[FFmpeg-devel] [PATCHv5 1/4] lavc/h263dsp: add DCT dequantisation functions

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed

* [FFmpeg-devel] [PATCHv5 1/4] lavc/h263dsp: add DCT dequantisation functions
@ 2024-06-12  4:47 Rémi Denis-Courmont
  2024-06-12  4:47 ` [FFmpeg-devel] [PATCH 2/4] lavc/mpegvideo: use H263DSP dequant function Rémi Denis-Courmont
                   ` (4 more replies)
  0 siblings, 5 replies; 19+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-12  4:47 UTC (permalink / raw)
  To: ffmpeg-devel

Note that optimised implementations of these functions will be taken
into actual use only if MpegEncContext.dct_unquantize_h263_{inter,intra}
are *not* overloaded by existing optimisations.

---
This adds the plus ones back, saving two branch instructions in C and
one in assembler (at the cost of two unconditional adds).

---
 libavcodec/h263dsp.c | 26 ++++++++++++++++++++++++++
 libavcodec/h263dsp.h |  4 ++++
 2 files changed, 30 insertions(+)

diff --git a/libavcodec/h263dsp.c b/libavcodec/h263dsp.c
index 6a13353499..f4523a68c1 100644
--- a/libavcodec/h263dsp.c
+++ b/libavcodec/h263dsp.c
@@ -19,10 +19,34 @@
 #include <stdint.h>
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "config.h"
 #include "h263dsp.h"
 
+static void h263_dct_unquantize_inter_c(int16_t *block, size_t len,
+                                        int qmul, int qadd)
+{
+    for (size_t i = 0; i < len; i++) {
+        int level = block[i];
+
+        if (level) {
+            if (level < 0)
+                level = level * qmul - qadd;
+            else
+                level = level * qmul + qadd;
+            block[i] = level;
+        }
+    }
+}
+
+static void h263_dct_unquantize_intra_c(int16_t *block, size_t len,
+                                        int qmul, int qadd)
+{
+    av_assert1(len >= 1);
+    h263_dct_unquantize_inter_c(block + 1, len - 1, qmul, qadd);
+}
+
 const uint8_t ff_h263_loop_filter_strength[32] = {
     0, 1, 1, 2, 2, 3, 3,  4,  4,  4,  5,  5,  6,  6,  7, 7,
     7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12
@@ -116,6 +140,8 @@ static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale)
 
 av_cold void ff_h263dsp_init(H263DSPContext *ctx)
 {
+    ctx->h263_dct_unquantize_intra = h263_dct_unquantize_intra_c;
+    ctx->h263_dct_unquantize_inter = h263_dct_unquantize_inter_c;
     ctx->h263_h_loop_filter = h263_h_loop_filter_c;
     ctx->h263_v_loop_filter = h263_v_loop_filter_c;
 
diff --git a/libavcodec/h263dsp.h b/libavcodec/h263dsp.h
index 2dccd23392..0ecbe83314 100644
--- a/libavcodec/h263dsp.h
+++ b/libavcodec/h263dsp.h
@@ -24,6 +24,10 @@
 extern const uint8_t ff_h263_loop_filter_strength[32];
 
 typedef struct H263DSPContext {
+    void (*h263_dct_unquantize_intra)(int16_t *block /* align 16 */,
+                                      size_t len, int mul, int add);
+    void (*h263_dct_unquantize_inter)(int16_t *block /* align 16 */,
+                                      size_t len, int mul, int add);
     void (*h263_h_loop_filter)(uint8_t *src, int stride, int qscale);
     void (*h263_v_loop_filter)(uint8_t *src, int stride, int qscale);
 } H263DSPContext;
-- 
2.45.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [FFmpeg-devel] [PATCH 2/4] lavc/mpegvideo: use H263DSP dequant function
  2024-06-12  4:47 [FFmpeg-devel] [PATCHv5 1/4] lavc/h263dsp: add DCT dequantisation functions Rémi Denis-Courmont
@ 2024-06-12  4:47 ` Rémi Denis-Courmont
  2024-06-14 14:33   ` James Almer
  2024-06-12  4:47 ` [FFmpeg-devel] [PATCH 3/4] checkasm/h263dsp: test dct_unquantize_{intra, inter} Rémi Denis-Courmont
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 19+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-12  4:47 UTC (permalink / raw)
  To: ffmpeg-devel

---
 configure              |  4 ++--
 libavcodec/mpegvideo.c | 46 +++++++++++-------------------------------
 2 files changed, 14 insertions(+), 36 deletions(-)

diff --git a/configure b/configure
index 6baa9b0646..eb9d1b1f5d 100755
--- a/configure
+++ b/configure
@@ -2957,8 +2957,8 @@ ftr_decoder_select="adts_header"
 g2m_decoder_deps="zlib"
 g2m_decoder_select="blockdsp idctdsp jpegtables"
 g729_decoder_select="audiodsp"
-h261_decoder_select="mpegvideodec"
-h261_encoder_select="mpegvideoenc"
+h261_decoder_select="h263dsp mpegvideodec"
+h261_encoder_select="h263dsp mpegvideoenc"
 h263_decoder_select="h263_parser h263dsp mpegvideodec qpeldsp"
 h263_encoder_select="h263dsp mpegvideoenc"
 h263i_decoder_select="h263_decoder"
diff --git a/libavcodec/mpegvideo.c b/libavcodec/mpegvideo.c
index 7af823b8bd..b35fd37083 100644
--- a/libavcodec/mpegvideo.c
+++ b/libavcodec/mpegvideo.c
@@ -201,13 +201,11 @@ static void dct_unquantize_mpeg2_inter_c(MpegEncContext *s,
 static void dct_unquantize_h263_intra_c(MpegEncContext *s,
                                   int16_t *block, int n, int qscale)
 {
-    int i, level, qmul, qadd;
-    int nCoeffs;
+    int qmul = qscale << 1;
+    int qadd, nCoeffs;
 
     av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
 
-    qmul = qscale << 1;
-
     if (!s->h263_aic) {
         block[0] *= n < 4 ? s->y_dc_scale : s->c_dc_scale;
         qadd = (qscale - 1) | 1;
@@ -215,47 +213,24 @@ static void dct_unquantize_h263_intra_c(MpegEncContext *s,
         qadd = 0;
     }
     if(s->ac_pred)
-        nCoeffs=63;
+        nCoeffs = 64;
     else
-        nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
+        nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]] + 1;
 
-    for(i=1; i<=nCoeffs; i++) {
-        level = block[i];
-        if (level) {
-            if (level < 0) {
-                level = level * qmul - qadd;
-            } else {
-                level = level * qmul + qadd;
-            }
-            block[i] = level;
-        }
-    }
+    s->h263dsp.h263_dct_unquantize_intra(block, nCoeffs, qmul, qadd);
 }
 
 static void dct_unquantize_h263_inter_c(MpegEncContext *s,
                                   int16_t *block, int n, int qscale)
 {
-    int i, level, qmul, qadd;
+    int qmul = qscale << 1;
+    int qadd = (qscale - 1) | 1;
     int nCoeffs;
 
     av_assert2(s->block_last_index[n]>=0);
 
-    qadd = (qscale - 1) | 1;
-    qmul = qscale << 1;
-
-    nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
-
-    for(i=0; i<=nCoeffs; i++) {
-        level = block[i];
-        if (level) {
-            if (level < 0) {
-                level = level * qmul - qadd;
-            } else {
-                level = level * qmul + qadd;
-            }
-            block[i] = level;
-        }
-    }
+    nCoeffs = s->inter_scantable.raster_end[s->block_last_index[n]] + 1;
+    s->h263dsp.h263_dct_unquantize_inter(block, nCoeffs, qmul, qadd);
 }
 
 
@@ -275,6 +250,9 @@ static void gray8(uint8_t *dst, const uint8_t *src, ptrdiff_t linesize, int h)
 static av_cold int dct_init(MpegEncContext *s)
 {
     ff_blockdsp_init(&s->bdsp);
+#if CONFIG_H263DSP
+    ff_h263dsp_init(&s->h263dsp);
+#endif
     ff_hpeldsp_init(&s->hdsp, s->avctx->flags);
     ff_videodsp_init(&s->vdsp, s->avctx->bits_per_raw_sample);
 
-- 
2.45.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/4] lavc/mpegvideo: use H263DSP dequant function
  2024-06-12  4:47 ` [FFmpeg-devel] [PATCH 2/4] lavc/mpegvideo: use H263DSP dequant function Rémi Denis-Courmont
@ 2024-06-14 14:33   ` James Almer
  2024-06-14 14:45     ` Rémi Denis-Courmont
  0 siblings, 1 reply; 19+ messages in thread
From: James Almer @ 2024-06-14 14:33 UTC (permalink / raw)
  To: ffmpeg-devel

On 6/12/2024 1:47 AM, Rémi Denis-Courmont wrote:
> ---
>   configure              |  4 ++--
>   libavcodec/mpegvideo.c | 46 +++++++++++-------------------------------
>   2 files changed, 14 insertions(+), 36 deletions(-)
> 
> diff --git a/configure b/configure
> index 6baa9b0646..eb9d1b1f5d 100755
> --- a/configure
> +++ b/configure
> @@ -2957,8 +2957,8 @@ ftr_decoder_select="adts_header"
>   g2m_decoder_deps="zlib"
>   g2m_decoder_select="blockdsp idctdsp jpegtables"
>   g729_decoder_select="audiodsp"
> -h261_decoder_select="mpegvideodec"
> -h261_encoder_select="mpegvideoenc"
> +h261_decoder_select="h263dsp mpegvideodec"
> +h261_encoder_select="h263dsp mpegvideoenc"
>   h263_decoder_select="h263_parser h263dsp mpegvideodec qpeldsp"
>   h263_encoder_select="h263dsp mpegvideoenc"
>   h263i_decoder_select="h263_decoder"
> diff --git a/libavcodec/mpegvideo.c b/libavcodec/mpegvideo.c
> index 7af823b8bd..b35fd37083 100644
> --- a/libavcodec/mpegvideo.c
> +++ b/libavcodec/mpegvideo.c
> @@ -201,13 +201,11 @@ static void dct_unquantize_mpeg2_inter_c(MpegEncContext *s,
>   static void dct_unquantize_h263_intra_c(MpegEncContext *s,
>                                     int16_t *block, int n, int qscale)
>   {
> -    int i, level, qmul, qadd;
> -    int nCoeffs;
> +    int qmul = qscale << 1;
> +    int qadd, nCoeffs;
>   
>       av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
>   
> -    qmul = qscale << 1;
> -
>       if (!s->h263_aic) {
>           block[0] *= n < 4 ? s->y_dc_scale : s->c_dc_scale;
>           qadd = (qscale - 1) | 1;
> @@ -215,47 +213,24 @@ static void dct_unquantize_h263_intra_c(MpegEncContext *s,
>           qadd = 0;
>       }
>       if(s->ac_pred)
> -        nCoeffs=63;
> +        nCoeffs = 64;
>       else
> -        nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
> +        nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]] + 1;
>   
> -    for(i=1; i<=nCoeffs; i++) {
> -        level = block[i];
> -        if (level) {
> -            if (level < 0) {
> -                level = level * qmul - qadd;
> -            } else {
> -                level = level * qmul + qadd;
> -            }
> -            block[i] = level;
> -        }
> -    }
> +    s->h263dsp.h263_dct_unquantize_intra(block, nCoeffs, qmul, qadd);

Looking further into this, you're adding a function pointer call in a 
function that's already called from a function pointer. And both x86 and 
arm have asm optimized versions of this entire method, which includes 
all the setup before the loop.

Can't you do the same for riscv? Is there anything preventing you from 
accessing fields at specific offsets within MpegEncContext?

>   }
>   
>   static void dct_unquantize_h263_inter_c(MpegEncContext *s,
>                                     int16_t *block, int n, int qscale)
>   {
> -    int i, level, qmul, qadd;
> +    int qmul = qscale << 1;
> +    int qadd = (qscale - 1) | 1;
>       int nCoeffs;
>   
>       av_assert2(s->block_last_index[n]>=0);
>   
> -    qadd = (qscale - 1) | 1;
> -    qmul = qscale << 1;
> -
> -    nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
> -
> -    for(i=0; i<=nCoeffs; i++) {
> -        level = block[i];
> -        if (level) {
> -            if (level < 0) {
> -                level = level * qmul - qadd;
> -            } else {
> -                level = level * qmul + qadd;
> -            }
> -            block[i] = level;
> -        }
> -    }
> +    nCoeffs = s->inter_scantable.raster_end[s->block_last_index[n]] + 1;
> +    s->h263dsp.h263_dct_unquantize_inter(block, nCoeffs, qmul, qadd);
>   }
>   
>   
> @@ -275,6 +250,9 @@ static void gray8(uint8_t *dst, const uint8_t *src, ptrdiff_t linesize, int h)
>   static av_cold int dct_init(MpegEncContext *s)
>   {
>       ff_blockdsp_init(&s->bdsp);
> +#if CONFIG_H263DSP
> +    ff_h263dsp_init(&s->h263dsp);
> +#endif
>       ff_hpeldsp_init(&s->hdsp, s->avctx->flags);
>       ff_videodsp_init(&s->vdsp, s->avctx->bits_per_raw_sample);
>   
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/4] lavc/mpegvideo: use H263DSP dequant function
  2024-06-14 14:33   ` James Almer
@ 2024-06-14 14:45     ` Rémi Denis-Courmont
  2024-06-14 15:11       ` Rémi Denis-Courmont
  0 siblings, 1 reply; 19+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-14 14:45 UTC (permalink / raw)
  To: FFmpeg development discussions and patches



Le 14 juin 2024 17:33:16 GMT+03:00, James Almer <jamrial@gmail.com> a écrit :
>On 6/12/2024 1:47 AM, Rémi Denis-Courmont wrote:
>> ---
>>   configure              |  4 ++--
>>   libavcodec/mpegvideo.c | 46 +++++++++++-------------------------------
>>   2 files changed, 14 insertions(+), 36 deletions(-)
>> 
>> diff --git a/configure b/configure
>> index 6baa9b0646..eb9d1b1f5d 100755
>> --- a/configure
>> +++ b/configure
>> @@ -2957,8 +2957,8 @@ ftr_decoder_select="adts_header"
>>   g2m_decoder_deps="zlib"
>>   g2m_decoder_select="blockdsp idctdsp jpegtables"
>>   g729_decoder_select="audiodsp"
>> -h261_decoder_select="mpegvideodec"
>> -h261_encoder_select="mpegvideoenc"
>> +h261_decoder_select="h263dsp mpegvideodec"
>> +h261_encoder_select="h263dsp mpegvideoenc"
>>   h263_decoder_select="h263_parser h263dsp mpegvideodec qpeldsp"
>>   h263_encoder_select="h263dsp mpegvideoenc"
>>   h263i_decoder_select="h263_decoder"
>> diff --git a/libavcodec/mpegvideo.c b/libavcodec/mpegvideo.c
>> index 7af823b8bd..b35fd37083 100644
>> --- a/libavcodec/mpegvideo.c
>> +++ b/libavcodec/mpegvideo.c
>> @@ -201,13 +201,11 @@ static void dct_unquantize_mpeg2_inter_c(MpegEncContext *s,
>>   static void dct_unquantize_h263_intra_c(MpegEncContext *s,
>>                                     int16_t *block, int n, int qscale)
>>   {
>> -    int i, level, qmul, qadd;
>> -    int nCoeffs;
>> +    int qmul = qscale << 1;
>> +    int qadd, nCoeffs;
>>         av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
>>   -    qmul = qscale << 1;
>> -
>>       if (!s->h263_aic) {
>>           block[0] *= n < 4 ? s->y_dc_scale : s->c_dc_scale;
>>           qadd = (qscale - 1) | 1;
>> @@ -215,47 +213,24 @@ static void dct_unquantize_h263_intra_c(MpegEncContext *s,
>>           qadd = 0;
>>       }
>>       if(s->ac_pred)
>> -        nCoeffs=63;
>> +        nCoeffs = 64;
>>       else
>> -        nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
>> +        nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]] + 1;
>>   -    for(i=1; i<=nCoeffs; i++) {
>> -        level = block[i];
>> -        if (level) {
>> -            if (level < 0) {
>> -                level = level * qmul - qadd;
>> -            } else {
>> -                level = level * qmul + qadd;
>> -            }
>> -            block[i] = level;
>> -        }
>> -    }
>> +    s->h263dsp.h263_dct_unquantize_intra(block, nCoeffs, qmul, qadd);
>
>Looking further into this, you're adding a function pointer call in a function that's already called from a function pointer. And both x86 and arm have asm optimized versions of this entire method, which includes all the setup before the loop.

I am not interested in copying existing bad design. Note that the Arm code uses intrinsics. I don't want to use RVV intrinsics especially just for this. And x86 only has MMX.

>Can't you do the same for riscv?

Sure, RV can address fixed offsets up to +/-2 KiB. If someone *else* adds a assembler-friendly header file that defines the offsets to the relevant context fields, and rewrites the checkasm code to match, that is.

> Is there anything preventing you from accessing fields at specific offsets within MpegEncContext?

My strong belief that that would be technically wrong, yes.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/4] lavc/mpegvideo: use H263DSP dequant function
  2024-06-14 14:45     ` Rémi Denis-Courmont
@ 2024-06-14 15:11       ` Rémi Denis-Courmont
  2024-06-14 15:41         ` James Almer
  0 siblings, 1 reply; 19+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-14 15:11 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Le perjantaina 14. kesäkuuta 2024, 17.45.50 EEST Rémi Denis-Courmont a écrit :
> And x86 only has MMX.

And the x86 code uses inline assembler. That's not viable on any ISA with sane 
conventions such as Arm or RISC-V. The compiler will reject our vector 
clobbers, unless the Vector extension is included in the compilation target. 
But then that breaks run-time detection, and will be rejected by all Linux 
distros.

-- 
Rémi Denis-Courmont
http://www.remlab.net/

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/4] lavc/mpegvideo: use H263DSP dequant function
  2024-06-14 15:11       ` Rémi Denis-Courmont
@ 2024-06-14 15:41         ` James Almer
  2024-06-14 19:03           ` Rémi Denis-Courmont
  0 siblings, 1 reply; 19+ messages in thread
From: James Almer @ 2024-06-14 15:41 UTC (permalink / raw)
  To: ffmpeg-devel

On 6/14/2024 12:11 PM, Rémi Denis-Courmont wrote:
> Le perjantaina 14. kesäkuuta 2024, 17.45.50 EEST Rémi Denis-Courmont a écrit :
>> And x86 only has MMX.
> 
> And the x86 code uses inline assembler. That's not viable on any ISA with sane
> conventions such as Arm or RISC-V. The compiler will reject our vector
> clobbers, unless the Vector extension is included in the compilation target.
> But then that breaks run-time detection, and will be rejected by all Linux
> distros.

I was thinking on removing that inline version and probably replacing it 
with sse2, using the dsp prototypes you introduce in this set.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/4] lavc/mpegvideo: use H263DSP dequant function
  2024-06-14 15:41         ` James Almer
@ 2024-06-14 19:03           ` Rémi Denis-Courmont
  0 siblings, 0 replies; 19+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-14 19:03 UTC (permalink / raw)
  To: ffmpeg-devel

Le perjantaina 14. kesäkuuta 2024, 18.41.48 EEST James Almer a écrit :
> On 6/14/2024 12:11 PM, Rémi Denis-Courmont wrote:
> > Le perjantaina 14. kesäkuuta 2024, 17.45.50 EEST Rémi Denis-Courmont a 
écrit :
> >> And x86 only has MMX.
> > 
> > And the x86 code uses inline assembler. That's not viable on any ISA with
> > sane conventions such as Arm or RISC-V. The compiler will reject our
> > vector clobbers, unless the Vector extension is included in the
> > compilation target. But then that breaks run-time detection, and will be
> > rejected by all Linux distros.
> 
> I was thinking on removing that inline version and probably replacing it
> with sse2, using the dsp prototypes you introduce in this set.

I saw that, but we can't have the cake and eat it. Those DSP callbacks can't 
exist without adding a layer of indirection.

-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [FFmpeg-devel] [PATCH 3/4] checkasm/h263dsp: test dct_unquantize_{intra, inter}
  2024-06-12  4:47 [FFmpeg-devel] [PATCHv5 1/4] lavc/h263dsp: add DCT dequantisation functions Rémi Denis-Courmont
  2024-06-12  4:47 ` [FFmpeg-devel] [PATCH 2/4] lavc/mpegvideo: use H263DSP dequant function Rémi Denis-Courmont
@ 2024-06-12  4:47 ` Rémi Denis-Courmont
  2024-06-12 18:39   ` James Almer
  2024-06-13 19:41   ` [FFmpeg-devel] [PATCHv6 " Rémi Denis-Courmont
  2024-06-12  4:47 ` [FFmpeg-devel] [PATCH 4/4] lavc/h263dsp: R-V V " Rémi Denis-Courmont
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 19+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-12  4:47 UTC (permalink / raw)
  To: ffmpeg-devel

---
 tests/checkasm/h263dsp.c | 47 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 46 insertions(+), 1 deletion(-)

diff --git a/tests/checkasm/h263dsp.c b/tests/checkasm/h263dsp.c
index 2d0957a90b..8a2cdb34df 100644
--- a/tests/checkasm/h263dsp.c
+++ b/tests/checkasm/h263dsp.c
@@ -18,13 +18,55 @@
  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
+#include <stdbool.h>
 #include <string.h>
 
 #include "checkasm.h"
 
-#include "libavcodec/h263dsp.h"
+#include "libavutil/avassert.h"
 #include "libavutil/mem.h"
 #include "libavutil/mem_internal.h"
+#include "libavcodec/h263dsp.h"
+#include "libavcodec/mpegvideodata.h"
+
+static uint_fast8_t mpeg_qscale_rnd(void)
+{
+    int n = rnd(), q = (n >> 1) & 31;
+
+    if (n & 1)
+        return ff_mpeg2_non_linear_qscale[q];
+    else
+        return q << 1;
+}
+
+typedef void (*unquantizer)(int16_t *, size_t, int, int);
+
+static void check_dct_unquantize(unquantizer func, const char *name)
+{
+#define LEN 64
+    LOCAL_ALIGNED_16(int16_t, block0, [LEN]);
+    LOCAL_ALIGNED_16(int16_t, block1, [LEN]);
+    size_t len = 1 + (rnd() & (LEN - 1));
+    const int qscale = mpeg_qscale_rnd();
+    const int qmul = qscale << 1;
+    const int qadd = (rnd() & 1) ? (qscale - 1) | 1 : 0;
+
+    declare_func(void, int16_t *, size_t, int, int);
+
+    for (size_t i = 0; i < LEN; i++)
+        block1[i] = block0[i] = (rnd() & 1) ? rnd() : 0;
+
+    if (check_func(func, "h263dsp.dct_unquantize_%s", name)) {
+        av_assert0(len <= LEN);
+        call_ref(block0, len, qmul, qadd);
+        call_new(block1, len, qmul, qadd);
+
+        if (memcmp(block0, block1, len * sizeof (int16_t)))
+            fail();
+
+        bench_new(block1, LEN, qmul, qadd);
+    }
+}
 
 typedef void (*filter)(uint8_t *src, int stride, int qscale);
 
@@ -56,6 +98,9 @@ void checkasm_check_h263dsp(void)
     H263DSPContext ctx;
 
     ff_h263dsp_init(&ctx);
+    check_dct_unquantize(ctx.h263_dct_unquantize_intra, "intra");
+    check_dct_unquantize(ctx.h263_dct_unquantize_inter, "inter");
+    report("dct_unquantize");
     check_loop_filter('h', ctx.h263_h_loop_filter);
     check_loop_filter('v', ctx.h263_v_loop_filter);
     report("loop_filter");
-- 
2.45.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [FFmpeg-devel] [PATCH 3/4] checkasm/h263dsp: test dct_unquantize_{intra, inter}
  2024-06-12  4:47 ` [FFmpeg-devel] [PATCH 3/4] checkasm/h263dsp: test dct_unquantize_{intra, inter} Rémi Denis-Courmont
@ 2024-06-12 18:39   ` James Almer
  2024-06-12 19:15     ` Rémi Denis-Courmont
  2024-06-13 19:41   ` [FFmpeg-devel] [PATCHv6 " Rémi Denis-Courmont
  1 sibling, 1 reply; 19+ messages in thread
From: James Almer @ 2024-06-12 18:39 UTC (permalink / raw)
  To: ffmpeg-devel

On 6/12/2024 1:47 AM, Rémi Denis-Courmont wrote:
> ---
>   tests/checkasm/h263dsp.c | 47 +++++++++++++++++++++++++++++++++++++++-
>   1 file changed, 46 insertions(+), 1 deletion(-)
> 
> diff --git a/tests/checkasm/h263dsp.c b/tests/checkasm/h263dsp.c
> index 2d0957a90b..8a2cdb34df 100644
> --- a/tests/checkasm/h263dsp.c
> +++ b/tests/checkasm/h263dsp.c
> @@ -18,13 +18,55 @@
>    * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
>    */
>   
> +#include <stdbool.h>
>   #include <string.h>
>   
>   #include "checkasm.h"
>   
> -#include "libavcodec/h263dsp.h"
> +#include "libavutil/avassert.h"
>   #include "libavutil/mem.h"
>   #include "libavutil/mem_internal.h"
> +#include "libavcodec/h263dsp.h"
> +#include "libavcodec/mpegvideodata.h"
> +
> +static uint_fast8_t mpeg_qscale_rnd(void)
> +{
> +    int n = rnd(), q = (n >> 1) & 31;
> +
> +    if (n & 1)
> +        return ff_mpeg2_non_linear_qscale[q];
> +    else
> +        return q << 1;
> +}
> +
> +typedef void (*unquantizer)(int16_t *, size_t, int, int);
> +
> +static void check_dct_unquantize(unquantizer func, const char *name)
> +{
> +#define LEN 64
> +    LOCAL_ALIGNED_16(int16_t, block0, [LEN]);
> +    LOCAL_ALIGNED_16(int16_t, block1, [LEN]);

These are not padded, and unless I'm reading this wrong, an asm 
implementation loading say 16 bytes at a time will overread/write in 
dct_unquantize_intra (which offsets block by 1).

> +    size_t len = 1 + (rnd() & (LEN - 1));
> +    const int qscale = mpeg_qscale_rnd();
> +    const int qmul = qscale << 1;
> +    const int qadd = (rnd() & 1) ? (qscale - 1) | 1 : 0;
> +
> +    declare_func(void, int16_t *, size_t, int, int);
> +
> +    for (size_t i = 0; i < LEN; i++)
> +        block1[i] = block0[i] = (rnd() & 1) ? rnd() : 0;
> +
> +    if (check_func(func, "h263dsp.dct_unquantize_%s", name)) {
> +        av_assert0(len <= LEN);
> +        call_ref(block0, len, qmul, qadd);
> +        call_new(block1, len, qmul, qadd);
> +
> +        if (memcmp(block0, block1, len * sizeof (int16_t)))
> +            fail();
> +
> +        bench_new(block1, LEN, qmul, qadd);
> +    }
> +}
>   
>   typedef void (*filter)(uint8_t *src, int stride, int qscale);
>   
> @@ -56,6 +98,9 @@ void checkasm_check_h263dsp(void)
>       H263DSPContext ctx;
>   
>       ff_h263dsp_init(&ctx);
> +    check_dct_unquantize(ctx.h263_dct_unquantize_intra, "intra");
> +    check_dct_unquantize(ctx.h263_dct_unquantize_inter, "inter");
> +    report("dct_unquantize");
>       check_loop_filter('h', ctx.h263_h_loop_filter);
>       check_loop_filter('v', ctx.h263_v_loop_filter);
>       report("loop_filter");
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [FFmpeg-devel] [PATCH 3/4] checkasm/h263dsp: test dct_unquantize_{intra, inter}
  2024-06-12 18:39   ` James Almer
@ 2024-06-12 19:15     ` Rémi Denis-Courmont
  2024-06-12 19:39       ` James Almer
  0 siblings, 1 reply; 19+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-12 19:15 UTC (permalink / raw)
  To: ffmpeg-devel

Le keskiviikkona 12. kesäkuuta 2024, 21.39.12 EEST James Almer a écrit :
> On 6/12/2024 1:47 AM, Rémi Denis-Courmont wrote:
> > ---
> > 
> >   tests/checkasm/h263dsp.c | 47 +++++++++++++++++++++++++++++++++++++++-
> >   1 file changed, 46 insertions(+), 1 deletion(-)
> > 
> > diff --git a/tests/checkasm/h263dsp.c b/tests/checkasm/h263dsp.c
> > index 2d0957a90b..8a2cdb34df 100644
> > --- a/tests/checkasm/h263dsp.c
> > +++ b/tests/checkasm/h263dsp.c
> > @@ -18,13 +18,55 @@
> > 
> >    * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
> >    */
> > 
> > +#include <stdbool.h>
> > 
> >   #include <string.h>
> >   
> >   #include "checkasm.h"
> > 
> > -#include "libavcodec/h263dsp.h"
> > +#include "libavutil/avassert.h"
> > 
> >   #include "libavutil/mem.h"
> >   #include "libavutil/mem_internal.h"
> > 
> > +#include "libavcodec/h263dsp.h"
> > +#include "libavcodec/mpegvideodata.h"
> > +
> > +static uint_fast8_t mpeg_qscale_rnd(void)
> > +{
> > +    int n = rnd(), q = (n >> 1) & 31;
> > +
> > +    if (n & 1)
> > +        return ff_mpeg2_non_linear_qscale[q];
> > +    else
> > +        return q << 1;
> > +}
> > +
> > +typedef void (*unquantizer)(int16_t *, size_t, int, int);
> > +
> > +static void check_dct_unquantize(unquantizer func, const char *name)
> > +{
> > +#define LEN 64
> > +    LOCAL_ALIGNED_16(int16_t, block0, [LEN]);
> > +    LOCAL_ALIGNED_16(int16_t, block1, [LEN]);
> 
> These are not padded, and unless I'm reading this wrong, an asm
> implementation loading say 16 bytes at a time will overread/write in
> dct_unquantize_intra (which offsets block by 1).

AFAIU, there is no padding per se, but the block buffer size is always exactly 
64 elements, regardless of the number of coeffs, hence this code. The old NEON 
intrinsic code seems to assume the block is a multiple of 8 elements, and the 
tail can be overwritten safely (hence not checking in memcmp()).

I have a feeling that I am not grasping the implications of you comment here.

-- 
Rémi Denis-Courmont
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [FFmpeg-devel] [PATCH 3/4] checkasm/h263dsp: test dct_unquantize_{intra, inter}
  2024-06-12 19:15     ` Rémi Denis-Courmont
@ 2024-06-12 19:39       ` James Almer
  2024-06-12 19:52         ` Rémi Denis-Courmont
  0 siblings, 1 reply; 19+ messages in thread
From: James Almer @ 2024-06-12 19:39 UTC (permalink / raw)
  To: ffmpeg-devel

On 6/12/2024 4:15 PM, Rémi Denis-Courmont wrote:
> Le keskiviikkona 12. kesäkuuta 2024, 21.39.12 EEST James Almer a écrit :
>> On 6/12/2024 1:47 AM, Rémi Denis-Courmont wrote:
>>> ---
>>>
>>>    tests/checkasm/h263dsp.c | 47 +++++++++++++++++++++++++++++++++++++++-
>>>    1 file changed, 46 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/tests/checkasm/h263dsp.c b/tests/checkasm/h263dsp.c
>>> index 2d0957a90b..8a2cdb34df 100644
>>> --- a/tests/checkasm/h263dsp.c
>>> +++ b/tests/checkasm/h263dsp.c
>>> @@ -18,13 +18,55 @@
>>>
>>>     * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
>>>     */
>>>
>>> +#include <stdbool.h>
>>>
>>>    #include <string.h>
>>>    
>>>    #include "checkasm.h"
>>>
>>> -#include "libavcodec/h263dsp.h"
>>> +#include "libavutil/avassert.h"
>>>
>>>    #include "libavutil/mem.h"
>>>    #include "libavutil/mem_internal.h"
>>>
>>> +#include "libavcodec/h263dsp.h"
>>> +#include "libavcodec/mpegvideodata.h"
>>> +
>>> +static uint_fast8_t mpeg_qscale_rnd(void)
>>> +{
>>> +    int n = rnd(), q = (n >> 1) & 31;
>>> +
>>> +    if (n & 1)
>>> +        return ff_mpeg2_non_linear_qscale[q];
>>> +    else
>>> +        return q << 1;
>>> +}
>>> +
>>> +typedef void (*unquantizer)(int16_t *, size_t, int, int);
>>> +
>>> +static void check_dct_unquantize(unquantizer func, const char *name)
>>> +{
>>> +#define LEN 64
>>> +    LOCAL_ALIGNED_16(int16_t, block0, [LEN]);
>>> +    LOCAL_ALIGNED_16(int16_t, block1, [LEN]);
>>
>> These are not padded, and unless I'm reading this wrong, an asm
>> implementation loading say 16 bytes at a time will overread/write in
>> dct_unquantize_intra (which offsets block by 1).
> 
> AFAIU, there is no padding per se, but the block buffer size is always exactly
> 64 elements, regardless of the number of coeffs, hence this code. The old NEON
> intrinsic code seems to assume the block is a multiple of 8 elements, and the
> tail can be overwritten safely (hence not checking in memcmp()).
> 
> I have a feeling that I am not grasping the implications of you comment here.

An asm function loading 16 bytes at a time from block[1] onwards for 
intra may end up reading two bytes more than available at the end of the 
128 byte wide buffer.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [FFmpeg-devel] [PATCH 3/4] checkasm/h263dsp: test dct_unquantize_{intra, inter}
  2024-06-12 19:39       ` James Almer
@ 2024-06-12 19:52         ` Rémi Denis-Courmont
  0 siblings, 0 replies; 19+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-12 19:52 UTC (permalink / raw)
  To: ffmpeg-devel

Le keskiviikkona 12. kesäkuuta 2024, 22.39.49 EEST James Almer a écrit :
> >> These are not padded, and unless I'm reading this wrong, an asm
> >> implementation loading say 16 bytes at a time will overread/write in
> >> dct_unquantize_intra (which offsets block by 1).
> > 
> > AFAIU, there is no padding per se, but the block buffer size is always
> > exactly 64 elements, regardless of the number of coeffs, hence this code.
> > The old NEON intrinsic code seems to assume the block is a multiple of 8
> > elements, and the tail can be overwritten safely (hence not checking in
> > memcmp()).
> > 
> > I have a feeling that I am not grasping the implications of you comment
> > here.
> An asm function loading 16 bytes at a time from block[1] onwards for
> intra may end up reading two bytes more than available at the end of the
> 128 byte wide buffer.

Wouldn't that be a bug in the assembler function? Do you mean that checkasm 
should add padding to check against overwrites?

The whole point of separating inter and intra was to preserve alignment for 
those instruction set extensions that want it (C and RVV couldn't care less).

-- 
レミ・デニ-クールモン
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [FFmpeg-devel] [PATCHv6 3/4] checkasm/h263dsp: test dct_unquantize_{intra, inter}
  2024-06-12  4:47 ` [FFmpeg-devel] [PATCH 3/4] checkasm/h263dsp: test dct_unquantize_{intra, inter} Rémi Denis-Courmont
  2024-06-12 18:39   ` James Almer
@ 2024-06-13 19:41   ` Rémi Denis-Courmont
  1 sibling, 0 replies; 19+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-13 19:41 UTC (permalink / raw)
  To: ffmpeg-devel

---
 tests/checkasm/h263dsp.c | 50 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/tests/checkasm/h263dsp.c b/tests/checkasm/h263dsp.c
index 2d0957a90b..69b303a6cc 100644
--- a/tests/checkasm/h263dsp.c
+++ b/tests/checkasm/h263dsp.c
@@ -18,13 +18,58 @@
  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
+#include <stdbool.h>
 #include <string.h>
 
 #include "checkasm.h"
 
-#include "libavcodec/h263dsp.h"
+#include "libavutil/avassert.h"
 #include "libavutil/mem.h"
 #include "libavutil/mem_internal.h"
+#include "libavcodec/h263dsp.h"
+#include "libavcodec/mpegvideodata.h"
+
+static uint_fast8_t mpeg_qscale_rnd(void)
+{
+    int n = rnd(), q = (n >> 1) & 31;
+
+    if (n & 1)
+        return ff_mpeg2_non_linear_qscale[q];
+    else
+        return q << 1;
+}
+
+typedef void (*unquantizer)(int16_t *, size_t, int, int);
+
+static void check_dct_unquantize(unquantizer func, const char *name)
+{
+#define LEN 64
+    LOCAL_ALIGNED_16(int16_t, block0, [LEN]);
+    LOCAL_ALIGNED_16(int16_t, block1, [LEN]);
+    size_t len = 1 + (rnd() & (LEN - 1));
+    const int qscale = mpeg_qscale_rnd();
+    const int qmul = qscale << 1;
+    const int qadd = (rnd() & 1) ? (qscale - 1) | 1 : 0;
+
+    declare_func(void, int16_t *, size_t, int, int);
+
+    for (size_t i = 0; i < LEN; i++) {
+        int r = rnd();
+
+        block1[i] = block0[i] = (r & 1) ? (r >> 1) : 0;
+    }
+
+    if (check_func(func, "h263dsp.dct_unquantize_%s", name)) {
+        av_assert0(len <= LEN);
+        call_ref(block0, len, qmul, qadd);
+        call_new(block1, len, qmul, qadd);
+
+        if (memcmp(block0, block1, len * sizeof (int16_t)))
+            fail();
+
+        bench_new(block1, LEN, qmul, qadd);
+    }
+}
 
 typedef void (*filter)(uint8_t *src, int stride, int qscale);
 
@@ -56,6 +101,9 @@ void checkasm_check_h263dsp(void)
     H263DSPContext ctx;
 
     ff_h263dsp_init(&ctx);
+    check_dct_unquantize(ctx.h263_dct_unquantize_intra, "intra");
+    check_dct_unquantize(ctx.h263_dct_unquantize_inter, "inter");
+    report("dct_unquantize");
     check_loop_filter('h', ctx.h263_h_loop_filter);
     check_loop_filter('v', ctx.h263_v_loop_filter);
     report("loop_filter");
-- 
2.45.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [FFmpeg-devel] [PATCH 4/4] lavc/h263dsp: R-V V dct_unquantize_{intra, inter}
  2024-06-12  4:47 [FFmpeg-devel] [PATCHv5 1/4] lavc/h263dsp: add DCT dequantisation functions Rémi Denis-Courmont
  2024-06-12  4:47 ` [FFmpeg-devel] [PATCH 2/4] lavc/mpegvideo: use H263DSP dequant function Rémi Denis-Courmont
  2024-06-12  4:47 ` [FFmpeg-devel] [PATCH 3/4] checkasm/h263dsp: test dct_unquantize_{intra, inter} Rémi Denis-Courmont
@ 2024-06-12  4:47 ` Rémi Denis-Courmont
  2024-06-13 19:41   ` [FFmpeg-devel] [PATCHv6 " Rémi Denis-Courmont
  2024-06-12 17:40 ` [FFmpeg-devel] [PATCHv5 1/4] lavc/h263dsp: add DCT dequantisation functions James Almer
  2024-06-13  2:23 ` Andreas Rheinhardt
  4 siblings, 1 reply; 19+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-12  4:47 UTC (permalink / raw)
  To: ffmpeg-devel

T-Head C908:
h263dsp.dct_unquantize_inter_c:       3.7
h263dsp.dct_unquantize_inter_rvv_i32: 1.7
h263dsp.dct_unquantize_intra_c:       4.0
h263dsp.dct_unquantize_intra_rvv_i32: 2.0

SpacemiT X60:
h263dsp.dct_unquantize_inter_c:       3.5
h263dsp.dct_unquantize_inter_rvv_i32: 1.5
h263dsp.dct_unquantize_intra_c:       3.5
h263dsp.dct_unquantize_intra_rvv_i32: 1.5
---
 libavcodec/riscv/h263dsp_init.c | 15 ++++++++++++---
 libavcodec/riscv/h263dsp_rvv.S  | 26 ++++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/libavcodec/riscv/h263dsp_init.c b/libavcodec/riscv/h263dsp_init.c
index 21b536366c..5fb12f360b 100644
--- a/libavcodec/riscv/h263dsp_init.c
+++ b/libavcodec/riscv/h263dsp_init.c
@@ -25,6 +25,8 @@
 #include "libavutil/riscv/cpu.h"
 #include "libavcodec/h263dsp.h"
 
+void ff_h263_dct_unquantize_intra_rvv(int16_t *, size_t len, int, int);
+void ff_h263_dct_unquantize_inter_rvv(int16_t *, size_t len, int, int);
 void ff_h263_h_loop_filter_rvv(uint8_t *src, int stride, int q);
 void ff_h263_v_loop_filter_rvv(uint8_t *src, int stride, int q);
 
@@ -33,9 +35,16 @@ av_cold void ff_h263dsp_init_riscv(H263DSPContext *c)
 #if HAVE_RVV
     int flags = av_get_cpu_flags();
 
-    if ((flags & AV_CPU_FLAG_RVV_I32) && ff_rv_vlen_least(128)) {
-        c->h263_h_loop_filter = ff_h263_h_loop_filter_rvv;
-        c->h263_v_loop_filter = ff_h263_v_loop_filter_rvv;
+    if (flags & AV_CPU_FLAG_RVV_I32) {
+        if (ff_rv_vlen_least(128) || (flags & AV_CPU_FLAG_RVB_ADDR)) {
+            c->h263_dct_unquantize_intra = ff_h263_dct_unquantize_intra_rvv;
+            c->h263_dct_unquantize_inter = ff_h263_dct_unquantize_inter_rvv;
+        }
+
+        if (ff_rv_vlen_least(128)) {
+            c->h263_h_loop_filter = ff_h263_h_loop_filter_rvv;
+            c->h263_v_loop_filter = ff_h263_v_loop_filter_rvv;
+        }
     }
 #endif
 }
diff --git a/libavcodec/riscv/h263dsp_rvv.S b/libavcodec/riscv/h263dsp_rvv.S
index 97503d527c..080091bfb1 100644
--- a/libavcodec/riscv/h263dsp_rvv.S
+++ b/libavcodec/riscv/h263dsp_rvv.S
@@ -20,6 +20,32 @@
 
 #include "libavutil/riscv/asm.S"
 
+func ff_h263_dct_unquantize_intra_rvv, zve32x
+        addi    a1, a1, -1
+        addi    a0, a0, 2
+        # fall through
+endfunc
+
+func ff_h263_dct_unquantize_inter_rvv, zve32x
+1:
+        vsetvli  t0, a1, e16, m8, ta, mu
+        vle16.v  v8, (a0)
+        sub      a1, a1, t0
+        vmv.v.x  v24, a3
+        vmslt.vi v0, v8, 0
+        vmul.vx  v16, v8, a2
+        vneg.v   v24, v24, v0.t
+        vmsne.vi v0, v8, 0
+        vadd.vv  v8, v16, v24, v0.t
+        vse16.v v8, (a0)
+#if defined(__riscv_v_min_vlen) && __riscv_v_min_vlen < 128
+        sh1add  a0, t0, a0
+        bnez    a1, 1b
+#endif
+
+        ret
+endfunc
+
         .option push
         .option norelax
 func ff_h263_h_loop_filter_rvv, zve32x
-- 
2.45.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [FFmpeg-devel] [PATCHv6 4/4] lavc/h263dsp: R-V V dct_unquantize_{intra, inter}
  2024-06-12  4:47 ` [FFmpeg-devel] [PATCH 4/4] lavc/h263dsp: R-V V " Rémi Denis-Courmont
@ 2024-06-13 19:41   ` Rémi Denis-Courmont
  0 siblings, 0 replies; 19+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-13 19:41 UTC (permalink / raw)
  To: ffmpeg-devel

T-Head C908:
h263dsp.dct_unquantize_inter_c:       3.7
h263dsp.dct_unquantize_inter_rvv_i32: 1.7
h263dsp.dct_unquantize_intra_c:       4.0
h263dsp.dct_unquantize_intra_rvv_i32: 1.5

SpacemiT X60:
h263dsp.dct_unquantize_inter_c:       3.5
h263dsp.dct_unquantize_inter_rvv_i32: 1.2
h263dsp.dct_unquantize_intra_c:       3.5
h263dsp.dct_unquantize_intra_rvv_i32: 1.2
---
 libavcodec/riscv/h263dsp_init.c | 15 ++++++++++++---
 libavcodec/riscv/h263dsp_rvv.S  | 25 +++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/libavcodec/riscv/h263dsp_init.c b/libavcodec/riscv/h263dsp_init.c
index 21b536366c..5fb12f360b 100644
--- a/libavcodec/riscv/h263dsp_init.c
+++ b/libavcodec/riscv/h263dsp_init.c
@@ -25,6 +25,8 @@
 #include "libavutil/riscv/cpu.h"
 #include "libavcodec/h263dsp.h"
 
+void ff_h263_dct_unquantize_intra_rvv(int16_t *, size_t len, int, int);
+void ff_h263_dct_unquantize_inter_rvv(int16_t *, size_t len, int, int);
 void ff_h263_h_loop_filter_rvv(uint8_t *src, int stride, int q);
 void ff_h263_v_loop_filter_rvv(uint8_t *src, int stride, int q);
 
@@ -33,9 +35,16 @@ av_cold void ff_h263dsp_init_riscv(H263DSPContext *c)
 #if HAVE_RVV
     int flags = av_get_cpu_flags();
 
-    if ((flags & AV_CPU_FLAG_RVV_I32) && ff_rv_vlen_least(128)) {
-        c->h263_h_loop_filter = ff_h263_h_loop_filter_rvv;
-        c->h263_v_loop_filter = ff_h263_v_loop_filter_rvv;
+    if (flags & AV_CPU_FLAG_RVV_I32) {
+        if (ff_rv_vlen_least(128) || (flags & AV_CPU_FLAG_RVB_ADDR)) {
+            c->h263_dct_unquantize_intra = ff_h263_dct_unquantize_intra_rvv;
+            c->h263_dct_unquantize_inter = ff_h263_dct_unquantize_inter_rvv;
+        }
+
+        if (ff_rv_vlen_least(128)) {
+            c->h263_h_loop_filter = ff_h263_h_loop_filter_rvv;
+            c->h263_v_loop_filter = ff_h263_v_loop_filter_rvv;
+        }
     }
 #endif
 }
diff --git a/libavcodec/riscv/h263dsp_rvv.S b/libavcodec/riscv/h263dsp_rvv.S
index 97503d527c..3d6be8f58f 100644
--- a/libavcodec/riscv/h263dsp_rvv.S
+++ b/libavcodec/riscv/h263dsp_rvv.S
@@ -20,6 +20,31 @@
 
 #include "libavutil/riscv/asm.S"
 
+func ff_h263_dct_unquantize_intra_rvv, zve32x
+        addi    a1, a1, -1
+        addi    a0, a0, 2
+        # fall through
+endfunc
+
+func ff_h263_dct_unquantize_inter_rvv, zve32x
+1:
+        vsetvli  t0, a1, e16, m8, ta, mu
+        vle16.v  v8, (a0)
+        sub      a1, a1, t0
+        vmv.v.x  v16, a3
+        vmslt.vi v0, v8, 0
+        vneg.v   v16, v16, v0.t
+        vmsne.vi v0, v8, 0
+        vmadd.vx v8, a2, v16, v0.t
+        vse16.v  v8, (a0)
+#if defined(__riscv_v_min_vlen) && __riscv_v_min_vlen < 128
+        sh1add   a0, t0, a0
+        bnez     a1, 1b
+#endif
+
+        ret
+endfunc
+
         .option push
         .option norelax
 func ff_h263_h_loop_filter_rvv, zve32x
-- 
2.45.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [FFmpeg-devel] [PATCHv5 1/4] lavc/h263dsp: add DCT dequantisation functions
  2024-06-12  4:47 [FFmpeg-devel] [PATCHv5 1/4] lavc/h263dsp: add DCT dequantisation functions Rémi Denis-Courmont
                   ` (2 preceding siblings ...)
  2024-06-12  4:47 ` [FFmpeg-devel] [PATCH 4/4] lavc/h263dsp: R-V V " Rémi Denis-Courmont
@ 2024-06-12 17:40 ` James Almer
  2024-06-12 18:10   ` Rémi Denis-Courmont
  2024-06-13  2:23 ` Andreas Rheinhardt
  4 siblings, 1 reply; 19+ messages in thread
From: James Almer @ 2024-06-12 17:40 UTC (permalink / raw)
  To: ffmpeg-devel

On 6/12/2024 1:47 AM, Rémi Denis-Courmont wrote:
> Note that optimised implementations of these functions will be taken
> into actual use only if MpegEncContext.dct_unquantize_h263_{inter,intra}
> are *not* overloaded by existing optimisations.
> 
> ---
> This adds the plus ones back, saving two branch instructions in C and
> one in assembler (at the cost of two unconditional adds).

See my reply in the previous version. Not sure if it will help with this.

> 
> ---
>   libavcodec/h263dsp.c | 26 ++++++++++++++++++++++++++
>   libavcodec/h263dsp.h |  4 ++++
>   2 files changed, 30 insertions(+)
> 
> diff --git a/libavcodec/h263dsp.c b/libavcodec/h263dsp.c
> index 6a13353499..f4523a68c1 100644
> --- a/libavcodec/h263dsp.c
> +++ b/libavcodec/h263dsp.c
> @@ -19,10 +19,34 @@
>   #include <stdint.h>
>   
>   #include "libavutil/attributes.h"
> +#include "libavutil/avassert.h"
>   #include "libavutil/common.h"
>   #include "config.h"
>   #include "h263dsp.h"
>   
> +static void h263_dct_unquantize_inter_c(int16_t *block, size_t len,
> +                                        int qmul, int qadd)
> +{
> +    for (size_t i = 0; i < len; i++) {
> +        int level = block[i];
> +
> +        if (level) {
> +            if (level < 0)
> +                level = level * qmul - qadd;
> +            else
> +                level = level * qmul + qadd;
> +            block[i] = level;
> +        }
> +    }
> +}
> +
> +static void h263_dct_unquantize_intra_c(int16_t *block, size_t len,
> +                                        int qmul, int qadd)
> +{
> +    av_assert1(len >= 1);
> +    h263_dct_unquantize_inter_c(block + 1, len - 1, qmul, qadd);
> +}
> +
>   const uint8_t ff_h263_loop_filter_strength[32] = {
>       0, 1, 1, 2, 2, 3, 3,  4,  4,  4,  5,  5,  6,  6,  7, 7,
>       7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12
> @@ -116,6 +140,8 @@ static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale)
>   
>   av_cold void ff_h263dsp_init(H263DSPContext *ctx)
>   {
> +    ctx->h263_dct_unquantize_intra = h263_dct_unquantize_intra_c;
> +    ctx->h263_dct_unquantize_inter = h263_dct_unquantize_inter_c;
>       ctx->h263_h_loop_filter = h263_h_loop_filter_c;
>       ctx->h263_v_loop_filter = h263_v_loop_filter_c;
>   
> diff --git a/libavcodec/h263dsp.h b/libavcodec/h263dsp.h
> index 2dccd23392..0ecbe83314 100644
> --- a/libavcodec/h263dsp.h
> +++ b/libavcodec/h263dsp.h
> @@ -24,6 +24,10 @@
>   extern const uint8_t ff_h263_loop_filter_strength[32];
>   
>   typedef struct H263DSPContext {
> +    void (*h263_dct_unquantize_intra)(int16_t *block /* align 16 */,
> +                                      size_t len, int mul, int add);
> +    void (*h263_dct_unquantize_inter)(int16_t *block /* align 16 */,
> +                                      size_t len, int mul, int add);
>       void (*h263_h_loop_filter)(uint8_t *src, int stride, int qscale);
>       void (*h263_v_loop_filter)(uint8_t *src, int stride, int qscale);
>   } H263DSPContext;
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [FFmpeg-devel] [PATCHv5 1/4] lavc/h263dsp: add DCT dequantisation functions
  2024-06-12 17:40 ` [FFmpeg-devel] [PATCHv5 1/4] lavc/h263dsp: add DCT dequantisation functions James Almer
@ 2024-06-12 18:10   ` Rémi Denis-Courmont
  0 siblings, 0 replies; 19+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-12 18:10 UTC (permalink / raw)
  To: ffmpeg-devel

Le keskiviikkona 12. kesäkuuta 2024, 20.40.37 EEST James Almer a écrit :
> On 6/12/2024 1:47 AM, Rémi Denis-Courmont wrote:
> > Note that optimised implementations of these functions will be taken
> > into actual use only if MpegEncContext.dct_unquantize_h263_{inter,intra}
> > are *not* overloaded by existing optimisations.
> > 
> > ---
> > This adds the plus ones back, saving two branch instructions in C and
> > one in assembler (at the cost of two unconditional adds).
> 
> See my reply in the previous version. Not sure if it will help with this.

We can of course avoid the branches - this version avoids the branches, as did 
the initial versions. In C (and in RVV), we can't avoid incrementing the 
pointer and a counter variable.

If you change the loop like yuo suggest:
     for (size_t i = 1; i <= nCoeffs; i++) { 
         int level = block[i];

         if (level) {
             if (level < 0)
                 level = level * qmul - qadd;
             else
                 level = level * qmul + qadd;
             block[i] = level;
         }
    }

... at best, an optimising compiler will reinterpret it to:

     if (nCoeffs >= 1) {
         block++;
         end = block + nCoeffs;

      loop:
            level = *block;
            if (level) {
                 tmp = level * qmul;
                 if (level < 0)
                     tmp -= qadd;
                 else
                     tmp += qadd;
                 *(block++) = tmp;
             }
            if (block <= end)
                goto loop;
    }

Or perhaps the compiler will keep an explicit counter, which is even worse. 
This does not save branches, nor increments. It just looks like it because of 
the syntactic sugar that is the for() loop. In reality, this only duplicates 
code (as we can no longer share between inter/intra).

-- 
レミ・デニ-クールモン
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [FFmpeg-devel] [PATCHv5 1/4] lavc/h263dsp: add DCT dequantisation functions
  2024-06-12  4:47 [FFmpeg-devel] [PATCHv5 1/4] lavc/h263dsp: add DCT dequantisation functions Rémi Denis-Courmont
                   ` (3 preceding siblings ...)
  2024-06-12 17:40 ` [FFmpeg-devel] [PATCHv5 1/4] lavc/h263dsp: add DCT dequantisation functions James Almer
@ 2024-06-13  2:23 ` Andreas Rheinhardt
  2024-06-13 19:22   ` Rémi Denis-Courmont
  4 siblings, 1 reply; 19+ messages in thread
From: Andreas Rheinhardt @ 2024-06-13  2:23 UTC (permalink / raw)
  To: ffmpeg-devel

Rémi Denis-Courmont:
> Note that optimised implementations of these functions will be taken
> into actual use only if MpegEncContext.dct_unquantize_h263_{inter,intra}
> are *not* overloaded by existing optimisations.
> 
> ---
> This adds the plus ones back, saving two branch instructions in C and
> one in assembler (at the cost of two unconditional adds).
> 

I don't see how this saves branch instructions: len - 1 is allowed to be
0 in h263_dct_unquantize_intra and therefore the initial check can't be
avoided (presuming the compiler produces an initial check+do-loop). It
seems to me that your intra assembly simply ignores this.

If you were to check for the case of nCoeffs == 0 in
dct_unquantize_h263_intra_c (before the call!), you could write assembly
with this restriction in mind. It would also allow the compiler to avoid
the branch in case it is known that nCoeffs == 63.

> ---
>  libavcodec/h263dsp.c | 26 ++++++++++++++++++++++++++
>  libavcodec/h263dsp.h |  4 ++++
>  2 files changed, 30 insertions(+)
> 
> diff --git a/libavcodec/h263dsp.c b/libavcodec/h263dsp.c
> index 6a13353499..f4523a68c1 100644
> --- a/libavcodec/h263dsp.c
> +++ b/libavcodec/h263dsp.c
> @@ -19,10 +19,34 @@
>  #include <stdint.h>
>  
>  #include "libavutil/attributes.h"
> +#include "libavutil/avassert.h"
>  #include "libavutil/common.h"
>  #include "config.h"
>  #include "h263dsp.h"
>  
> +static void h263_dct_unquantize_inter_c(int16_t *block, size_t len,
> +                                        int qmul, int qadd)
> +{
> +    for (size_t i = 0; i < len; i++) {
> +        int level = block[i];
> +
> +        if (level) {
> +            if (level < 0)
> +                level = level * qmul - qadd;
> +            else
> +                level = level * qmul + qadd;
> +            block[i] = level;
> +        }
> +    }
> +}
> +
> +static void h263_dct_unquantize_intra_c(int16_t *block, size_t len,
> +                                        int qmul, int qadd)
> +{
> +    av_assert1(len >= 1);
> +    h263_dct_unquantize_inter_c(block + 1, len - 1, qmul, qadd);
> +}
> +
>  const uint8_t ff_h263_loop_filter_strength[32] = {
>      0, 1, 1, 2, 2, 3, 3,  4,  4,  4,  5,  5,  6,  6,  7, 7,
>      7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12
> @@ -116,6 +140,8 @@ static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale)
>  
>  av_cold void ff_h263dsp_init(H263DSPContext *ctx)
>  {
> +    ctx->h263_dct_unquantize_intra = h263_dct_unquantize_intra_c;
> +    ctx->h263_dct_unquantize_inter = h263_dct_unquantize_inter_c;
>      ctx->h263_h_loop_filter = h263_h_loop_filter_c;
>      ctx->h263_v_loop_filter = h263_v_loop_filter_c;
>  
> diff --git a/libavcodec/h263dsp.h b/libavcodec/h263dsp.h
> index 2dccd23392..0ecbe83314 100644
> --- a/libavcodec/h263dsp.h
> +++ b/libavcodec/h263dsp.h
> @@ -24,6 +24,10 @@
>  extern const uint8_t ff_h263_loop_filter_strength[32];
>  
>  typedef struct H263DSPContext {
> +    void (*h263_dct_unquantize_intra)(int16_t *block /* align 16 */,
> +                                      size_t len, int mul, int add);
> +    void (*h263_dct_unquantize_inter)(int16_t *block /* align 16 */,
> +                                      size_t len, int mul, int add);
>      void (*h263_h_loop_filter)(uint8_t *src, int stride, int qscale);
>      void (*h263_v_loop_filter)(uint8_t *src, int stride, int qscale);
>  } H263DSPContext;

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [FFmpeg-devel] [PATCHv5 1/4] lavc/h263dsp: add DCT dequantisation functions
  2024-06-13  2:23 ` Andreas Rheinhardt
@ 2024-06-13 19:22   ` Rémi Denis-Courmont
  0 siblings, 0 replies; 19+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-13 19:22 UTC (permalink / raw)
  To: ffmpeg-devel

Le torstaina 13. kesäkuuta 2024, 5.23.12 EEST Andreas Rheinhardt a écrit :
> Rémi Denis-Courmont:
> > Note that optimised implementations of these functions will be taken
> > into actual use only if MpegEncContext.dct_unquantize_h263_{inter,intra}
> > are *not* overloaded by existing optimisations.
> > 
> > ---
> > This adds the plus ones back, saving two branch instructions in C and
> > one in assembler (at the cost of two unconditional adds).
> 
> I don't see how this saves branch instructions:

You can compare versions 4 and 5. The C had the extra if() that you complained 
about and the assembler had an explicit extra branch.

That being said, if someone wants to microoptimise the C version of DSP 
function, there are much greater and much less debatable savings to be had 
elsewhere - for instance adding missing restrict keywords (I don't mean in 
this particular case).

The point of this patchset is not to optimise the C code, rather to enable 
checkasm and avoid copying the same scalar prologues in RVV and SSE2.

> It seems to me that your intra assembly simply ignores [that] len -1 is
> allowed to be 0.

> If you were to check for the case of nCoeffs == 0 in
> dct_unquantize_h263_intra_c (before the call!), you could write assembly
> with this restriction in mind.

If it really is a common case worth optimising for, then indeed it could be 
checked in the common C calling code. But that is not a call that I can 
credibly make, and is outside the scope of this MR. If someone has data to 
back that optimisation, they are welcome to submit it as a separate patch.

The assembler works fine with 0 length. The only assumption is that the length 
is the actual and unsigned length.

> It would also allow the compiler to avoid
> the branch in case it is known that nCoeffs == 63.

-- 
Rémi Denis-Courmont
http://www.remlab.net/

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2024-06-14 19:03 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-06-12  4:47 [FFmpeg-devel] [PATCHv5 1/4] lavc/h263dsp: add DCT dequantisation functions Rémi Denis-Courmont
2024-06-12  4:47 ` [FFmpeg-devel] [PATCH 2/4] lavc/mpegvideo: use H263DSP dequant function Rémi Denis-Courmont
2024-06-14 14:33   ` James Almer
2024-06-14 14:45     ` Rémi Denis-Courmont
2024-06-14 15:11       ` Rémi Denis-Courmont
2024-06-14 15:41         ` James Almer
2024-06-14 19:03           ` Rémi Denis-Courmont
2024-06-12  4:47 ` [FFmpeg-devel] [PATCH 3/4] checkasm/h263dsp: test dct_unquantize_{intra, inter} Rémi Denis-Courmont
2024-06-12 18:39   ` James Almer
2024-06-12 19:15     ` Rémi Denis-Courmont
2024-06-12 19:39       ` James Almer
2024-06-12 19:52         ` Rémi Denis-Courmont
2024-06-13 19:41   ` [FFmpeg-devel] [PATCHv6 " Rémi Denis-Courmont
2024-06-12  4:47 ` [FFmpeg-devel] [PATCH 4/4] lavc/h263dsp: R-V V " Rémi Denis-Courmont
2024-06-13 19:41   ` [FFmpeg-devel] [PATCHv6 " Rémi Denis-Courmont
2024-06-12 17:40 ` [FFmpeg-devel] [PATCHv5 1/4] lavc/h263dsp: add DCT dequantisation functions James Almer
2024-06-12 18:10   ` Rémi Denis-Courmont
2024-06-13  2:23 ` Andreas Rheinhardt
2024-06-13 19:22   ` Rémi Denis-Courmont

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git