Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 1/7] proresdec2: port and fix for cached reader
@ 2023-09-08  8:15 Christophe Gisquet
  2023-09-08  8:15 ` [FFmpeg-devel] [PATCH 2/7] proresdec2: store precomputed EC parameters Christophe Gisquet
                   ` (8 more replies)
  0 siblings, 9 replies; 22+ messages in thread
From: Christophe Gisquet @ 2023-09-08  8:15 UTC (permalink / raw)
  To: ffmpeg-devel

Summary of changes
- move back to regular, non-macro, get_bits API
- reduce the lookup to switch the coding method
- shorter reads wherever possible, in particular for the end of bitstream
  (16 bits instead of 32, as per the above)

There are cases that really need longer lengths (larger EG codes) of up
to 27 bits.

Win64: 6.10 -> 4.87 (~20% speedup)

Reference for an hypothetical 32bits version of the cached reader:
Win32: 11.4 -> 9.8  (14%, because iDCT is not SIMDed)
---
 libavcodec/proresdec2.c | 53 ++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 30 deletions(-)

diff --git a/libavcodec/proresdec2.c b/libavcodec/proresdec2.c
index 9297860946..6e243cfc17 100644
--- a/libavcodec/proresdec2.c
+++ b/libavcodec/proresdec2.c
@@ -24,9 +24,7 @@
  * Known FOURCCs: 'apch' (HQ), 'apcn' (SD), 'apcs' (LT), 'apco' (Proxy), 'ap4h' (4444), 'ap4x' (4444 XQ)
  */
 
-//#define DEBUG
-
-#define LONG_BITSTREAM_READER
+#define CACHED_BITSTREAM_READER 1
 
 #include "config_components.h"
 
@@ -422,35 +420,37 @@ static int decode_picture_header(AVCodecContext *avctx, const uint8_t *buf, cons
     return pic_data_size;
 }
 
-#define DECODE_CODEWORD(val, codebook, SKIP)                            \
+/* bitstream_read may fail on 32bits ARCHS for >24 bits, so use long version there */
+#if 0 //BITSTREAM_BITS == 32
+# define READ_BITS get_bits_long
+#else
+# define READ_BITS get_bits
+#endif
+
+#define DECODE_CODEWORD(val, codebook)                                  \
     do {                                                                \
         unsigned int rice_order, exp_order, switch_bits;                \
         unsigned int q, buf, bits;                                      \
                                                                         \
-        UPDATE_CACHE(re, gb);                                           \
-        buf = GET_CACHE(re, gb);                                        \
+        buf = show_bits(gb, 14);                                        \
                                                                         \
         /* number of bits to switch between rice and exp golomb */      \
         switch_bits =  codebook & 3;                                    \
         rice_order  =  codebook >> 5;                                   \
         exp_order   = (codebook >> 2) & 7;                              \
                                                                         \
-        q = 31 - av_log2(buf);                                          \
+        q = 13 - av_log2(buf);                                          \
                                                                         \
         if (q > switch_bits) { /* exp golomb */                         \
             bits = exp_order - switch_bits + (q<<1);                    \
-            if (bits > FFMIN(MIN_CACHE_BITS, 31))                       \
-                return AVERROR_INVALIDDATA;                             \
-            val = SHOW_UBITS(re, gb, bits) - (1 << exp_order) +         \
+            val = READ_BITS(gb, bits) - (1 << exp_order) +              \
                 ((switch_bits + 1) << rice_order);                      \
-            SKIP(re, gb, bits);                                         \
         } else if (rice_order) {                                        \
-            SKIP_BITS(re, gb, q+1);                                     \
-            val = (q << rice_order) + SHOW_UBITS(re, gb, rice_order);   \
-            SKIP(re, gb, rice_order);                                   \
+            skip_remaining(gb, q+1);                                    \
+            val = (q << rice_order) + get_bits(gb, rice_order);         \
         } else {                                                        \
             val = q;                                                    \
-            SKIP(re, gb, q+1);                                          \
+            skip_remaining(gb, q+1);                                    \
         }                                                               \
     } while (0)
 
@@ -466,9 +466,7 @@ static av_always_inline int decode_dc_coeffs(GetBitContext *gb, int16_t *out,
     int16_t prev_dc;
     int code, i, sign;
 
-    OPEN_READER(re, gb);
-
-    DECODE_CODEWORD(code, FIRST_DC_CB, LAST_SKIP_BITS);
+    DECODE_CODEWORD(code, FIRST_DC_CB);
     prev_dc = TOSIGNED(code);
     out[0] = prev_dc;
 
@@ -477,13 +475,12 @@ static av_always_inline int decode_dc_coeffs(GetBitContext *gb, int16_t *out,
     code = 5;
     sign = 0;
     for (i = 1; i < blocks_per_slice; i++, out += 64) {
-        DECODE_CODEWORD(code, dc_codebook[FFMIN(code, 6U)], LAST_SKIP_BITS);
+        DECODE_CODEWORD(code, dc_codebook[FFMIN(code, 6U)]);
         if(code) sign ^= -(code & 1);
         else     sign  = 0;
         prev_dc += (((code + 1) >> 1) ^ sign) - sign;
         out[0] = prev_dc;
     }
-    CLOSE_READER(re, gb);
     return 0;
 }
 
@@ -497,11 +494,9 @@ static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContex
     const ProresContext *ctx = avctx->priv_data;
     int block_mask, sign;
     unsigned pos, run, level;
-    int max_coeffs, i, bits_left;
+    int max_coeffs, i, bits_rem;
     int log2_block_count = av_log2(blocks_per_slice);
 
-    OPEN_READER(re, gb);
-    UPDATE_CACHE(re, gb);                                           \
     run   = 4;
     level = 2;
 
@@ -509,28 +504,26 @@ static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContex
     block_mask = blocks_per_slice - 1;
 
     for (pos = block_mask;;) {
-        bits_left = gb->size_in_bits - re_index;
-        if (!bits_left || (bits_left < 32 && !SHOW_UBITS(re, gb, bits_left)))
+        bits_rem = get_bits_left(gb);
+        if (!bits_rem || (bits_rem < 16 && !show_bits(gb, bits_rem)))
             break;
 
-        DECODE_CODEWORD(run, run_to_cb[FFMIN(run,  15)], LAST_SKIP_BITS);
+        DECODE_CODEWORD(run, run_to_cb[FFMIN(run,  15)]);
         pos += run + 1;
         if (pos >= max_coeffs) {
             av_log(avctx, AV_LOG_ERROR, "ac tex damaged %d, %d\n", pos, max_coeffs);
             return AVERROR_INVALIDDATA;
         }
 
-        DECODE_CODEWORD(level, lev_to_cb[FFMIN(level, 9)], SKIP_BITS);
+        DECODE_CODEWORD(level, lev_to_cb[FFMIN(level, 9)]);
         level += 1;
 
         i = pos >> log2_block_count;
 
-        sign = SHOW_SBITS(re, gb, 1);
-        SKIP_BITS(re, gb, 1);
+        sign = -get_bits1(gb);
         out[((pos & block_mask) << 6) + ctx->scan[i]] = ((level ^ sign) - sign);
     }
 
-    CLOSE_READER(re, gb);
     return 0;
 }
 
-- 
2.42.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [FFmpeg-devel] [PATCH 2/7] proresdec2: store precomputed EC parameters
  2023-09-08  8:15 [FFmpeg-devel] [PATCH 1/7] proresdec2: port and fix for cached reader Christophe Gisquet
@ 2023-09-08  8:15 ` Christophe Gisquet
  2023-09-08  8:39   ` Andreas Rheinhardt
  2023-09-08  8:15 ` [FFmpeg-devel] [PATCH 3/7] proresdec2: use VLC for level instead of EC switch Christophe Gisquet
                   ` (7 subsequent siblings)
  8 siblings, 1 reply; 22+ messages in thread
From: Christophe Gisquet @ 2023-09-08  8:15 UTC (permalink / raw)
  To: ffmpeg-devel

Having the various orders and offsets stored in a codebook is compact
but causes additional computations. Using instead a table for the
precomputed results achieve some speedups at the cost of ~132 bytes.

Around 5% speedup.
---
 libavcodec/proresdec2.c | 54 +++++++++++++++++++++++++++++++++++------
 1 file changed, 47 insertions(+), 7 deletions(-)

diff --git a/libavcodec/proresdec2.c b/libavcodec/proresdec2.c
index 6e243cfc17..65e8b01755 100644
--- a/libavcodec/proresdec2.c
+++ b/libavcodec/proresdec2.c
@@ -427,6 +427,7 @@ static int decode_picture_header(AVCodecContext *avctx, const uint8_t *buf, cons
 # define READ_BITS get_bits
 #endif
 
+/* Kept for reference and because clearer for first DC */
 #define DECODE_CODEWORD(val, codebook)                                  \
     do {                                                                \
         unsigned int rice_order, exp_order, switch_bits;                \
@@ -454,18 +455,41 @@ static int decode_picture_header(AVCodecContext *avctx, const uint8_t *buf, cons
         }                                                               \
     } while (0)
 
+/* number of bits to switch between rice and exp golomb */
+#define DECODE_CODEWORD2(val, switch_bits, rice_order, diff, offset)    \
+    do {                                                                \
+        unsigned int q, buf, bits;                                      \
+                                                                        \
+        buf = show_bits(gb, 14);                                        \
+        q = 13 - av_log2(buf);                                          \
+                                                                        \
+        if (q > switch_bits) { /* exp golomb */                         \
+            bits = (q<<1) + (int)diff;                                  \
+            val = READ_BITS(gb, bits) + (int)offset;                    \
+        } else if (rice_order) {                                        \
+            skip_remaining(gb, q+1);                                    \
+            val = (q << rice_order) + get_bits(gb, rice_order);         \
+        } else {                                                        \
+            val = q;                                                    \
+            skip_remaining(gb, q+1);                                    \
+        }                                                               \
+    } while (0)
+
+
 #define TOSIGNED(x) (((x) >> 1) ^ (-((x) & 1)))
 
 #define FIRST_DC_CB 0xB8
 
-static const uint8_t dc_codebook[7] = { 0x04, 0x28, 0x28, 0x4D, 0x4D, 0x70, 0x70};
+static const char dc_codebook[7][4] = {
+    { 0, 0, 1, -1 }, { 0, 1, 2, -2 }, { 0, 1, 2, -2 },
+    { 1, 2, 2,  0 }, { 1, 2, 2,  0 }, { 0, 3, 4, -8 }, { 0, 3, 4, -8 }
+};
 
 static av_always_inline int decode_dc_coeffs(GetBitContext *gb, int16_t *out,
                                               int blocks_per_slice)
 {
     int16_t prev_dc;
     int code, i, sign;
-
     DECODE_CODEWORD(code, FIRST_DC_CB);
     prev_dc = TOSIGNED(code);
     out[0] = prev_dc;
@@ -475,7 +499,9 @@ static av_always_inline int decode_dc_coeffs(GetBitContext *gb, int16_t *out,
     code = 5;
     sign = 0;
     for (i = 1; i < blocks_per_slice; i++, out += 64) {
-        DECODE_CODEWORD(code, dc_codebook[FFMIN(code, 6U)]);
+        unsigned int dccb = FFMIN(code, 6U);
+        DECODE_CODEWORD2(code, dc_codebook[dccb][0], dc_codebook[dccb][1],
+                               dc_codebook[dccb][2], dc_codebook[dccb][3]);
         if(code) sign ^= -(code & 1);
         else     sign  = 0;
         prev_dc += (((code + 1) >> 1) ^ sign) - sign;
@@ -485,8 +511,18 @@ static av_always_inline int decode_dc_coeffs(GetBitContext *gb, int16_t *out,
 }
 
 // adaptive codebook switching lut according to previous run/level values
-static const uint8_t run_to_cb[16] = { 0x06, 0x06, 0x05, 0x05, 0x04, 0x29, 0x29, 0x29, 0x29, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x4C };
-static const uint8_t lev_to_cb[10] = { 0x04, 0x0A, 0x05, 0x06, 0x04, 0x28, 0x28, 0x28, 0x28, 0x4C };
+static const char run_to_cb[16][4] = {
+    { 2, 0, -1,  1 }, { 2, 0, -1,  1 }, { 1, 0, 0,  0 }, { 1, 0,  0,  0 }, { 0, 0, 1, -1 },
+    { 1, 1,  1,  0 }, { 1, 1,  1,  0 }, { 1, 1, 1,  0 }, { 1, 1,  1,  0 },
+    { 0, 1,  2, -2 }, { 0, 1,  2, -2 }, { 0, 1, 2, -2 }, { 0, 1,  2, -2 }, { 0, 1, 2, -2 }, { 0, 1, 2, -2 },
+    { 0, 2,  3, -4 }
+};
+
+static const char lev_to_cb[10][4] = {
+    { 0, 0,  1, -1 }, { 2, 0,  0, -1 }, { 1, 0, 0,  0 }, { 2, 0, -1,  1 }, { 0, 0, 1, -1 },
+    { 0, 1,  2, -2 }, { 0, 1,  2, -2 }, { 0, 1, 2, -2 }, { 0, 1,  2, -2 },
+    { 0, 2,  3, -4 }
+};
 
 static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContext *gb,
                                              int16_t *out, int blocks_per_slice)
@@ -504,18 +540,22 @@ static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContex
     block_mask = blocks_per_slice - 1;
 
     for (pos = block_mask;;) {
+        unsigned int runcb = FFMIN(run,  15);
+        unsigned int levcb = FFMIN(level, 9);
         bits_rem = get_bits_left(gb);
         if (!bits_rem || (bits_rem < 16 && !show_bits(gb, bits_rem)))
             break;
 
-        DECODE_CODEWORD(run, run_to_cb[FFMIN(run,  15)]);
+        DECODE_CODEWORD2(run, run_to_cb[runcb][0], run_to_cb[runcb][1],
+                              run_to_cb[runcb][2], run_to_cb[runcb][3]);
         pos += run + 1;
         if (pos >= max_coeffs) {
             av_log(avctx, AV_LOG_ERROR, "ac tex damaged %d, %d\n", pos, max_coeffs);
             return AVERROR_INVALIDDATA;
         }
 
-        DECODE_CODEWORD(level, lev_to_cb[FFMIN(level, 9)]);
+        DECODE_CODEWORD2(level, lev_to_cb[levcb][0], lev_to_cb[levcb][1],
+                                lev_to_cb[levcb][2], lev_to_cb[levcb][3]);
         level += 1;
 
         i = pos >> log2_block_count;
-- 
2.42.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [FFmpeg-devel] [PATCH 3/7] proresdec2: use VLC for level instead of EC switch
  2023-09-08  8:15 [FFmpeg-devel] [PATCH 1/7] proresdec2: port and fix for cached reader Christophe Gisquet
  2023-09-08  8:15 ` [FFmpeg-devel] [PATCH 2/7] proresdec2: store precomputed EC parameters Christophe Gisquet
@ 2023-09-08  8:15 ` Christophe Gisquet
  2023-09-08  8:44   ` Andreas Rheinhardt
  2023-09-08  8:15 ` [FFmpeg-devel] [PATCH 4/7] proresdec2: offset VLCs by 1 to avoid 1 add Christophe Gisquet
                   ` (6 subsequent siblings)
  8 siblings, 1 reply; 22+ messages in thread
From: Christophe Gisquet @ 2023-09-08  8:15 UTC (permalink / raw)
  To: ffmpeg-devel

x86/x64: 61/52 -> 55/46
Around 7-10% speedup.

Run and DC do not lend themselves to such changes, likely because
their distribution is less skewed, and need larger average vlc read
iterations.
---
 libavcodec/proresdec.h  |  1 +
 libavcodec/proresdec2.c | 77 ++++++++++++++++++++++++++++++++++-------
 2 files changed, 66 insertions(+), 12 deletions(-)

diff --git a/libavcodec/proresdec.h b/libavcodec/proresdec.h
index 1e48752e6f..7ebacaeb21 100644
--- a/libavcodec/proresdec.h
+++ b/libavcodec/proresdec.h
@@ -22,6 +22,7 @@
 #ifndef AVCODEC_PRORESDEC_H
 #define AVCODEC_PRORESDEC_H
 
+#define CACHED_BITSTREAM_READER 1
 #include "get_bits.h"
 #include "blockdsp.h"
 #include "proresdsp.h"
diff --git a/libavcodec/proresdec2.c b/libavcodec/proresdec2.c
index 65e8b01755..91c689d9ef 100644
--- a/libavcodec/proresdec2.c
+++ b/libavcodec/proresdec2.c
@@ -24,17 +24,17 @@
  * Known FOURCCs: 'apch' (HQ), 'apcn' (SD), 'apcs' (LT), 'apco' (Proxy), 'ap4h' (4444), 'ap4x' (4444 XQ)
  */
 
-#define CACHED_BITSTREAM_READER 1
+//#define DEBUG
 
 #include "config_components.h"
 
 #include "libavutil/internal.h"
 #include "libavutil/mem_internal.h"
+#include "libavutil/thread.h"
 
 #include "avcodec.h"
 #include "codec_internal.h"
 #include "decode.h"
-#include "get_bits.h"
 #include "hwaccel_internal.h"
 #include "hwconfig.h"
 #include "idctdsp.h"
@@ -129,8 +129,64 @@ static void unpack_alpha_12(GetBitContext *gb, uint16_t *dst, int num_coeffs,
     }
 }
 
+#define AC_BITS 12
+#define PRORES_LEV_BITS 9
+
+static const uint8_t ac_info[] = { 0x04, 0x0A, 0x05, 0x06, 0x28, 0x4C };
+static VLC ac_vlc[6];
+
+static av_cold void init_vlcs(void)
+{
+    int i;
+    for (i = 0; i < sizeof(ac_info); i++) {
+        uint32_t ac_codes[1<<AC_BITS];
+        uint8_t ac_bits[1<<AC_BITS];
+        unsigned int rice_order, exp_order, switch_bits, switch_val;
+        int ac, max_bits = 0, codebook = ac_info[i];
+
+        /* number of prefix bits to switch between Rice and expGolomb */
+        switch_bits = (codebook & 3);
+        rice_order  =  codebook >> 5;       /* rice code order */
+        exp_order   = (codebook >> 2) & 7;  /* exp golomb code order */
+
+        switch_val  = (switch_bits+1) << rice_order;
+
+        // Values are actually transformed, but this is more a wrapping
+        for (ac = 0; ac <1<<AC_BITS; ac++) {
+            int exponent, bits, val = ac;
+            unsigned int code;
+
+            if (val >= switch_val) {
+                val += (1 << exp_order) - switch_val;
+                exponent = av_log2(val);
+                bits = exponent+1+switch_bits-exp_order/*0*/ + exponent+1/*val*/;
+                code = val;
+            } else if (rice_order) {
+                bits = (val >> rice_order)/*0*/ + 1/*1*/ + rice_order/*val*/;
+                code = (1 << rice_order) | val;
+            } else {
+                bits = val/*0*/ + 1/*1*/;
+                code = 1;
+            }
+            if (bits > max_bits) max_bits = bits;
+            ac_bits [ac] = bits;
+            ac_codes[ac] = code;
+        }
+
+        ff_free_vlc(ac_vlc+i);
+
+        if (init_vlc(ac_vlc+i, PRORES_LEV_BITS, 1<<AC_BITS,
+                     ac_bits, 1, 1, ac_codes, 4, 4, 0) < 0) {
+            av_log(NULL, AV_LOG_ERROR, "Error for %d(0x%02X), max bits %d\n",
+                   i, codebook, max_bits);
+            break; //return AVERROR_BUG;
+        }
+    }
+}
+
 static av_cold int decode_init(AVCodecContext *avctx)
 {
+    static AVOnce init_static_once = AV_ONCE_INIT;
     int ret = 0;
     ProresContext *ctx = avctx->priv_data;
     uint8_t idct_permutation[64];
@@ -184,6 +240,9 @@ static av_cold int decode_init(AVCodecContext *avctx)
 
     ctx->pix_fmt = AV_PIX_FMT_NONE;
 
+    // init dc_tables
+    ff_thread_once(&init_static_once, init_vlcs);
+
     if (avctx->bits_per_raw_sample == 10){
         ctx->unpack_alpha = unpack_alpha_10;
     } else if (avctx->bits_per_raw_sample == 12){
@@ -510,7 +569,7 @@ static av_always_inline int decode_dc_coeffs(GetBitContext *gb, int16_t *out,
     return 0;
 }
 
-// adaptive codebook switching lut according to previous run/level values
+// adaptive codebook switching lut according to previous run values
 static const char run_to_cb[16][4] = {
     { 2, 0, -1,  1 }, { 2, 0, -1,  1 }, { 1, 0, 0,  0 }, { 1, 0,  0,  0 }, { 0, 0, 1, -1 },
     { 1, 1,  1,  0 }, { 1, 1,  1,  0 }, { 1, 1, 1,  0 }, { 1, 1,  1,  0 },
@@ -518,12 +577,6 @@ static const char run_to_cb[16][4] = {
     { 0, 2,  3, -4 }
 };
 
-static const char lev_to_cb[10][4] = {
-    { 0, 0,  1, -1 }, { 2, 0,  0, -1 }, { 1, 0, 0,  0 }, { 2, 0, -1,  1 }, { 0, 0, 1, -1 },
-    { 0, 1,  2, -2 }, { 0, 1,  2, -2 }, { 0, 1, 2, -2 }, { 0, 1,  2, -2 },
-    { 0, 2,  3, -4 }
-};
-
 static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContext *gb,
                                              int16_t *out, int blocks_per_slice)
 {
@@ -540,8 +593,9 @@ static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContex
     block_mask = blocks_per_slice - 1;
 
     for (pos = block_mask;;) {
+        static const uint8_t ctx_to_tbl[] = { 0, 1, 2, 3, 0, 4, 4, 4, 4, 5 };
+        const VLC* tbl = ac_vlc + ctx_to_tbl[FFMIN(level, 9)];
         unsigned int runcb = FFMIN(run,  15);
-        unsigned int levcb = FFMIN(level, 9);
         bits_rem = get_bits_left(gb);
         if (!bits_rem || (bits_rem < 16 && !show_bits(gb, bits_rem)))
             break;
@@ -554,8 +608,7 @@ static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContex
             return AVERROR_INVALIDDATA;
         }
 
-        DECODE_CODEWORD2(level, lev_to_cb[levcb][0], lev_to_cb[levcb][1],
-                                lev_to_cb[levcb][2], lev_to_cb[levcb][3]);
+        level = get_vlc2(gb, tbl->table, PRORES_LEV_BITS, 3);
         level += 1;
 
         i = pos >> log2_block_count;
-- 
2.42.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [FFmpeg-devel] [PATCH 4/7] proresdec2: offset VLCs by 1 to avoid 1 add
  2023-09-08  8:15 [FFmpeg-devel] [PATCH 1/7] proresdec2: port and fix for cached reader Christophe Gisquet
  2023-09-08  8:15 ` [FFmpeg-devel] [PATCH 2/7] proresdec2: store precomputed EC parameters Christophe Gisquet
  2023-09-08  8:15 ` [FFmpeg-devel] [PATCH 3/7] proresdec2: use VLC for level instead of EC switch Christophe Gisquet
@ 2023-09-08  8:15 ` Christophe Gisquet
  2023-09-08  8:15 ` [FFmpeg-devel] [PATCH 5/7] proresdec2: use VLC for small runs and levels Christophe Gisquet
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 22+ messages in thread
From: Christophe Gisquet @ 2023-09-08  8:15 UTC (permalink / raw)
  To: ffmpeg-devel

Pretty harmless, but not much gained either.
---
 libavcodec/proresdec2.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/libavcodec/proresdec2.c b/libavcodec/proresdec2.c
index 91c689d9ef..e3cef402d7 100644
--- a/libavcodec/proresdec2.c
+++ b/libavcodec/proresdec2.c
@@ -152,7 +152,9 @@ static av_cold void init_vlcs(void)
         switch_val  = (switch_bits+1) << rice_order;
 
         // Values are actually transformed, but this is more a wrapping
-        for (ac = 0; ac <1<<AC_BITS; ac++) {
+        ac_codes[0] = 0;
+        ac_bits[0] = 0;
+        for (ac = 0; ac < (1<<AC_BITS)-1; ac++) {
             int exponent, bits, val = ac;
             unsigned int code;
 
@@ -169,8 +171,8 @@ static av_cold void init_vlcs(void)
                 code = 1;
             }
             if (bits > max_bits) max_bits = bits;
-            ac_bits [ac] = bits;
-            ac_codes[ac] = code;
+            ac_bits [ac+1] = bits;
+            ac_codes[ac+1] = code;
         }
 
         ff_free_vlc(ac_vlc+i);
@@ -609,7 +611,6 @@ static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContex
         }
 
         level = get_vlc2(gb, tbl->table, PRORES_LEV_BITS, 3);
-        level += 1;
 
         i = pos >> log2_block_count;
 
-- 
2.42.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [FFmpeg-devel] [PATCH 5/7] proresdec2: use VLC for small runs and levels
  2023-09-08  8:15 [FFmpeg-devel] [PATCH 1/7] proresdec2: port and fix for cached reader Christophe Gisquet
                   ` (2 preceding siblings ...)
  2023-09-08  8:15 ` [FFmpeg-devel] [PATCH 4/7] proresdec2: offset VLCs by 1 to avoid 1 add Christophe Gisquet
@ 2023-09-08  8:15 ` Christophe Gisquet
  2023-09-08  8:15 ` [FFmpeg-devel] [PATCH 6/7] proresdec2: remove a useless DC codebook entry Christophe Gisquet
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 22+ messages in thread
From: Christophe Gisquet @ 2023-09-08  8:15 UTC (permalink / raw)
  To: ffmpeg-devel

Basically, the catch-all codebook is for on average long codewords,
and with a distribution such that the 3-step VLC reading is not
efficient. Furthermore, the complete unrolling make the actual code
smaller than the macro, and as the maximum codelength is smaller,
smaller amounts of bits, optimized for run and for level, can be read.
---
 libavcodec/proresdec2.c | 53 +++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 29 deletions(-)

diff --git a/libavcodec/proresdec2.c b/libavcodec/proresdec2.c
index e3cef402d7..02e1d82d00 100644
--- a/libavcodec/proresdec2.c
+++ b/libavcodec/proresdec2.c
@@ -132,7 +132,7 @@ static void unpack_alpha_12(GetBitContext *gb, uint16_t *dst, int num_coeffs,
 #define AC_BITS 12
 #define PRORES_LEV_BITS 9
 
-static const uint8_t ac_info[] = { 0x04, 0x0A, 0x05, 0x06, 0x28, 0x4C };
+static const uint8_t ac_info[] = { 0x04, 0x0A, 0x05, 0x06, 0x28, 0x29 };
 static VLC ac_vlc[6];
 
 static av_cold void init_vlcs(void)
@@ -152,9 +152,7 @@ static av_cold void init_vlcs(void)
         switch_val  = (switch_bits+1) << rice_order;
 
         // Values are actually transformed, but this is more a wrapping
-        ac_codes[0] = 0;
-        ac_bits[0] = 0;
-        for (ac = 0; ac < (1<<AC_BITS)-1; ac++) {
+        for (ac = 0; ac <1<<AC_BITS; ac++) {
             int exponent, bits, val = ac;
             unsigned int code;
 
@@ -171,8 +169,8 @@ static av_cold void init_vlcs(void)
                 code = 1;
             }
             if (bits > max_bits) max_bits = bits;
-            ac_bits [ac+1] = bits;
-            ac_codes[ac+1] = code;
+            ac_bits [ac] = bits;
+            ac_codes[ac] = code;
         }
 
         ff_free_vlc(ac_vlc+i);
@@ -507,12 +505,9 @@ static int decode_picture_header(AVCodecContext *avctx, const uint8_t *buf, cons
             bits = exp_order - switch_bits + (q<<1);                    \
             val = READ_BITS(gb, bits) - (1 << exp_order) +              \
                 ((switch_bits + 1) << rice_order);                      \
-        } else if (rice_order) {                                        \
-            skip_remaining(gb, q+1);                                    \
-            val = (q << rice_order) + get_bits(gb, rice_order);         \
         } else {                                                        \
-            val = q;                                                    \
             skip_remaining(gb, q+1);                                    \
+            val = rice_order ? (q << rice_order) + get_bits(gb, rice_order) : q;\
         }                                                               \
     } while (0)
 
@@ -527,12 +522,10 @@ static int decode_picture_header(AVCodecContext *avctx, const uint8_t *buf, cons
         if (q > switch_bits) { /* exp golomb */                         \
             bits = (q<<1) + (int)diff;                                  \
             val = READ_BITS(gb, bits) + (int)offset;                    \
-        } else if (rice_order) {                                        \
-            skip_remaining(gb, q+1);                                    \
-            val = (q << rice_order) + get_bits(gb, rice_order);         \
         } else {                                                        \
-            val = q;                                                    \
             skip_remaining(gb, q+1);                                    \
+            val = rice_order ? (q << rice_order) + show_bits(gb, rice_order) : q;   \
+            skip_remaining(gb, rice_order);                             \
         }                                                               \
     } while (0)
 
@@ -571,14 +564,6 @@ static av_always_inline int decode_dc_coeffs(GetBitContext *gb, int16_t *out,
     return 0;
 }
 
-// adaptive codebook switching lut according to previous run values
-static const char run_to_cb[16][4] = {
-    { 2, 0, -1,  1 }, { 2, 0, -1,  1 }, { 1, 0, 0,  0 }, { 1, 0,  0,  0 }, { 0, 0, 1, -1 },
-    { 1, 1,  1,  0 }, { 1, 1,  1,  0 }, { 1, 1, 1,  0 }, { 1, 1,  1,  0 },
-    { 0, 1,  2, -2 }, { 0, 1,  2, -2 }, { 0, 1, 2, -2 }, { 0, 1,  2, -2 }, { 0, 1, 2, -2 }, { 0, 1, 2, -2 },
-    { 0, 2,  3, -4 }
-};
-
 static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContext *gb,
                                              int16_t *out, int blocks_per_slice)
 {
@@ -595,22 +580,32 @@ static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContex
     block_mask = blocks_per_slice - 1;
 
     for (pos = block_mask;;) {
-        static const uint8_t ctx_to_tbl[] = { 0, 1, 2, 3, 0, 4, 4, 4, 4, 5 };
-        const VLC* tbl = ac_vlc + ctx_to_tbl[FFMIN(level, 9)];
-        unsigned int runcb = FFMIN(run,  15);
         bits_rem = get_bits_left(gb);
-        if (!bits_rem || (bits_rem < 16 && !show_bits(gb, bits_rem)))
+        if (!bits_rem || (bits_rem < 14 && !show_bits(gb, bits_rem)))
             break;
 
-        DECODE_CODEWORD2(run, run_to_cb[runcb][0], run_to_cb[runcb][1],
-                              run_to_cb[runcb][2], run_to_cb[runcb][3]);
+        if (run < 15) {
+            static const uint8_t ctx_to_tbl[] = { 3, 3, 2, 2, 0, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4 };
+            const VLC* tbl = ac_vlc + ctx_to_tbl[run];
+            run = get_vlc2(gb, tbl->table, PRORES_LEV_BITS, 3);
+        } else {
+            unsigned int bits = 21 - 2*av_log2(show_bits(gb, 10));
+            run = READ_BITS(gb, bits) - 4; // up to 17 bits
+        }
         pos += run + 1;
         if (pos >= max_coeffs) {
             av_log(avctx, AV_LOG_ERROR, "ac tex damaged %d, %d\n", pos, max_coeffs);
             return AVERROR_INVALIDDATA;
         }
 
-        level = get_vlc2(gb, tbl->table, PRORES_LEV_BITS, 3);
+        if (level < 9) {
+            static const uint8_t ctx_to_tbl[] = { 0, 1, 2, 3, 0, 4, 4, 4, 4 };
+            const VLC* tbl = ac_vlc + ctx_to_tbl[level];
+            level = 1+get_vlc2(gb, tbl->table, PRORES_LEV_BITS, 3);
+        } else {
+            unsigned int bits = 25 - 2*av_log2(show_bits(gb, 12));
+            level = READ_BITS(gb, bits) - 4 + 1; // up to 21 bits
+        }
 
         i = pos >> log2_block_count;
 
-- 
2.42.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [FFmpeg-devel] [PATCH 6/7] proresdec2: remove a useless DC codebook entry
  2023-09-08  8:15 [FFmpeg-devel] [PATCH 1/7] proresdec2: port and fix for cached reader Christophe Gisquet
                   ` (3 preceding siblings ...)
  2023-09-08  8:15 ` [FFmpeg-devel] [PATCH 5/7] proresdec2: use VLC for small runs and levels Christophe Gisquet
@ 2023-09-08  8:15 ` Christophe Gisquet
  2023-09-08  9:08   ` Andreas Rheinhardt
  2023-09-08  8:15 ` [FFmpeg-devel] [PATCH 7/7] prores: use VLC LUTs Christophe Gisquet
                   ` (3 subsequent siblings)
  8 siblings, 1 reply; 22+ messages in thread
From: Christophe Gisquet @ 2023-09-08  8:15 UTC (permalink / raw)
  To: ffmpeg-devel

---
 libavcodec/proresdec2.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libavcodec/proresdec2.c b/libavcodec/proresdec2.c
index 02e1d82d00..b20021c622 100644
--- a/libavcodec/proresdec2.c
+++ b/libavcodec/proresdec2.c
@@ -534,9 +534,9 @@ static int decode_picture_header(AVCodecContext *avctx, const uint8_t *buf, cons
 
 #define FIRST_DC_CB 0xB8
 
-static const char dc_codebook[7][4] = {
+static const char dc_codebook[6][4] = {
     { 0, 0, 1, -1 }, { 0, 1, 2, -2 }, { 0, 1, 2, -2 },
-    { 1, 2, 2,  0 }, { 1, 2, 2,  0 }, { 0, 3, 4, -8 }, { 0, 3, 4, -8 }
+    { 1, 2, 2,  0 }, { 1, 2, 2,  0 }, { 0, 3, 4, -8 }
 };
 
 static av_always_inline int decode_dc_coeffs(GetBitContext *gb, int16_t *out,
@@ -553,7 +553,7 @@ static av_always_inline int decode_dc_coeffs(GetBitContext *gb, int16_t *out,
     code = 5;
     sign = 0;
     for (i = 1; i < blocks_per_slice; i++, out += 64) {
-        unsigned int dccb = FFMIN(code, 6U);
+        unsigned int dccb = FFMIN(code, 5U);
         DECODE_CODEWORD2(code, dc_codebook[dccb][0], dc_codebook[dccb][1],
                                dc_codebook[dccb][2], dc_codebook[dccb][3]);
         if(code) sign ^= -(code & 1);
-- 
2.42.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [FFmpeg-devel] [PATCH 7/7] prores: use VLC LUTs
  2023-09-08  8:15 [FFmpeg-devel] [PATCH 1/7] proresdec2: port and fix for cached reader Christophe Gisquet
                   ` (4 preceding siblings ...)
  2023-09-08  8:15 ` [FFmpeg-devel] [PATCH 6/7] proresdec2: remove a useless DC codebook entry Christophe Gisquet
@ 2023-09-08  8:15 ` Christophe Gisquet
  2023-09-08  9:20   ` Andreas Rheinhardt
  2023-09-08  8:20 ` [FFmpeg-devel] [PATCH 1/7] proresdec2: port and fix for cached reader Christophe Gisquet
                   ` (2 subsequent siblings)
  8 siblings, 1 reply; 22+ messages in thread
From: Christophe Gisquet @ 2023-09-08  8:15 UTC (permalink / raw)
  To: ffmpeg-devel

One indirection less, around 1% speedup.
---
 libavcodec/proresdec2.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/libavcodec/proresdec2.c b/libavcodec/proresdec2.c
index b20021c622..85f81d92d3 100644
--- a/libavcodec/proresdec2.c
+++ b/libavcodec/proresdec2.c
@@ -561,12 +561,18 @@ static av_always_inline int decode_dc_coeffs(GetBitContext *gb, int16_t *out,
         prev_dc += (((code + 1) >> 1) ^ sign) - sign;
         out[0] = prev_dc;
     }
-    return 0;
+    return 0;	
 }
 
+#include "libavutil/timer.h"
+
+
 static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContext *gb,
                                              int16_t *out, int blocks_per_slice)
 {
+	static VLC* lvl_vlc[9] = { &ac_vlc[0], &ac_vlc[1], &ac_vlc[2], &ac_vlc[3], &ac_vlc[0], &ac_vlc[4], &ac_vlc[4], &ac_vlc[4], &ac_vlc[4], };
+	static VLC* run_vlc[15] = { &ac_vlc[3], &ac_vlc[3], &ac_vlc[2], &ac_vlc[2], &ac_vlc[0], &ac_vlc[5], &ac_vlc[5], &ac_vlc[5], &ac_vlc[5],
+	                            &ac_vlc[4], &ac_vlc[4], &ac_vlc[4], &ac_vlc[4], &ac_vlc[4], &ac_vlc[4], };
     const ProresContext *ctx = avctx->priv_data;
     int block_mask, sign;
     unsigned pos, run, level;
@@ -585,9 +591,7 @@ static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContex
             break;
 
         if (run < 15) {
-            static const uint8_t ctx_to_tbl[] = { 3, 3, 2, 2, 0, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4 };
-            const VLC* tbl = ac_vlc + ctx_to_tbl[run];
-            run = get_vlc2(gb, tbl->table, PRORES_LEV_BITS, 3);
+            run = get_vlc2(gb, run_vlc[run]->table, PRORES_LEV_BITS, 3);
         } else {
             unsigned int bits = 21 - 2*av_log2(show_bits(gb, 10));
             run = READ_BITS(gb, bits) - 4; // up to 17 bits
@@ -599,9 +603,7 @@ static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContex
         }
 
         if (level < 9) {
-            static const uint8_t ctx_to_tbl[] = { 0, 1, 2, 3, 0, 4, 4, 4, 4 };
-            const VLC* tbl = ac_vlc + ctx_to_tbl[level];
-            level = 1+get_vlc2(gb, tbl->table, PRORES_LEV_BITS, 3);
+            level = 1+get_vlc2(gb, lvl_vlc[level]->table, PRORES_LEV_BITS, 3);
         } else {
             unsigned int bits = 25 - 2*av_log2(show_bits(gb, 12));
             level = READ_BITS(gb, bits) - 4 + 1; // up to 21 bits
-- 
2.42.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/7] proresdec2: port and fix for cached reader
  2023-09-08  8:15 [FFmpeg-devel] [PATCH 1/7] proresdec2: port and fix for cached reader Christophe Gisquet
                   ` (5 preceding siblings ...)
  2023-09-08  8:15 ` [FFmpeg-devel] [PATCH 7/7] prores: use VLC LUTs Christophe Gisquet
@ 2023-09-08  8:20 ` Christophe Gisquet
  2023-09-08  8:30   ` Andreas Rheinhardt
  2023-09-11 20:54   ` Christophe Gisquet
  2023-09-08  8:36 ` Andreas Rheinhardt
  2023-09-08 21:00 ` Michael Niedermayer
  8 siblings, 2 replies; 22+ messages in thread
From: Christophe Gisquet @ 2023-09-08  8:20 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Le ven. 8 sept. 2023 à 10:15, Christophe Gisquet
<christophe.gisquet@gmail.com> a écrit :
>
> Summary of changes

git send-email --cover-letter apparently didn't let me edit one, so here goes.

This patchset requires my previous one improving the cached bitstream
reader, and serves as its justification. It, basically, moves to using
VLC wherever possible, and in particular when codewords are
sufficiently short/there's some kind of well-behaved laplacian
distribution for codewords that make VLCs efficient.

Total speedup is around 40% here.

-- 
Christophe
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/7] proresdec2: port and fix for cached reader
  2023-09-08  8:20 ` [FFmpeg-devel] [PATCH 1/7] proresdec2: port and fix for cached reader Christophe Gisquet
@ 2023-09-08  8:30   ` Andreas Rheinhardt
  2023-09-08  8:34     ` Andreas Rheinhardt
  2023-09-11 20:54   ` Christophe Gisquet
  1 sibling, 1 reply; 22+ messages in thread
From: Andreas Rheinhardt @ 2023-09-08  8:30 UTC (permalink / raw)
  To: ffmpeg-devel

Christophe Gisquet:
> Le ven. 8 sept. 2023 à 10:15, Christophe Gisquet
> <christophe.gisquet@gmail.com> a écrit :
>>
>> Summary of changes
> 
> git send-email --cover-letter apparently didn't let me edit one, so here goes.
> 

Use --compose.

> This patchset requires my previous one improving the cached bitstream
> reader, and serves as its justification. It, basically, moves to using
> VLC wherever possible, and in particular when codewords are
> sufficiently short/there's some kind of well-behaved laplacian
> distribution for codewords that make VLCs efficient.
> 
> Total speedup is around 40% here.
> 

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/7] proresdec2: port and fix for cached reader
  2023-09-08  8:30   ` Andreas Rheinhardt
@ 2023-09-08  8:34     ` Andreas Rheinhardt
  0 siblings, 0 replies; 22+ messages in thread
From: Andreas Rheinhardt @ 2023-09-08  8:34 UTC (permalink / raw)
  To: ffmpeg-devel

Andreas Rheinhardt:
> Christophe Gisquet:
>> Le ven. 8 sept. 2023 à 10:15, Christophe Gisquet
>> <christophe.gisquet@gmail.com> a écrit :
>>>
>>> Summary of changes
>>
>> git send-email --cover-letter apparently didn't let me edit one, so here goes.
>>
> 
> Use --compose.

Or even better: --cover-letter --annotate
That way you get a default cover-letter to annotate

> 
>> This patchset requires my previous one improving the cached bitstream
>> reader, and serves as its justification. It, basically, moves to using
>> VLC wherever possible, and in particular when codewords are
>> sufficiently short/there's some kind of well-behaved laplacian
>> distribution for codewords that make VLCs efficient.
>>
>> Total speedup is around 40% here.
>>

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/7] proresdec2: port and fix for cached reader
  2023-09-08  8:15 [FFmpeg-devel] [PATCH 1/7] proresdec2: port and fix for cached reader Christophe Gisquet
                   ` (6 preceding siblings ...)
  2023-09-08  8:20 ` [FFmpeg-devel] [PATCH 1/7] proresdec2: port and fix for cached reader Christophe Gisquet
@ 2023-09-08  8:36 ` Andreas Rheinhardt
  2023-09-08 21:00 ` Michael Niedermayer
  8 siblings, 0 replies; 22+ messages in thread
From: Andreas Rheinhardt @ 2023-09-08  8:36 UTC (permalink / raw)
  To: ffmpeg-devel

Christophe Gisquet:
> Summary of changes
> - move back to regular, non-macro, get_bits API
> - reduce the lookup to switch the coding method
> - shorter reads wherever possible, in particular for the end of bitstream
>   (16 bits instead of 32, as per the above)
> 
> There are cases that really need longer lengths (larger EG codes) of up
> to 27 bits.
> 
> Win64: 6.10 -> 4.87 (~20% speedup)
> 
> Reference for an hypothetical 32bits version of the cached reader:
> Win32: 11.4 -> 9.8  (14%, because iDCT is not SIMDed)
> ---

The commit message claims to fix something; what does it fix?
Also, changing to the non-macro API should be done in a separate commit
to the one changing the type of bitstream reader.

Furthermore, you should also provide information about the code size
impact when switching the type of reader.

>  libavcodec/proresdec2.c | 53 ++++++++++++++++++-----------------------
>  1 file changed, 23 insertions(+), 30 deletions(-)
> 
> diff --git a/libavcodec/proresdec2.c b/libavcodec/proresdec2.c
> index 9297860946..6e243cfc17 100644
> --- a/libavcodec/proresdec2.c
> +++ b/libavcodec/proresdec2.c
> @@ -24,9 +24,7 @@
>   * Known FOURCCs: 'apch' (HQ), 'apcn' (SD), 'apcs' (LT), 'apco' (Proxy), 'ap4h' (4444), 'ap4x' (4444 XQ)
>   */
>  
> -//#define DEBUG
> -
> -#define LONG_BITSTREAM_READER
> +#define CACHED_BITSTREAM_READER 1
>  
>  #include "config_components.h"
>  
> @@ -422,35 +420,37 @@ static int decode_picture_header(AVCodecContext *avctx, const uint8_t *buf, cons
>      return pic_data_size;
>  }
>  
> -#define DECODE_CODEWORD(val, codebook, SKIP)                            \
> +/* bitstream_read may fail on 32bits ARCHS for >24 bits, so use long version there */
> +#if 0 //BITSTREAM_BITS == 32
> +# define READ_BITS get_bits_long
> +#else
> +# define READ_BITS get_bits
> +#endif
> +
> +#define DECODE_CODEWORD(val, codebook)                                  \
>      do {                                                                \
>          unsigned int rice_order, exp_order, switch_bits;                \
>          unsigned int q, buf, bits;                                      \
>                                                                          \
> -        UPDATE_CACHE(re, gb);                                           \
> -        buf = GET_CACHE(re, gb);                                        \
> +        buf = show_bits(gb, 14);                                        \
>                                                                          \
>          /* number of bits to switch between rice and exp golomb */      \
>          switch_bits =  codebook & 3;                                    \
>          rice_order  =  codebook >> 5;                                   \
>          exp_order   = (codebook >> 2) & 7;                              \
>                                                                          \
> -        q = 31 - av_log2(buf);                                          \
> +        q = 13 - av_log2(buf);                                          \
>                                                                          \
>          if (q > switch_bits) { /* exp golomb */                         \
>              bits = exp_order - switch_bits + (q<<1);                    \
> -            if (bits > FFMIN(MIN_CACHE_BITS, 31))                       \
> -                return AVERROR_INVALIDDATA;                             \
> -            val = SHOW_UBITS(re, gb, bits) - (1 << exp_order) +         \
> +            val = READ_BITS(gb, bits) - (1 << exp_order) +              \
>                  ((switch_bits + 1) << rice_order);                      \
> -            SKIP(re, gb, bits);                                         \
>          } else if (rice_order) {                                        \
> -            SKIP_BITS(re, gb, q+1);                                     \
> -            val = (q << rice_order) + SHOW_UBITS(re, gb, rice_order);   \
> -            SKIP(re, gb, rice_order);                                   \
> +            skip_remaining(gb, q+1);                                    \
> +            val = (q << rice_order) + get_bits(gb, rice_order);         \
>          } else {                                                        \
>              val = q;                                                    \
> -            SKIP(re, gb, q+1);                                          \
> +            skip_remaining(gb, q+1);                                    \
>          }                                                               \
>      } while (0)
>  
> @@ -466,9 +466,7 @@ static av_always_inline int decode_dc_coeffs(GetBitContext *gb, int16_t *out,
>      int16_t prev_dc;
>      int code, i, sign;
>  
> -    OPEN_READER(re, gb);
> -
> -    DECODE_CODEWORD(code, FIRST_DC_CB, LAST_SKIP_BITS);
> +    DECODE_CODEWORD(code, FIRST_DC_CB);
>      prev_dc = TOSIGNED(code);
>      out[0] = prev_dc;
>  
> @@ -477,13 +475,12 @@ static av_always_inline int decode_dc_coeffs(GetBitContext *gb, int16_t *out,
>      code = 5;
>      sign = 0;
>      for (i = 1; i < blocks_per_slice; i++, out += 64) {
> -        DECODE_CODEWORD(code, dc_codebook[FFMIN(code, 6U)], LAST_SKIP_BITS);
> +        DECODE_CODEWORD(code, dc_codebook[FFMIN(code, 6U)]);
>          if(code) sign ^= -(code & 1);
>          else     sign  = 0;
>          prev_dc += (((code + 1) >> 1) ^ sign) - sign;
>          out[0] = prev_dc;
>      }
> -    CLOSE_READER(re, gb);
>      return 0;
>  }
>  
> @@ -497,11 +494,9 @@ static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContex
>      const ProresContext *ctx = avctx->priv_data;
>      int block_mask, sign;
>      unsigned pos, run, level;
> -    int max_coeffs, i, bits_left;
> +    int max_coeffs, i, bits_rem;
>      int log2_block_count = av_log2(blocks_per_slice);
>  
> -    OPEN_READER(re, gb);
> -    UPDATE_CACHE(re, gb);                                           \
>      run   = 4;
>      level = 2;
>  
> @@ -509,28 +504,26 @@ static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContex
>      block_mask = blocks_per_slice - 1;
>  
>      for (pos = block_mask;;) {
> -        bits_left = gb->size_in_bits - re_index;
> -        if (!bits_left || (bits_left < 32 && !SHOW_UBITS(re, gb, bits_left)))
> +        bits_rem = get_bits_left(gb);
> +        if (!bits_rem || (bits_rem < 16 && !show_bits(gb, bits_rem)))
>              break;
>  
> -        DECODE_CODEWORD(run, run_to_cb[FFMIN(run,  15)], LAST_SKIP_BITS);
> +        DECODE_CODEWORD(run, run_to_cb[FFMIN(run,  15)]);
>          pos += run + 1;
>          if (pos >= max_coeffs) {
>              av_log(avctx, AV_LOG_ERROR, "ac tex damaged %d, %d\n", pos, max_coeffs);
>              return AVERROR_INVALIDDATA;
>          }
>  
> -        DECODE_CODEWORD(level, lev_to_cb[FFMIN(level, 9)], SKIP_BITS);
> +        DECODE_CODEWORD(level, lev_to_cb[FFMIN(level, 9)]);
>          level += 1;
>  
>          i = pos >> log2_block_count;
>  
> -        sign = SHOW_SBITS(re, gb, 1);
> -        SKIP_BITS(re, gb, 1);
> +        sign = -get_bits1(gb);
>          out[((pos & block_mask) << 6) + ctx->scan[i]] = ((level ^ sign) - sign);
>      }
>  
> -    CLOSE_READER(re, gb);
>      return 0;
>  }
>  

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/7] proresdec2: store precomputed EC parameters
  2023-09-08  8:15 ` [FFmpeg-devel] [PATCH 2/7] proresdec2: store precomputed EC parameters Christophe Gisquet
@ 2023-09-08  8:39   ` Andreas Rheinhardt
  0 siblings, 0 replies; 22+ messages in thread
From: Andreas Rheinhardt @ 2023-09-08  8:39 UTC (permalink / raw)
  To: ffmpeg-devel

Christophe Gisquet:
> Having the various orders and offsets stored in a codebook is compact
> but causes additional computations. Using instead a table for the
> precomputed results achieve some speedups at the cost of ~132 bytes.
> 
> Around 5% speedup.
> ---
>  libavcodec/proresdec2.c | 54 +++++++++++++++++++++++++++++++++++------
>  1 file changed, 47 insertions(+), 7 deletions(-)
> 
> diff --git a/libavcodec/proresdec2.c b/libavcodec/proresdec2.c
> index 6e243cfc17..65e8b01755 100644
> --- a/libavcodec/proresdec2.c
> +++ b/libavcodec/proresdec2.c
> @@ -427,6 +427,7 @@ static int decode_picture_header(AVCodecContext *avctx, const uint8_t *buf, cons
>  # define READ_BITS get_bits
>  #endif
>  
> +/* Kept for reference and because clearer for first DC */
>  #define DECODE_CODEWORD(val, codebook)                                  \
>      do {                                                                \
>          unsigned int rice_order, exp_order, switch_bits;                \
> @@ -454,18 +455,41 @@ static int decode_picture_header(AVCodecContext *avctx, const uint8_t *buf, cons
>          }                                                               \
>      } while (0)
>  
> +/* number of bits to switch between rice and exp golomb */
> +#define DECODE_CODEWORD2(val, switch_bits, rice_order, diff, offset)    \
> +    do {                                                                \
> +        unsigned int q, buf, bits;                                      \
> +                                                                        \
> +        buf = show_bits(gb, 14);                                        \
> +        q = 13 - av_log2(buf);                                          \
> +                                                                        \
> +        if (q > switch_bits) { /* exp golomb */                         \
> +            bits = (q<<1) + (int)diff;                                  \
> +            val = READ_BITS(gb, bits) + (int)offset;                    \
> +        } else if (rice_order) {                                        \
> +            skip_remaining(gb, q+1);                                    \
> +            val = (q << rice_order) + get_bits(gb, rice_order);         \
> +        } else {                                                        \
> +            val = q;                                                    \
> +            skip_remaining(gb, q+1);                                    \
> +        }                                                               \
> +    } while (0)
> +
> +
>  #define TOSIGNED(x) (((x) >> 1) ^ (-((x) & 1)))
>  
>  #define FIRST_DC_CB 0xB8
>  
> -static const uint8_t dc_codebook[7] = { 0x04, 0x28, 0x28, 0x4D, 0x4D, 0x70, 0x70};
> +static const char dc_codebook[7][4] = {
> +    { 0, 0, 1, -1 }, { 0, 1, 2, -2 }, { 0, 1, 2, -2 },
> +    { 1, 2, 2,  0 }, { 1, 2, 2,  0 }, { 0, 3, 4, -8 }, { 0, 3, 4, -8 }
> +};
>  
>  static av_always_inline int decode_dc_coeffs(GetBitContext *gb, int16_t *out,
>                                                int blocks_per_slice)
>  {
>      int16_t prev_dc;
>      int code, i, sign;
> -
>      DECODE_CODEWORD(code, FIRST_DC_CB);
>      prev_dc = TOSIGNED(code);
>      out[0] = prev_dc;
> @@ -475,7 +499,9 @@ static av_always_inline int decode_dc_coeffs(GetBitContext *gb, int16_t *out,
>      code = 5;
>      sign = 0;
>      for (i = 1; i < blocks_per_slice; i++, out += 64) {
> -        DECODE_CODEWORD(code, dc_codebook[FFMIN(code, 6U)]);
> +        unsigned int dccb = FFMIN(code, 6U);
> +        DECODE_CODEWORD2(code, dc_codebook[dccb][0], dc_codebook[dccb][1],
> +                               dc_codebook[dccb][2], dc_codebook[dccb][3]);
>          if(code) sign ^= -(code & 1);
>          else     sign  = 0;
>          prev_dc += (((code + 1) >> 1) ^ sign) - sign;
> @@ -485,8 +511,18 @@ static av_always_inline int decode_dc_coeffs(GetBitContext *gb, int16_t *out,
>  }
>  
>  // adaptive codebook switching lut according to previous run/level values
> -static const uint8_t run_to_cb[16] = { 0x06, 0x06, 0x05, 0x05, 0x04, 0x29, 0x29, 0x29, 0x29, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x4C };
> -static const uint8_t lev_to_cb[10] = { 0x04, 0x0A, 0x05, 0x06, 0x04, 0x28, 0x28, 0x28, 0x28, 0x4C };
> +static const char run_to_cb[16][4] = {
> +    { 2, 0, -1,  1 }, { 2, 0, -1,  1 }, { 1, 0, 0,  0 }, { 1, 0,  0,  0 }, { 0, 0, 1, -1 },
> +    { 1, 1,  1,  0 }, { 1, 1,  1,  0 }, { 1, 1, 1,  0 }, { 1, 1,  1,  0 },
> +    { 0, 1,  2, -2 }, { 0, 1,  2, -2 }, { 0, 1, 2, -2 }, { 0, 1,  2, -2 }, { 0, 1, 2, -2 }, { 0, 1, 2, -2 },
> +    { 0, 2,  3, -4 }
> +};
> +
> +static const char lev_to_cb[10][4] = {

You seem to require signed chars here, but this is not a given on many
platforms. Better use int8_t.

> +    { 0, 0,  1, -1 }, { 2, 0,  0, -1 }, { 1, 0, 0,  0 }, { 2, 0, -1,  1 }, { 0, 0, 1, -1 },
> +    { 0, 1,  2, -2 }, { 0, 1,  2, -2 }, { 0, 1, 2, -2 }, { 0, 1,  2, -2 },
> +    { 0, 2,  3, -4 }
> +};
>  
>  static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContext *gb,
>                                               int16_t *out, int blocks_per_slice)
> @@ -504,18 +540,22 @@ static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContex
>      block_mask = blocks_per_slice - 1;
>  
>      for (pos = block_mask;;) {
> +        unsigned int runcb = FFMIN(run,  15);
> +        unsigned int levcb = FFMIN(level, 9);
>          bits_rem = get_bits_left(gb);
>          if (!bits_rem || (bits_rem < 16 && !show_bits(gb, bits_rem)))
>              break;
>  
> -        DECODE_CODEWORD(run, run_to_cb[FFMIN(run,  15)]);
> +        DECODE_CODEWORD2(run, run_to_cb[runcb][0], run_to_cb[runcb][1],
> +                              run_to_cb[runcb][2], run_to_cb[runcb][3]);
>          pos += run + 1;
>          if (pos >= max_coeffs) {
>              av_log(avctx, AV_LOG_ERROR, "ac tex damaged %d, %d\n", pos, max_coeffs);
>              return AVERROR_INVALIDDATA;
>          }
>  
> -        DECODE_CODEWORD(level, lev_to_cb[FFMIN(level, 9)]);
> +        DECODE_CODEWORD2(level, lev_to_cb[levcb][0], lev_to_cb[levcb][1],
> +                                lev_to_cb[levcb][2], lev_to_cb[levcb][3]);
>          level += 1;
>  
>          i = pos >> log2_block_count;

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [FFmpeg-devel] [PATCH 3/7] proresdec2: use VLC for level instead of EC switch
  2023-09-08  8:15 ` [FFmpeg-devel] [PATCH 3/7] proresdec2: use VLC for level instead of EC switch Christophe Gisquet
@ 2023-09-08  8:44   ` Andreas Rheinhardt
  2023-09-08  9:58     ` Andreas Rheinhardt
  0 siblings, 1 reply; 22+ messages in thread
From: Andreas Rheinhardt @ 2023-09-08  8:44 UTC (permalink / raw)
  To: ffmpeg-devel

Christophe Gisquet:
> x86/x64: 61/52 -> 55/46
> Around 7-10% speedup.
> 
> Run and DC do not lend themselves to such changes, likely because
> their distribution is less skewed, and need larger average vlc read
> iterations.
> ---
>  libavcodec/proresdec.h  |  1 +
>  libavcodec/proresdec2.c | 77 ++++++++++++++++++++++++++++++++++-------
>  2 files changed, 66 insertions(+), 12 deletions(-)
> 
> diff --git a/libavcodec/proresdec.h b/libavcodec/proresdec.h
> index 1e48752e6f..7ebacaeb21 100644
> --- a/libavcodec/proresdec.h
> +++ b/libavcodec/proresdec.h
> @@ -22,6 +22,7 @@
>  #ifndef AVCODEC_PRORESDEC_H
>  #define AVCODEC_PRORESDEC_H
>  
> +#define CACHED_BITSTREAM_READER 1

This should be in the commit switching to the cached bitstream reader.

>  #include "get_bits.h"
>  #include "blockdsp.h"
>  #include "proresdsp.h"
> diff --git a/libavcodec/proresdec2.c b/libavcodec/proresdec2.c
> index 65e8b01755..91c689d9ef 100644
> --- a/libavcodec/proresdec2.c
> +++ b/libavcodec/proresdec2.c
> @@ -24,17 +24,17 @@
>   * Known FOURCCs: 'apch' (HQ), 'apcn' (SD), 'apcs' (LT), 'apco' (Proxy), 'ap4h' (4444), 'ap4x' (4444 XQ)
>   */
>  
> -#define CACHED_BITSTREAM_READER 1
> +//#define DEBUG
>  
>  #include "config_components.h"
>  
>  #include "libavutil/internal.h"
>  #include "libavutil/mem_internal.h"
> +#include "libavutil/thread.h"
>  
>  #include "avcodec.h"
>  #include "codec_internal.h"
>  #include "decode.h"
> -#include "get_bits.h"
>  #include "hwaccel_internal.h"
>  #include "hwconfig.h"
>  #include "idctdsp.h"
> @@ -129,8 +129,64 @@ static void unpack_alpha_12(GetBitContext *gb, uint16_t *dst, int num_coeffs,
>      }
>  }
>  
> +#define AC_BITS 12
> +#define PRORES_LEV_BITS 9
> +
> +static const uint8_t ac_info[] = { 0x04, 0x0A, 0x05, 0x06, 0x28, 0x4C };
> +static VLC ac_vlc[6];
> +
> +static av_cold void init_vlcs(void)
> +{
> +    int i;
> +    for (i = 0; i < sizeof(ac_info); i++) {

FF_ARRAY_ELEMS() is cleaner; also we support and prefer declarations
inside for-loops: for (int i = 0;

> +        uint32_t ac_codes[1<<AC_BITS];
> +        uint8_t ac_bits[1<<AC_BITS];
> +        unsigned int rice_order, exp_order, switch_bits, switch_val;
> +        int ac, max_bits = 0, codebook = ac_info[i];
> +
> +        /* number of prefix bits to switch between Rice and expGolomb */
> +        switch_bits = (codebook & 3);
> +        rice_order  =  codebook >> 5;       /* rice code order */
> +        exp_order   = (codebook >> 2) & 7;  /* exp golomb code order */
> +
> +        switch_val  = (switch_bits+1) << rice_order;
> +
> +        // Values are actually transformed, but this is more a wrapping
> +        for (ac = 0; ac <1<<AC_BITS; ac++) {
> +            int exponent, bits, val = ac;
> +            unsigned int code;
> +
> +            if (val >= switch_val) {
> +                val += (1 << exp_order) - switch_val;
> +                exponent = av_log2(val);
> +                bits = exponent+1+switch_bits-exp_order/*0*/ + exponent+1/*val*/;
> +                code = val;
> +            } else if (rice_order) {
> +                bits = (val >> rice_order)/*0*/ + 1/*1*/ + rice_order/*val*/;
> +                code = (1 << rice_order) | val;
> +            } else {
> +                bits = val/*0*/ + 1/*1*/;
> +                code = 1;
> +            }
> +            if (bits > max_bits) max_bits = bits;
> +            ac_bits [ac] = bits;
> +            ac_codes[ac] = code;
> +        }
> +
> +        ff_free_vlc(ac_vlc+i);

This is unnecessary, as the VLC is initially blank and is not
initialized multiple times.

> +
> +        if (init_vlc(ac_vlc+i, PRORES_LEV_BITS, 1<<AC_BITS,
> +                     ac_bits, 1, 1, ac_codes, 4, 4, 0) < 0) {
> +            av_log(NULL, AV_LOG_ERROR, "Error for %d(0x%02X), max bits %d\n",
> +                   i, codebook, max_bits);
> +            break; //return AVERROR_BUG;

This is not how you initialize a static table (you miss the
INIT_VLC_USE_NEW_STATIC flag and don't set the static store buffer).
Search for INIT_VLC_STATIC_OVERLONG for an idea of how to do it.

> +        }
> +    }
> +}
> +
>  static av_cold int decode_init(AVCodecContext *avctx)
>  {
> +    static AVOnce init_static_once = AV_ONCE_INIT;
>      int ret = 0;
>      ProresContext *ctx = avctx->priv_data;
>      uint8_t idct_permutation[64];
> @@ -184,6 +240,9 @@ static av_cold int decode_init(AVCodecContext *avctx)
>  
>      ctx->pix_fmt = AV_PIX_FMT_NONE;
>  
> +    // init dc_tables
> +    ff_thread_once(&init_static_once, init_vlcs);
> +
>      if (avctx->bits_per_raw_sample == 10){
>          ctx->unpack_alpha = unpack_alpha_10;
>      } else if (avctx->bits_per_raw_sample == 12){
> @@ -510,7 +569,7 @@ static av_always_inline int decode_dc_coeffs(GetBitContext *gb, int16_t *out,
>      return 0;
>  }
>  
> -// adaptive codebook switching lut according to previous run/level values
> +// adaptive codebook switching lut according to previous run values
>  static const char run_to_cb[16][4] = {
>      { 2, 0, -1,  1 }, { 2, 0, -1,  1 }, { 1, 0, 0,  0 }, { 1, 0,  0,  0 }, { 0, 0, 1, -1 },
>      { 1, 1,  1,  0 }, { 1, 1,  1,  0 }, { 1, 1, 1,  0 }, { 1, 1,  1,  0 },
> @@ -518,12 +577,6 @@ static const char run_to_cb[16][4] = {
>      { 0, 2,  3, -4 }
>  };
>  
> -static const char lev_to_cb[10][4] = {
> -    { 0, 0,  1, -1 }, { 2, 0,  0, -1 }, { 1, 0, 0,  0 }, { 2, 0, -1,  1 }, { 0, 0, 1, -1 },
> -    { 0, 1,  2, -2 }, { 0, 1,  2, -2 }, { 0, 1, 2, -2 }, { 0, 1,  2, -2 },
> -    { 0, 2,  3, -4 }
> -};
> -
>  static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContext *gb,
>                                               int16_t *out, int blocks_per_slice)
>  {
> @@ -540,8 +593,9 @@ static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContex
>      block_mask = blocks_per_slice - 1;
>  
>      for (pos = block_mask;;) {
> +        static const uint8_t ctx_to_tbl[] = { 0, 1, 2, 3, 0, 4, 4, 4, 4, 5 };
> +        const VLC* tbl = ac_vlc + ctx_to_tbl[FFMIN(level, 9)];
>          unsigned int runcb = FFMIN(run,  15);
> -        unsigned int levcb = FFMIN(level, 9);
>          bits_rem = get_bits_left(gb);
>          if (!bits_rem || (bits_rem < 16 && !show_bits(gb, bits_rem)))
>              break;
> @@ -554,8 +608,7 @@ static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContex
>              return AVERROR_INVALIDDATA;
>          }
>  
> -        DECODE_CODEWORD2(level, lev_to_cb[levcb][0], lev_to_cb[levcb][1],
> -                                lev_to_cb[levcb][2], lev_to_cb[levcb][3]);
> +        level = get_vlc2(gb, tbl->table, PRORES_LEV_BITS, 3);
>          level += 1;
>  
>          i = pos >> log2_block_count;

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [FFmpeg-devel] [PATCH 6/7] proresdec2: remove a useless DC codebook entry
  2023-09-08  8:15 ` [FFmpeg-devel] [PATCH 6/7] proresdec2: remove a useless DC codebook entry Christophe Gisquet
@ 2023-09-08  9:08   ` Andreas Rheinhardt
  0 siblings, 0 replies; 22+ messages in thread
From: Andreas Rheinhardt @ 2023-09-08  9:08 UTC (permalink / raw)
  To: ffmpeg-devel

Christophe Gisquet:
> ---
>  libavcodec/proresdec2.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/libavcodec/proresdec2.c b/libavcodec/proresdec2.c
> index 02e1d82d00..b20021c622 100644
> --- a/libavcodec/proresdec2.c
> +++ b/libavcodec/proresdec2.c
> @@ -534,9 +534,9 @@ static int decode_picture_header(AVCodecContext *avctx, const uint8_t *buf, cons
>  
>  #define FIRST_DC_CB 0xB8
>  
> -static const char dc_codebook[7][4] = {
> +static const char dc_codebook[6][4] = {

You would not need this change if you omitted the length.

>      { 0, 0, 1, -1 }, { 0, 1, 2, -2 }, { 0, 1, 2, -2 },
> -    { 1, 2, 2,  0 }, { 1, 2, 2,  0 }, { 0, 3, 4, -8 }, { 0, 3, 4, -8 }
> +    { 1, 2, 2,  0 }, { 1, 2, 2,  0 }, { 0, 3, 4, -8 }
>  };
>  
>  static av_always_inline int decode_dc_coeffs(GetBitContext *gb, int16_t *out,
> @@ -553,7 +553,7 @@ static av_always_inline int decode_dc_coeffs(GetBitContext *gb, int16_t *out,
>      code = 5;
>      sign = 0;
>      for (i = 1; i < blocks_per_slice; i++, out += 64) {
> -        unsigned int dccb = FFMIN(code, 6U);
> +        unsigned int dccb = FFMIN(code, 5U);

You wouldn't need this change if you used FF_ARRAY_ELEMS(dc_codebook) -
1 here.

Btw: Why is this codebook entry useless? Can code never be 6?

>          DECODE_CODEWORD2(code, dc_codebook[dccb][0], dc_codebook[dccb][1],
>                                 dc_codebook[dccb][2], dc_codebook[dccb][3]);
>          if(code) sign ^= -(code & 1);

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [FFmpeg-devel] [PATCH 7/7] prores: use VLC LUTs
  2023-09-08  8:15 ` [FFmpeg-devel] [PATCH 7/7] prores: use VLC LUTs Christophe Gisquet
@ 2023-09-08  9:20   ` Andreas Rheinhardt
  2023-09-08  9:58     ` Christophe Gisquet
  0 siblings, 1 reply; 22+ messages in thread
From: Andreas Rheinhardt @ 2023-09-08  9:20 UTC (permalink / raw)
  To: ffmpeg-devel

Christophe Gisquet:
> One indirection less, around 1% speedup.
> ---
>  libavcodec/proresdec2.c | 16 +++++++++-------
>  1 file changed, 9 insertions(+), 7 deletions(-)
> 
> diff --git a/libavcodec/proresdec2.c b/libavcodec/proresdec2.c
> index b20021c622..85f81d92d3 100644
> --- a/libavcodec/proresdec2.c
> +++ b/libavcodec/proresdec2.c
> @@ -561,12 +561,18 @@ static av_always_inline int decode_dc_coeffs(GetBitContext *gb, int16_t *out,
>          prev_dc += (((code + 1) >> 1) ^ sign) - sign;
>          out[0] = prev_dc;
>      }
> -    return 0;
> +    return 0;	

You are adding trailing whitespace.

>  }
>  
> +#include "libavutil/timer.h"

You really need to look over your patches once more before you send
them. Both of these changes are obviously not ok to commit.

> +
> +
>  static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContext *gb,
>                                               int16_t *out, int blocks_per_slice)
>  {
> +	static VLC* lvl_vlc[9] = { &ac_vlc[0], &ac_vlc[1], &ac_vlc[2], &ac_vlc[3], &ac_vlc[0], &ac_vlc[4], &ac_vlc[4], &ac_vlc[4], &ac_vlc[4], };
> +	static VLC* run_vlc[15] = { &ac_vlc[3], &ac_vlc[3], &ac_vlc[2], &ac_vlc[2], &ac_vlc[0], &ac_vlc[5], &ac_vlc[5], &ac_vlc[5], &ac_vlc[5],
> +	                            &ac_vlc[4], &ac_vlc[4], &ac_vlc[4], &ac_vlc[4], &ac_vlc[4], &ac_vlc[4], };

This still incurs an unnecessary indirection. The LUT should not point
to the VLC's, but rather to the VLC tables (as this is the only thing
needed from them lateron given that the number of bits is a compile-time
constant. The LUT should be initialized when the VLCs are initialized.

In fact, this is so common that I always pondered adding an explicit
function for it. Will probably do so soon.

(Apart from that: This could be "static const VLC *const run_vlc[15]".)

>      const ProresContext *ctx = avctx->priv_data;
>      int block_mask, sign;
>      unsigned pos, run, level;
> @@ -585,9 +591,7 @@ static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContex
>              break;
>  
>          if (run < 15) {
> -            static const uint8_t ctx_to_tbl[] = { 3, 3, 2, 2, 0, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4 };
> -            const VLC* tbl = ac_vlc + ctx_to_tbl[run];
> -            run = get_vlc2(gb, tbl->table, PRORES_LEV_BITS, 3);
> +            run = get_vlc2(gb, run_vlc[run]->table, PRORES_LEV_BITS, 3);
>          } else {
>              unsigned int bits = 21 - 2*av_log2(show_bits(gb, 10));
>              run = READ_BITS(gb, bits) - 4; // up to 17 bits
> @@ -599,9 +603,7 @@ static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContex
>          }
>  
>          if (level < 9) {
> -            static const uint8_t ctx_to_tbl[] = { 0, 1, 2, 3, 0, 4, 4, 4, 4 };
> -            const VLC* tbl = ac_vlc + ctx_to_tbl[level];
> -            level = 1+get_vlc2(gb, tbl->table, PRORES_LEV_BITS, 3);
> +            level = 1+get_vlc2(gb, lvl_vlc[level]->table, PRORES_LEV_BITS, 3);

Seems like these VLCs should be offset by 1 to avoid the "1+".

>          } else {
>              unsigned int bits = 25 - 2*av_log2(show_bits(gb, 12));
>              level = READ_BITS(gb, bits) - 4 + 1; // up to 21 bits

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [FFmpeg-devel] [PATCH 3/7] proresdec2: use VLC for level instead of EC switch
  2023-09-08  8:44   ` Andreas Rheinhardt
@ 2023-09-08  9:58     ` Andreas Rheinhardt
  2023-09-10 15:28       ` Christophe Gisquet
  0 siblings, 1 reply; 22+ messages in thread
From: Andreas Rheinhardt @ 2023-09-08  9:58 UTC (permalink / raw)
  To: ffmpeg-devel

Andreas Rheinhardt:
> Christophe Gisquet:
>> x86/x64: 61/52 -> 55/46
>> Around 7-10% speedup.
>>
>> Run and DC do not lend themselves to such changes, likely because
>> their distribution is less skewed, and need larger average vlc read
>> iterations.
>> ---
>>  libavcodec/proresdec.h  |  1 +
>>  libavcodec/proresdec2.c | 77 ++++++++++++++++++++++++++++++++++-------
>>  2 files changed, 66 insertions(+), 12 deletions(-)
>>
>> diff --git a/libavcodec/proresdec.h b/libavcodec/proresdec.h
>> index 1e48752e6f..7ebacaeb21 100644
>> --- a/libavcodec/proresdec.h
>> +++ b/libavcodec/proresdec.h
>> @@ -22,6 +22,7 @@
>>  #ifndef AVCODEC_PRORESDEC_H
>>  #define AVCODEC_PRORESDEC_H
>>  
>> +#define CACHED_BITSTREAM_READER 1
> 
> This should be in the commit switching to the cached bitstream reader.

Correction: This header is included in videotoolbox.c and there is other
stuff that also includes get_bits.h included in said file (and currently
gets included before proresdec.h). This means that proresdec2.c and
videotoolbox.c will have different opinions on what a GetBitContext is:
It will be the non-cached one in videotoolbox.c and the cached one in
proresdec2.c. This will work in practice, because ProresContext does not
need the complete GetBitContext type at all (it does not contain a
GetBitContext at all), so offsets are not affected. But it is
nevertheless undefined behaviour and could become dangerous when using LTO.

So you should switch the type of the pointer to BitstreamContextBE* in
proresdec2.h. Furthermore, you can either include bitstream.h in
proresdec.h or (IMO better) use a forward declaration and struct
BitstreamContextBE* in the function pointer without including get_bits.h
in the header at all.

> 
>>  #include "get_bits.h"
>>  #include "blockdsp.h"
>>  #include "proresdsp.h"
>> diff --git a/libavcodec/proresdec2.c b/libavcodec/proresdec2.c
>> index 65e8b01755..91c689d9ef 100644
>> --- a/libavcodec/proresdec2.c
>> +++ b/libavcodec/proresdec2.c
>> @@ -24,17 +24,17 @@
>>   * Known FOURCCs: 'apch' (HQ), 'apcn' (SD), 'apcs' (LT), 'apco' (Proxy), 'ap4h' (4444), 'ap4x' (4444 XQ)
>>   */
>>  
>> -#define CACHED_BITSTREAM_READER 1
>> +//#define DEBUG
>>  
>>  #include "config_components.h"
>>  
>>  #include "libavutil/internal.h"
>>  #include "libavutil/mem_internal.h"
>> +#include "libavutil/thread.h"
>>  
>>  #include "avcodec.h"
>>  #include "codec_internal.h"
>>  #include "decode.h"
>> -#include "get_bits.h"
>>  #include "hwaccel_internal.h"
>>  #include "hwconfig.h"
>>  #include "idctdsp.h"
>> @@ -129,8 +129,64 @@ static void unpack_alpha_12(GetBitContext *gb, uint16_t *dst, int num_coeffs,
>>      }
>>  }
>>  
>> +#define AC_BITS 12
>> +#define PRORES_LEV_BITS 9
>> +
>> +static const uint8_t ac_info[] = { 0x04, 0x0A, 0x05, 0x06, 0x28, 0x4C };
>> +static VLC ac_vlc[6];
>> +
>> +static av_cold void init_vlcs(void)
>> +{
>> +    int i;
>> +    for (i = 0; i < sizeof(ac_info); i++) {
> 
> FF_ARRAY_ELEMS() is cleaner; also we support and prefer declarations
> inside for-loops: for (int i = 0;
> 
>> +        uint32_t ac_codes[1<<AC_BITS];
>> +        uint8_t ac_bits[1<<AC_BITS];
>> +        unsigned int rice_order, exp_order, switch_bits, switch_val;
>> +        int ac, max_bits = 0, codebook = ac_info[i];
>> +
>> +        /* number of prefix bits to switch between Rice and expGolomb */
>> +        switch_bits = (codebook & 3);
>> +        rice_order  =  codebook >> 5;       /* rice code order */
>> +        exp_order   = (codebook >> 2) & 7;  /* exp golomb code order */
>> +
>> +        switch_val  = (switch_bits+1) << rice_order;
>> +
>> +        // Values are actually transformed, but this is more a wrapping
>> +        for (ac = 0; ac <1<<AC_BITS; ac++) {
>> +            int exponent, bits, val = ac;
>> +            unsigned int code;
>> +
>> +            if (val >= switch_val) {
>> +                val += (1 << exp_order) - switch_val;
>> +                exponent = av_log2(val);
>> +                bits = exponent+1+switch_bits-exp_order/*0*/ + exponent+1/*val*/;
>> +                code = val;
>> +            } else if (rice_order) {
>> +                bits = (val >> rice_order)/*0*/ + 1/*1*/ + rice_order/*val*/;
>> +                code = (1 << rice_order) | val;
>> +            } else {
>> +                bits = val/*0*/ + 1/*1*/;
>> +                code = 1;
>> +            }
>> +            if (bits > max_bits) max_bits = bits;
>> +            ac_bits [ac] = bits;
>> +            ac_codes[ac] = code;
>> +        }
>> +
>> +        ff_free_vlc(ac_vlc+i);
> 
> This is unnecessary, as the VLC is initially blank and is not
> initialized multiple times.
> 
>> +
>> +        if (init_vlc(ac_vlc+i, PRORES_LEV_BITS, 1<<AC_BITS,
>> +                     ac_bits, 1, 1, ac_codes, 4, 4, 0) < 0) {
>> +            av_log(NULL, AV_LOG_ERROR, "Error for %d(0x%02X), max bits %d\n",
>> +                   i, codebook, max_bits);
>> +            break; //return AVERROR_BUG;
> 
> This is not how you initialize a static table (you miss the
> INIT_VLC_USE_NEW_STATIC flag and don't set the static store buffer).
> Search for INIT_VLC_STATIC_OVERLONG for an idea of how to do it.
> 
>> +        }
>> +    }
>> +}
>> +
>>  static av_cold int decode_init(AVCodecContext *avctx)
>>  {
>> +    static AVOnce init_static_once = AV_ONCE_INIT;
>>      int ret = 0;
>>      ProresContext *ctx = avctx->priv_data;
>>      uint8_t idct_permutation[64];
>> @@ -184,6 +240,9 @@ static av_cold int decode_init(AVCodecContext *avctx)
>>  
>>      ctx->pix_fmt = AV_PIX_FMT_NONE;
>>  
>> +    // init dc_tables
>> +    ff_thread_once(&init_static_once, init_vlcs);
>> +
>>      if (avctx->bits_per_raw_sample == 10){
>>          ctx->unpack_alpha = unpack_alpha_10;
>>      } else if (avctx->bits_per_raw_sample == 12){
>> @@ -510,7 +569,7 @@ static av_always_inline int decode_dc_coeffs(GetBitContext *gb, int16_t *out,
>>      return 0;
>>  }
>>  
>> -// adaptive codebook switching lut according to previous run/level values
>> +// adaptive codebook switching lut according to previous run values
>>  static const char run_to_cb[16][4] = {
>>      { 2, 0, -1,  1 }, { 2, 0, -1,  1 }, { 1, 0, 0,  0 }, { 1, 0,  0,  0 }, { 0, 0, 1, -1 },
>>      { 1, 1,  1,  0 }, { 1, 1,  1,  0 }, { 1, 1, 1,  0 }, { 1, 1,  1,  0 },
>> @@ -518,12 +577,6 @@ static const char run_to_cb[16][4] = {
>>      { 0, 2,  3, -4 }
>>  };
>>  
>> -static const char lev_to_cb[10][4] = {
>> -    { 0, 0,  1, -1 }, { 2, 0,  0, -1 }, { 1, 0, 0,  0 }, { 2, 0, -1,  1 }, { 0, 0, 1, -1 },
>> -    { 0, 1,  2, -2 }, { 0, 1,  2, -2 }, { 0, 1, 2, -2 }, { 0, 1,  2, -2 },
>> -    { 0, 2,  3, -4 }
>> -};
>> -
>>  static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContext *gb,
>>                                               int16_t *out, int blocks_per_slice)
>>  {
>> @@ -540,8 +593,9 @@ static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContex
>>      block_mask = blocks_per_slice - 1;
>>  
>>      for (pos = block_mask;;) {
>> +        static const uint8_t ctx_to_tbl[] = { 0, 1, 2, 3, 0, 4, 4, 4, 4, 5 };
>> +        const VLC* tbl = ac_vlc + ctx_to_tbl[FFMIN(level, 9)];
>>          unsigned int runcb = FFMIN(run,  15);
>> -        unsigned int levcb = FFMIN(level, 9);
>>          bits_rem = get_bits_left(gb);
>>          if (!bits_rem || (bits_rem < 16 && !show_bits(gb, bits_rem)))
>>              break;
>> @@ -554,8 +608,7 @@ static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContex
>>              return AVERROR_INVALIDDATA;
>>          }
>>  
>> -        DECODE_CODEWORD2(level, lev_to_cb[levcb][0], lev_to_cb[levcb][1],
>> -                                lev_to_cb[levcb][2], lev_to_cb[levcb][3]);
>> +        level = get_vlc2(gb, tbl->table, PRORES_LEV_BITS, 3);
>>          level += 1;
>>  
>>          i = pos >> log2_block_count;
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [FFmpeg-devel] [PATCH 7/7] prores: use VLC LUTs
  2023-09-08  9:20   ` Andreas Rheinhardt
@ 2023-09-08  9:58     ` Christophe Gisquet
  0 siblings, 0 replies; 22+ messages in thread
From: Christophe Gisquet @ 2023-09-08  9:58 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Le ven. 8 sept. 2023 à 11:19, Andreas Rheinhardt
<andreas.rheinhardt@outlook.com> a écrit :
> > -    return 0;
> > +    return 0;
>
> You are adding trailing whitespace.

Sorry, will fix. I had to do some of this work on a misconfigured machine.

> > +#include "libavutil/timer.h"
>
> You really need to look over your patches once more before you send
> them. Both of these changes are obviously not ok to commit.

I know the drill. Again, trying my best to help moving a situation
that had been rotting for 6 years.

> This still incurs an unnecessary indirection. The LUT should not point
> to the VLC's, but rather to the VLC tables (as this is the only thing
> needed from them lateron given that the number of bits is a compile-time
> constant. The LUT should be initialized when the VLCs are initialized.

You're right, and by the same logic from my comment, that should save
things further.

> Seems like these VLCs should be offset by 1 to avoid the "1+".

That's what I did in a previous commit, but that was before I could
share the tables. I didn't consider creating 5 more tables for this
beneficial.

-- 
Christophe
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/7] proresdec2: port and fix for cached reader
  2023-09-08  8:15 [FFmpeg-devel] [PATCH 1/7] proresdec2: port and fix for cached reader Christophe Gisquet
                   ` (7 preceding siblings ...)
  2023-09-08  8:36 ` Andreas Rheinhardt
@ 2023-09-08 21:00 ` Michael Niedermayer
  8 siblings, 0 replies; 22+ messages in thread
From: Michael Niedermayer @ 2023-09-08 21:00 UTC (permalink / raw)
  To: FFmpeg development discussions and patches


[-- Attachment #1.1: Type: text/plain, Size: 3869 bytes --]

On Fri, Sep 08, 2023 at 10:15:02AM +0200, Christophe Gisquet wrote:
> Summary of changes
> - move back to regular, non-macro, get_bits API
> - reduce the lookup to switch the coding method
> - shorter reads wherever possible, in particular for the end of bitstream
>   (16 bits instead of 32, as per the above)
> 
> There are cases that really need longer lengths (larger EG codes) of up
> to 27 bits.
> 
> Win64: 6.10 -> 4.87 (~20% speedup)
> 
> Reference for an hypothetical 32bits version of the cached reader:
> Win32: 11.4 -> 9.8  (14%, because iDCT is not SIMDed)
> ---
>  libavcodec/proresdec2.c | 53 ++++++++++++++++++-----------------------
>  1 file changed, 23 insertions(+), 30 deletions(-)

causes assertion failure:

Assertion n <= 32 failed at libavcodec/bitstream_template.h:338/0

Thread 36 "read_thread" received signal SIGABRT, Aborted.
[Switching to Thread 0x7fff56ffd700 (LWP 12751)]
__GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:51
51	../sysdeps/unix/sysv/linux/raise.c: No such file or directory.
(gdb) bt
#0  __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:51
#1  0x00007fffefd097f1 in __GI_abort () at abort.c:79
#2  0x0000555556035277 in bits_peek_be (bc=0x7fff56ffb270, n=4294967295) at libavcodec/bitstream_template.h:338
#3  0x0000555556036b35 in decode_ac_coeffs (avctx=0x7fff28002680, gb=0x7fff56ffb270, out=0x7fff56ffb2a0, blocks_per_slice=32) at libavcodec/proresdec2.c:590
#4  0x000055555603715c in decode_slice_chroma (avctx=0x7fff28002680, slice=0x7fff28055c20, dst=0x7fff9450b940, dst_stride=7680, buf=0x7fff2803d072 "#\a`\002g\240", <incomplete sequence \323>, buf_size=43, qmat=0x7fff56ffc450, log2_blocks_per_mb=2) at libavcodec/proresdec2.c:674
#5  0x0000555556037ae2 in decode_slice_thread (avctx=0x7fff28002680, arg=0x0, jobnr=145, threadnr=0) at libavcodec/proresdec2.c:807
#6  0x0000555555c815a0 in avcodec_default_execute2 (c=0x7fff28002680, func=0x555556037465 <decode_slice_thread>, arg=0x0, ret=0x0, count=510) at libavcodec/avcodec.c:74
#7  0x0000555556037d1e in decode_picture (avctx=0x7fff28002680) at libavcodec/proresdec2.c:846
#8  0x0000555556037fda in decode_frame (avctx=0x7fff28002680, frame=0x7fff28007440, got_frame=0x7fff56ffc5f0, avpkt=0x7fff28007dc0) at libavcodec/proresdec2.c:912
#9  0x0000555555d732ec in decode_simple_internal (avctx=0x7fff28002680, frame=0x7fff28007440, discarded_samples=0x7fff56ffc650) at libavcodec/decode.c:433
#10 0x0000555555d73843 in decode_simple_receive_frame (avctx=0x7fff28002680, frame=0x7fff28007440) at libavcodec/decode.c:607
#11 0x0000555555d739b3 in decode_receive_frame_internal (avctx=0x7fff28002680, frame=0x7fff28007440) at libavcodec/decode.c:635
#12 0x0000555555d73d78 in avcodec_send_packet (avctx=0x7fff28002680, avpkt=0x7fff28007208) at libavcodec/decode.c:732
#13 0x0000555555a6ef78 in try_decode_frame (s=0x7fff28000c80, st=0x7fff28002240, pkt=0x7fff28007208, options=0x7fff280071c0) at libavformat/demux.c:2075
#14 0x0000555555a71d92 in avformat_find_stream_info (ic=0x7fff28000c80, options=0x7fff280071c0) at libavformat/demux.c:2771
#15 0x0000555555699823 in read_thread (arg=0x7fffd45ca040) at fftools/ffplay.c:2806
#16 0x00007ffff5deed6c in ?? () from /usr/lib/x86_64-linux-gnu/libSDL2-2.0.so.0
#17 0x00007ffff5e640f9 in ?? () from /usr/lib/x86_64-linux-gnu/libSDL2-2.0.so.0
#18 0x00007ffff00c16db in start_thread (arg=0x7fff56ffd700) at pthread_create.c:463
#19 0x00007fffefdea61f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

The day soldiers stop bringing you their problems is the day you have stopped 
leading them. They have either lost confidence that you can help or concluded 
you do not care. Either case is a failure of leadership. - Colin Powell

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

[-- Attachment #2: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [FFmpeg-devel] [PATCH 3/7] proresdec2: use VLC for level instead of EC switch
  2023-09-08  9:58     ` Andreas Rheinhardt
@ 2023-09-10 15:28       ` Christophe Gisquet
  2023-09-10 15:41         ` Andreas Rheinhardt
  0 siblings, 1 reply; 22+ messages in thread
From: Christophe Gisquet @ 2023-09-10 15:28 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Hello,

Le ven. 8 sept. 2023 à 11:57, Andreas Rheinhardt
<andreas.rheinhardt@outlook.com> a écrit :
> >> +#define CACHED_BITSTREAM_READER 1
> >
> > This should be in the commit switching to the cached bitstream reader.
>
> Correction: This header is included in videotoolbox.c and there is other
> stuff that also includes get_bits.h included in said file (and currently
> gets included before proresdec.h). This means that proresdec2.c and
> videotoolbox.c will have different opinions on what a GetBitContext is:
> It will be the non-cached one in videotoolbox.c and the cached one in
> proresdec2.c. This will work in practice, because ProresContext does not
> need the complete GetBitContext type at all (it does not contain a
> GetBitContext at all), so offsets are not affected. But it is
> nevertheless undefined behaviour and could become dangerous when using LTO.
>
> So you should switch the type of the pointer to BitstreamContextBE* in
> proresdec2.h. Furthermore, you can either include bitstream.h in
> proresdec.h or (IMO better) use a forward declaration and struct
> BitstreamContextBE* in the function pointer without including get_bits.h
> in the header at all.

On that point only: I don't recall (yes that's 3+ years old) the issue
being videotoolbox, it didn't have that include back when I wrote this
code.

It's a very faint recollection, and I don't find proof in the ffmpeg
code today or of 3+ years ago, but the problem you mention was
happening with the encoder instead. So maybe the fix now is needed
only by videotoolbox then.

-- 
Christophe
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [FFmpeg-devel] [PATCH 3/7] proresdec2: use VLC for level instead of EC switch
  2023-09-10 15:28       ` Christophe Gisquet
@ 2023-09-10 15:41         ` Andreas Rheinhardt
  2023-09-10 15:56           ` Christophe Gisquet
  0 siblings, 1 reply; 22+ messages in thread
From: Andreas Rheinhardt @ 2023-09-10 15:41 UTC (permalink / raw)
  To: ffmpeg-devel

Christophe Gisquet:
> Hello,
> 
> Le ven. 8 sept. 2023 à 11:57, Andreas Rheinhardt
> <andreas.rheinhardt@outlook.com> a écrit :
>>>> +#define CACHED_BITSTREAM_READER 1
>>>
>>> This should be in the commit switching to the cached bitstream reader.
>>
>> Correction: This header is included in videotoolbox.c and there is other
>> stuff that also includes get_bits.h included in said file (and currently
>> gets included before proresdec.h). This means that proresdec2.c and
>> videotoolbox.c will have different opinions on what a GetBitContext is:
>> It will be the non-cached one in videotoolbox.c and the cached one in
>> proresdec2.c. This will work in practice, because ProresContext does not
>> need the complete GetBitContext type at all (it does not contain a
>> GetBitContext at all), so offsets are not affected. But it is
>> nevertheless undefined behaviour and could become dangerous when using LTO.
>>
>> So you should switch the type of the pointer to BitstreamContextBE* in
>> proresdec2.h. Furthermore, you can either include bitstream.h in
>> proresdec.h or (IMO better) use a forward declaration and struct
>> BitstreamContextBE* in the function pointer without including get_bits.h
>> in the header at all.
> 
> On that point only: I don't recall (yes that's 3+ years old) the issue
> being videotoolbox, it didn't have that include back when I wrote this
> code.
> 
> It's a very faint recollection, and I don't find proof in the ffmpeg
> code today or of 3+ years ago, but the problem you mention was
> happening with the encoder instead. So maybe the fix now is needed
> only by videotoolbox then.
> 

The videotoolbox prores hwaccel has only been added in November 2021;
before that, nobody but proresdec2.c included proresdec.h, so that there
was no issue with GetBitContext being cached or not in different files.

Another solution would be to use void* instead of GetBitContext* in the
header and in the implementation and then convert this void* to
GetBitContext* in the function.

I do not know what you mean by "the encoder instead". What problem
happens with the encoder? Why would the encoder include proresdec.h at
all and why would it be affected by changes to the decoder?

- Andreas

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [FFmpeg-devel] [PATCH 3/7] proresdec2: use VLC for level instead of EC switch
  2023-09-10 15:41         ` Andreas Rheinhardt
@ 2023-09-10 15:56           ` Christophe Gisquet
  0 siblings, 0 replies; 22+ messages in thread
From: Christophe Gisquet @ 2023-09-10 15:56 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Hello

Le dim. 10 sept. 2023 à 17:40, Andreas Rheinhardt
<andreas.rheinhardt@outlook.com> a écrit :
> Another solution would be to use void* instead of GetBitContext* in the
> header and in the implementation and then convert this void* to
> GetBitContext* in the function.

The forward declaration will be enough.

> I do not know what you mean by "the encoder instead". What problem
> happens with the encoder? Why would the encoder include proresdec.h at
> all and why would it be affected by changes to the decoder?

The keyword was "faint", and it's a non-issue now. Just explaining why
I would have had that code change that now appears as an attempted fix
for videotoolbox, but really was never meant that way: pure luck, the
actual reason being lost to time.

-- 
Christophe
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/7] proresdec2: port and fix for cached reader
  2023-09-08  8:20 ` [FFmpeg-devel] [PATCH 1/7] proresdec2: port and fix for cached reader Christophe Gisquet
  2023-09-08  8:30   ` Andreas Rheinhardt
@ 2023-09-11 20:54   ` Christophe Gisquet
  1 sibling, 0 replies; 22+ messages in thread
From: Christophe Gisquet @ 2023-09-11 20:54 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Le ven. 8 sept. 2023 à 10:20, Christophe Gisquet
<christophe.gisquet@gmail.com> a écrit :
> This patchset requires my previous one improving the cached bitstream
> reader, and serves as its justification. It, basically, moves to using
> VLC wherever possible, and in particular when codewords are
> sufficiently short/there's some kind of well-behaved laplacian
> distribution for codewords that make VLCs efficient.
>
> Total speedup is around 40% here.

It's unfortunate I cannot devote as much time and effort to fix some
fundamental problems. But as I don't want Andreas to have wasted his
time reviewing, and me answering as best as possible, the last state
(maybe addressing 90% of the review?) can be obtained from repo at
https://github.com/cgisquet/ffmpeg.git branch prores.

Best regards,
-- 
Christophe
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2023-09-11 20:55 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-09-08  8:15 [FFmpeg-devel] [PATCH 1/7] proresdec2: port and fix for cached reader Christophe Gisquet
2023-09-08  8:15 ` [FFmpeg-devel] [PATCH 2/7] proresdec2: store precomputed EC parameters Christophe Gisquet
2023-09-08  8:39   ` Andreas Rheinhardt
2023-09-08  8:15 ` [FFmpeg-devel] [PATCH 3/7] proresdec2: use VLC for level instead of EC switch Christophe Gisquet
2023-09-08  8:44   ` Andreas Rheinhardt
2023-09-08  9:58     ` Andreas Rheinhardt
2023-09-10 15:28       ` Christophe Gisquet
2023-09-10 15:41         ` Andreas Rheinhardt
2023-09-10 15:56           ` Christophe Gisquet
2023-09-08  8:15 ` [FFmpeg-devel] [PATCH 4/7] proresdec2: offset VLCs by 1 to avoid 1 add Christophe Gisquet
2023-09-08  8:15 ` [FFmpeg-devel] [PATCH 5/7] proresdec2: use VLC for small runs and levels Christophe Gisquet
2023-09-08  8:15 ` [FFmpeg-devel] [PATCH 6/7] proresdec2: remove a useless DC codebook entry Christophe Gisquet
2023-09-08  9:08   ` Andreas Rheinhardt
2023-09-08  8:15 ` [FFmpeg-devel] [PATCH 7/7] prores: use VLC LUTs Christophe Gisquet
2023-09-08  9:20   ` Andreas Rheinhardt
2023-09-08  9:58     ` Christophe Gisquet
2023-09-08  8:20 ` [FFmpeg-devel] [PATCH 1/7] proresdec2: port and fix for cached reader Christophe Gisquet
2023-09-08  8:30   ` Andreas Rheinhardt
2023-09-08  8:34     ` Andreas Rheinhardt
2023-09-11 20:54   ` Christophe Gisquet
2023-09-08  8:36 ` Andreas Rheinhardt
2023-09-08 21:00 ` Michael Niedermayer

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git