[FFmpeg-devel] [PATCH 1/2] avcodec/vc2enc: Use LUT to assemble interleaved golomb, code

From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
Subject: [FFmpeg-devel] [PATCH 1/2] avcodec/vc2enc: Use LUT to assemble interleaved golomb, code
Date: Wed, 12 Mar 2025 04:10:01 +0100
Message-ID: <GV1P250MB0737DF2E0F59612C1DEA44958FD02@GV1P250MB0737.EURP250.PROD.OUTLOOK.COM> (raw)

[-- Attachment #1: Type: text/plain, Size: 29 bytes --]

Patches attached.

- Andreas

[-- Attachment #2: 0001-avcodec-vc2enc-Use-LUT-to-assemble-interleaved-golom.patch --]
[-- Type: text/x-patch, Size: 2977 bytes --]

From 362a2cdad8717c016cd05e8d782260bd1aa0751a Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 12 Mar 2025 03:26:09 +0100
Subject: [PATCH 1/2] avcodec/vc2enc: Use LUT to assemble interleaved golomb
 code

Up until now, the encoder processed only one bit at a time.
With this patch, it is eight bits.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/vc2enc.c | 45 +++++++++++++++++++++++++++++++++------------
 1 file changed, 33 insertions(+), 12 deletions(-)

diff --git a/libavcodec/vc2enc.c b/libavcodec/vc2enc.c
index 4728a48938..1fe973f4cd 100644
--- a/libavcodec/vc2enc.c
+++ b/libavcodec/vc2enc.c
@@ -22,6 +22,7 @@
 #include "libavutil/mem.h"
 #include "libavutil/pixdesc.h"
 #include "libavutil/opt.h"
+#include "libavutil/thread.h"
 #include "libavutil/version.h"
 #include "codec_internal.h"
 #include "dirac.h"
@@ -186,22 +187,39 @@ typedef struct VC2EncContext {
     enum DiracParseCodes last_parse_code;
 } VC2EncContext;
 
+/// x_k x_{k-1} ... x_0 -> 0 x_k 0 x_{k - 1} ... 0 x_0
+static uint16_t interleaved_ue_golomb_tab[256];
+/// 1 x_{k-1} ... x_0 -> 0 0 0 x_{k - 1} ... 0 x_0
+static uint16_t top_interleaved_ue_golomb_tab[256];
+/// 1 x_{k-1} ... x_0 -> 2 * k
+static uint8_t golomb_len_tab[256];
+
+static av_cold void vc2_init_static_data(void)
+{
+    interleaved_ue_golomb_tab[1] = 1;
+    for (unsigned i = 2; i < 256; ++i) {
+        golomb_len_tab[i] = golomb_len_tab[i >> 1] + 2;
+        interleaved_ue_golomb_tab[i] = (interleaved_ue_golomb_tab[i >> 1] << 2) | (i & 1);
+        top_interleaved_ue_golomb_tab[i] = interleaved_ue_golomb_tab[i] ^ (1 << golomb_len_tab[i]);
+    }
+}
+
 static av_always_inline void put_vc2_ue_uint(PutBitContext *pb, uint32_t val)
 {
-    int i;
-    int bits = av_log2(++val);
-    unsigned topbit = 1 << bits;
-    uint64_t pbits = 0;
-
-    for (i = 0; i < bits; i++) {
-        topbit >>= 1;
-        av_assert2(pbits <= UINT64_MAX>>3);
-        pbits <<= 2;
-        if (val & topbit)
-            pbits |= 0x1;
+    uint64_t pbits = 1;
+    int bits = 1;
+
+    ++val;
+
+    while (val >> 8) {
+        pbits |= (uint64_t)interleaved_ue_golomb_tab[val & 0xff] << bits;
+        val  >>= 8;
+        bits  += 16;
     }
+    pbits |= (uint64_t)top_interleaved_ue_golomb_tab[val] << bits;
+    bits  += golomb_len_tab[val];
 
-    put_bits63(pb, 2 * bits + 1, (pbits << 1) | 1);
+    put_bits63(pb, bits, pbits);
 }
 
 static av_always_inline int count_vc2_ue_uint(uint32_t val)
@@ -1003,6 +1021,7 @@ static av_cold int vc2_encode_end(AVCodecContext *avctx)
 
 static av_cold int vc2_encode_init(AVCodecContext *avctx)
 {
+    static AVOnce init_static_once = AV_ONCE_INIT;
     Plane *p;
     SubBand *b;
     int i, level, o, shift;
@@ -1165,6 +1184,8 @@ static av_cold int vc2_encode_init(AVCodecContext *avctx)
         }
     }
 
+    ff_thread_once(&init_static_once, vc2_init_static_data);
+
     return 0;
 }
 
-- 
2.45.2


[-- Attachment #3: 0002-avcodec-vc2enc-Avoid-excessive-inlining.patch --]
[-- Type: text/x-patch, Size: 1832 bytes --]

From 389a64c00bc8244186db1abb25be8dd5ec452df7 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 12 Mar 2025 03:56:03 +0100
Subject: [PATCH 2/2] avcodec/vc2enc: Avoid excessive inlining

There is no reason to inline put_vc2_ue_uint() everywhere;
only one call site is actually hot: The one in encode_subband()
(which accounts for 35735040 of 35739495 calls to said function
in a FATE run). Uninline all the others.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/vc2enc.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/libavcodec/vc2enc.c b/libavcodec/vc2enc.c
index 1fe973f4cd..d05df64911 100644
--- a/libavcodec/vc2enc.c
+++ b/libavcodec/vc2enc.c
@@ -204,7 +204,7 @@ static av_cold void vc2_init_static_data(void)
     }
 }
 
-static av_always_inline void put_vc2_ue_uint(PutBitContext *pb, uint32_t val)
+static av_always_inline void put_vc2_ue_uint_inline(PutBitContext *pb, uint32_t val)
 {
     uint64_t pbits = 1;
     int bits = 1;
@@ -222,6 +222,11 @@ static av_always_inline void put_vc2_ue_uint(PutBitContext *pb, uint32_t val)
     put_bits63(pb, bits, pbits);
 }
 
+static av_noinline void put_vc2_ue_uint(PutBitContext *pb, uint32_t val)
+{
+    put_vc2_ue_uint_inline(pb, val);
+}
+
 static av_always_inline int count_vc2_ue_uint(uint32_t val)
 {
     return 2 * av_log2(val + 1) + 1;
@@ -545,7 +550,7 @@ static void encode_subband(const VC2EncContext *s, PutBitContext *pb,
     for (y = top; y < bottom; y++) {
         for (x = left; x < right; x++) {
             uint32_t c_abs = QUANT(FFABS(coeff[x]), q_m, q_a, q_s);
-            put_vc2_ue_uint(pb, c_abs);
+            put_vc2_ue_uint_inline(pb, c_abs);
             if (c_abs)
                 put_bits(pb, 1, coeff[x] < 0);
         }
-- 
2.45.2


[-- Attachment #4: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".