Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 1/2] avcodec/vc2enc: Use LUT to assemble interleaved golomb, code
@ 2025-03-12  3:10 Andreas Rheinhardt
  2025-03-12  4:50 ` Lynne
  0 siblings, 1 reply; 4+ messages in thread
From: Andreas Rheinhardt @ 2025-03-12  3:10 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 29 bytes --]

Patches attached.

- Andreas

[-- Attachment #2: 0001-avcodec-vc2enc-Use-LUT-to-assemble-interleaved-golom.patch --]
[-- Type: text/x-patch, Size: 2977 bytes --]

From 362a2cdad8717c016cd05e8d782260bd1aa0751a Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 12 Mar 2025 03:26:09 +0100
Subject: [PATCH 1/2] avcodec/vc2enc: Use LUT to assemble interleaved golomb
 code

Up until now, the encoder processed only one bit at a time.
With this patch, it is eight bits.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/vc2enc.c | 45 +++++++++++++++++++++++++++++++++------------
 1 file changed, 33 insertions(+), 12 deletions(-)

diff --git a/libavcodec/vc2enc.c b/libavcodec/vc2enc.c
index 4728a48938..1fe973f4cd 100644
--- a/libavcodec/vc2enc.c
+++ b/libavcodec/vc2enc.c
@@ -22,6 +22,7 @@
 #include "libavutil/mem.h"
 #include "libavutil/pixdesc.h"
 #include "libavutil/opt.h"
+#include "libavutil/thread.h"
 #include "libavutil/version.h"
 #include "codec_internal.h"
 #include "dirac.h"
@@ -186,22 +187,39 @@ typedef struct VC2EncContext {
     enum DiracParseCodes last_parse_code;
 } VC2EncContext;
 
+/// x_k x_{k-1} ... x_0 -> 0 x_k 0 x_{k - 1} ... 0 x_0
+static uint16_t interleaved_ue_golomb_tab[256];
+/// 1 x_{k-1} ... x_0 -> 0 0 0 x_{k - 1} ... 0 x_0
+static uint16_t top_interleaved_ue_golomb_tab[256];
+/// 1 x_{k-1} ... x_0 -> 2 * k
+static uint8_t golomb_len_tab[256];
+
+static av_cold void vc2_init_static_data(void)
+{
+    interleaved_ue_golomb_tab[1] = 1;
+    for (unsigned i = 2; i < 256; ++i) {
+        golomb_len_tab[i] = golomb_len_tab[i >> 1] + 2;
+        interleaved_ue_golomb_tab[i] = (interleaved_ue_golomb_tab[i >> 1] << 2) | (i & 1);
+        top_interleaved_ue_golomb_tab[i] = interleaved_ue_golomb_tab[i] ^ (1 << golomb_len_tab[i]);
+    }
+}
+
 static av_always_inline void put_vc2_ue_uint(PutBitContext *pb, uint32_t val)
 {
-    int i;
-    int bits = av_log2(++val);
-    unsigned topbit = 1 << bits;
-    uint64_t pbits = 0;
-
-    for (i = 0; i < bits; i++) {
-        topbit >>= 1;
-        av_assert2(pbits <= UINT64_MAX>>3);
-        pbits <<= 2;
-        if (val & topbit)
-            pbits |= 0x1;
+    uint64_t pbits = 1;
+    int bits = 1;
+
+    ++val;
+
+    while (val >> 8) {
+        pbits |= (uint64_t)interleaved_ue_golomb_tab[val & 0xff] << bits;
+        val  >>= 8;
+        bits  += 16;
     }
+    pbits |= (uint64_t)top_interleaved_ue_golomb_tab[val] << bits;
+    bits  += golomb_len_tab[val];
 
-    put_bits63(pb, 2 * bits + 1, (pbits << 1) | 1);
+    put_bits63(pb, bits, pbits);
 }
 
 static av_always_inline int count_vc2_ue_uint(uint32_t val)
@@ -1003,6 +1021,7 @@ static av_cold int vc2_encode_end(AVCodecContext *avctx)
 
 static av_cold int vc2_encode_init(AVCodecContext *avctx)
 {
+    static AVOnce init_static_once = AV_ONCE_INIT;
     Plane *p;
     SubBand *b;
     int i, level, o, shift;
@@ -1165,6 +1184,8 @@ static av_cold int vc2_encode_init(AVCodecContext *avctx)
         }
     }
 
+    ff_thread_once(&init_static_once, vc2_init_static_data);
+
     return 0;
 }
 
-- 
2.45.2


[-- Attachment #3: 0002-avcodec-vc2enc-Avoid-excessive-inlining.patch --]
[-- Type: text/x-patch, Size: 1832 bytes --]

From 389a64c00bc8244186db1abb25be8dd5ec452df7 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 12 Mar 2025 03:56:03 +0100
Subject: [PATCH 2/2] avcodec/vc2enc: Avoid excessive inlining

There is no reason to inline put_vc2_ue_uint() everywhere;
only one call site is actually hot: The one in encode_subband()
(which accounts for 35735040 of 35739495 calls to said function
in a FATE run). Uninline all the others.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/vc2enc.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/libavcodec/vc2enc.c b/libavcodec/vc2enc.c
index 1fe973f4cd..d05df64911 100644
--- a/libavcodec/vc2enc.c
+++ b/libavcodec/vc2enc.c
@@ -204,7 +204,7 @@ static av_cold void vc2_init_static_data(void)
     }
 }
 
-static av_always_inline void put_vc2_ue_uint(PutBitContext *pb, uint32_t val)
+static av_always_inline void put_vc2_ue_uint_inline(PutBitContext *pb, uint32_t val)
 {
     uint64_t pbits = 1;
     int bits = 1;
@@ -222,6 +222,11 @@ static av_always_inline void put_vc2_ue_uint(PutBitContext *pb, uint32_t val)
     put_bits63(pb, bits, pbits);
 }
 
+static av_noinline void put_vc2_ue_uint(PutBitContext *pb, uint32_t val)
+{
+    put_vc2_ue_uint_inline(pb, val);
+}
+
 static av_always_inline int count_vc2_ue_uint(uint32_t val)
 {
     return 2 * av_log2(val + 1) + 1;
@@ -545,7 +550,7 @@ static void encode_subband(const VC2EncContext *s, PutBitContext *pb,
     for (y = top; y < bottom; y++) {
         for (x = left; x < right; x++) {
             uint32_t c_abs = QUANT(FFABS(coeff[x]), q_m, q_a, q_s);
-            put_vc2_ue_uint(pb, c_abs);
+            put_vc2_ue_uint_inline(pb, c_abs);
             if (c_abs)
                 put_bits(pb, 1, coeff[x] < 0);
         }
-- 
2.45.2


[-- Attachment #4: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/2] avcodec/vc2enc: Use LUT to assemble interleaved golomb, code
  2025-03-12  3:10 [FFmpeg-devel] [PATCH 1/2] avcodec/vc2enc: Use LUT to assemble interleaved golomb, code Andreas Rheinhardt
@ 2025-03-12  4:50 ` Lynne
  2025-03-12  5:27   ` Andreas Rheinhardt
  0 siblings, 1 reply; 4+ messages in thread
From: Lynne @ 2025-03-12  4:50 UTC (permalink / raw)
  To: ffmpeg-devel


[-- Attachment #1.1.1.1: Type: text/plain, Size: 742 bytes --]

On 12/03/2025 04:10, Andreas Rheinhardt wrote:
> Patches attached.
> 
> - Andreas

First patch is wild, its surprising no one considered inverting the way 
decoder parses codes for an encoder yet.
Rather than ORing and using put_bits63, I think it would make more sense 
to write out each chunk using put_bits sequentially. It might be 
possible to reverse the lookups such that you get the MSBs first so you 
wouldn't need to reverse them out of place in a small array.
But either way, LGTM. Feel free to explore this in a follow-up.

Second patch seems a bit pointless. It's just one single call you're 
uninlining? Chasing to save a few extra bytes of binary surely don't 
deserve having a wrapper function for uninlining.

[-- Attachment #1.1.1.2: OpenPGP public key --]
[-- Type: application/pgp-keys, Size: 637 bytes --]

[-- Attachment #1.2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 236 bytes --]

[-- Attachment #2: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/2] avcodec/vc2enc: Use LUT to assemble interleaved golomb, code
  2025-03-12  4:50 ` Lynne
@ 2025-03-12  5:27   ` Andreas Rheinhardt
  2025-03-12 13:46     ` Lynne
  0 siblings, 1 reply; 4+ messages in thread
From: Andreas Rheinhardt @ 2025-03-12  5:27 UTC (permalink / raw)
  To: ffmpeg-devel

Lynne:
> On 12/03/2025 04:10, Andreas Rheinhardt wrote:
>> Patches attached.
>>
>> - Andreas
> 
> First patch is wild, its surprising no one considered inverting the way
> decoder parses codes for an encoder yet.

I didn't even look at the decoder.
(It is actually surprising that it took until
512e597932dfe05cf5665192efbe2c93c2e36af2 for the original code to be
improved.)

> Rather than ORing and using put_bits63, I think it would make more sense
> to write out each chunk using put_bits sequentially. It might be
> possible to reverse the lookups such that you get the MSBs first so you
> wouldn't need to reverse them out of place in a small array.
> But either way, LGTM. Feel free to explore this in a follow-up.

I don't think that writing them sequentially will improve anything: In
order to be able to use a LUT, I would have to shift the bits starting
with the MSBs into position; and then there would be the internal shifts
and checks inside put_bits().
Apart from that: put_bits63() is the same as put_bits() when BUF_BITS is
64 (see ede2b391cc516f4f93621f6a214b3410b231f582).

> 
> Second patch seems a bit pointless. It's just one single call you're
> uninlining? Chasing to save a few extra bytes of binary surely don't
> deserve having a wrapper function for uninlining.
> 

I am uninlining all calls besides the hot one. 31 callsites.
For GCC, this reduced codesize 2c36 to 25b1 (15% saved), for clang from
4b08 to 3338 (32% saved).

- Andreas

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/2] avcodec/vc2enc: Use LUT to assemble interleaved golomb, code
  2025-03-12  5:27   ` Andreas Rheinhardt
@ 2025-03-12 13:46     ` Lynne
  0 siblings, 0 replies; 4+ messages in thread
From: Lynne @ 2025-03-12 13:46 UTC (permalink / raw)
  To: ffmpeg-devel

On 12/03/2025 06:27, Andreas Rheinhardt wrote:
> Lynne:
>> On 12/03/2025 04:10, Andreas Rheinhardt wrote:
>>> Patches attached.
>>>
>>> - Andreas
>>
>> First patch is wild, its surprising no one considered inverting the way
>> decoder parses codes for an encoder yet.
> 
> I didn't even look at the decoder.
> (It is actually surprising that it took until
> 512e597932dfe05cf5665192efbe2c93c2e36af2 for the original code to be
> improved.)
> 
>> Rather than ORing and using put_bits63, I think it would make more sense
>> to write out each chunk using put_bits sequentially. It might be
>> possible to reverse the lookups such that you get the MSBs first so you
>> wouldn't need to reverse them out of place in a small array.
>> But either way, LGTM. Feel free to explore this in a follow-up.
> 
> I don't think that writing them sequentially will improve anything: In
> order to be able to use a LUT, I would have to shift the bits starting
> with the MSBs into position; and then there would be the internal shifts
> and checks inside put_bits().
> Apart from that: put_bits63() is the same as put_bits() when BUF_BITS is
> 64 (see ede2b391cc516f4f93621f6a214b3410b231f582).
> 
>>
>> Second patch seems a bit pointless. It's just one single call you're
>> uninlining? Chasing to save a few extra bytes of binary surely don't
>> deserve having a wrapper function for uninlining.
>>
> 
> I am uninlining all calls besides the hot one. 31 callsites.
> For GCC, this reduced codesize 2c36 to 25b1 (15% saved), for clang from
> 4b08 to 3338 (32% saved).

Oh, it was late and I didn't read carefully.
Both patches LGTM.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2025-03-12 13:46 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-03-12  3:10 [FFmpeg-devel] [PATCH 1/2] avcodec/vc2enc: Use LUT to assemble interleaved golomb, code Andreas Rheinhardt
2025-03-12  4:50 ` Lynne
2025-03-12  5:27   ` Andreas Rheinhardt
2025-03-12 13:46     ` Lynne

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git