[FFmpeg-devel] [PATCH 1/3] avcodec/asvenc: Don't waste bits encoding non-visible part

* [FFmpeg-devel] [PATCH 1/3] avcodec/asvenc: Don't waste bits encoding non-visible part
@ 2025-05-22 23:04 Andreas Rheinhardt
  2025-05-25  4:18 ` Andreas Rheinhardt
  0 siblings, 1 reply; 2+ messages in thread
From: Andreas Rheinhardt @ 2025-05-22 23:04 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 29 bytes --]

Patches attached.

- Andreas

[-- Attachment #2: 0001-avcodec-asvenc-Don-t-waste-bits-encoding-non-visible.patch --]
[-- Type: text/x-patch, Size: 8512 bytes --]

From 1105cb797c67d05bf4666e2e33140debcfec12a7 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 22 May 2025 15:57:13 +0200
Subject: [PATCH 1/3] avcodec/asvenc: Don't waste bits encoding non-visible
 part

Up until now, the encoder replicated all the border pixels
for incomplete 16x16 macroblocks. In case the available width
or height is <= 8, some of the luma blocks of the MB
do not correspond to actual input, so that we should encode
them using the least amount of bits. Zeroing the block coefficients
(as this commit does) achieves this, replicating the pixels
and performing an FDCT does not.

This commit also removes the frame copying code for insufficiently
aligned dimensions.

The vsynth3-asv[12] FATE tests use a 34x34 input file and are
therefore affected by this. As the ref updates show, the size
and checksum of the encoded changes, yet the decoded output
stays the same.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/asvenc.c           | 131 +++++++++++++++++++++-------------
 tests/ref/vsynth/vsynth3-asv1 |   4 +-
 tests/ref/vsynth/vsynth3-asv2 |   4 +-
 3 files changed, 84 insertions(+), 55 deletions(-)

diff --git a/libavcodec/asvenc.c b/libavcodec/asvenc.c
index 52666ee547..a53dc7c670 100644
--- a/libavcodec/asvenc.c
+++ b/libavcodec/asvenc.c
@@ -26,6 +26,7 @@
 #include "config_components.h"
 
 #include "libavutil/attributes.h"
+#include "libavutil/intreadwrite.h"
 #include "libavutil/mem.h"
 #include "libavutil/mem_internal.h"
 
@@ -228,55 +229,65 @@ static inline void dct_get(ASVEncContext *a, const AVFrame *frame,
     }
 }
 
-static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
-                        const AVFrame *pict, int *got_packet)
+static void handle_partial_mb(ASVEncContext *a, const uint8_t *const data[3],
+                              const int linesizes[3],
+                              int valid_width, int valid_height)
 {
-    ASVEncContext *const a = avctx->priv_data;
-    const ASVCommonContext *const c = &a->c;
-    int size, ret;
-
-    if (pict->width % 16 || pict->height % 16) {
-        AVFrame *clone = av_frame_alloc();
-        int i;
-
-        if (!clone)
-            return AVERROR(ENOMEM);
-        clone->format = pict->format;
-        clone->width  = FFALIGN(pict->width, 16);
-        clone->height = FFALIGN(pict->height, 16);
-        ret = av_frame_get_buffer(clone, 0);
-        if (ret < 0) {
-            av_frame_free(&clone);
-            return ret;
+    const int nb_blocks = a->c.avctx->flags & AV_CODEC_FLAG_GRAY ? 4 : 6;
+    static const struct Descriptor {
+        uint8_t x_offset, y_offset;
+        uint8_t component, subsampling;
+    } block_descriptor[] = {
+        { 0, 0, 0, 0 }, { 8, 0, 0, 0 }, { 0, 8, 0, 0 }, { 8, 8, 0, 0 },
+        { 0, 0, 1, 1 }, { 0, 0, 2, 1 },
+    };
+
+    for (int i = 0; i < nb_blocks; ++i) {
+        const struct Descriptor *const desc = block_descriptor + i;
+        int width_avail  = AV_CEIL_RSHIFT(valid_width,  desc->subsampling) - desc->x_offset;
+        int height_avail = AV_CEIL_RSHIFT(valid_height, desc->subsampling) - desc->y_offset;
+
+        if (width_avail <= 0 || height_avail <= 0) {
+            // This block is outside of the visible part; don't replicate pixels,
+            // just zero the block, so that only the dc value will be coded.
+            memset(a->block[i], 0, sizeof(a->block[i]));
+            continue;
         }
-
-        ret = av_frame_copy(clone, pict);
-        if (ret < 0) {
-            av_frame_free(&clone);
-            return ret;
+        width_avail  = FFMIN(width_avail,  8);
+        height_avail = FFMIN(height_avail, 8);
+
+        ptrdiff_t linesize = linesizes[desc->component];
+        const uint8_t *src = data[desc->component] + desc->y_offset * linesize + desc->x_offset;
+        int16_t *block = a->block[i];
+
+        for (int h = 0;; block += 8, src += linesize) {
+            int16_t last;
+            for (int w = 0; w < width_avail; ++w)
+                last = block[w] = src[w];
+            for (int w = width_avail; w < 8; ++w)
+                block[w] = last;
+            if (++h == height_avail)
+                break;
         }
-
-        for (i = 0; i<3; i++) {
-            int x, y;
-            int w  = AV_CEIL_RSHIFT(pict->width, !!i);
-            int h  = AV_CEIL_RSHIFT(pict->height, !!i);
-            int w2 = AV_CEIL_RSHIFT(clone->width, !!i);
-            int h2 = AV_CEIL_RSHIFT(clone->height, !!i);
-            for (y=0; y<h; y++)
-                for (x=w; x<w2; x++)
-                    clone->data[i][x + y*clone->linesize[i]] =
-                        clone->data[i][w - 1 + y*clone->linesize[i]];
-            for (y=h; y<h2; y++)
-                for (x=0; x<w2; x++)
-                    clone->data[i][x + y*clone->linesize[i]] =
-                        clone->data[i][x + (h-1)*clone->linesize[i]];
+        const int16_t *const last_row = block;
+        for (int h = height_avail; h < 8; ++h) {
+            block += 8;
+            AV_COPY128(block, last_row);
         }
-        ret = encode_frame(avctx, pkt, clone, got_packet);
 
-        av_frame_free(&clone);
-        return ret;
+        a->fdsp.fdct(a->block[i]);
     }
 
+    encode_mb(a, a->block);
+}
+
+static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                        const AVFrame *pict, int *got_packet)
+{
+    ASVEncContext *const a = avctx->priv_data;
+    const ASVCommonContext *const c = &a->c;
+    int size, ret;
+
     ret = ff_alloc_packet(avctx, pkt, c->mb_height * c->mb_width * MAX_MB_SIZE + 3);
     if (ret < 0)
         return ret;
@@ -290,19 +301,37 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         }
     }
 
-    if (c->mb_width2 != c->mb_width) {
-        int mb_x = c->mb_width2;
+    if (avctx->width & 15) {
+        const uint8_t *src[3] = {
+            pict->data[0] + c->mb_width2 * 16,
+            pict->data[1] + c->mb_width2 *  8,
+            pict->data[2] + c->mb_width2 *  8,
+        };
+        int available_width = avctx->width & 15;
+
         for (int mb_y = 0; mb_y < c->mb_height2; mb_y++) {
-            dct_get(a, pict, mb_x, mb_y);
-            encode_mb(a, a->block);
+            handle_partial_mb(a, src, pict->linesize, available_width, 16);
+            src[0] += 16 * pict->linesize[0];
+            src[1] +=  8 * pict->linesize[1];
+            src[2] +=  8 * pict->linesize[2];
         }
     }
 
-    if (c->mb_height2 != c->mb_height) {
-        int mb_y = c->mb_height2;
-        for (int mb_x = 0; mb_x < c->mb_width; mb_x++) {
-            dct_get(a, pict, mb_x, mb_y);
-            encode_mb(a, a->block);
+    if (avctx->height & 15) {
+        const uint8_t *src[3] = {
+            pict->data[0] + c->mb_height2 * 16 * pict->linesize[0],
+            pict->data[1] + c->mb_height2 *  8 * pict->linesize[1],
+            pict->data[2] + c->mb_height2 *  8 * pict->linesize[2],
+        };
+        int available_height = avctx->height & 15;
+
+        for (int remaining = avctx->width;; remaining -= 16) {
+            handle_partial_mb(a, src, pict->linesize, remaining, available_height);
+            if (remaining <= 16)
+                break;
+            src[0] += 16;
+            src[1] +=  8;
+            src[2] +=  8;
         }
     }
 
diff --git a/tests/ref/vsynth/vsynth3-asv1 b/tests/ref/vsynth/vsynth3-asv1
index 0abbf787ec..af1dc644b0 100644
--- a/tests/ref/vsynth/vsynth3-asv1
+++ b/tests/ref/vsynth/vsynth3-asv1
@@ -1,4 +1,4 @@
-81eeea0d0e6219b2f381cf2100e9a12f *tests/data/fate/vsynth3-asv1.avi
-34704 tests/data/fate/vsynth3-asv1.avi
+69ae6df10440e68c53bee4e713851199 *tests/data/fate/vsynth3-asv1.avi
+31524 tests/data/fate/vsynth3-asv1.avi
 3c8636e22a96267451684f42d7a6f608 *tests/data/fate/vsynth3-asv1.out.rawvideo
 stddev:   13.16 PSNR: 25.74 MAXDIFF:  112 bytes:    86700/    86700
diff --git a/tests/ref/vsynth/vsynth3-asv2 b/tests/ref/vsynth/vsynth3-asv2
index 90b8a47f34..9fa9822c0b 100644
--- a/tests/ref/vsynth/vsynth3-asv2
+++ b/tests/ref/vsynth/vsynth3-asv2
@@ -1,4 +1,4 @@
-8402fb1112fb8119c019154a472b5cd0 *tests/data/fate/vsynth3-asv2.avi
-36208 tests/data/fate/vsynth3-asv2.avi
+63000eaedeb60bede8baeb090f02881a *tests/data/fate/vsynth3-asv2.avi
+33696 tests/data/fate/vsynth3-asv2.avi
 5469c0735b7c9279e5e8e3439fc6acab *tests/data/fate/vsynth3-asv2.out.rawvideo
 stddev:    9.07 PSNR: 28.97 MAXDIFF:   51 bytes:    86700/    86700
-- 
2.45.2


[-- Attachment #3: 0002-avcodec-asvenc-Combine-writing-bits.patch --]
[-- Type: text/x-patch, Size: 3840 bytes --]

From 10448faa75c648e7be7a910c210121e2609da6c9 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 22 May 2025 19:38:24 +0200
Subject: [PATCH 2/3] avcodec/asvenc: Combine writing bits

Removes implicit checks for "do I need to output the buffer now?".
Codesize with Clang 19 with -O3 decreased from 7136B to 6108B
(although asv2_put_level() is now inlined).

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/asvenc.c | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/libavcodec/asvenc.c b/libavcodec/asvenc.c
index a53dc7c670..2f81d6c74b 100644
--- a/libavcodec/asvenc.c
+++ b/libavcodec/asvenc.c
@@ -62,40 +62,43 @@ enum {
 static inline void asv1_put_level(PutBitContext *pb, int level)
 {
     unsigned int index = level + 3;
+    unsigned n, code;
 
     if (index <= 6) {
-        put_bits(pb, ff_asv_level_tab[index][1], ff_asv_level_tab[index][0]);
+        n    = ff_asv_level_tab[index][1];
+        code = ff_asv_level_tab[index][0];
     } else {
-        put_bits(pb, 3, 0); /* Escape code */
-        put_sbits(pb, 8, level);
+        n    = 3 + 8;
+        code = (0 /* Escape code */ << 8)  | (level & 0xFF);
     }
+    put_bits(pb, n, code);
 }
 
 static inline void asv2_put_level(ASVEncContext *a, PutBitContext *pb, int level)
 {
     unsigned int index = level + 31;
+    unsigned n, code;
 
     if (index <= 62) {
-        put_bits_le(pb, ff_asv2_level_tab[index][1], ff_asv2_level_tab[index][0]);
+        n    = ff_asv2_level_tab[index][1];
+        code = ff_asv2_level_tab[index][0];
     } else {
-        put_bits_le(pb, 5, 0); /* Escape code */
         if (level < -128 || level > 127) {
             av_log(a->c.avctx, AV_LOG_WARNING, "Clipping level %d, increase qscale\n", level);
             level = av_clip_int8(level);
         }
-        put_bits_le(pb, 8, level & 0xFF);
+        n    = 5 + 8;
+        code = (level & 0xFF) << 5 | /* Escape code */ 0;
     }
+    put_bits_le(pb, n, code);
 }
 
 static inline void asv1_encode_block(ASVEncContext *a, int16_t block[64])
 {
-    int i;
-    int nc_count = 0;
-
     put_bits(&a->pb, 8, (block[0] + 32) >> 6);
     block[0] = 0;
 
-    for (i = 0; i < 10; i++) {
+    for (unsigned i = 0, nc_bits = 0, nc_val = 0; i < 10; i++) {
         const int index = ff_asv_scantab[4 * i];
         int ccp         = 0;
 
@@ -113,10 +116,11 @@ static inline void asv1_encode_block(ASVEncContext *a, int16_t block[64])
             ccp |= 1;
 
         if (ccp) {
-            for (; nc_count; nc_count--)
-                put_bits(&a->pb, 2, 2); /* Skip */
-
-            put_bits(&a->pb, ff_asv_ccp_tab[ccp][1], ff_asv_ccp_tab[ccp][0]);
+            put_bits(&a->pb, nc_bits + ff_asv_ccp_tab[ccp][1],
+                             nc_val << ff_asv_ccp_tab[ccp][1] /* Skip */ |
+                             ff_asv_ccp_tab[ccp][0]);
+            nc_bits = 0;
+            nc_val  = 0;
 
             if (ccp & 8)
                 asv1_put_level(&a->pb, block[index + 0]);
@@ -127,7 +131,8 @@ static inline void asv1_encode_block(ASVEncContext *a, int16_t block[64])
             if (ccp & 1)
                 asv1_put_level(&a->pb, block[index + 9]);
         } else {
-            nc_count++;
+            nc_bits += 2;
+            nc_val   = (nc_val << 2) | 2;
         }
     }
     put_bits(&a->pb, 5, 0xF); /* End of block */
@@ -146,8 +151,8 @@ static inline void asv2_encode_block(ASVEncContext *a, int16_t block[64])
 
     count >>= 2;
 
-    put_bits_le(&a->pb, 4, count);
-    put_bits_le(&a->pb, 8, (block[0] + 32) >> 6);
+    put_bits_le(&a->pb, 4 + 8, count /* 4 bits */ |
+                               (/* DC */(block[0] + 32) >> 6) << 4);
     block[0] = 0;
 
     for (i = 0; i <= count; i++) {
-- 
2.45.2


[-- Attachment #4: 0003-avcodec-asvenc-Simplify-writing-extradata.patch --]
[-- Type: text/x-patch, Size: 1457 bytes --]

From f58d68b2f4d8d15841e27d6af05dc24ea32efddd Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 22 May 2025 22:44:56 +0200
Subject: [PATCH 3/3] avcodec/asvenc: Simplify writing extradata

It is confusing, because the AV_RL32("ASUS") already
returns an endian-independent value, so converting
it via av_le2ne32() makes no real sense: one would need
to transform the native value to le and write it as
a natie endian uint32_t for it to make sense (the current
code only works because le2ne32 and ne2le32 are the same
for both endianness that we care about). Or one can just
use AV_RL32 and create the number via MKTAG().

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/asvenc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavcodec/asvenc.c b/libavcodec/asvenc.c
index 2f81d6c74b..bcdb5cfbe2 100644
--- a/libavcodec/asvenc.c
+++ b/libavcodec/asvenc.c
@@ -379,8 +379,8 @@ static av_cold int encode_init(AVCodecContext *avctx)
     if (!avctx->extradata)
         return AVERROR(ENOMEM);
     avctx->extradata_size              = 8;
-    AV_WLA(32, avctx->extradata, inv_qscale);
-    ((uint32_t *) avctx->extradata)[1] = av_le2ne32(AV_RL32("ASUS"));
+    AV_WL32A(avctx->extradata, inv_qscale);
+    AV_WL32A(avctx->extradata + 4, MKTAG('A', 'S', 'U', 'S'));
 
     for (i = 0; i < 64; i++) {
         if (a->fdsp.fdct == ff_fdct_ifast) {
-- 
2.45.2


[-- Attachment #5: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 2+ messages in thread