From 3c0df27067d0e0758fc72bbe31c5cab7eb2c8ed2 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Tue, 25 Mar 2025 18:00:25 +0100
Subject: [PATCH 04/11] avcodec/mpegvideo_enc: Don't use unnecessarily much
 stack

encode_thread() puts two MPVEncContexts (2*6516B here)
on the stack and zeroes one of them in order to
temporarily store the variables that get changed
during encoding a macroblock (when there is more than
one candidate type for a macroblock). This is wasteful
and therefore this commit adds a small (328B here) structure
to store exactly the fields that actually need to be backed
up. Then one can extend MPVEncContext without fearing
too use up to much stack.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/mpegvideo_enc.c | 158 +++++++++++++++++++++----------------
 1 file changed, 89 insertions(+), 69 deletions(-)

diff --git a/libavcodec/mpegvideo_enc.c b/libavcodec/mpegvideo_enc.c
index 16086bf067..fb3ad2e25c 100644
--- a/libavcodec/mpegvideo_enc.c
+++ b/libavcodec/mpegvideo_enc.c
@@ -2631,80 +2631,100 @@ static void encode_mb(MPVEncContext *const s, int motion_x, int motion_y)
         encode_mb_internal(s, motion_x, motion_y, 16, 16, 12, 0, 0, CHROMA_444);
 }
 
-static inline void copy_context_before_encode(MPVEncContext *const d,
-                                              const MPVEncContext *const s)
-{
-    int i;
-
-    memcpy(d->c.last_mv, s->c.last_mv, 2*2*2*sizeof(int)); //FIXME is memcpy faster than a loop?
-
-    /* MPEG-1 */
-    d->c.mb_skip_run = s->c.mb_skip_run;
-    for(i=0; i<3; i++)
-        d->c.last_dc[i] = s->c.last_dc[i];
-
-    /* statistics */
-    d->mv_bits= s->mv_bits;
-    d->i_tex_bits= s->i_tex_bits;
-    d->p_tex_bits= s->p_tex_bits;
-    d->i_count= s->i_count;
-    d->misc_bits= s->misc_bits;
-    d->last_bits= 0;
-
-    d->c.mb_skipped = 0;
-    d->c.qscale = s->c.qscale;
-    d->dquant= s->dquant;
-
-    d->esc3_level_length= s->esc3_level_length;
+typedef struct MBBackup {
+    struct {
+        int mv[2][4][2];
+        int last_mv[2][2][2];
+        int mv_type, mv_dir;
+        int last_dc[3];
+        int mb_intra, mb_skipped, mb_skip_run;
+        int qscale;
+        int block_last_index[8];
+        int interlaced_dct;
+        int16_t (*block)[64];
+    } c;
+    int mv_bits, i_tex_bits, p_tex_bits, i_count, misc_bits, last_bits;
+    int dquant;
+    int esc3_level_length;
+    PutBitContext pb, pb2, tex_pb;
+} MBBackup;
+
+#define COPY_CONTEXT(BEFORE, AFTER, DST_TYPE, SRC_TYPE)                     \
+static inline void BEFORE ##_context_before_encode(DST_TYPE *const d,       \
+                                                   const SRC_TYPE *const s) \
+{                                                                           \
+    /* FIXME is memcpy faster than a loop? */                               \
+    memcpy(d->c.last_mv, s->c.last_mv, 2*2*2*sizeof(int));                  \
+                                                                            \
+    /* MPEG-1 */                                                            \
+    d->c.mb_skip_run = s->c.mb_skip_run;                                    \
+    for (int i = 0; i < 3; i++)                                             \
+        d->c.last_dc[i] = s->c.last_dc[i];                                  \
+                                                                            \
+    /* statistics */                                                        \
+    d->mv_bits    = s->mv_bits;                                             \
+    d->i_tex_bits = s->i_tex_bits;                                          \
+    d->p_tex_bits = s->p_tex_bits;                                          \
+    d->i_count    = s->i_count;                                             \
+    d->misc_bits  = s->misc_bits;                                           \
+    d->last_bits  = 0;                                                      \
+                                                                            \
+    d->c.mb_skipped = 0;                                                    \
+    d->c.qscale = s->c.qscale;                                              \
+    d->dquant   = s->dquant;                                                \
+                                                                            \
+    d->esc3_level_length = s->esc3_level_length;                            \
+}                                                                           \
+                                                                            \
+static inline void AFTER ## _context_after_encode(DST_TYPE *const d,        \
+                                                  const SRC_TYPE *const s,  \
+                                                  int data_partitioning)    \
+{                                                                           \
+    /* FIXME is memcpy faster than a loop? */                               \
+    memcpy(d->c.mv, s->c.mv, 2*4*2*sizeof(int));                            \
+    memcpy(d->c.last_mv, s->c.last_mv, 2*2*2*sizeof(int));                  \
+                                                                            \
+    /* MPEG-1 */                                                            \
+    d->c.mb_skip_run = s->c.mb_skip_run;                                    \
+    for (int i = 0; i < 3; i++)                                             \
+        d->c.last_dc[i] = s->c.last_dc[i];                                  \
+                                                                            \
+    /* statistics */                                                        \
+    d->mv_bits    = s->mv_bits;                                             \
+    d->i_tex_bits = s->i_tex_bits;                                          \
+    d->p_tex_bits = s->p_tex_bits;                                          \
+    d->i_count    = s->i_count;                                             \
+    d->misc_bits  = s->misc_bits;                                           \
+                                                                            \
+    d->c.mb_intra   = s->c.mb_intra;                                        \
+    d->c.mb_skipped = s->c.mb_skipped;                                      \
+    d->c.mv_type    = s->c.mv_type;                                         \
+    d->c.mv_dir     = s->c.mv_dir;                                          \
+    d->pb = s->pb;                                                          \
+    if (data_partitioning) {                                                \
+        d->pb2    = s->pb2;                                                 \
+        d->tex_pb = s->tex_pb;                                              \
+    }                                                                       \
+    d->c.block = s->c.block;                                                \
+    for (int i = 0; i < 8; i++)                                             \
+        d->c.block_last_index[i] = s->c.block_last_index[i];                \
+    d->c.interlaced_dct = s->c.interlaced_dct;                              \
+    d->c.qscale = s->c.qscale;                                              \
+                                                                            \
+    d->esc3_level_length = s->esc3_level_length;                            \
 }
 
-static inline void copy_context_after_encode(MPVEncContext *const d,
-                                             const MPVEncContext *const s,
-                                             int data_partitioning)
-{
-    int i;
-
-    memcpy(d->c.mv, s->c.mv, 2*4*2*sizeof(int));
-    memcpy(d->c.last_mv, s->c.last_mv, 2*2*2*sizeof(int)); //FIXME is memcpy faster than a loop?
-
-    /* MPEG-1 */
-    d->c.mb_skip_run = s->c.mb_skip_run;
-    for(i=0; i<3; i++)
-        d->c.last_dc[i] = s->c.last_dc[i];
-
-    /* statistics */
-    d->mv_bits= s->mv_bits;
-    d->i_tex_bits= s->i_tex_bits;
-    d->p_tex_bits= s->p_tex_bits;
-    d->i_count= s->i_count;
-    d->misc_bits= s->misc_bits;
-
-    d->c.mb_intra   = s->c.mb_intra;
-    d->c.mb_skipped = s->c.mb_skipped;
-    d->c.mv_type    = s->c.mv_type;
-    d->c.mv_dir     = s->c.mv_dir;
-    d->pb= s->pb;
-    if (data_partitioning) {
-        d->pb2= s->pb2;
-        d->tex_pb= s->tex_pb;
-    }
-    d->c.block = s->c.block;
-    for(i=0; i<8; i++)
-        d->c.block_last_index[i] = s->c.block_last_index[i];
-    d->c.interlaced_dct = s->c.interlaced_dct;
-    d->c.qscale = s->c.qscale;
-
-    d->esc3_level_length= s->esc3_level_length;
-}
+COPY_CONTEXT(backup, save, MBBackup, MPVEncContext)
+COPY_CONTEXT(reset, store, MPVEncContext, MBBackup)
 
-static void encode_mb_hq(MPVEncContext *const s, MPVEncContext *const backup, MPVEncContext *const best,
+static void encode_mb_hq(MPVEncContext *const s, MBBackup *const backup, MBBackup *const best,
                          PutBitContext pb[2], PutBitContext pb2[2], PutBitContext tex_pb[2],
                          int *dmin, int *next_block, int motion_x, int motion_y)
 {
     int score;
     uint8_t *dest_backup[3];
 
-    copy_context_before_encode(s, backup);
+    reset_context_before_encode(s, backup);
 
     s->c.block = s->c.blocks[*next_block];
     s->pb      = pb[*next_block];
@@ -2744,7 +2764,7 @@ static void encode_mb_hq(MPVEncContext *const s, MPVEncContext *const backup, MP
         *dmin= score;
         *next_block^=1;
 
-        copy_context_after_encode(best, s, s->c.data_partitioning);
+        save_context_after_encode(best, s, s->c.data_partitioning);
     }
 }
 
@@ -2962,7 +2982,7 @@ static int encode_thread(AVCodecContext *c, void *arg){
     MPVEncContext *const s = *(void**)arg;
     int chr_h = 16 >> s->c.chroma_y_shift;
     int i;
-    MPVEncContext best_s = { 0 }, backup_s;
+    MBBackup best_s = { 0 }, backup_s;
     uint8_t bit_buf[2][MAX_MB_BYTES];
     uint8_t bit_buf2[2][MAX_MB_BYTES];
     uint8_t bit_buf_tex[2][MAX_MB_BYTES];
@@ -3163,7 +3183,7 @@ static int encode_thread(AVCodecContext *c, void *arg){
                 int next_block=0;
                 int pb_bits_count, pb2_bits_count, tex_pb_bits_count;
 
-                copy_context_before_encode(&backup_s, s);
+                backup_context_before_encode(&backup_s, s);
                 backup_s.pb= s->pb;
                 if (s->c.data_partitioning) {
                     backup_s.pb2= s->pb2;
@@ -3388,7 +3408,7 @@ static int encode_thread(AVCodecContext *c, void *arg){
                     }
                 }
 
-                copy_context_after_encode(s, &best_s, s->c.data_partitioning);
+                store_context_after_encode(s, &best_s, s->c.data_partitioning);
 
                 pb_bits_count= put_bits_count(&s->pb);
                 flush_put_bits(&s->pb);
-- 
2.45.2