From 3c0df27067d0e0758fc72bbe31c5cab7eb2c8ed2 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Date: Tue, 25 Mar 2025 18:00:25 +0100 Subject: [PATCH 04/11] avcodec/mpegvideo_enc: Don't use unnecessarily much stack encode_thread() puts two MPVEncContexts (2*6516B here) on the stack and zeroes one of them in order to temporarily store the variables that get changed during encoding a macroblock (when there is more than one candidate type for a macroblock). This is wasteful and therefore this commit adds a small (328B here) structure to store exactly the fields that actually need to be backed up. Then one can extend MPVEncContext without fearing too use up to much stack. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/mpegvideo_enc.c | 158 +++++++++++++++++++++---------------- 1 file changed, 89 insertions(+), 69 deletions(-) diff --git a/libavcodec/mpegvideo_enc.c b/libavcodec/mpegvideo_enc.c index 16086bf067..fb3ad2e25c 100644 --- a/libavcodec/mpegvideo_enc.c +++ b/libavcodec/mpegvideo_enc.c @@ -2631,80 +2631,100 @@ static void encode_mb(MPVEncContext *const s, int motion_x, int motion_y) encode_mb_internal(s, motion_x, motion_y, 16, 16, 12, 0, 0, CHROMA_444); } -static inline void copy_context_before_encode(MPVEncContext *const d, - const MPVEncContext *const s) -{ - int i; - - memcpy(d->c.last_mv, s->c.last_mv, 2*2*2*sizeof(int)); //FIXME is memcpy faster than a loop? - - /* MPEG-1 */ - d->c.mb_skip_run = s->c.mb_skip_run; - for(i=0; i<3; i++) - d->c.last_dc[i] = s->c.last_dc[i]; - - /* statistics */ - d->mv_bits= s->mv_bits; - d->i_tex_bits= s->i_tex_bits; - d->p_tex_bits= s->p_tex_bits; - d->i_count= s->i_count; - d->misc_bits= s->misc_bits; - d->last_bits= 0; - - d->c.mb_skipped = 0; - d->c.qscale = s->c.qscale; - d->dquant= s->dquant; - - d->esc3_level_length= s->esc3_level_length; +typedef struct MBBackup { + struct { + int mv[2][4][2]; + int last_mv[2][2][2]; + int mv_type, mv_dir; + int last_dc[3]; + int mb_intra, mb_skipped, mb_skip_run; + int qscale; + int block_last_index[8]; + int interlaced_dct; + int16_t (*block)[64]; + } c; + int mv_bits, i_tex_bits, p_tex_bits, i_count, misc_bits, last_bits; + int dquant; + int esc3_level_length; + PutBitContext pb, pb2, tex_pb; +} MBBackup; + +#define COPY_CONTEXT(BEFORE, AFTER, DST_TYPE, SRC_TYPE) \ +static inline void BEFORE ##_context_before_encode(DST_TYPE *const d, \ + const SRC_TYPE *const s) \ +{ \ + /* FIXME is memcpy faster than a loop? */ \ + memcpy(d->c.last_mv, s->c.last_mv, 2*2*2*sizeof(int)); \ + \ + /* MPEG-1 */ \ + d->c.mb_skip_run = s->c.mb_skip_run; \ + for (int i = 0; i < 3; i++) \ + d->c.last_dc[i] = s->c.last_dc[i]; \ + \ + /* statistics */ \ + d->mv_bits = s->mv_bits; \ + d->i_tex_bits = s->i_tex_bits; \ + d->p_tex_bits = s->p_tex_bits; \ + d->i_count = s->i_count; \ + d->misc_bits = s->misc_bits; \ + d->last_bits = 0; \ + \ + d->c.mb_skipped = 0; \ + d->c.qscale = s->c.qscale; \ + d->dquant = s->dquant; \ + \ + d->esc3_level_length = s->esc3_level_length; \ +} \ + \ +static inline void AFTER ## _context_after_encode(DST_TYPE *const d, \ + const SRC_TYPE *const s, \ + int data_partitioning) \ +{ \ + /* FIXME is memcpy faster than a loop? */ \ + memcpy(d->c.mv, s->c.mv, 2*4*2*sizeof(int)); \ + memcpy(d->c.last_mv, s->c.last_mv, 2*2*2*sizeof(int)); \ + \ + /* MPEG-1 */ \ + d->c.mb_skip_run = s->c.mb_skip_run; \ + for (int i = 0; i < 3; i++) \ + d->c.last_dc[i] = s->c.last_dc[i]; \ + \ + /* statistics */ \ + d->mv_bits = s->mv_bits; \ + d->i_tex_bits = s->i_tex_bits; \ + d->p_tex_bits = s->p_tex_bits; \ + d->i_count = s->i_count; \ + d->misc_bits = s->misc_bits; \ + \ + d->c.mb_intra = s->c.mb_intra; \ + d->c.mb_skipped = s->c.mb_skipped; \ + d->c.mv_type = s->c.mv_type; \ + d->c.mv_dir = s->c.mv_dir; \ + d->pb = s->pb; \ + if (data_partitioning) { \ + d->pb2 = s->pb2; \ + d->tex_pb = s->tex_pb; \ + } \ + d->c.block = s->c.block; \ + for (int i = 0; i < 8; i++) \ + d->c.block_last_index[i] = s->c.block_last_index[i]; \ + d->c.interlaced_dct = s->c.interlaced_dct; \ + d->c.qscale = s->c.qscale; \ + \ + d->esc3_level_length = s->esc3_level_length; \ } -static inline void copy_context_after_encode(MPVEncContext *const d, - const MPVEncContext *const s, - int data_partitioning) -{ - int i; - - memcpy(d->c.mv, s->c.mv, 2*4*2*sizeof(int)); - memcpy(d->c.last_mv, s->c.last_mv, 2*2*2*sizeof(int)); //FIXME is memcpy faster than a loop? - - /* MPEG-1 */ - d->c.mb_skip_run = s->c.mb_skip_run; - for(i=0; i<3; i++) - d->c.last_dc[i] = s->c.last_dc[i]; - - /* statistics */ - d->mv_bits= s->mv_bits; - d->i_tex_bits= s->i_tex_bits; - d->p_tex_bits= s->p_tex_bits; - d->i_count= s->i_count; - d->misc_bits= s->misc_bits; - - d->c.mb_intra = s->c.mb_intra; - d->c.mb_skipped = s->c.mb_skipped; - d->c.mv_type = s->c.mv_type; - d->c.mv_dir = s->c.mv_dir; - d->pb= s->pb; - if (data_partitioning) { - d->pb2= s->pb2; - d->tex_pb= s->tex_pb; - } - d->c.block = s->c.block; - for(i=0; i<8; i++) - d->c.block_last_index[i] = s->c.block_last_index[i]; - d->c.interlaced_dct = s->c.interlaced_dct; - d->c.qscale = s->c.qscale; - - d->esc3_level_length= s->esc3_level_length; -} +COPY_CONTEXT(backup, save, MBBackup, MPVEncContext) +COPY_CONTEXT(reset, store, MPVEncContext, MBBackup) -static void encode_mb_hq(MPVEncContext *const s, MPVEncContext *const backup, MPVEncContext *const best, +static void encode_mb_hq(MPVEncContext *const s, MBBackup *const backup, MBBackup *const best, PutBitContext pb[2], PutBitContext pb2[2], PutBitContext tex_pb[2], int *dmin, int *next_block, int motion_x, int motion_y) { int score; uint8_t *dest_backup[3]; - copy_context_before_encode(s, backup); + reset_context_before_encode(s, backup); s->c.block = s->c.blocks[*next_block]; s->pb = pb[*next_block]; @@ -2744,7 +2764,7 @@ static void encode_mb_hq(MPVEncContext *const s, MPVEncContext *const backup, MP *dmin= score; *next_block^=1; - copy_context_after_encode(best, s, s->c.data_partitioning); + save_context_after_encode(best, s, s->c.data_partitioning); } } @@ -2962,7 +2982,7 @@ static int encode_thread(AVCodecContext *c, void *arg){ MPVEncContext *const s = *(void**)arg; int chr_h = 16 >> s->c.chroma_y_shift; int i; - MPVEncContext best_s = { 0 }, backup_s; + MBBackup best_s = { 0 }, backup_s; uint8_t bit_buf[2][MAX_MB_BYTES]; uint8_t bit_buf2[2][MAX_MB_BYTES]; uint8_t bit_buf_tex[2][MAX_MB_BYTES]; @@ -3163,7 +3183,7 @@ static int encode_thread(AVCodecContext *c, void *arg){ int next_block=0; int pb_bits_count, pb2_bits_count, tex_pb_bits_count; - copy_context_before_encode(&backup_s, s); + backup_context_before_encode(&backup_s, s); backup_s.pb= s->pb; if (s->c.data_partitioning) { backup_s.pb2= s->pb2; @@ -3388,7 +3408,7 @@ static int encode_thread(AVCodecContext *c, void *arg){ } } - copy_context_after_encode(s, &best_s, s->c.data_partitioning); + store_context_after_encode(s, &best_s, s->c.data_partitioning); pb_bits_count= put_bits_count(&s->pb); flush_put_bits(&s->pb); -- 2.45.2