From: Nuo Mi <nuomi2021@gmail.com> To: ffmpeg-devel@ffmpeg.org Subject: Re: [FFmpeg-devel] [PATCH 11/11] avcodec/vvcdec: move frame tab memset from the main thread to worker threads Date: Sun, 11 Aug 2024 22:01:17 +0800 Message-ID: <CAFXK13eY5+pyYyUEn-OJnJACwY1MS7yvakkTsdvXRTPo5-a47A@mail.gmail.com> (raw) In-Reply-To: <TYSPR06MB64331FC9D2749A2C9FE37EE7AAB62@TYSPR06MB6433.apcprd06.prod.outlook.com> On Sun, Jul 28, 2024 at 11:19 AM Nuo Mi <nuomi2021@gmail.com> wrote: > memset tables in the main thread can become a bottleneck for the decoder. > For example, if it takes 1% of the processing time for one core, the > maximum achievable FPS will be 100. > Move the memeset to worker threads will fix the issue. > will apply next week if there are no objections > --- > libavcodec/vvc/dec.c | 13 ++++- > libavcodec/vvc/thread.c | 122 ++++++++++++++++++++++++---------------- > libavcodec/vvc/thread.h | 1 + > 3 files changed, 85 insertions(+), 51 deletions(-) > > diff --git a/libavcodec/vvc/dec.c b/libavcodec/vvc/dec.c > index 575bcfa33d..d34713296d 100644 > --- a/libavcodec/vvc/dec.c > +++ b/libavcodec/vvc/dec.c > @@ -82,7 +82,13 @@ static int tl_create(TabList *l) > if (!*t->tab) > return AVERROR(ENOMEM); > } > - } else if (l->zero) { > + } > + return 0; > +} > + > +static int tl_zero(TabList *l) > +{ > + if (l->zero) { > for (int i = 0; i < l->nb_tabs; i++) { > Tab *t = l->tabs + i; > memset(*t->tab, 0, t->size); > @@ -404,6 +410,11 @@ static int pic_arrays_init(VVCContext *s, > VVCFrameContext *fc) > return 0; > } > > +int ff_vvc_per_frame_init(VVCFrameContext *fc) > +{ > + return frame_context_for_each_tl(fc, tl_zero); > +} > + > static int min_positive(const int idx, const int diff, const int min_diff) > { > return diff > 0 && (idx < 0 || diff < min_diff); > diff --git a/libavcodec/vvc/thread.c b/libavcodec/vvc/thread.c > index 28065d726f..74f8e4e9d0 100644 > --- a/libavcodec/vvc/thread.c > +++ b/libavcodec/vvc/thread.c > @@ -40,6 +40,7 @@ typedef struct ProgressListener { > } ProgressListener; > > typedef enum VVCTaskStage { > + VVC_TASK_STAGE_INIT, // for CTU(0, 0) only > VVC_TASK_STAGE_PARSE, > VVC_TASK_STAGE_INTER, > VVC_TASK_STAGE_RECON, > @@ -175,10 +176,14 @@ static int task_has_target_score(VVCTask *t, const > VVCTaskStage stage, const uin > uint8_t target = 0; > VVCFrameContext *fc = t->fc; > > + if (stage == VVC_TASK_STAGE_INIT) > + return 1; > + > if (stage == VVC_TASK_STAGE_PARSE) { > - const H266RawSPS *rsps = fc->ps.sps->r; > - const int wpp = rsps->sps_entropy_coding_sync_enabled_flag && > !is_first_row(fc, t->rx, t->ry); > - target = 2 + wpp - 1; //left parse + > colocation + wpp - no previous stage > + const H266RawSPS *rsps = fc->ps.sps->r; > + const int wpp = > rsps->sps_entropy_coding_sync_enabled_flag && !is_first_row(fc, t->rx, > t->ry); > + const int no_prev_stage = t->rs > 0; > + target = 2 + wpp - no_prev_stage; > //left parse + colocation + wpp - no_prev_stage > } else if (stage == VVC_TASK_STAGE_INTER) { > target = atomic_load(&t->target_inter_score); > } else { > @@ -399,6 +404,55 @@ static int task_priority_higher(const AVTask *_a, > const AVTask *_b) > return a->ry < b->ry; > } > > +static void check_colocation(VVCContext *s, VVCTask *t) > +{ > + const VVCFrameContext *fc = t->fc; > + > + if (fc->ps.ph.r->ph_temporal_mvp_enabled_flag || > fc->ps.sps->r->sps_sbtmvp_enabled_flag) { > + VVCFrame *col = fc->ref->collocated_ref; > + const int first_col = t->rx == fc->ps.pps->ctb_to_col_bd[t->rx]; > + if (col && first_col) { > + //we depend on bottom and right boundary, do not - 1 for y > + const int y = (t->ry << fc->ps.sps->ctb_log2_size_y); > + add_progress_listener(col, &t->col_listener, t, s, > VVC_PROGRESS_MV, y); > + return; > + } > + } > + frame_thread_add_score(s, fc->ft, t->rx, t->ry, VVC_TASK_STAGE_PARSE); > +} > + > +static void submit_entry_point(VVCContext *s, VVCFrameThread *ft, > SliceContext *sc, EntryPoint *ep) > +{ > + const int rs = sc->sh.ctb_addr_in_curr_slice[ep->ctu_start]; > + VVCTask *t = ft->tasks + rs; > + > + frame_thread_add_score(s, ft, t->rx, t->ry, VVC_TASK_STAGE_PARSE); > +} > + > +static int run_init(VVCContext *s, VVCLocalContext *lc, VVCTask *t) > +{ > + VVCFrameContext *fc = lc->fc; > + VVCFrameThread *ft = fc->ft; > + const int ret = ff_vvc_per_frame_init(fc); > + > + if (ret < 0) > + return ret; > + > + for (int i = 0; i < fc->nb_slices; i++) { > + SliceContext *sc = fc->slices[i]; > + for (int j = 0; j < sc->nb_eps; j++) { > + EntryPoint *ep = sc->eps + j; > + for (int k = ep->ctu_start; k < ep->ctu_end; k++) { > + const int rs = sc->sh.ctb_addr_in_curr_slice[k]; > + VVCTask *t = ft->tasks + rs; > + check_colocation(s, t); > + } > + submit_entry_point(s, ft, sc, ep); > + } > + } > + return 0; > +} > + > static void report_frame_progress(VVCFrameContext *fc, > const int ry, const VVCProgress idx) > { > @@ -547,6 +601,7 @@ static int run_alf(VVCContext *s, VVCLocalContext *lc, > VVCTask *t) > #define VVC_THREAD_DEBUG > #ifdef VVC_THREAD_DEBUG > const static char* task_name[] = { > + "INIT", > "P", > "I", > "R", > @@ -567,6 +622,7 @@ static void task_run_stage(VVCTask *t, VVCContext *s, > VVCLocalContext *lc) > VVCFrameThread *ft = fc->ft; > const VVCTaskStage stage = t->stage; > static const run_func run[] = { > + run_init, > run_parse, > run_inter, > run_recon, > @@ -726,7 +782,7 @@ int ff_vvc_frame_thread_init(VVCFrameContext *fc) > > for (int rs = 0; rs < ft->ctu_count; rs++) { > VVCTask *t = ft->tasks + rs; > - task_init(t, VVC_TASK_STAGE_PARSE, fc, rs % ft->ctu_width, rs / > ft->ctu_width); > + task_init(t, rs ? VVC_TASK_STAGE_PARSE : VVC_TASK_STAGE_INIT, fc, > rs % ft->ctu_width, rs / ft->ctu_width); > } > > memset(&ft->row_progress[0], 0, sizeof(ft->row_progress)); > @@ -745,59 +801,25 @@ fail: > return AVERROR(ENOMEM); > } > > -static void check_colocation(VVCContext *s, VVCTask *t) > -{ > - const VVCFrameContext *fc = t->fc; > - > - if (fc->ps.ph.r->ph_temporal_mvp_enabled_flag || > fc->ps.sps->r->sps_sbtmvp_enabled_flag) { > - VVCFrame *col = fc->ref->collocated_ref; > - const int first_col = t->rx == fc->ps.pps->ctb_to_col_bd[t->rx]; > - if (col && first_col) { > - //we depend on bottom and right boundary, do not - 1 for y > - const int y = (t->ry << fc->ps.sps->ctb_log2_size_y); > - add_progress_listener(col, &t->col_listener, t, s, > VVC_PROGRESS_MV, y); > - return; > - } > - } > - frame_thread_add_score(s, fc->ft, t->rx, t->ry, VVC_TASK_STAGE_PARSE); > -} > - > -static void submit_entry_point(VVCContext *s, VVCFrameThread *ft, > SliceContext *sc, EntryPoint *ep) > -{ > - const int rs = sc->sh.ctb_addr_in_curr_slice[ep->ctu_start]; > - VVCTask *t = ft->tasks + rs; > - > - frame_thread_add_score(s, ft, t->rx, t->ry, VVC_TASK_STAGE_PARSE); > -} > - > int ff_vvc_frame_submit(VVCContext *s, VVCFrameContext *fc) > { > VVCFrameThread *ft = fc->ft; > > - // We'll handle this in two passes: > - // Pass 0 to initialize tasks with parser, this will help detect bit > stream error > - // Pass 1 to shedule location check and submit the entry point > - for (int pass = 0; pass < 2; pass++) { > - for (int i = 0; i < fc->nb_slices; i++) { > - SliceContext *sc = fc->slices[i]; > - for (int j = 0; j < sc->nb_eps; j++) { > - EntryPoint *ep = sc->eps + j; > - for (int k = ep->ctu_start; k < ep->ctu_end; k++) { > - const int rs = sc->sh.ctb_addr_in_curr_slice[k]; > - VVCTask *t = ft->tasks + rs; > - if (pass) { > - check_colocation(s, t); > - } else { > - const int ret = task_init_parse(t, sc, ep, k); > - if (ret < 0) > - return ret; > - } > - } > - if (pass) > - submit_entry_point(s, ft, sc, ep); > + for (int i = 0; i < fc->nb_slices; i++) { > + SliceContext *sc = fc->slices[i]; > + for (int j = 0; j < sc->nb_eps; j++) { > + EntryPoint *ep = sc->eps + j; > + for (int k = ep->ctu_start; k < ep->ctu_end; k++) { > + const int rs = sc->sh.ctb_addr_in_curr_slice[k]; > + VVCTask *t = ft->tasks + rs; > + const int ret = task_init_parse(t, sc, ep, k); > + if (ret < 0) > + return ret; > } > } > } > + frame_thread_add_score(s, ft, 0, 0, VVC_TASK_STAGE_INIT); > + > return 0; > } > > diff --git a/libavcodec/vvc/thread.h b/libavcodec/vvc/thread.h > index 8ac59b2ecf..7b15dbee59 100644 > --- a/libavcodec/vvc/thread.h > +++ b/libavcodec/vvc/thread.h > @@ -32,5 +32,6 @@ int ff_vvc_frame_thread_init(VVCFrameContext *fc); > void ff_vvc_frame_thread_free(VVCFrameContext *fc); > int ff_vvc_frame_submit(VVCContext *s, VVCFrameContext *fc); > int ff_vvc_frame_wait(VVCContext *s, VVCFrameContext *fc); > +int ff_vvc_per_frame_init(VVCFrameContext *fc); > > #endif // AVCODEC_VVC_THREAD_H > -- > 2.34.1 > > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2024-08-11 14:01 UTC|newest] Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top [not found] <20240728031807.462810-1-nuomi2021@gmail.com> 2024-07-28 3:17 ` [FFmpeg-devel] [PATCH 02/11] avcodec/vvcdec: refact, combine bs tab with tu tab Nuo Mi 2024-07-28 3:17 ` [FFmpeg-devel] [PATCH 03/11] avcodec/vvcdec: remove unnecessary perframe initializations Nuo Mi 2024-07-28 3:18 ` [FFmpeg-devel] [PATCH 04/11] avcodec/vvcdec: split ctu table to zero init and no zero init parts Nuo Mi 2024-07-28 3:18 ` [FFmpeg-devel] [PATCH 05/11] avcodec/vvcdec: refact out is_available from is_a0_available Nuo Mi 2024-07-28 3:18 ` [FFmpeg-devel] [PATCH 06/11] avcodec/vvcdec: do not zero frame mvf table Nuo Mi 2024-07-28 3:18 ` [FFmpeg-devel] [PATCH 07/11] avcodec/vvcdec: check_available, use && instead of &= for shortcut evaluation Nuo Mi 2024-07-28 3:18 ` [FFmpeg-devel] [PATCH 08/11] avcodec/vvcdec: do not zero frame cpm table Nuo Mi 2024-07-28 3:18 ` [FFmpeg-devel] [PATCH 09/11] avcodec/vvcdec: do not zero frame msf mmi table Nuo Mi 2024-07-28 3:18 ` [FFmpeg-devel] [PATCH 10/11] avcodec/vvcdec: do not zero frame qp table Nuo Mi 2024-07-28 3:18 ` [FFmpeg-devel] [PATCH 11/11] avcodec/vvcdec: move frame tab memset from the main thread to worker threads Nuo Mi 2024-08-11 14:01 ` Nuo Mi [this message] 2024-08-15 12:45 ` Nuo Mi
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=CAFXK13eY5+pyYyUEn-OJnJACwY1MS7yvakkTsdvXRTPo5-a47A@mail.gmail.com \ --to=nuomi2021@gmail.com \ --cc=ffmpeg-devel@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git