From: Nuo Mi <nuomi2021@gmail.com>
To: ffmpeg-devel@ffmpeg.org
Subject: Re: [FFmpeg-devel] [PATCH 11/11] avcodec/vvcdec: move frame tab memset from the main thread to worker threads
Date: Sun, 11 Aug 2024 22:01:17 +0800
Message-ID: <CAFXK13eY5+pyYyUEn-OJnJACwY1MS7yvakkTsdvXRTPo5-a47A@mail.gmail.com> (raw)
In-Reply-To: <TYSPR06MB64331FC9D2749A2C9FE37EE7AAB62@TYSPR06MB6433.apcprd06.prod.outlook.com>
On Sun, Jul 28, 2024 at 11:19 AM Nuo Mi <nuomi2021@gmail.com> wrote:
> memset tables in the main thread can become a bottleneck for the decoder.
> For example, if it takes 1% of the processing time for one core, the
> maximum achievable FPS will be 100.
> Move the memeset to worker threads will fix the issue.
>
will apply next week if there are no objections
> ---
> libavcodec/vvc/dec.c | 13 ++++-
> libavcodec/vvc/thread.c | 122 ++++++++++++++++++++++++----------------
> libavcodec/vvc/thread.h | 1 +
> 3 files changed, 85 insertions(+), 51 deletions(-)
>
> diff --git a/libavcodec/vvc/dec.c b/libavcodec/vvc/dec.c
> index 575bcfa33d..d34713296d 100644
> --- a/libavcodec/vvc/dec.c
> +++ b/libavcodec/vvc/dec.c
> @@ -82,7 +82,13 @@ static int tl_create(TabList *l)
> if (!*t->tab)
> return AVERROR(ENOMEM);
> }
> - } else if (l->zero) {
> + }
> + return 0;
> +}
> +
> +static int tl_zero(TabList *l)
> +{
> + if (l->zero) {
> for (int i = 0; i < l->nb_tabs; i++) {
> Tab *t = l->tabs + i;
> memset(*t->tab, 0, t->size);
> @@ -404,6 +410,11 @@ static int pic_arrays_init(VVCContext *s,
> VVCFrameContext *fc)
> return 0;
> }
>
> +int ff_vvc_per_frame_init(VVCFrameContext *fc)
> +{
> + return frame_context_for_each_tl(fc, tl_zero);
> +}
> +
> static int min_positive(const int idx, const int diff, const int min_diff)
> {
> return diff > 0 && (idx < 0 || diff < min_diff);
> diff --git a/libavcodec/vvc/thread.c b/libavcodec/vvc/thread.c
> index 28065d726f..74f8e4e9d0 100644
> --- a/libavcodec/vvc/thread.c
> +++ b/libavcodec/vvc/thread.c
> @@ -40,6 +40,7 @@ typedef struct ProgressListener {
> } ProgressListener;
>
> typedef enum VVCTaskStage {
> + VVC_TASK_STAGE_INIT, // for CTU(0, 0) only
> VVC_TASK_STAGE_PARSE,
> VVC_TASK_STAGE_INTER,
> VVC_TASK_STAGE_RECON,
> @@ -175,10 +176,14 @@ static int task_has_target_score(VVCTask *t, const
> VVCTaskStage stage, const uin
> uint8_t target = 0;
> VVCFrameContext *fc = t->fc;
>
> + if (stage == VVC_TASK_STAGE_INIT)
> + return 1;
> +
> if (stage == VVC_TASK_STAGE_PARSE) {
> - const H266RawSPS *rsps = fc->ps.sps->r;
> - const int wpp = rsps->sps_entropy_coding_sync_enabled_flag &&
> !is_first_row(fc, t->rx, t->ry);
> - target = 2 + wpp - 1; //left parse +
> colocation + wpp - no previous stage
> + const H266RawSPS *rsps = fc->ps.sps->r;
> + const int wpp =
> rsps->sps_entropy_coding_sync_enabled_flag && !is_first_row(fc, t->rx,
> t->ry);
> + const int no_prev_stage = t->rs > 0;
> + target = 2 + wpp - no_prev_stage;
> //left parse + colocation + wpp - no_prev_stage
> } else if (stage == VVC_TASK_STAGE_INTER) {
> target = atomic_load(&t->target_inter_score);
> } else {
> @@ -399,6 +404,55 @@ static int task_priority_higher(const AVTask *_a,
> const AVTask *_b)
> return a->ry < b->ry;
> }
>
> +static void check_colocation(VVCContext *s, VVCTask *t)
> +{
> + const VVCFrameContext *fc = t->fc;
> +
> + if (fc->ps.ph.r->ph_temporal_mvp_enabled_flag ||
> fc->ps.sps->r->sps_sbtmvp_enabled_flag) {
> + VVCFrame *col = fc->ref->collocated_ref;
> + const int first_col = t->rx == fc->ps.pps->ctb_to_col_bd[t->rx];
> + if (col && first_col) {
> + //we depend on bottom and right boundary, do not - 1 for y
> + const int y = (t->ry << fc->ps.sps->ctb_log2_size_y);
> + add_progress_listener(col, &t->col_listener, t, s,
> VVC_PROGRESS_MV, y);
> + return;
> + }
> + }
> + frame_thread_add_score(s, fc->ft, t->rx, t->ry, VVC_TASK_STAGE_PARSE);
> +}
> +
> +static void submit_entry_point(VVCContext *s, VVCFrameThread *ft,
> SliceContext *sc, EntryPoint *ep)
> +{
> + const int rs = sc->sh.ctb_addr_in_curr_slice[ep->ctu_start];
> + VVCTask *t = ft->tasks + rs;
> +
> + frame_thread_add_score(s, ft, t->rx, t->ry, VVC_TASK_STAGE_PARSE);
> +}
> +
> +static int run_init(VVCContext *s, VVCLocalContext *lc, VVCTask *t)
> +{
> + VVCFrameContext *fc = lc->fc;
> + VVCFrameThread *ft = fc->ft;
> + const int ret = ff_vvc_per_frame_init(fc);
> +
> + if (ret < 0)
> + return ret;
> +
> + for (int i = 0; i < fc->nb_slices; i++) {
> + SliceContext *sc = fc->slices[i];
> + for (int j = 0; j < sc->nb_eps; j++) {
> + EntryPoint *ep = sc->eps + j;
> + for (int k = ep->ctu_start; k < ep->ctu_end; k++) {
> + const int rs = sc->sh.ctb_addr_in_curr_slice[k];
> + VVCTask *t = ft->tasks + rs;
> + check_colocation(s, t);
> + }
> + submit_entry_point(s, ft, sc, ep);
> + }
> + }
> + return 0;
> +}
> +
> static void report_frame_progress(VVCFrameContext *fc,
> const int ry, const VVCProgress idx)
> {
> @@ -547,6 +601,7 @@ static int run_alf(VVCContext *s, VVCLocalContext *lc,
> VVCTask *t)
> #define VVC_THREAD_DEBUG
> #ifdef VVC_THREAD_DEBUG
> const static char* task_name[] = {
> + "INIT",
> "P",
> "I",
> "R",
> @@ -567,6 +622,7 @@ static void task_run_stage(VVCTask *t, VVCContext *s,
> VVCLocalContext *lc)
> VVCFrameThread *ft = fc->ft;
> const VVCTaskStage stage = t->stage;
> static const run_func run[] = {
> + run_init,
> run_parse,
> run_inter,
> run_recon,
> @@ -726,7 +782,7 @@ int ff_vvc_frame_thread_init(VVCFrameContext *fc)
>
> for (int rs = 0; rs < ft->ctu_count; rs++) {
> VVCTask *t = ft->tasks + rs;
> - task_init(t, VVC_TASK_STAGE_PARSE, fc, rs % ft->ctu_width, rs /
> ft->ctu_width);
> + task_init(t, rs ? VVC_TASK_STAGE_PARSE : VVC_TASK_STAGE_INIT, fc,
> rs % ft->ctu_width, rs / ft->ctu_width);
> }
>
> memset(&ft->row_progress[0], 0, sizeof(ft->row_progress));
> @@ -745,59 +801,25 @@ fail:
> return AVERROR(ENOMEM);
> }
>
> -static void check_colocation(VVCContext *s, VVCTask *t)
> -{
> - const VVCFrameContext *fc = t->fc;
> -
> - if (fc->ps.ph.r->ph_temporal_mvp_enabled_flag ||
> fc->ps.sps->r->sps_sbtmvp_enabled_flag) {
> - VVCFrame *col = fc->ref->collocated_ref;
> - const int first_col = t->rx == fc->ps.pps->ctb_to_col_bd[t->rx];
> - if (col && first_col) {
> - //we depend on bottom and right boundary, do not - 1 for y
> - const int y = (t->ry << fc->ps.sps->ctb_log2_size_y);
> - add_progress_listener(col, &t->col_listener, t, s,
> VVC_PROGRESS_MV, y);
> - return;
> - }
> - }
> - frame_thread_add_score(s, fc->ft, t->rx, t->ry, VVC_TASK_STAGE_PARSE);
> -}
> -
> -static void submit_entry_point(VVCContext *s, VVCFrameThread *ft,
> SliceContext *sc, EntryPoint *ep)
> -{
> - const int rs = sc->sh.ctb_addr_in_curr_slice[ep->ctu_start];
> - VVCTask *t = ft->tasks + rs;
> -
> - frame_thread_add_score(s, ft, t->rx, t->ry, VVC_TASK_STAGE_PARSE);
> -}
> -
> int ff_vvc_frame_submit(VVCContext *s, VVCFrameContext *fc)
> {
> VVCFrameThread *ft = fc->ft;
>
> - // We'll handle this in two passes:
> - // Pass 0 to initialize tasks with parser, this will help detect bit
> stream error
> - // Pass 1 to shedule location check and submit the entry point
> - for (int pass = 0; pass < 2; pass++) {
> - for (int i = 0; i < fc->nb_slices; i++) {
> - SliceContext *sc = fc->slices[i];
> - for (int j = 0; j < sc->nb_eps; j++) {
> - EntryPoint *ep = sc->eps + j;
> - for (int k = ep->ctu_start; k < ep->ctu_end; k++) {
> - const int rs = sc->sh.ctb_addr_in_curr_slice[k];
> - VVCTask *t = ft->tasks + rs;
> - if (pass) {
> - check_colocation(s, t);
> - } else {
> - const int ret = task_init_parse(t, sc, ep, k);
> - if (ret < 0)
> - return ret;
> - }
> - }
> - if (pass)
> - submit_entry_point(s, ft, sc, ep);
> + for (int i = 0; i < fc->nb_slices; i++) {
> + SliceContext *sc = fc->slices[i];
> + for (int j = 0; j < sc->nb_eps; j++) {
> + EntryPoint *ep = sc->eps + j;
> + for (int k = ep->ctu_start; k < ep->ctu_end; k++) {
> + const int rs = sc->sh.ctb_addr_in_curr_slice[k];
> + VVCTask *t = ft->tasks + rs;
> + const int ret = task_init_parse(t, sc, ep, k);
> + if (ret < 0)
> + return ret;
> }
> }
> }
> + frame_thread_add_score(s, ft, 0, 0, VVC_TASK_STAGE_INIT);
> +
> return 0;
> }
>
> diff --git a/libavcodec/vvc/thread.h b/libavcodec/vvc/thread.h
> index 8ac59b2ecf..7b15dbee59 100644
> --- a/libavcodec/vvc/thread.h
> +++ b/libavcodec/vvc/thread.h
> @@ -32,5 +32,6 @@ int ff_vvc_frame_thread_init(VVCFrameContext *fc);
> void ff_vvc_frame_thread_free(VVCFrameContext *fc);
> int ff_vvc_frame_submit(VVCContext *s, VVCFrameContext *fc);
> int ff_vvc_frame_wait(VVCContext *s, VVCFrameContext *fc);
> +int ff_vvc_per_frame_init(VVCFrameContext *fc);
>
> #endif // AVCODEC_VVC_THREAD_H
> --
> 2.34.1
>
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2024-08-11 14:01 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <20240728031807.462810-1-nuomi2021@gmail.com>
2024-07-28 3:17 ` [FFmpeg-devel] [PATCH 02/11] avcodec/vvcdec: refact, combine bs tab with tu tab Nuo Mi
2024-07-28 3:17 ` [FFmpeg-devel] [PATCH 03/11] avcodec/vvcdec: remove unnecessary perframe initializations Nuo Mi
2024-07-28 3:18 ` [FFmpeg-devel] [PATCH 04/11] avcodec/vvcdec: split ctu table to zero init and no zero init parts Nuo Mi
2024-07-28 3:18 ` [FFmpeg-devel] [PATCH 05/11] avcodec/vvcdec: refact out is_available from is_a0_available Nuo Mi
2024-07-28 3:18 ` [FFmpeg-devel] [PATCH 06/11] avcodec/vvcdec: do not zero frame mvf table Nuo Mi
2024-07-28 3:18 ` [FFmpeg-devel] [PATCH 07/11] avcodec/vvcdec: check_available, use && instead of &= for shortcut evaluation Nuo Mi
2024-07-28 3:18 ` [FFmpeg-devel] [PATCH 08/11] avcodec/vvcdec: do not zero frame cpm table Nuo Mi
2024-07-28 3:18 ` [FFmpeg-devel] [PATCH 09/11] avcodec/vvcdec: do not zero frame msf mmi table Nuo Mi
2024-07-28 3:18 ` [FFmpeg-devel] [PATCH 10/11] avcodec/vvcdec: do not zero frame qp table Nuo Mi
2024-07-28 3:18 ` [FFmpeg-devel] [PATCH 11/11] avcodec/vvcdec: move frame tab memset from the main thread to worker threads Nuo Mi
2024-08-11 14:01 ` Nuo Mi [this message]
2024-08-15 12:45 ` Nuo Mi
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=CAFXK13eY5+pyYyUEn-OJnJACwY1MS7yvakkTsdvXRTPo5-a47A@mail.gmail.com \
--to=nuomi2021@gmail.com \
--cc=ffmpeg-devel@ffmpeg.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git