Re: [FFmpeg-devel] [PATCH 3/3] libavcodec/vaapi_encode: Add async_depth to vaapi_encoder to increase performance

From: Dennis Mungai <dmngaie@gmail.com>
To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
Subject: Re: [FFmpeg-devel] [PATCH 3/3] libavcodec/vaapi_encode: Add async_depth to vaapi_encoder to increase performance
Date: Sat, 25 Dec 2021 08:49:14 +0300
Message-ID: <CAKKYfmHNj_fjnEZd7tUfhRv=V=5jxjTDnrFZ0et4re6qo=scWA@mail.gmail.com> (raw)
In-Reply-To: <0100017deec0f420-2dd9f1a7-c38f-455f-8195-caf2195f7643-000000@email.amazonses.com>

On Sat, 25 Dec 2021, 02:23 Ed Martin, <lists@edman007.com> wrote:

> On 10/31/21 22:14, Chen, Wenbin wrote:
> >> Add async_depth to increase encoder's performance. Reuse encode_fifo as
> >> async buffer. Encoder puts all reordered frame to HW and then check
> >> fifo size. If fifo < async_depth and the top frame is not ready, it will
> >> return AVERROR(EAGAIN) to require more frames.
> >>
> >> 1080p transcoding (no B frames) with -async_depth=4 can increase 20%
> >> performance on my environment.
> >> The async increases performance but also introduces frame delay.
> >>
> >> Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
> >> ---
> >>   libavcodec/vaapi_encode.c | 20 +++++++++++++++-----
> >>   libavcodec/vaapi_encode.h | 12 ++++++++++--
> >>   2 files changed, 25 insertions(+), 7 deletions(-)
> >>
> >> diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
> >> index db0ae136a1..616fb7c089 100644
> >> --- a/libavcodec/vaapi_encode.c
> >> +++ b/libavcodec/vaapi_encode.c
> >> @@ -1158,7 +1158,8 @@ static int
> >> vaapi_encode_send_frame(AVCodecContext *avctx, AVFrame *frame)
> >>           if (ctx->input_order == ctx->decode_delay)
> >>               ctx->dts_pts_diff = pic->pts - ctx->first_pts;
> >>           if (ctx->output_delay > 0)
> >> -            ctx->ts_ring[ctx->input_order % (3 * ctx->output_delay)] =
> pic->pts;
> >> +            ctx->ts_ring[ctx->input_order %
> >> +                        (3 * ctx->output_delay + ctx->async_depth)] =
> pic->pts;
> >>
> >>           pic->display_order = ctx->input_order;
> >>           ++ctx->input_order;
> >> @@ -1212,7 +1213,8 @@ int
> >> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
> >>               return AVERROR(EAGAIN);
> >>       }
> >>
> >> -    while (av_fifo_size(ctx->encode_fifo) <= MAX_PICTURE_REFERENCES *
> >> sizeof(VAAPIEncodePicture *)) {
> >> +    while (av_fifo_size(ctx->encode_fifo) <
> >> +            MAX_ASYNC_DEPTH * sizeof(VAAPIEncodePicture *)) {
> >>           pic = NULL;
> >>           err = vaapi_encode_pick_next(avctx, &pic);
> >>           if (err < 0)
> >> @@ -1234,6 +1236,14 @@ int
> >> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
> >>       if (!av_fifo_size(ctx->encode_fifo))
> >>           return err;
> >>
> >> +    if (av_fifo_size(ctx->encode_fifo) < ctx->async_depth *
> >> sizeof(VAAPIEncodePicture *) &&
> >> +        !ctx->end_of_stream) {
> >> +        av_fifo_generic_peek(ctx->encode_fifo, &pic, sizeof(pic),
> NULL);
> >> +        err = vaapi_encode_wait(avctx, pic, 0);
> >> +        if (err < 0)
> >> +            return err;
> >> +    }
> >> +
> >>       av_fifo_generic_read(ctx->encode_fifo, &pic, sizeof(pic), NULL);
> >>       ctx->encode_order = pic->encode_order + 1;
> >>
> >> @@ -1252,7 +1262,7 @@ int
> >> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
> >>               pkt->dts = ctx->ts_ring[pic->encode_order] -
> ctx->dts_pts_diff;
> >>       } else {
> >>           pkt->dts = ctx->ts_ring[(pic->encode_order -
> ctx->decode_delay) %
> >> -                                (3 * ctx->output_delay)];
> >> +                                (3 * ctx->output_delay +
> ctx->async_depth)];
> >>       }
> >>       av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64"
> >> dts %"PRId64".\n",
> >>              pkt->pts, pkt->dts);
> >> @@ -2566,8 +2576,8 @@ av_cold int ff_vaapi_encode_init(AVCodecContext
> >> *avctx)
> >>           }
> >>       }
> >>
> >> -    ctx->encode_fifo = av_fifo_alloc((MAX_PICTURE_REFERENCES + 1) *
> >> -                                      sizeof(VAAPIEncodePicture *));
> >> +    ctx->encode_fifo = av_fifo_alloc(MAX_ASYNC_DEPTH *
> >> +                                     sizeof(VAAPIEncodePicture *));
> >>       if (!ctx->encode_fifo)
> >>           return AVERROR(ENOMEM);
> >>
> >> diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h
> >> index 89fe8de466..1bf5d7c337 100644
> >> --- a/libavcodec/vaapi_encode.h
> >> +++ b/libavcodec/vaapi_encode.h
> >> @@ -48,6 +48,7 @@ enum {
> >>       MAX_TILE_ROWS          = 22,
> >>       // A.4.1: table A.6 allows at most 20 tile columns for any level.
> >>       MAX_TILE_COLS          = 20,
> >> +    MAX_ASYNC_DEPTH        = 64,
> >>   };
> >>
> >>   extern const AVCodecHWConfigInternal *const
> >> ff_vaapi_encode_hw_configs[];
> >> @@ -298,7 +299,8 @@ typedef struct VAAPIEncodeContext {
> >>       // Timestamp handling.
> >>       int64_t         first_pts;
> >>       int64_t         dts_pts_diff;
> >> -    int64_t         ts_ring[MAX_REORDER_DELAY * 3];
> >> +    int64_t         ts_ring[MAX_REORDER_DELAY * 3 +
> >> +                            MAX_ASYNC_DEPTH];
> >>
> >>       // Slice structure.
> >>       int slice_block_rows;
> >> @@ -348,6 +350,8 @@ typedef struct VAAPIEncodeContext {
> >>       AVFrame         *frame;
> >>
> >>       AVFifoBuffer *encode_fifo;
> >> +
> >> +    int async_depth;
> >>   } VAAPIEncodeContext;
> >>
> >>   enum {
> >> @@ -458,7 +462,11 @@ int ff_vaapi_encode_close(AVCodecContext *avctx);
> >>       { "b_depth", \
> >>         "Maximum B-frame reference depth", \
> >>         OFFSET(common.desired_b_depth), AV_OPT_TYPE_INT, \
> >> -      { .i64 = 1 }, 1, INT_MAX, FLAGS }
> >> +      { .i64 = 1 }, 1, INT_MAX, FLAGS }, \
> >> +    { "async_depth", "Maximum processing parallelism. " \
> >> +      "Increase this to improve single channel performance", \
> >> +      OFFSET(common.async_depth), AV_OPT_TYPE_INT, \
> >> +      { .i64 = 4 }, 0, MAX_ASYNC_DEPTH, FLAGS }
> >>
> >>   #define VAAPI_ENCODE_RC_MODE(name, desc) \
> >>       { #name, desc, 0, AV_OPT_TYPE_CONST, { .i64 = RC_MODE_ ## name },
> \
> >> --
> >> 2.25.1
> > ping
>
> I tested this patchset and  I can confirm that it solves my bug that I
> thought was a mesa bug
> (https://gitlab.freedesktop.org/mesa/mesa/-/issues/1235)
>
>
> I would love if this feature is incorporated into ffmpeg

>
>
> Indeed, this is the only patch that makes AMD GPUs usable with VAAPI.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".