* [FFmpeg-devel] [PATCH V3 2/3] libavcodec/vaapi_encode: Change the way to call async to increase performance
2022-02-08 3:05 [FFmpeg-devel] [PATCH V3 1/3] libavcodec/vaapi_encode: Add new API adaption to vaapi_encode Wenbin Chen
@ 2022-02-08 3:05 ` Wenbin Chen
2022-02-11 4:24 ` Xiang, Haihao
2022-02-08 3:05 ` [FFmpeg-devel] [PATCH V3 3/3] libavcodec/vaapi_encode: Add async_depth to vaapi_encoder " Wenbin Chen
2022-02-11 4:07 ` [FFmpeg-devel] [PATCH V3 1/3] libavcodec/vaapi_encode: Add new API adaption to vaapi_encode Xiang, Haihao
2 siblings, 1 reply; 8+ messages in thread
From: Wenbin Chen @ 2022-02-08 3:05 UTC (permalink / raw)
To: ffmpeg-devel
Fix: #7706. After commit 5fdcf85bbffe7451c2, vaapi encoder's performance
decrease. The reason is that vaRenderPicture() and vaSyncBuffer() are
called at the same time (vaRenderPicture() always followed by a
vaSyncBuffer()). When we encode stream with B frames, we need buffer to
reorder frames, so we can send serveral frames to HW at once to increase
performance. Now I changed them to be called in a asynchronous way, which
will make better use of hardware. 1080p transcoding increases about 17%
fps on my environment.
This change fits vaSyncBuffer(), so if driver does not support
vaSyncBuffer, it will keep previous operation.
Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
---
libavcodec/vaapi_encode.c | 64 ++++++++++++++++++++++++++++++++-------
libavcodec/vaapi_encode.h | 5 +++
2 files changed, 58 insertions(+), 11 deletions(-)
diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
index b87b58a42b..15ddbbaa4a 100644
--- a/libavcodec/vaapi_encode.c
+++ b/libavcodec/vaapi_encode.c
@@ -984,8 +984,10 @@ static int vaapi_encode_pick_next(AVCodecContext *avctx,
if (!pic && ctx->end_of_stream) {
--b_counter;
pic = ctx->pic_end;
- if (pic->encode_issued)
+ if (pic->encode_complete)
return AVERROR_EOF;
+ else if (pic->encode_issued)
+ return AVERROR(EAGAIN);
}
if (!pic) {
@@ -1210,18 +1212,44 @@ int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
return AVERROR(EAGAIN);
}
- pic = NULL;
- err = vaapi_encode_pick_next(avctx, &pic);
- if (err < 0)
- return err;
- av_assert0(pic);
+#if VA_CHECK_VERSION(1, 9, 0)
+ if (ctx->has_sync_buffer_func) {
+ while (av_fifo_can_read(ctx->encode_fifo) <= MAX_PICTURE_REFERENCES) {
+ pic = NULL;
+ err = vaapi_encode_pick_next(avctx, &pic);
+ if (err < 0)
+ break;
+
+ av_assert0(pic);
+ pic->encode_order = ctx->encode_order +
+ av_fifo_can_read(ctx->encode_fifo);
+ err = vaapi_encode_issue(avctx, pic);
+ if (err < 0) {
+ av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err);
+ return err;
+ }
+ av_fifo_write(ctx->encode_fifo, &pic, 1);
+ }
+ if (!av_fifo_can_read(ctx->encode_fifo))
+ return err;
+ av_fifo_read(ctx->encode_fifo, &pic, 1);
+ ctx->encode_order = pic->encode_order + 1;
+ } else
+#endif
+ {
+ pic = NULL;
+ err = vaapi_encode_pick_next(avctx, &pic);
+ if (err < 0)
+ return err;
+ av_assert0(pic);
- pic->encode_order = ctx->encode_order++;
+ pic->encode_order = ctx->encode_order++;
- err = vaapi_encode_issue(avctx, pic);
- if (err < 0) {
- av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err);
- return err;
+ err = vaapi_encode_issue(avctx, pic);
+ if (err < 0) {
+ av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err);
+ return err;
+ }
}
err = vaapi_encode_output(avctx, pic, pkt);
@@ -2555,6 +2583,19 @@ av_cold int ff_vaapi_encode_init(AVCodecContext *avctx)
}
}
+#if VA_CHECK_VERSION(1, 9, 0)
+ //check vaSyncBuffer function
+ vas = vaSyncBuffer(ctx->hwctx->display, 0, 0);
+ if (vas != VA_STATUS_ERROR_UNIMPLEMENTED) {
+ ctx->has_sync_buffer_func = 1;
+ ctx->encode_fifo = av_fifo_alloc2(MAX_PICTURE_REFERENCES + 1,
+ sizeof(VAAPIEncodePicture *),
+ 0);
+ if (!ctx->encode_fifo)
+ return AVERROR(ENOMEM);
+ }
+#endif
+
return 0;
fail:
@@ -2592,6 +2633,7 @@ av_cold int ff_vaapi_encode_close(AVCodecContext *avctx)
av_freep(&ctx->codec_sequence_params);
av_freep(&ctx->codec_picture_params);
+ av_fifo_freep2(&ctx->encode_fifo);
av_buffer_unref(&ctx->recon_frames_ref);
av_buffer_unref(&ctx->input_frames_ref);
diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h
index b41604a883..d33a486cb8 100644
--- a/libavcodec/vaapi_encode.h
+++ b/libavcodec/vaapi_encode.h
@@ -29,6 +29,7 @@
#include "libavutil/hwcontext.h"
#include "libavutil/hwcontext_vaapi.h"
+#include "libavutil/fifo.h"
#include "avcodec.h"
#include "hwconfig.h"
@@ -345,6 +346,10 @@ typedef struct VAAPIEncodeContext {
int roi_warned;
AVFrame *frame;
+ //Store buffered pic
+ AVFifo *encode_fifo;
+ //Whether the driver support vaSyncBuffer
+ int has_sync_buffer_func;
} VAAPIEncodeContext;
enum {
--
2.32.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [FFmpeg-devel] [PATCH V3 2/3] libavcodec/vaapi_encode: Change the way to call async to increase performance
2022-02-08 3:05 ` [FFmpeg-devel] [PATCH V3 2/3] libavcodec/vaapi_encode: Change the way to call async to increase performance Wenbin Chen
@ 2022-02-11 4:24 ` Xiang, Haihao
0 siblings, 0 replies; 8+ messages in thread
From: Xiang, Haihao @ 2022-02-11 4:24 UTC (permalink / raw)
To: ffmpeg-devel
On Tue, 2022-02-08 at 11:05 +0800, Wenbin Chen wrote:
> Fix: #7706. After commit 5fdcf85bbffe7451c2, vaapi encoder's performance
> decrease. The reason is that vaRenderPicture() and vaSyncBuffer() are
> called at the same time (vaRenderPicture() always followed by a
> vaSyncBuffer()). When we encode stream with B frames, we need buffer to
> reorder frames, so we can send serveral frames to HW at once to increase
> performance. Now I changed them to be called in a asynchronous way, which
> will make better use of hardware. 1080p transcoding increases about 17%
> fps on my environment.
>
> This change fits vaSyncBuffer(), so if driver does not support
> vaSyncBuffer, it will keep previous operation.
>
> Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
> ---
> libavcodec/vaapi_encode.c | 64 ++++++++++++++++++++++++++++++++-------
> libavcodec/vaapi_encode.h | 5 +++
> 2 files changed, 58 insertions(+), 11 deletions(-)
>
> diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
> index b87b58a42b..15ddbbaa4a 100644
> --- a/libavcodec/vaapi_encode.c
> +++ b/libavcodec/vaapi_encode.c
> @@ -984,8 +984,10 @@ static int vaapi_encode_pick_next(AVCodecContext *avctx,
> if (!pic && ctx->end_of_stream) {
> --b_counter;
> pic = ctx->pic_end;
> - if (pic->encode_issued)
> + if (pic->encode_complete)
> return AVERROR_EOF;
> + else if (pic->encode_issued)
> + return AVERROR(EAGAIN);
> }
>
> if (!pic) {
> @@ -1210,18 +1212,44 @@ int ff_vaapi_encode_receive_packet(AVCodecContext
> *avctx, AVPacket *pkt)
> return AVERROR(EAGAIN);
> }
>
> - pic = NULL;
> - err = vaapi_encode_pick_next(avctx, &pic);
> - if (err < 0)
> - return err;
> - av_assert0(pic);
> +#if VA_CHECK_VERSION(1, 9, 0)
Needn't check the version at compile time because vaSyncBuffer is not called
directly in the code below.
> + if (ctx->has_sync_buffer_func) {
> + while (av_fifo_can_read(ctx->encode_fifo) <= MAX_PICTURE_REFERENCES)
> {
> + pic = NULL;
> + err = vaapi_encode_pick_next(avctx, &pic);
> + if (err < 0)
> + break;
> +
> + av_assert0(pic);
> + pic->encode_order = ctx->encode_order +
> + av_fifo_can_read(ctx->encode_fifo);
> + err = vaapi_encode_issue(avctx, pic);
> + if (err < 0) {
> + av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err);
> + return err;
> + }
> + av_fifo_write(ctx->encode_fifo, &pic, 1);
> + }
> + if (!av_fifo_can_read(ctx->encode_fifo))
> + return err;
> + av_fifo_read(ctx->encode_fifo, &pic, 1);
> + ctx->encode_order = pic->encode_order + 1;
> + } else
> +#endif
> + {
> + pic = NULL;
> + err = vaapi_encode_pick_next(avctx, &pic);
> + if (err < 0)
> + return err;
> + av_assert0(pic);
>
> - pic->encode_order = ctx->encode_order++;
> + pic->encode_order = ctx->encode_order++;
>
> - err = vaapi_encode_issue(avctx, pic);
> - if (err < 0) {
> - av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err);
> - return err;
> + err = vaapi_encode_issue(avctx, pic);
> + if (err < 0) {
> + av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err);
> + return err;
> + }
> }
>
> err = vaapi_encode_output(avctx, pic, pkt);
> @@ -2555,6 +2583,19 @@ av_cold int ff_vaapi_encode_init(AVCodecContext *avctx)
> }
> }
>
> +#if VA_CHECK_VERSION(1, 9, 0)
> + //check vaSyncBuffer function
> + vas = vaSyncBuffer(ctx->hwctx->display, 0, 0);
Buf id 0 (the 2nd parameter) might be valid, however we needn't synchronize with
a real buf here, we may use VA_INVALID_ID instead
> + if (vas != VA_STATUS_ERROR_UNIMPLEMENTED) {
> + ctx->has_sync_buffer_func = 1;
> + ctx->encode_fifo = av_fifo_alloc2(MAX_PICTURE_REFERENCES + 1,
> + sizeof(VAAPIEncodePicture *),
> + 0);
> + if (!ctx->encode_fifo)
> + return AVERROR(ENOMEM);
> + }
> +#endif
> +
> return 0;
>
> fail:
> @@ -2592,6 +2633,7 @@ av_cold int ff_vaapi_encode_close(AVCodecContext *avctx)
>
> av_freep(&ctx->codec_sequence_params);
> av_freep(&ctx->codec_picture_params);
> + av_fifo_freep2(&ctx->encode_fifo);
>
> av_buffer_unref(&ctx->recon_frames_ref);
> av_buffer_unref(&ctx->input_frames_ref);
> diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h
> index b41604a883..d33a486cb8 100644
> --- a/libavcodec/vaapi_encode.h
> +++ b/libavcodec/vaapi_encode.h
> @@ -29,6 +29,7 @@
>
> #include "libavutil/hwcontext.h"
> #include "libavutil/hwcontext_vaapi.h"
> +#include "libavutil/fifo.h"
>
> #include "avcodec.h"
> #include "hwconfig.h"
> @@ -345,6 +346,10 @@ typedef struct VAAPIEncodeContext {
> int roi_warned;
>
> AVFrame *frame;
> + //Store buffered pic
Better to keep the coding style unchanged, always put one space between // and
your comment.
> + AVFifo *encode_fifo;
> + //Whether the driver support vaSyncBuffer
> + int has_sync_buffer_func;
How about to check whether the driver support vaSyncBuffer in the first patch ?
Thanks
Haihao
> } VAAPIEncodeContext;
>
> enum {
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* [FFmpeg-devel] [PATCH V3 3/3] libavcodec/vaapi_encode: Add async_depth to vaapi_encoder to increase performance
2022-02-08 3:05 [FFmpeg-devel] [PATCH V3 1/3] libavcodec/vaapi_encode: Add new API adaption to vaapi_encode Wenbin Chen
2022-02-08 3:05 ` [FFmpeg-devel] [PATCH V3 2/3] libavcodec/vaapi_encode: Change the way to call async to increase performance Wenbin Chen
@ 2022-02-08 3:05 ` Wenbin Chen
2022-02-09 6:22 ` Chen, Wenbin
2022-02-11 4:43 ` Xiang, Haihao
2022-02-11 4:07 ` [FFmpeg-devel] [PATCH V3 1/3] libavcodec/vaapi_encode: Add new API adaption to vaapi_encode Xiang, Haihao
2 siblings, 2 replies; 8+ messages in thread
From: Wenbin Chen @ 2022-02-08 3:05 UTC (permalink / raw)
To: ffmpeg-devel
Add async_depth to increase encoder's performance. Reuse encode_fifo as
async buffer. Encoder puts all reordered frame to HW and then check
fifo size. If fifo < async_depth and the top frame is not ready, it will
return AVERROR(EAGAIN) to require more frames.
1080p transcoding (no B frames) with -async_depth=4 can increase 20%
performance on my environment.
The async increases performance but also introduces frame delay.
Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
---
libavcodec/vaapi_encode.c | 16 ++++++++++++----
libavcodec/vaapi_encode.h | 12 ++++++++++--
2 files changed, 22 insertions(+), 6 deletions(-)
diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
index 15ddbbaa4a..432abf31f7 100644
--- a/libavcodec/vaapi_encode.c
+++ b/libavcodec/vaapi_encode.c
@@ -1158,7 +1158,8 @@ static int vaapi_encode_send_frame(AVCodecContext *avctx, AVFrame *frame)
if (ctx->input_order == ctx->decode_delay)
ctx->dts_pts_diff = pic->pts - ctx->first_pts;
if (ctx->output_delay > 0)
- ctx->ts_ring[ctx->input_order % (3 * ctx->output_delay)] = pic->pts;
+ ctx->ts_ring[ctx->input_order %
+ (3 * ctx->output_delay + ctx->async_depth)] = pic->pts;
pic->display_order = ctx->input_order;
++ctx->input_order;
@@ -1214,7 +1215,7 @@ int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
#if VA_CHECK_VERSION(1, 9, 0)
if (ctx->has_sync_buffer_func) {
- while (av_fifo_can_read(ctx->encode_fifo) <= MAX_PICTURE_REFERENCES) {
+ while (av_fifo_can_read(ctx->encode_fifo) <= MAX_ASYNC_DEPTH) {
pic = NULL;
err = vaapi_encode_pick_next(avctx, &pic);
if (err < 0)
@@ -1232,6 +1233,13 @@ int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
}
if (!av_fifo_can_read(ctx->encode_fifo))
return err;
+ if (av_fifo_can_read(ctx->encode_fifo) < ctx->async_depth &&
+ !ctx->end_of_stream) {
+ av_fifo_peek(ctx->encode_fifo, &pic, 1, 0);
+ err = vaapi_encode_wait(avctx, pic, 0);
+ if (err < 0)
+ return err;
+ }
av_fifo_read(ctx->encode_fifo, &pic, 1);
ctx->encode_order = pic->encode_order + 1;
} else
@@ -1267,7 +1275,7 @@ int ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
pkt->dts = ctx->ts_ring[pic->encode_order] - ctx->dts_pts_diff;
} else {
pkt->dts = ctx->ts_ring[(pic->encode_order - ctx->decode_delay) %
- (3 * ctx->output_delay)];
+ (3 * ctx->output_delay + ctx->async_depth)];
}
av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64" dts %"PRId64".\n",
pkt->pts, pkt->dts);
@@ -2588,7 +2596,7 @@ av_cold int ff_vaapi_encode_init(AVCodecContext *avctx)
vas = vaSyncBuffer(ctx->hwctx->display, 0, 0);
if (vas != VA_STATUS_ERROR_UNIMPLEMENTED) {
ctx->has_sync_buffer_func = 1;
- ctx->encode_fifo = av_fifo_alloc2(MAX_PICTURE_REFERENCES + 1,
+ ctx->encode_fifo = av_fifo_alloc2(MAX_ASYNC_DEPTH,
sizeof(VAAPIEncodePicture *),
0);
if (!ctx->encode_fifo)
diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h
index d33a486cb8..691521387d 100644
--- a/libavcodec/vaapi_encode.h
+++ b/libavcodec/vaapi_encode.h
@@ -48,6 +48,7 @@ enum {
MAX_TILE_ROWS = 22,
// A.4.1: table A.6 allows at most 20 tile columns for any level.
MAX_TILE_COLS = 20,
+ MAX_ASYNC_DEPTH = 64,
};
extern const AVCodecHWConfigInternal *const ff_vaapi_encode_hw_configs[];
@@ -298,7 +299,8 @@ typedef struct VAAPIEncodeContext {
// Timestamp handling.
int64_t first_pts;
int64_t dts_pts_diff;
- int64_t ts_ring[MAX_REORDER_DELAY * 3];
+ int64_t ts_ring[MAX_REORDER_DELAY * 3 +
+ MAX_ASYNC_DEPTH];
// Slice structure.
int slice_block_rows;
@@ -350,6 +352,8 @@ typedef struct VAAPIEncodeContext {
AVFifo *encode_fifo;
//Whether the driver support vaSyncBuffer
int has_sync_buffer_func;
+ //Max number of frame buffered in encoder.
+ int async_depth;
} VAAPIEncodeContext;
enum {
@@ -460,7 +464,11 @@ int ff_vaapi_encode_close(AVCodecContext *avctx);
{ "b_depth", \
"Maximum B-frame reference depth", \
OFFSET(common.desired_b_depth), AV_OPT_TYPE_INT, \
- { .i64 = 1 }, 1, INT_MAX, FLAGS }
+ { .i64 = 1 }, 1, INT_MAX, FLAGS }, \
+ { "async_depth", "Maximum processing parallelism. " \
+ "Increase this to improve single channel performance", \
+ OFFSET(common.async_depth), AV_OPT_TYPE_INT, \
+ { .i64 = 4 }, 0, MAX_ASYNC_DEPTH, FLAGS }
#define VAAPI_ENCODE_RC_MODE(name, desc) \
{ #name, desc, 0, AV_OPT_TYPE_CONST, { .i64 = RC_MODE_ ## name }, \
--
2.32.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [FFmpeg-devel] [PATCH V3 3/3] libavcodec/vaapi_encode: Add async_depth to vaapi_encoder to increase performance
2022-02-08 3:05 ` [FFmpeg-devel] [PATCH V3 3/3] libavcodec/vaapi_encode: Add async_depth to vaapi_encoder " Wenbin Chen
@ 2022-02-09 6:22 ` Chen, Wenbin
2022-02-11 4:43 ` Xiang, Haihao
1 sibling, 0 replies; 8+ messages in thread
From: Chen, Wenbin @ 2022-02-09 6:22 UTC (permalink / raw)
To: FFmpeg development discussions and patches
> Add async_depth to increase encoder's performance. Reuse encode_fifo as
> async buffer. Encoder puts all reordered frame to HW and then check
> fifo size. If fifo < async_depth and the top frame is not ready, it will
> return AVERROR(EAGAIN) to require more frames.
>
> 1080p transcoding (no B frames) with -async_depth=4 can increase 20%
> performance on my environment.
> The async increases performance but also introduces frame delay.
>
> Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
> ---
> libavcodec/vaapi_encode.c | 16 ++++++++++++----
> libavcodec/vaapi_encode.h | 12 ++++++++++--
> 2 files changed, 22 insertions(+), 6 deletions(-)
>
> diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
> index 15ddbbaa4a..432abf31f7 100644
> --- a/libavcodec/vaapi_encode.c
> +++ b/libavcodec/vaapi_encode.c
> @@ -1158,7 +1158,8 @@ static int
> vaapi_encode_send_frame(AVCodecContext *avctx, AVFrame *frame)
> if (ctx->input_order == ctx->decode_delay)
> ctx->dts_pts_diff = pic->pts - ctx->first_pts;
> if (ctx->output_delay > 0)
> - ctx->ts_ring[ctx->input_order % (3 * ctx->output_delay)] = pic->pts;
> + ctx->ts_ring[ctx->input_order %
> + (3 * ctx->output_delay + ctx->async_depth)] = pic->pts;
>
> pic->display_order = ctx->input_order;
> ++ctx->input_order;
> @@ -1214,7 +1215,7 @@ int
> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
>
> #if VA_CHECK_VERSION(1, 9, 0)
> if (ctx->has_sync_buffer_func) {
> - while (av_fifo_can_read(ctx->encode_fifo) <=
> MAX_PICTURE_REFERENCES) {
> + while (av_fifo_can_read(ctx->encode_fifo) <= MAX_ASYNC_DEPTH) {
Here is a mistake I should use "<" instead of "<=" and I can use av_fifo_can_write()
instead. I will update it.
> pic = NULL;
> err = vaapi_encode_pick_next(avctx, &pic);
> if (err < 0)
> @@ -1232,6 +1233,13 @@ int
> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
> }
> if (!av_fifo_can_read(ctx->encode_fifo))
> return err;
> + if (av_fifo_can_read(ctx->encode_fifo) < ctx->async_depth &&
> + !ctx->end_of_stream) {
> + av_fifo_peek(ctx->encode_fifo, &pic, 1, 0);
> + err = vaapi_encode_wait(avctx, pic, 0);
> + if (err < 0)
> + return err;
> + }
> av_fifo_read(ctx->encode_fifo, &pic, 1);
> ctx->encode_order = pic->encode_order + 1;
> } else
> @@ -1267,7 +1275,7 @@ int
> ff_vaapi_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
> pkt->dts = ctx->ts_ring[pic->encode_order] - ctx->dts_pts_diff;
> } else {
> pkt->dts = ctx->ts_ring[(pic->encode_order - ctx->decode_delay) %
> - (3 * ctx->output_delay)];
> + (3 * ctx->output_delay + ctx->async_depth)];
> }
> av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64"
> dts %"PRId64".\n",
> pkt->pts, pkt->dts);
> @@ -2588,7 +2596,7 @@ av_cold int ff_vaapi_encode_init(AVCodecContext
> *avctx)
> vas = vaSyncBuffer(ctx->hwctx->display, 0, 0);
> if (vas != VA_STATUS_ERROR_UNIMPLEMENTED) {
> ctx->has_sync_buffer_func = 1;
> - ctx->encode_fifo = av_fifo_alloc2(MAX_PICTURE_REFERENCES + 1,
> + ctx->encode_fifo = av_fifo_alloc2(MAX_ASYNC_DEPTH,
> sizeof(VAAPIEncodePicture *),
> 0);
> if (!ctx->encode_fifo)
> diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h
> index d33a486cb8..691521387d 100644
> --- a/libavcodec/vaapi_encode.h
> +++ b/libavcodec/vaapi_encode.h
> @@ -48,6 +48,7 @@ enum {
> MAX_TILE_ROWS = 22,
> // A.4.1: table A.6 allows at most 20 tile columns for any level.
> MAX_TILE_COLS = 20,
> + MAX_ASYNC_DEPTH = 64,
> };
>
> extern const AVCodecHWConfigInternal *const
> ff_vaapi_encode_hw_configs[];
> @@ -298,7 +299,8 @@ typedef struct VAAPIEncodeContext {
> // Timestamp handling.
> int64_t first_pts;
> int64_t dts_pts_diff;
> - int64_t ts_ring[MAX_REORDER_DELAY * 3];
> + int64_t ts_ring[MAX_REORDER_DELAY * 3 +
> + MAX_ASYNC_DEPTH];
>
> // Slice structure.
> int slice_block_rows;
> @@ -350,6 +352,8 @@ typedef struct VAAPIEncodeContext {
> AVFifo *encode_fifo;
> //Whether the driver support vaSyncBuffer
> int has_sync_buffer_func;
> + //Max number of frame buffered in encoder.
> + int async_depth;
> } VAAPIEncodeContext;
>
> enum {
> @@ -460,7 +464,11 @@ int ff_vaapi_encode_close(AVCodecContext *avctx);
> { "b_depth", \
> "Maximum B-frame reference depth", \
> OFFSET(common.desired_b_depth), AV_OPT_TYPE_INT, \
> - { .i64 = 1 }, 1, INT_MAX, FLAGS }
> + { .i64 = 1 }, 1, INT_MAX, FLAGS }, \
> + { "async_depth", "Maximum processing parallelism. " \
> + "Increase this to improve single channel performance", \
> + OFFSET(common.async_depth), AV_OPT_TYPE_INT, \
> + { .i64 = 4 }, 0, MAX_ASYNC_DEPTH, FLAGS }
>
> #define VAAPI_ENCODE_RC_MODE(name, desc) \
> { #name, desc, 0, AV_OPT_TYPE_CONST, { .i64 = RC_MODE_ ## name }, \
> --
> 2.32.0
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [FFmpeg-devel] [PATCH V3 3/3] libavcodec/vaapi_encode: Add async_depth to vaapi_encoder to increase performance
2022-02-08 3:05 ` [FFmpeg-devel] [PATCH V3 3/3] libavcodec/vaapi_encode: Add async_depth to vaapi_encoder " Wenbin Chen
2022-02-09 6:22 ` Chen, Wenbin
@ 2022-02-11 4:43 ` Xiang, Haihao
1 sibling, 0 replies; 8+ messages in thread
From: Xiang, Haihao @ 2022-02-11 4:43 UTC (permalink / raw)
To: ffmpeg-devel
> Add async_depth to increase encoder's performance. Reuse encode_fifo as
> async buffer. Encoder puts all reordered frame to HW and then check
> fifo size. If fifo < async_depth and the top frame is not ready, it will
> return AVERROR(EAGAIN) to require more frames.
>
> 1080p transcoding (no B frames) with -async_depth=4 can increase 20%
> performance on my environment.
> The async increases performance but also introduces frame delay.
>
> Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
> ---
> libavcodec/vaapi_encode.c | 16 ++++++++++++----
> libavcodec/vaapi_encode.h | 12 ++++++++++--
> 2 files changed, 22 insertions(+), 6 deletions(-)
>
> diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
> index 15ddbbaa4a..432abf31f7 100644
> --- a/libavcodec/vaapi_encode.c
> +++ b/libavcodec/vaapi_encode.c
> @@ -1158,7 +1158,8 @@ static int vaapi_encode_send_frame(AVCodecContext
> *avctx, AVFrame *frame)
> if (ctx->input_order == ctx->decode_delay)
> ctx->dts_pts_diff = pic->pts - ctx->first_pts;
> if (ctx->output_delay > 0)
> - ctx->ts_ring[ctx->input_order % (3 * ctx->output_delay)] = pic-
> >pts;
> + ctx->ts_ring[ctx->input_order %
> + (3 * ctx->output_delay + ctx->async_depth)] = pic-
> >pts;
>
> pic->display_order = ctx->input_order;
> ++ctx->input_order;
> @@ -1214,7 +1215,7 @@ int ff_vaapi_encode_receive_packet(AVCodecContext
> *avctx, AVPacket *pkt)
>
> #if VA_CHECK_VERSION(1, 9, 0)
> if (ctx->has_sync_buffer_func) {
> - while (av_fifo_can_read(ctx->encode_fifo) <= MAX_PICTURE_REFERENCES)
> {
> + while (av_fifo_can_read(ctx->encode_fifo) <= MAX_ASYNC_DEPTH) {
> pic = NULL;
> err = vaapi_encode_pick_next(avctx, &pic);
> if (err < 0)
> @@ -1232,6 +1233,13 @@ int ff_vaapi_encode_receive_packet(AVCodecContext
> *avctx, AVPacket *pkt)
> }
> if (!av_fifo_can_read(ctx->encode_fifo))
> return err;
> + if (av_fifo_can_read(ctx->encode_fifo) < ctx->async_depth &&
> + !ctx->end_of_stream) {
> + av_fifo_peek(ctx->encode_fifo, &pic, 1, 0);
> + err = vaapi_encode_wait(avctx, pic, 0);
> + if (err < 0)
> + return err;
> + }
> av_fifo_read(ctx->encode_fifo, &pic, 1);
> ctx->encode_order = pic->encode_order + 1;
> } else
> @@ -1267,7 +1275,7 @@ int ff_vaapi_encode_receive_packet(AVCodecContext
> *avctx, AVPacket *pkt)
> pkt->dts = ctx->ts_ring[pic->encode_order] - ctx->dts_pts_diff;
> } else {
> pkt->dts = ctx->ts_ring[(pic->encode_order - ctx->decode_delay) %
> - (3 * ctx->output_delay)];
> + (3 * ctx->output_delay + ctx->async_depth)];
> }
> av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64" dts
> %"PRId64".\n",
> pkt->pts, pkt->dts);
> @@ -2588,7 +2596,7 @@ av_cold int ff_vaapi_encode_init(AVCodecContext *avctx)
> vas = vaSyncBuffer(ctx->hwctx->display, 0, 0);
> if (vas != VA_STATUS_ERROR_UNIMPLEMENTED) {
> ctx->has_sync_buffer_func = 1;
> - ctx->encode_fifo = av_fifo_alloc2(MAX_PICTURE_REFERENCES + 1,
> + ctx->encode_fifo = av_fifo_alloc2(MAX_ASYNC_DEPTH,
> sizeof(VAAPIEncodePicture *),
> 0);
> if (!ctx->encode_fifo)
> diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h
> index d33a486cb8..691521387d 100644
> --- a/libavcodec/vaapi_encode.h
> +++ b/libavcodec/vaapi_encode.h
> @@ -48,6 +48,7 @@ enum {
> MAX_TILE_ROWS = 22,
> // A.4.1: table A.6 allows at most 20 tile columns for any level.
> MAX_TILE_COLS = 20,
> + MAX_ASYNC_DEPTH = 64,
> };
>
> extern const AVCodecHWConfigInternal *const ff_vaapi_encode_hw_configs[];
> @@ -298,7 +299,8 @@ typedef struct VAAPIEncodeContext {
> // Timestamp handling.
> int64_t first_pts;
> int64_t dts_pts_diff;
> - int64_t ts_ring[MAX_REORDER_DELAY * 3];
> + int64_t ts_ring[MAX_REORDER_DELAY * 3 +
> + MAX_ASYNC_DEPTH];
>
> // Slice structure.
> int slice_block_rows;
> @@ -350,6 +352,8 @@ typedef struct VAAPIEncodeContext {
> AVFifo *encode_fifo;
> //Whether the driver support vaSyncBuffer
> int has_sync_buffer_func;
> + //Max number of frame buffered in encoder.
> + int async_depth;
> } VAAPIEncodeContext;
>
> enum {
> @@ -460,7 +464,11 @@ int ff_vaapi_encode_close(AVCodecContext *avctx);
> { "b_depth", \
> "Maximum B-frame reference depth", \
> OFFSET(common.desired_b_depth), AV_OPT_TYPE_INT, \
> - { .i64 = 1 }, 1, INT_MAX, FLAGS }
> + { .i64 = 1 }, 1, INT_MAX, FLAGS }, \
> + { "async_depth", "Maximum processing parallelism. " \
> + "Increase this to improve single channel performance", \
async_depth is not available if vaSyncBuffer is not implemented, it would be
better to add some comments in the help string.
Thanks
Haihao
> + OFFSET(common.async_depth), AV_OPT_TYPE_INT, \
> + { .i64 = 4 }, 0, MAX_ASYNC_DEPTH, FLAGS }
>
> #define VAAPI_ENCODE_RC_MODE(name, desc) \
> { #name, desc, 0, AV_OPT_TYPE_CONST, { .i64 = RC_MODE_ ## name }, \
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [FFmpeg-devel] [PATCH V3 1/3] libavcodec/vaapi_encode: Add new API adaption to vaapi_encode
2022-02-08 3:05 [FFmpeg-devel] [PATCH V3 1/3] libavcodec/vaapi_encode: Add new API adaption to vaapi_encode Wenbin Chen
2022-02-08 3:05 ` [FFmpeg-devel] [PATCH V3 2/3] libavcodec/vaapi_encode: Change the way to call async to increase performance Wenbin Chen
2022-02-08 3:05 ` [FFmpeg-devel] [PATCH V3 3/3] libavcodec/vaapi_encode: Add async_depth to vaapi_encoder " Wenbin Chen
@ 2022-02-11 4:07 ` Xiang, Haihao
2022-02-11 5:16 ` Chen, Wenbin
2 siblings, 1 reply; 8+ messages in thread
From: Xiang, Haihao @ 2022-02-11 4:07 UTC (permalink / raw)
To: ffmpeg-devel
On Tue, 2022-02-08 at 11:05 +0800, Wenbin Chen wrote:
> Add vaSyncBuffer to VAAPI encoder. Old version API vaSyncSurface wait
> surface to complete. When surface is used for multiple operation, it
> waits all operations to finish. vaSyncBuffer only wait one channel to
> finish.
>
> Add wait param to vaapi_encode_wait() to prepare for the async_depth
> option. "wait=1" means wait until operation ready. "wait=0" means
> query operation's status. If it is ready return 0, if it is still
> in progress return EAGAIN.
>
> Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
> ---
> libavcodec/vaapi_encode.c | 47 +++++++++++++++++++++++++++++++++------
> 1 file changed, 40 insertions(+), 7 deletions(-)
>
> diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
> index 3bf379b1a0..b87b58a42b 100644
> --- a/libavcodec/vaapi_encode.c
> +++ b/libavcodec/vaapi_encode.c
> @@ -134,7 +134,8 @@ static int
> vaapi_encode_make_misc_param_buffer(AVCodecContext *avctx,
> }
>
> static int vaapi_encode_wait(AVCodecContext *avctx,
> - VAAPIEncodePicture *pic)
> + VAAPIEncodePicture *pic,
> + uint8_t wait)
> {
> VAAPIEncodeContext *ctx = avctx->priv_data;
> VAStatus vas;
> @@ -150,11 +151,43 @@ static int vaapi_encode_wait(AVCodecContext *avctx,
> "(input surface %#x).\n", pic->display_order,
> pic->encode_order, pic->input_surface);
>
> - vas = vaSyncSurface(ctx->hwctx->display, pic->input_surface);
> - if (vas != VA_STATUS_SUCCESS) {
> - av_log(avctx, AV_LOG_ERROR, "Failed to sync to picture completion: "
> - "%d (%s).\n", vas, vaErrorStr(vas));
> +#if VA_CHECK_VERSION(1, 9, 0)
> + // Try vaSyncBuffer.
> + vas = vaSyncBuffer(ctx->hwctx->display,
> + pic->output_buffer,
> + wait ? VA_TIMEOUT_INFINITE : 0);
> + if (vas == VA_STATUS_ERROR_TIMEDOUT) {
> + return AVERROR(EAGAIN);
> + } else if (vas != VA_STATUS_SUCCESS && vas !=
> VA_STATUS_ERROR_UNIMPLEMENTED) {
> + av_log(avctx, AV_LOG_ERROR, "Failed to sync to output buffer
> completion: "
> + "%d (%s).\n", vas, vaErrorStr(vas));
> return AVERROR(EIO);
We may add has_sync_buffer_func flag in this patch, and run the above code when
ctx->has_sync_buffer_func is true. If so, we needn't check whether vaSyncBuffer
is implemented again.
Thanks
Haihao
> + } else if (vas == VA_STATUS_ERROR_UNIMPLEMENTED)
> + // If vaSyncBuffer is not implemented, try old version API.
> +#endif
> + {
> + if (!wait) {
> + VASurfaceStatus surface_status;
> + vas = vaQuerySurfaceStatus(ctx->hwctx->display,
> + pic->input_surface,
> + &surface_status);
> + if (vas == VA_STATUS_SUCCESS &&
> + surface_status != VASurfaceReady &&
> + surface_status != VASurfaceSkipped) {
> + return AVERROR(EAGAIN);
> + } else if (vas != VA_STATUS_SUCCESS) {
> + av_log(avctx, AV_LOG_ERROR, "Failed to query surface status:
> "
> + "%d (%s).\n", vas, vaErrorStr(vas));
> + return AVERROR(EIO);
> + }
> + } else {
> + vas = vaSyncSurface(ctx->hwctx->display, pic->input_surface);
> + if (vas != VA_STATUS_SUCCESS) {
> + av_log(avctx, AV_LOG_ERROR, "Failed to sync to picture
> completion: "
> + "%d (%s).\n", vas, vaErrorStr(vas));
> + return AVERROR(EIO);
> + }
> + }
> }
>
> // Input is definitely finished with now.
> @@ -633,7 +666,7 @@ static int vaapi_encode_output(AVCodecContext *avctx,
> uint8_t *ptr;
> int err;
>
> - err = vaapi_encode_wait(avctx, pic);
> + err = vaapi_encode_wait(avctx, pic, 1);
> if (err < 0)
> return err;
>
> @@ -695,7 +728,7 @@ fail:
> static int vaapi_encode_discard(AVCodecContext *avctx,
> VAAPIEncodePicture *pic)
> {
> - vaapi_encode_wait(avctx, pic);
> + vaapi_encode_wait(avctx, pic, 1);
>
> if (pic->output_buffer_ref) {
> av_log(avctx, AV_LOG_DEBUG, "Discard output for pic "
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [FFmpeg-devel] [PATCH V3 1/3] libavcodec/vaapi_encode: Add new API adaption to vaapi_encode
2022-02-11 4:07 ` [FFmpeg-devel] [PATCH V3 1/3] libavcodec/vaapi_encode: Add new API adaption to vaapi_encode Xiang, Haihao
@ 2022-02-11 5:16 ` Chen, Wenbin
0 siblings, 0 replies; 8+ messages in thread
From: Chen, Wenbin @ 2022-02-11 5:16 UTC (permalink / raw)
To: FFmpeg development discussions and patches
> On Tue, 2022-02-08 at 11:05 +0800, Wenbin Chen wrote:
> > Add vaSyncBuffer to VAAPI encoder. Old version API vaSyncSurface wait
> > surface to complete. When surface is used for multiple operation, it
> > waits all operations to finish. vaSyncBuffer only wait one channel to
> > finish.
> >
> > Add wait param to vaapi_encode_wait() to prepare for the async_depth
> > option. "wait=1" means wait until operation ready. "wait=0" means
> > query operation's status. If it is ready return 0, if it is still
> > in progress return EAGAIN.
> >
> > Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
> > ---
> > libavcodec/vaapi_encode.c | 47 +++++++++++++++++++++++++++++++++---
> ---
> > 1 file changed, 40 insertions(+), 7 deletions(-)
> >
> > diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
> > index 3bf379b1a0..b87b58a42b 100644
> > --- a/libavcodec/vaapi_encode.c
> > +++ b/libavcodec/vaapi_encode.c
> > @@ -134,7 +134,8 @@ static int
> > vaapi_encode_make_misc_param_buffer(AVCodecContext *avctx,
> > }
> >
> > static int vaapi_encode_wait(AVCodecContext *avctx,
> > - VAAPIEncodePicture *pic)
> > + VAAPIEncodePicture *pic,
> > + uint8_t wait)
> > {
> > VAAPIEncodeContext *ctx = avctx->priv_data;
> > VAStatus vas;
> > @@ -150,11 +151,43 @@ static int vaapi_encode_wait(AVCodecContext
> *avctx,
> > "(input surface %#x).\n", pic->display_order,
> > pic->encode_order, pic->input_surface);
> >
> > - vas = vaSyncSurface(ctx->hwctx->display, pic->input_surface);
> > - if (vas != VA_STATUS_SUCCESS) {
> > - av_log(avctx, AV_LOG_ERROR, "Failed to sync to picture completion: "
> > - "%d (%s).\n", vas, vaErrorStr(vas));
> > +#if VA_CHECK_VERSION(1, 9, 0)
> > + // Try vaSyncBuffer.
> > + vas = vaSyncBuffer(ctx->hwctx->display,
> > + pic->output_buffer,
> > + wait ? VA_TIMEOUT_INFINITE : 0);
> > + if (vas == VA_STATUS_ERROR_TIMEDOUT) {
> > + return AVERROR(EAGAIN);
> > + } else if (vas != VA_STATUS_SUCCESS && vas !=
> > VA_STATUS_ERROR_UNIMPLEMENTED) {
> > + av_log(avctx, AV_LOG_ERROR, "Failed to sync to output buffer
> > completion: "
> > + "%d (%s).\n", vas, vaErrorStr(vas));
> > return AVERROR(EIO);
>
> We may add has_sync_buffer_func flag in this patch, and run the above code
> when
> ctx->has_sync_buffer_func is true. If so, we needn't check whether
> vaSyncBuffer
> is implemented again.
>
> Thanks
> Haihao
Thanks for your review. I will update the patchset.
Thanks
Wenbin
>
>
> > + } else if (vas == VA_STATUS_ERROR_UNIMPLEMENTED)
> > + // If vaSyncBuffer is not implemented, try old version API.
> > +#endif
> > + {
> > + if (!wait) {
> > + VASurfaceStatus surface_status;
> > + vas = vaQuerySurfaceStatus(ctx->hwctx->display,
> > + pic->input_surface,
> > + &surface_status);
> > + if (vas == VA_STATUS_SUCCESS &&
> > + surface_status != VASurfaceReady &&
> > + surface_status != VASurfaceSkipped) {
> > + return AVERROR(EAGAIN);
> > + } else if (vas != VA_STATUS_SUCCESS) {
> > + av_log(avctx, AV_LOG_ERROR, "Failed to query surface status:
> > "
> > + "%d (%s).\n", vas, vaErrorStr(vas));
> > + return AVERROR(EIO);
> > + }
> > + } else {
> > + vas = vaSyncSurface(ctx->hwctx->display, pic->input_surface);
> > + if (vas != VA_STATUS_SUCCESS) {
> > + av_log(avctx, AV_LOG_ERROR, "Failed to sync to picture
> > completion: "
> > + "%d (%s).\n", vas, vaErrorStr(vas));
> > + return AVERROR(EIO);
> > + }
> > + }
> > }
> >
> > // Input is definitely finished with now.
> > @@ -633,7 +666,7 @@ static int vaapi_encode_output(AVCodecContext
> *avctx,
> > uint8_t *ptr;
> > int err;
> >
> > - err = vaapi_encode_wait(avctx, pic);
> > + err = vaapi_encode_wait(avctx, pic, 1);
> > if (err < 0)
> > return err;
> >
> > @@ -695,7 +728,7 @@ fail:
> > static int vaapi_encode_discard(AVCodecContext *avctx,
> > VAAPIEncodePicture *pic)
> > {
> > - vaapi_encode_wait(avctx, pic);
> > + vaapi_encode_wait(avctx, pic, 1);
> >
> > if (pic->output_buffer_ref) {
> > av_log(avctx, AV_LOG_DEBUG, "Discard output for pic "
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 8+ messages in thread