I added some instrumentation via the attached patch. You can see the benefits here: Before=1683378057.243350 After 1683378057.264239 Before=1683378083.335424 After 1683378083.356440 Before=1683378089.675400 After 1683378089.696512 Before=1683378151.792324 After 1683378151.813579 21 ms per run After patch: Before=1683378222.167796 After 1683378222.175760 Before=1683378233.131416 After 1683378233.139326 Before=1683378243.591895 After 1683378243.599840 8 ms per run Note: this is a different platform than I did the original development on, and apparently the improvement on this particular box is only 2.5x rather than 4x. Devin On Sat, May 6, 2023 at 7:53 AM Paul B Mahol wrote: > > On Sat, May 6, 2023 at 1:32 PM Lance Wang wrote: > > > On Sat, May 6, 2023 at 4:58 AM Devin Heitmueller < > > devin.heitmueller@ltnglobal.com> wrote: > > > > > Rework the code a bit to speed up the 10-bit bitpacked decoding > > > routine. This is probably about as fast as I can get it without > > > switching to assembly language. > > > > > > Demonstratable with: > > > > > > ./ffmpeg -f lavfi -i "smptehdbars=size=3840x2160" -c bitpacked -f image2 > > > -frames:v 1 source.yuv > > > ./ffmpeg -f bitpacked -pix_fmt yuv422p10le -s 3840x2160 -c:v bitpacked -i > > > source.yuv -pix_fmt yuv422p10le out.yuv > > > > > > On my development system, it went from 80ms for a 2160p frame > > > down to 20ms (i.e. a 4X speedup). Good enough for now, I hope... > > > > > > > > FYI, on my development system, I run two time for the original and modified > > version and no obvious difference: > > ./ffmpeg -f lavfi -i "smptehdbars=size=3840x2160" -c bitpacked -frames:v 25 > > source.yuv > > time ./ffmpeg -f bitpacked -pix_fmt yuv422p10le -s 3840x2160 -c:v bitpacked > > -i source.yuv -pix_fmt yuv422p10le out.yuv > > frame= 25 fps=0.0 q=-0.0 Lsize= 810000kB time=00:00:00.96 > > bitrate=6912000.0kbits/s speed=1.13x > > > > real 0m0.961s > > user 0m1.086s > > sys 0m1.360s > > > > frame= 25 fps=0.0 q=-0.0 Lsize= 810000kB time=00:00:00.96 > > bitrate=6912000.0kbits/s speed=1.16x > > > > real 0m0.936s > > user 0m1.358s > > sys 0m1.350s > > > > after apply the patch: > > frame= 25 fps=0.0 q=-0.0 Lsize= 810000kB time=00:00:00.96 > > bitrate=6912000.0kbits/s speed=1.14x > > > > real 0m0.953s > > user 0m0.906s > > sys 0m1.438s > > > > frame= 25 fps=0.0 q=-0.0 Lsize= 810000kB time=00:00:00.96 > > bitrate=6912000.0kbits/s speed=1.17x > > > > real 0m0.922s > > user 0m0.926s > > sys 0m1.066s > > > > Only 25 frames? > This is flawed. > > > > > > > > > > > Signed-off-by: Devin Heitmueller > > > --- > > > libavcodec/bitpacked_dec.c | 17 +++++++---------- > > > 1 file changed, 7 insertions(+), 10 deletions(-) > > > > > > diff --git a/libavcodec/bitpacked_dec.c b/libavcodec/bitpacked_dec.c > > > index a1ffef1..96aba27 100644 > > > --- a/libavcodec/bitpacked_dec.c > > > +++ b/libavcodec/bitpacked_dec.c > > > @@ -28,7 +28,6 @@ > > > > > > #include "avcodec.h" > > > #include "codec_internal.h" > > > -#include "get_bits.h" > > > #include "libavutil/imgutils.h" > > > #include "thread.h" > > > > > > @@ -65,7 +64,7 @@ static int bitpacked_decode_yuv422p10(AVCodecContext > > > *avctx, AVFrame *frame, > > > { > > > uint64_t frame_size = (uint64_t)avctx->width * > > > (uint64_t)avctx->height * 20; > > > uint64_t packet_size = (uint64_t)avpkt->size * 8; > > > - GetBitContext bc; > > > + uint8_t *src; > > > uint16_t *y, *u, *v; > > > int ret, i, j; > > > > > > @@ -79,20 +78,18 @@ static int bitpacked_decode_yuv422p10(AVCodecContext > > > *avctx, AVFrame *frame, > > > if (avctx->width % 2) > > > return AVERROR_PATCHWELCOME; > > > > > > - ret = init_get_bits(&bc, avpkt->data, avctx->width * avctx->height * > > > 20); > > > - if (ret) > > > - return ret; > > > - > > > + src = avpkt->data; > > > for (i = 0; i < avctx->height; i++) { > > > y = (uint16_t*)(frame->data[0] + i * frame->linesize[0]); > > > u = (uint16_t*)(frame->data[1] + i * frame->linesize[1]); > > > v = (uint16_t*)(frame->data[2] + i * frame->linesize[2]); > > > > > > for (j = 0; j < avctx->width; j += 2) { > > > - *u++ = get_bits(&bc, 10); > > > - *y++ = get_bits(&bc, 10); > > > - *v++ = get_bits(&bc, 10); > > > - *y++ = get_bits(&bc, 10); > > > + *u++ = (src[0] << 2) | (src[1] >> 6); > > > + *y++ = ((src[1] << 4) | (src[2] >> 4)) & 0x3ff; > > > + *v++ = ((src[2] << 6) | (src[3] >> 2)) & 0x3ff; > > > + *y++ = ((src[3] << 8) | (src[4])) & 0x3ff; > > > + src += 5; > > > } > > > } > > > > > > -- > > > 1.8.3.1 > > > > > > _______________________________________________ > > > ffmpeg-devel mailing list > > > ffmpeg-devel@ffmpeg.org > > > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > > > > > To unsubscribe, visit link above, or email > > > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > > > > > _______________________________________________ > > ffmpeg-devel mailing list > > ffmpeg-devel@ffmpeg.org > > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > > > To unsubscribe, visit link above, or email > > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". -- Devin Heitmueller, Senior Software Engineer LTN Global Communications o: +1 (301) 363-1001 w: https://ltnglobal.com e: devin.heitmueller@ltnglobal.com