Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 1/2] lavc/speedhqdec: Add AV_CODEC_CAP_SLICE_THREADS
@ 2024-05-13 15:42 Tomas Härdin
  2024-05-13 15:43 ` [FFmpeg-devel] [PATCH 2/2] lavc/speedhqdec: Reindent Tomas Härdin
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Tomas Härdin @ 2024-05-13 15:42 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 1432 bytes --]

On a 36 core Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz
command: /usr/bin/time ./ffmpeg -t 30 -thread_type slice -threads
$THREADS -i $INPUT.mov -vcodec rawvideo -f null -

before:
frame= 1500 fps=160 q=-0.0 Lsize=N/A time=00:00:30.00 bitrate=N/A
speed= 3.2x    
10.54user 0.37system 0:09.40elapsed 116%CPU (0avgtext+0avgdata
175300maxresident)k

-thread_type slice -threads 1
frame= 1500 fps=161 q=-0.0 Lsize=N/A time=00:00:30.00 bitrate=N/A
speed=3.22x    
10.57user 0.29system 0:09.34elapsed 116%CPU (0avgtext+0avgdata
175580maxresident)k

-thread_type slice -threads 2
frame= 1500 fps=318 q=-0.0 Lsize=N/A time=00:00:30.00 bitrate=N/A
speed=6.36x    
10.53user 0.39system 0:04.73elapsed 230%CPU (0avgtext+0avgdata
175632maxresident)k

-thread_type slice -threads 4
frame= 1500 fps=615 q=-0.0 Lsize=N/A time=00:00:30.00 bitrate=N/A
speed=12.3x    
10.58user 0.34system 0:02.46elapsed 444%CPU (0avgtext+0avgdata
175452maxresident)k

-thread_type slice -threads 8
frame= 1500 fps=613 q=-0.0 Lsize=N/A time=00:00:30.00 bitrate=N/A
speed=12.3x    
10.60user 0.33system 0:02.46elapsed 443%CPU (0avgtext+0avgdata
180532maxresident)k
^ same as -threads 4 as we'd expect for progressive essence

I don't have any interlaced samples at the moment, and speedhqenc can't
make any. I also noticed speedhqenc produces broken output when width %
16 == 8. Will file a ticket on that tomorrow.

/Tomas

[-- Attachment #2: 0001-lavc-speedhqdec-Add-AV_CODEC_CAP_SLICE_THREADS.patch --]
[-- Type: text/x-patch, Size: 6394 bytes --]

From 29a0380a1537ba205ec91399512f676301d5e930 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20H=C3=A4rdin?= <git@haerdin.se>
Date: Mon, 13 May 2024 16:36:31 +0200
Subject: [PATCH 1/2] lavc/speedhqdec: Add AV_CODEC_CAP_SLICE_THREADS

Each field slice is assigned to one thread.
Serial performance is unaffected.
---
 libavcodec/speedhqdec.c | 59 ++++++++++++++++++++++++++---------------
 1 file changed, 38 insertions(+), 21 deletions(-)

diff --git a/libavcodec/speedhqdec.c b/libavcodec/speedhqdec.c
index d6b1fff7a5..77a159f7e5 100644
--- a/libavcodec/speedhqdec.c
+++ b/libavcodec/speedhqdec.c
@@ -58,6 +58,8 @@ typedef struct SHQContext {
     enum { SHQ_SUBSAMPLING_420, SHQ_SUBSAMPLING_422, SHQ_SUBSAMPLING_444 }
         subsampling;
     enum { SHQ_NO_ALPHA, SHQ_RLE_ALPHA, SHQ_DCT_ALPHA } alpha_type;
+    AVPacket *avpkt;
+    uint32_t second_field_offset;
 } SHQContext;
 
 /* NOTE: The first element is always 16, unscaled. */
@@ -266,9 +268,10 @@ static int decode_speedhq_border(const SHQContext *s, GetBitContext *gb, AVFrame
     return 0;
 }
 
-static int decode_speedhq_field(const SHQContext *s, const uint8_t *buf, int buf_size, AVFrame *frame, int field_number, int start, int end, int line_stride)
+static int decode_speedhq_field(const SHQContext *s, const uint8_t *buf, int buf_size, AVFrame *frame, int field_number, int start, int end, int line_stride, int slice_number)
 {
-    int ret, slice_number, slice_offsets[5];
+    int ret, x, y, slice_offsets[5];
+    uint32_t slice_begin, slice_end;
     int linesize_y  = frame->linesize[0] * line_stride;
     int linesize_cb = frame->linesize[1] * line_stride;
     int linesize_cr = frame->linesize[2] * line_stride;
@@ -283,21 +286,17 @@ static int decode_speedhq_field(const SHQContext *s, const uint8_t *buf, int buf
 
     slice_offsets[0] = start;
     slice_offsets[4] = end;
-    for (slice_number = 1; slice_number < 4; slice_number++) {
+    for (x = 1; x < 4; x++) {
         uint32_t last_offset, slice_len;
 
-        last_offset = slice_offsets[slice_number - 1];
+        last_offset = slice_offsets[x - 1];
         slice_len = AV_RL24(buf + last_offset);
-        slice_offsets[slice_number] = last_offset + slice_len;
+        slice_offsets[x] = last_offset + slice_len;
 
-        if (slice_len < 3 || slice_offsets[slice_number] > end - 3)
+        if (slice_len < 3 || slice_offsets[x] > end - 3)
             return AVERROR_INVALIDDATA;
     }
 
-    for (slice_number = 0; slice_number < 4; slice_number++) {
-        uint32_t slice_begin, slice_end;
-        int x, y;
-
         slice_begin = slice_offsets[slice_number];
         slice_end = slice_offsets[slice_number + 1];
 
@@ -390,14 +389,34 @@ static int decode_speedhq_field(const SHQContext *s, const uint8_t *buf, int buf
                 }
             }
         }
-    }
 
-    if (s->subsampling != SHQ_SUBSAMPLING_444 && (frame->width & 15))
+    if (s->subsampling != SHQ_SUBSAMPLING_444 && (frame->width & 15) && slice_number == 3)
         return decode_speedhq_border(s, &gb, frame, field_number, line_stride);
 
     return 0;
 }
 
+static int decode_slice_progressive(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
+{
+    SHQContext *s = avctx->priv_data;
+    (void)threadnr;
+
+    return decode_speedhq_field(avctx->priv_data, s->avpkt->data, s->avpkt->size, arg, 0, 4, s->avpkt->size, 1, jobnr);
+}
+
+static int decode_slice_interlaced(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
+{
+    SHQContext *s = avctx->priv_data;
+    int field_number = jobnr / 4;
+    int slice_number = jobnr % 4;
+    (void)threadnr;
+
+    if (field_number == 0)
+        return decode_speedhq_field(avctx->priv_data, s->avpkt->data, s->avpkt->size, arg, 0, 4, s->second_field_offset, 2, slice_number);
+    else
+        return decode_speedhq_field(avctx->priv_data, s->avpkt->data, s->avpkt->size, arg, 1, s->second_field_offset, s->avpkt->size, 2, slice_number);
+}
+
 static void compute_quant_matrix(int *output, int qscale)
 {
     int i;
@@ -411,7 +430,6 @@ static int speedhq_decode_frame(AVCodecContext *avctx, AVFrame *frame,
     const uint8_t *buf   = avpkt->data;
     int buf_size         = avpkt->size;
     uint8_t quality;
-    uint32_t second_field_offset;
     int ret;
 
     if (buf_size < 4 || avctx->width < 8 || avctx->width % 8 != 0)
@@ -429,8 +447,8 @@ static int speedhq_decode_frame(AVCodecContext *avctx, AVFrame *frame,
 
     compute_quant_matrix(s->quant_matrix, 100 - quality);
 
-    second_field_offset = AV_RL24(buf + 1);
-    if (second_field_offset >= buf_size - 3) {
+    s->second_field_offset = AV_RL24(buf + 1);
+    if (s->second_field_offset >= buf_size - 3) {
         return AVERROR_INVALIDDATA;
     }
 
@@ -441,8 +459,9 @@ static int speedhq_decode_frame(AVCodecContext *avctx, AVFrame *frame,
         return ret;
     }
     frame->flags |= AV_FRAME_FLAG_KEY;
+    s->avpkt = avpkt;
 
-    if (second_field_offset == 4 || second_field_offset == (buf_size-4)) {
+    if (s->second_field_offset == 4 || s->second_field_offset == (buf_size-4)) {
         /*
          * Overlapping first and second fields is used to signal
          * encoding only a single field. In this case, "height"
@@ -452,12 +471,10 @@ static int speedhq_decode_frame(AVCodecContext *avctx, AVFrame *frame,
          * but this matches the convention used in NDI, which is
          * the primary user of this trick.
          */
-        if ((ret = decode_speedhq_field(s, buf, buf_size, frame, 0, 4, buf_size, 1)) < 0)
+        if ((ret = avctx->execute2(avctx, decode_slice_progressive, frame, NULL, 4)) < 0)
             return ret;
     } else {
-        if ((ret = decode_speedhq_field(s, buf, buf_size, frame, 0, 4, second_field_offset, 2)) < 0)
-            return ret;
-        if ((ret = decode_speedhq_field(s, buf, buf_size, frame, 1, second_field_offset, buf_size, 2)) < 0)
+        if ((ret = avctx->execute2(avctx, decode_slice_interlaced, frame, NULL, 8)) < 0)
             return ret;
     }
 
@@ -653,5 +670,5 @@ const FFCodec ff_speedhq_decoder = {
     .priv_data_size = sizeof(SHQContext),
     .init           = speedhq_decode_init,
     FF_CODEC_DECODE_CB(speedhq_decode_frame),
-    .p.capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
+    .p.capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_SLICE_THREADS,
 };
-- 
2.39.2


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [FFmpeg-devel] [PATCH 2/2] lavc/speedhqdec: Reindent
  2024-05-13 15:42 [FFmpeg-devel] [PATCH 1/2] lavc/speedhqdec: Add AV_CODEC_CAP_SLICE_THREADS Tomas Härdin
@ 2024-05-13 15:43 ` Tomas Härdin
  2024-05-14  8:20 ` [FFmpeg-devel] [PATCH 1/2] lavc/speedhqdec: Add AV_CODEC_CAP_SLICE_THREADS Tomas Härdin
  2024-05-30 14:23 ` Tomas Härdin
  2 siblings, 0 replies; 5+ messages in thread
From: Tomas Härdin @ 2024-05-13 15:43 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 1 bytes --]



[-- Attachment #2: 0002-lavc-speedhqdec-Reindent.patch --]
[-- Type: text/x-patch, Size: 10289 bytes --]

From 17aceef1c1a1bb25d651610cd52bc94dbdf20e0d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20H=C3=A4rdin?= <git@haerdin.se>
Date: Mon, 13 May 2024 17:01:28 +0200
Subject: [PATCH 2/2] lavc/speedhqdec: Reindent

---
 libavcodec/speedhqdec.c | 152 ++++++++++++++++++++--------------------
 1 file changed, 76 insertions(+), 76 deletions(-)

diff --git a/libavcodec/speedhqdec.c b/libavcodec/speedhqdec.c
index 77a159f7e5..06ae0a7a85 100644
--- a/libavcodec/speedhqdec.c
+++ b/libavcodec/speedhqdec.c
@@ -297,98 +297,98 @@ static int decode_speedhq_field(const SHQContext *s, const uint8_t *buf, int buf
             return AVERROR_INVALIDDATA;
     }
 
-        slice_begin = slice_offsets[slice_number];
-        slice_end = slice_offsets[slice_number + 1];
+    slice_begin = slice_offsets[slice_number];
+    slice_end = slice_offsets[slice_number + 1];
 
-        if ((ret = init_get_bits8(&gb, buf + slice_begin + 3, slice_end - slice_begin - 3)) < 0)
-            return ret;
+    if ((ret = init_get_bits8(&gb, buf + slice_begin + 3, slice_end - slice_begin - 3)) < 0)
+        return ret;
 
-        for (y = slice_number * 16 * line_stride; y < frame->height; y += line_stride * 64) {
-            uint8_t *dest_y, *dest_cb, *dest_cr, *dest_a;
-            int last_dc[4] = { 1024, 1024, 1024, 1024 };
-            uint8_t last_alpha[16];
+    for (y = slice_number * 16 * line_stride; y < frame->height; y += line_stride * 64) {
+        uint8_t *dest_y, *dest_cb, *dest_cr, *dest_a;
+        int last_dc[4] = { 1024, 1024, 1024, 1024 };
+        uint8_t last_alpha[16];
 
-            memset(last_alpha, 255, sizeof(last_alpha));
+        memset(last_alpha, 255, sizeof(last_alpha));
 
-            dest_y = frame->data[0] + frame->linesize[0] * (y + field_number);
-            if (s->subsampling == SHQ_SUBSAMPLING_420) {
-                dest_cb = frame->data[1] + frame->linesize[1] * (y/2 + field_number);
-                dest_cr = frame->data[2] + frame->linesize[2] * (y/2 + field_number);
-            } else {
-                dest_cb = frame->data[1] + frame->linesize[1] * (y + field_number);
-                dest_cr = frame->data[2] + frame->linesize[2] * (y + field_number);
-            }
-            if (s->alpha_type != SHQ_NO_ALPHA) {
-                dest_a = frame->data[3] + frame->linesize[3] * (y + field_number);
-            }
+        dest_y = frame->data[0] + frame->linesize[0] * (y + field_number);
+        if (s->subsampling == SHQ_SUBSAMPLING_420) {
+            dest_cb = frame->data[1] + frame->linesize[1] * (y/2 + field_number);
+            dest_cr = frame->data[2] + frame->linesize[2] * (y/2 + field_number);
+        } else {
+            dest_cb = frame->data[1] + frame->linesize[1] * (y + field_number);
+            dest_cr = frame->data[2] + frame->linesize[2] * (y + field_number);
+        }
+        if (s->alpha_type != SHQ_NO_ALPHA) {
+            dest_a = frame->data[3] + frame->linesize[3] * (y + field_number);
+        }
 
-            for (x = 0; x < frame->width - 8 * (s->subsampling != SHQ_SUBSAMPLING_444); x += 16) {
-                /* Decode the four luma blocks. */
-                if ((ret = decode_dct_block(s, &gb, last_dc, 0, dest_y, linesize_y)) < 0)
-                    return ret;
-                if ((ret = decode_dct_block(s, &gb, last_dc, 0, dest_y + 8, linesize_y)) < 0)
-                    return ret;
-                if ((ret = decode_dct_block(s, &gb, last_dc, 0, dest_y + 8 * linesize_y, linesize_y)) < 0)
-                    return ret;
-                if ((ret = decode_dct_block(s, &gb, last_dc, 0, dest_y + 8 * linesize_y + 8, linesize_y)) < 0)
-                    return ret;
+        for (x = 0; x < frame->width - 8 * (s->subsampling != SHQ_SUBSAMPLING_444); x += 16) {
+            /* Decode the four luma blocks. */
+            if ((ret = decode_dct_block(s, &gb, last_dc, 0, dest_y, linesize_y)) < 0)
+                return ret;
+            if ((ret = decode_dct_block(s, &gb, last_dc, 0, dest_y + 8, linesize_y)) < 0)
+                return ret;
+            if ((ret = decode_dct_block(s, &gb, last_dc, 0, dest_y + 8 * linesize_y, linesize_y)) < 0)
+                return ret;
+            if ((ret = decode_dct_block(s, &gb, last_dc, 0, dest_y + 8 * linesize_y + 8, linesize_y)) < 0)
+                return ret;
+
+            /*
+                * Decode the first chroma block. For 4:2:0, this is the only one;
+                * for 4:2:2, it's the top block; for 4:4:4, it's the top-left block.
+                */
+            if ((ret = decode_dct_block(s, &gb, last_dc, 1, dest_cb, linesize_cb)) < 0)
+                return ret;
+            if ((ret = decode_dct_block(s, &gb, last_dc, 2, dest_cr, linesize_cr)) < 0)
+                return ret;
 
-                /*
-                 * Decode the first chroma block. For 4:2:0, this is the only one;
-                 * for 4:2:2, it's the top block; for 4:4:4, it's the top-left block.
-                 */
-                if ((ret = decode_dct_block(s, &gb, last_dc, 1, dest_cb, linesize_cb)) < 0)
+            if (s->subsampling != SHQ_SUBSAMPLING_420) {
+                /* For 4:2:2, this is the bottom block; for 4:4:4, it's the bottom-left block. */
+                if ((ret = decode_dct_block(s, &gb, last_dc, 1, dest_cb + 8 * linesize_cb, linesize_cb)) < 0)
                     return ret;
-                if ((ret = decode_dct_block(s, &gb, last_dc, 2, dest_cr, linesize_cr)) < 0)
+                if ((ret = decode_dct_block(s, &gb, last_dc, 2, dest_cr + 8 * linesize_cr, linesize_cr)) < 0)
                     return ret;
 
-                if (s->subsampling != SHQ_SUBSAMPLING_420) {
-                    /* For 4:2:2, this is the bottom block; for 4:4:4, it's the bottom-left block. */
-                    if ((ret = decode_dct_block(s, &gb, last_dc, 1, dest_cb + 8 * linesize_cb, linesize_cb)) < 0)
-                        return ret;
-                    if ((ret = decode_dct_block(s, &gb, last_dc, 2, dest_cr + 8 * linesize_cr, linesize_cr)) < 0)
-                        return ret;
-
-                    if (s->subsampling == SHQ_SUBSAMPLING_444) {
-                        /* Top-right and bottom-right blocks. */
-                        if ((ret = decode_dct_block(s, &gb, last_dc, 1, dest_cb + 8, linesize_cb)) < 0)
-                            return ret;
-                        if ((ret = decode_dct_block(s, &gb, last_dc, 2, dest_cr + 8, linesize_cr)) < 0)
-                            return ret;
-                        if ((ret = decode_dct_block(s, &gb, last_dc, 1, dest_cb + 8 * linesize_cb + 8, linesize_cb)) < 0)
-                            return ret;
-                        if ((ret = decode_dct_block(s, &gb, last_dc, 2, dest_cr + 8 * linesize_cr + 8, linesize_cr)) < 0)
-                            return ret;
-
-                        dest_cb += 8;
-                        dest_cr += 8;
-                    }
-                }
-                dest_y += 16;
-                dest_cb += 8;
-                dest_cr += 8;
-
-                if (s->alpha_type == SHQ_RLE_ALPHA) {
-                    /* Alpha coded using 16x8 RLE blocks. */
-                    if ((ret = decode_alpha_block(s, &gb, last_alpha, dest_a, linesize_a)) < 0)
-                        return ret;
-                    if ((ret = decode_alpha_block(s, &gb, last_alpha, dest_a + 8 * linesize_a, linesize_a)) < 0)
-                        return ret;
-                    dest_a += 16;
-                } else if (s->alpha_type == SHQ_DCT_ALPHA) {
-                    /* Alpha encoded exactly like luma. */
-                    if ((ret = decode_dct_block(s, &gb, last_dc, 3, dest_a, linesize_a)) < 0)
+                if (s->subsampling == SHQ_SUBSAMPLING_444) {
+                    /* Top-right and bottom-right blocks. */
+                    if ((ret = decode_dct_block(s, &gb, last_dc, 1, dest_cb + 8, linesize_cb)) < 0)
                         return ret;
-                    if ((ret = decode_dct_block(s, &gb, last_dc, 3, dest_a + 8, linesize_a)) < 0)
+                    if ((ret = decode_dct_block(s, &gb, last_dc, 2, dest_cr + 8, linesize_cr)) < 0)
                         return ret;
-                    if ((ret = decode_dct_block(s, &gb, last_dc, 3, dest_a + 8 * linesize_a, linesize_a)) < 0)
+                    if ((ret = decode_dct_block(s, &gb, last_dc, 1, dest_cb + 8 * linesize_cb + 8, linesize_cb)) < 0)
                         return ret;
-                    if ((ret = decode_dct_block(s, &gb, last_dc, 3, dest_a + 8 * linesize_a + 8, linesize_a)) < 0)
+                    if ((ret = decode_dct_block(s, &gb, last_dc, 2, dest_cr + 8 * linesize_cr + 8, linesize_cr)) < 0)
                         return ret;
-                    dest_a += 16;
+
+                    dest_cb += 8;
+                    dest_cr += 8;
                 }
             }
+            dest_y += 16;
+            dest_cb += 8;
+            dest_cr += 8;
+
+            if (s->alpha_type == SHQ_RLE_ALPHA) {
+                /* Alpha coded using 16x8 RLE blocks. */
+                if ((ret = decode_alpha_block(s, &gb, last_alpha, dest_a, linesize_a)) < 0)
+                    return ret;
+                if ((ret = decode_alpha_block(s, &gb, last_alpha, dest_a + 8 * linesize_a, linesize_a)) < 0)
+                    return ret;
+                dest_a += 16;
+            } else if (s->alpha_type == SHQ_DCT_ALPHA) {
+                /* Alpha encoded exactly like luma. */
+                if ((ret = decode_dct_block(s, &gb, last_dc, 3, dest_a, linesize_a)) < 0)
+                    return ret;
+                if ((ret = decode_dct_block(s, &gb, last_dc, 3, dest_a + 8, linesize_a)) < 0)
+                    return ret;
+                if ((ret = decode_dct_block(s, &gb, last_dc, 3, dest_a + 8 * linesize_a, linesize_a)) < 0)
+                    return ret;
+                if ((ret = decode_dct_block(s, &gb, last_dc, 3, dest_a + 8 * linesize_a + 8, linesize_a)) < 0)
+                    return ret;
+                dest_a += 16;
+            }
         }
+    }
 
     if (s->subsampling != SHQ_SUBSAMPLING_444 && (frame->width & 15) && slice_number == 3)
         return decode_speedhq_border(s, &gb, frame, field_number, line_stride);
-- 
2.39.2


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/2] lavc/speedhqdec: Add AV_CODEC_CAP_SLICE_THREADS
  2024-05-13 15:42 [FFmpeg-devel] [PATCH 1/2] lavc/speedhqdec: Add AV_CODEC_CAP_SLICE_THREADS Tomas Härdin
  2024-05-13 15:43 ` [FFmpeg-devel] [PATCH 2/2] lavc/speedhqdec: Reindent Tomas Härdin
@ 2024-05-14  8:20 ` Tomas Härdin
  2024-05-30 14:23 ` Tomas Härdin
  2 siblings, 0 replies; 5+ messages in thread
From: Tomas Härdin @ 2024-05-14  8:20 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

I forgot to mention it is possible to go even further with ||izing the
decoder by separating VLC decode from IDCT. This would affect serial
performance however, since all coefficients would likely not fit in the
innermost cache. For 4k yuv444p this is 51 MiB. Even 1080p yuv420p is
still 6 MiB. For comparison, at the moment only a single DCT block is
kept, which is just 128 bytes

Another possibility could be to have two threads per slice,
interleaving VLC decode and IDCT, using double buffering on more
modestly sized buffers. Two 16-line buffers per pair of threads or 768
KiB for 4k, 3 MiB across all four thread pairs. But, this would require
mutexes between pairs of threads, which execute2() isn't designed for

A final possibility is to do just enough upfront work to find the bit
boundary between 16-line blocks within each slice. That is, doing VLC
decode but not writing coefficients anywhere. Then each group of 16
lines could be made into its own job since we know the exact start and
end bit for all of them. This would degrade serial performance and
performance with 4/8 threads but probably a win for 4k and up with
large numbers of threads

/Tomas
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/2] lavc/speedhqdec: Add AV_CODEC_CAP_SLICE_THREADS
  2024-05-13 15:42 [FFmpeg-devel] [PATCH 1/2] lavc/speedhqdec: Add AV_CODEC_CAP_SLICE_THREADS Tomas Härdin
  2024-05-13 15:43 ` [FFmpeg-devel] [PATCH 2/2] lavc/speedhqdec: Reindent Tomas Härdin
  2024-05-14  8:20 ` [FFmpeg-devel] [PATCH 1/2] lavc/speedhqdec: Add AV_CODEC_CAP_SLICE_THREADS Tomas Härdin
@ 2024-05-30 14:23 ` Tomas Härdin
  2024-06-03 12:54   ` Tomas Härdin
  2 siblings, 1 reply; 5+ messages in thread
From: Tomas Härdin @ 2024-05-30 14:23 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Ping

Will push in a couple of days

/Tomas
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/2] lavc/speedhqdec: Add AV_CODEC_CAP_SLICE_THREADS
  2024-05-30 14:23 ` Tomas Härdin
@ 2024-06-03 12:54   ` Tomas Härdin
  0 siblings, 0 replies; 5+ messages in thread
From: Tomas Härdin @ 2024-06-03 12:54 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

tor 2024-05-30 klockan 16:23 +0200 skrev Tomas Härdin:
> Ping
> 
> Will push in a couple of days

Passes FATE -> pushed

/Tomas
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2024-06-03 12:54 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-05-13 15:42 [FFmpeg-devel] [PATCH 1/2] lavc/speedhqdec: Add AV_CODEC_CAP_SLICE_THREADS Tomas Härdin
2024-05-13 15:43 ` [FFmpeg-devel] [PATCH 2/2] lavc/speedhqdec: Reindent Tomas Härdin
2024-05-14  8:20 ` [FFmpeg-devel] [PATCH 1/2] lavc/speedhqdec: Add AV_CODEC_CAP_SLICE_THREADS Tomas Härdin
2024-05-30 14:23 ` Tomas Härdin
2024-06-03 12:54   ` Tomas Härdin

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git