Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 01/13] lavc/jpeg2000dec: Finer granularity threading
@ 2022-06-14 14:39 Tomas Härdin
  2022-06-14 14:39 ` [FFmpeg-devel] [PATCH 02/13] lavc/jpeg2000dec: Reindent Tomas Härdin
                   ` (12 more replies)
  0 siblings, 13 replies; 31+ messages in thread
From: Tomas Härdin @ 2022-06-14 14:39 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 106 bytes --]

Patch 12 in this series is optional since it's just me getting the
speed up on a specific machine

/Tomas

[-- Attachment #2: 0001-lavc-jpeg2000dec-Finer-granularity-threading.patch --]
[-- Type: text/x-patch, Size: 11603 bytes --]

From 115aa26c343419e81c1b5ba0bfdb1615cbec27e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20H=C3=A4rdin?= <git@haerdin.se>
Date: Fri, 10 Jun 2022 14:10:02 +0200
Subject: [PATCH 01/13] lavc/jpeg2000dec: Finer granularity threading

Decoding and dequant is now threaded on codeblock level.
IDWT is threaded on component level.
MCT and write_frame() remain threaded on tile level.

This brings lossless 4K J2K with -lowres 2 -thread_type slice -threads 96 on an AMD EPYC 7R32 from 4.8 fps (177% CPU) to 31 fps (1284% CPU).
---
 libavcodec/jpeg2000dec.c | 196 ++++++++++++++++++++++++++++-----------
 1 file changed, 142 insertions(+), 54 deletions(-)

diff --git a/libavcodec/jpeg2000dec.c b/libavcodec/jpeg2000dec.c
index 92966b11f5..d9754fc50e 100644
--- a/libavcodec/jpeg2000dec.c
+++ b/libavcodec/jpeg2000dec.c
@@ -92,6 +92,15 @@ typedef struct Jpeg2000Tile {
     int coord[2][2];                    // border coordinates {{x0, x1}, {y0, y1}}
 } Jpeg2000Tile;
 
+typedef struct Jpeg2000IdwtThread {
+    int cb_start, cb_end;
+} Jpeg2000IdwtThread;
+
+typedef struct Jpeg2000CodeblockThread {
+    int tileno, compno, reslevelno, bandno, precno, cblkno;
+    int coded;
+} Jpeg2000CodeblockThread;
+
 typedef struct Jpeg2000DecoderContext {
     AVClass         *class;
     AVCodecContext  *avctx;
@@ -136,6 +145,11 @@ typedef struct Jpeg2000DecoderContext {
 
     /*options parameters*/
     int             reduction_factor;
+
+    Jpeg2000IdwtThread *idwt;
+    unsigned int idwt_size;
+    Jpeg2000CodeblockThread *cb;
+    unsigned int cb_size;
 } Jpeg2000DecoderContext;
 
 /* get_bits functions for JPEG2000 packet bitstream
@@ -1937,54 +1951,33 @@ static inline void roi_scale_cblk(Jpeg2000Cblk *cblk,
     }
 }
 
-static inline void tile_codeblocks(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile)
+static int jpeg2000_decode_cb(AVCodecContext *avctx, void *td,
+                              int jobnr, int threadnr)
 {
     Jpeg2000T1Context t1;
-
-    int compno, reslevelno, bandno;
-
-    /* Loop on tile components */
-    for (compno = 0; compno < s->ncomponents; compno++) {
-        Jpeg2000Component *comp     = tile->comp + compno;
-        Jpeg2000CodingStyle *codsty = tile->codsty + compno;
-        int coded = 0;
-
-        t1.stride = (1<<codsty->log2_cblk_width) + 2;
-
-        /* Loop on resolution levels */
-        for (reslevelno = 0; reslevelno < codsty->nreslevels2decode; reslevelno++) {
-            Jpeg2000ResLevel *rlevel = comp->reslevel + reslevelno;
-            /* Loop on bands */
-            for (bandno = 0; bandno < rlevel->nbands; bandno++) {
-                int nb_precincts, precno;
-                Jpeg2000Band *band = rlevel->band + bandno;
-                int cblkno = 0, bandpos;
-
-                bandpos = bandno + (reslevelno > 0);
-
-                if (band->coord[0][0] == band->coord[0][1] ||
-                    band->coord[1][0] == band->coord[1][1])
-                    continue;
-
-                nb_precincts = rlevel->num_precincts_x * rlevel->num_precincts_y;
-                /* Loop on precincts */
-                for (precno = 0; precno < nb_precincts; precno++) {
-                    Jpeg2000Prec *prec = band->prec + precno;
-
-                    /* Loop on codeblocks */
-                    for (cblkno = 0;
-                         cblkno < prec->nb_codeblocks_width * prec->nb_codeblocks_height;
-                         cblkno++) {
-                        int x, y;
-                        Jpeg2000Cblk *cblk = prec->cblk + cblkno;
-                        int ret = decode_cblk(s, codsty, &t1, cblk,
+    Jpeg2000DecoderContext *s   = avctx->priv_data;
+    Jpeg2000CodeblockThread *cb = s->cb + jobnr;
+    Jpeg2000Tile *tile          = s->tile + cb->tileno;
+    Jpeg2000Component *comp     = tile->comp + cb->compno;
+    Jpeg2000CodingStyle *codsty = tile->codsty + cb->compno;
+    Jpeg2000ResLevel *rlevel    = comp->reslevel + cb->reslevelno;
+    Jpeg2000Band *band          = rlevel->band + cb->bandno;
+    Jpeg2000Prec *prec          = band->prec + cb->precno;
+    Jpeg2000Cblk *cblk          = prec->cblk + cb->cblkno;
+    int ret, x, y, bandpos      = cb->bandno + (cb->reslevelno > 0);
+
+    t1.stride = (1<<codsty->log2_cblk_width) + 2;
+    cb->coded = 0;
+
+                        ret = decode_cblk(s, codsty, &t1, cblk,
                                     cblk->coord[0][1] - cblk->coord[0][0],
                                     cblk->coord[1][1] - cblk->coord[1][0],
                                     bandpos, comp->roi_shift);
                         if (ret)
-                            coded = 1;
+                            cb->coded = 1;
                         else
-                            continue;
+                            return 0;
+
                         x = cblk->coord[0][0] - band->coord[0][0];
                         y = cblk->coord[1][0] - band->coord[1][0];
 
@@ -1996,16 +1989,28 @@ static inline void tile_codeblocks(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile
                             dequantization_int_97(x, y, cblk, comp, &t1, band);
                         else
                             dequantization_int(x, y, cblk, comp, &t1, band);
-                   } /* end cblk */
-                } /*end prec */
-            } /* end band */
-        } /* end reslevel */
 
-        /* inverse DWT */
-        if (coded)
+    return 0;
+}
+
+static int jpeg2000_idwt(AVCodecContext *avctx, void *td,
+                         int jobnr, int threadnr)
+{
+    Jpeg2000DecoderContext *s   = avctx->priv_data;
+    Jpeg2000IdwtThread *idwt    = s->idwt + jobnr;
+    Jpeg2000Tile *tile          = s->tile + jobnr / s->ncomponents;
+    int compno                  = jobnr % s->ncomponents;
+    Jpeg2000Component *comp     = tile->comp + compno;
+    Jpeg2000CodingStyle *codsty = tile->codsty + compno;
+
+    for (int i = idwt->cb_start; i < idwt->cb_end; i++) {
+        if (s->cb[i].coded) {
             ff_dwt_decode(&comp->dwt, codsty->transform == FF_DWT97 ? (void*)comp->f_data : (void*)comp->i_data);
+            break;
+        }
+    }
 
-    } /*end comp */
+    return 0;
 }
 
 #define WRITE_FRAME(D, PIXEL)                                                                     \
@@ -2075,15 +2080,13 @@ WRITE_FRAME(16, uint16_t)
 
 #undef WRITE_FRAME
 
-static int jpeg2000_decode_tile(AVCodecContext *avctx, void *td,
-                                int jobnr, int threadnr)
+static int jpeg2000_mct_write_frame(AVCodecContext *avctx, void *td,
+                                    int jobnr, int threadnr)
 {
     Jpeg2000DecoderContext *s = avctx->priv_data;
     AVFrame *picture = td;
     Jpeg2000Tile *tile = s->tile + jobnr;
 
-    tile_codeblocks(s, tile);
-
     /* inverse MCT transformation */
     if (tile->codsty[0].mct)
         mct_decode(s, tile);
@@ -2473,11 +2476,80 @@ static av_cold int jpeg2000_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
+static int jpeg2000_setup_cbs(Jpeg2000DecoderContext *s, int *cbs_out)
+{
+    if (s->numXtiles * s->numYtiles > INT_MAX/sizeof(*s->idwt)/s->ncomponents)
+        return AVERROR(ENOMEM);
+
+    av_fast_malloc(&s->idwt, &s->idwt_size, s->numXtiles * s->numYtiles * s->ncomponents * sizeof(*s->idwt));
+    if (!s->idwt)
+        return AVERROR(ENOMEM);
+
+    for (int pass = 0; pass < 2; pass++) {
+        int cbs = 0;
+        for (int tileno = 0; tileno < s->numXtiles * s->numYtiles; tileno++) {
+            for (int compno = 0; compno < s->ncomponents; compno++) {
+                Jpeg2000Tile *tile          = s->tile + tileno;
+                Jpeg2000Component *comp     = tile->comp + compno;
+                Jpeg2000CodingStyle *codsty = tile->codsty + compno;
+                Jpeg2000IdwtThread *idwt    = s->idwt + compno + tileno * s->ncomponents;
+
+                idwt->cb_start = cbs;
+
+                for (int reslevelno = 0; reslevelno < codsty->nreslevels2decode; reslevelno++) {
+                    Jpeg2000ResLevel *rlevel = comp->reslevel + reslevelno;
+                    for (int bandno = 0; bandno < rlevel->nbands; bandno++) {
+                        int nb_precincts = rlevel->num_precincts_x * rlevel->num_precincts_y;
+                        Jpeg2000Band *band = rlevel->band + bandno;
+
+                        if (band->coord[0][0] == band->coord[0][1] ||
+                            band->coord[1][0] == band->coord[1][1])
+                            continue;
+
+                        for (int precno = 0; precno < nb_precincts; precno++) {
+                            Jpeg2000Prec *prec = band->prec + precno;
+                            int prec_cbs = prec->nb_codeblocks_width * prec->nb_codeblocks_height;
+
+                            if (cbs > INT_MAX - prec_cbs)
+                                return AVERROR(ENOMEM);
+
+                            for (int cblkno = 0; cblkno < prec_cbs; cblkno++, cbs++) {
+                                if (pass == 1) {
+                                    Jpeg2000CodeblockThread *cb = s->cb + cbs;
+                                    cb->tileno = tileno;
+                                    cb->compno = compno;
+                                    cb->reslevelno = reslevelno;
+                                    cb->bandno = bandno;
+                                    cb->precno = precno;
+                                    cb->cblkno = cblkno;
+                                }
+                            }
+                        }
+                    }
+                }
+
+                idwt->cb_end = cbs;
+            }
+        }
+
+        if (pass == 0) {
+            if (cbs > INT_MAX/sizeof(*s->cb))
+                return AVERROR(ENOMEM);
+            av_fast_malloc(&s->cb, &s->cb_size, cbs*sizeof(*s->cb));
+            if (!s->cb)
+                return AVERROR(ENOMEM);
+        }
+
+        *cbs_out = cbs;
+    }
+    return 0;
+}
+
 static int jpeg2000_decode_frame(AVCodecContext *avctx, AVFrame *picture,
                                  int *got_frame, AVPacket *avpkt)
 {
     Jpeg2000DecoderContext *s = avctx->priv_data;
-    int ret;
+    int ret, cbs;
 
     s->avctx     = avctx;
     bytestream2_init(&s->g, avpkt->data, avpkt->size);
@@ -2535,7 +2607,12 @@ static int jpeg2000_decode_frame(AVCodecContext *avctx, AVFrame *picture,
         }
     }
 
-    avctx->execute2(avctx, jpeg2000_decode_tile, picture, NULL, s->numXtiles * s->numYtiles);
+    if ((ret = jpeg2000_setup_cbs(s, &cbs)))
+        goto end;
+
+    avctx->execute2(avctx, jpeg2000_decode_cb, NULL, NULL, cbs);
+    avctx->execute2(avctx, jpeg2000_idwt, NULL, NULL, s->numXtiles * s->numYtiles * s->ncomponents);
+    avctx->execute2(avctx, jpeg2000_mct_write_frame, picture, NULL, s->numXtiles * s->numYtiles);
 
     jpeg2000_dec_cleanup(s);
 
@@ -2554,6 +2631,16 @@ end:
     return ret;
 }
 
+static av_cold int jpeg2000_decode_close(AVCodecContext *avctx)
+{
+    Jpeg2000DecoderContext *s = avctx->priv_data;
+
+    av_freep(&s->idwt);
+    av_freep(&s->cb);
+
+    return 0;
+}
+
 #define OFFSET(x) offsetof(Jpeg2000DecoderContext, x)
 #define VD AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM
 
@@ -2579,6 +2666,7 @@ const FFCodec ff_jpeg2000_decoder = {
     .priv_data_size   = sizeof(Jpeg2000DecoderContext),
     .init             = jpeg2000_decode_init,
     FF_CODEC_DECODE_CB(jpeg2000_decode_frame),
+    .close            = jpeg2000_decode_close,
     .p.priv_class     = &jpeg2000_class,
     .p.max_lowres     = 5,
     .p.profiles       = NULL_IF_CONFIG_SMALL(ff_jpeg2000_profiles),
-- 
2.30.2


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [FFmpeg-devel] [PATCH 02/13] lavc/jpeg2000dec: Reindent
  2022-06-14 14:39 [FFmpeg-devel] [PATCH 01/13] lavc/jpeg2000dec: Finer granularity threading Tomas Härdin
@ 2022-06-14 14:39 ` Tomas Härdin
  2022-06-14 14:40 ` [FFmpeg-devel] [PATCH 03/13] lavc/jpeg2000dwt: Implement sliced transforms Tomas Härdin
                   ` (11 subsequent siblings)
  12 siblings, 0 replies; 31+ messages in thread
From: Tomas Härdin @ 2022-06-14 14:39 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 1 bytes --]



[-- Attachment #2: 0002-lavc-jpeg2000dec-Reindent.patch --]
[-- Type: text/x-patch, Size: 2478 bytes --]

From 86c30e327e1eb8ba913d74d5394ea90a87b55a69 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20H=C3=A4rdin?= <git@haerdin.se>
Date: Fri, 10 Jun 2022 14:12:11 +0200
Subject: [PATCH 02/13] lavc/jpeg2000dec: Reindent

---
 libavcodec/jpeg2000dec.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/libavcodec/jpeg2000dec.c b/libavcodec/jpeg2000dec.c
index d9754fc50e..9d3d406870 100644
--- a/libavcodec/jpeg2000dec.c
+++ b/libavcodec/jpeg2000dec.c
@@ -1969,26 +1969,26 @@ static int jpeg2000_decode_cb(AVCodecContext *avctx, void *td,
     t1.stride = (1<<codsty->log2_cblk_width) + 2;
     cb->coded = 0;
 
-                        ret = decode_cblk(s, codsty, &t1, cblk,
-                                    cblk->coord[0][1] - cblk->coord[0][0],
-                                    cblk->coord[1][1] - cblk->coord[1][0],
-                                    bandpos, comp->roi_shift);
-                        if (ret)
-                            cb->coded = 1;
-                        else
-                            return 0;
+    ret = decode_cblk(s, codsty, &t1, cblk,
+                cblk->coord[0][1] - cblk->coord[0][0],
+                cblk->coord[1][1] - cblk->coord[1][0],
+                bandpos, comp->roi_shift);
+    if (ret)
+        cb->coded = 1;
+    else
+        return 0;
 
-                        x = cblk->coord[0][0] - band->coord[0][0];
-                        y = cblk->coord[1][0] - band->coord[1][0];
+    x = cblk->coord[0][0] - band->coord[0][0];
+    y = cblk->coord[1][0] - band->coord[1][0];
 
-                        if (comp->roi_shift)
-                            roi_scale_cblk(cblk, comp, &t1);
-                        if (codsty->transform == FF_DWT97)
-                            dequantization_float(x, y, cblk, comp, &t1, band);
-                        else if (codsty->transform == FF_DWT97_INT)
-                            dequantization_int_97(x, y, cblk, comp, &t1, band);
-                        else
-                            dequantization_int(x, y, cblk, comp, &t1, band);
+    if (comp->roi_shift)
+        roi_scale_cblk(cblk, comp, &t1);
+    if (codsty->transform == FF_DWT97)
+        dequantization_float(x, y, cblk, comp, &t1, band);
+    else if (codsty->transform == FF_DWT97_INT)
+        dequantization_int_97(x, y, cblk, comp, &t1, band);
+    else
+        dequantization_int(x, y, cblk, comp, &t1, band);
 
     return 0;
 }
-- 
2.30.2


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [FFmpeg-devel] [PATCH 03/13] lavc/jpeg2000dwt: Implement sliced transforms
  2022-06-14 14:39 [FFmpeg-devel] [PATCH 01/13] lavc/jpeg2000dec: Finer granularity threading Tomas Härdin
  2022-06-14 14:39 ` [FFmpeg-devel] [PATCH 02/13] lavc/jpeg2000dec: Reindent Tomas Härdin
@ 2022-06-14 14:40 ` Tomas Härdin
  2022-06-14 14:40 ` [FFmpeg-devel] [PATCH 04/13] lavc/jpeg2000dec: Implement IDWT slicing Tomas Härdin
                   ` (10 subsequent siblings)
  12 siblings, 0 replies; 31+ messages in thread
From: Tomas Härdin @ 2022-06-14 14:40 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 1 bytes --]



[-- Attachment #2: 0003-lavc-jpeg2000dwt-Implement-sliced-transforms.patch --]
[-- Type: text/x-patch, Size: 16258 bytes --]

From 6ab67531c946ca320e49bc93f4f086835ffd2c1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20H=C3=A4rdin?= <git@haerdin.se>
Date: Fri, 10 Jun 2022 17:18:14 +0200
Subject: [PATCH 03/13] lavc/jpeg2000dwt: Implement sliced transforms

lavc/tests/jpeg2000dwt tests this.
---
 libavcodec/j2kenc.c            |   3 +-
 libavcodec/jpeg2000.c          |   5 +-
 libavcodec/jpeg2000.h          |   2 +-
 libavcodec/jpeg2000dec.c       |   2 +-
 libavcodec/jpeg2000dwt.c       | 131 +++++++++++++++++++--------------
 libavcodec/jpeg2000dwt.h       |   5 +-
 libavcodec/tests/jpeg2000dwt.c |  15 ++--
 7 files changed, 94 insertions(+), 69 deletions(-)

diff --git a/libavcodec/j2kenc.c b/libavcodec/j2kenc.c
index 0b761d0b00..4de596ffa9 100644
--- a/libavcodec/j2kenc.c
+++ b/libavcodec/j2kenc.c
@@ -496,7 +496,8 @@ static int init_tiles(Jpeg2000EncoderContext *s)
                                                 s->cbps[compno],
                                                 compno?1<<s->chroma_shift[0]:1,
                                                 compno?1<<s->chroma_shift[1]:1,
-                                                s->avctx
+                                                s->avctx,
+                                                1
                                                )) < 0)
                     return ret;
             }
diff --git a/libavcodec/jpeg2000.c b/libavcodec/jpeg2000.c
index 0aa984bc53..945b787565 100644
--- a/libavcodec/jpeg2000.c
+++ b/libavcodec/jpeg2000.c
@@ -467,7 +467,7 @@ int ff_jpeg2000_init_component(Jpeg2000Component *comp,
                                Jpeg2000CodingStyle *codsty,
                                Jpeg2000QuantStyle *qntsty,
                                int cbps, int dx, int dy,
-                               AVCodecContext *avctx)
+                               AVCodecContext *avctx, int max_slices)
 {
     int reslevelno, bandno, gbandno = 0, ret, i, j;
     uint32_t csize;
@@ -479,7 +479,8 @@ int ff_jpeg2000_init_component(Jpeg2000Component *comp,
 
     if (ret = ff_jpeg2000_dwt_init(&comp->dwt, comp->coord,
                                    codsty->nreslevels2decode - 1,
-                                   codsty->transform))
+                                   codsty->transform,
+                                   max_slices))
         return ret;
 
     if (av_image_check_size(comp->coord[0][1] - comp->coord[0][0],
diff --git a/libavcodec/jpeg2000.h b/libavcodec/jpeg2000.h
index d06313425e..cbb8e0d951 100644
--- a/libavcodec/jpeg2000.h
+++ b/libavcodec/jpeg2000.h
@@ -278,7 +278,7 @@ int ff_jpeg2000_init_component(Jpeg2000Component *comp,
                                Jpeg2000CodingStyle *codsty,
                                Jpeg2000QuantStyle *qntsty,
                                int cbps, int dx, int dy,
-                               AVCodecContext *ctx);
+                               AVCodecContext *ctx, int max_slices);
 
 void ff_jpeg2000_reinit(Jpeg2000Component *comp, Jpeg2000CodingStyle *codsty);
 
diff --git a/libavcodec/jpeg2000dec.c b/libavcodec/jpeg2000dec.c
index 9d3d406870..8999974a56 100644
--- a/libavcodec/jpeg2000dec.c
+++ b/libavcodec/jpeg2000dec.c
@@ -1052,7 +1052,7 @@ static int init_tile(Jpeg2000DecoderContext *s, int tileno)
             return AVERROR_INVALIDDATA;
         if (ret = ff_jpeg2000_init_component(comp, codsty, qntsty,
                                              s->cbps[compno], s->cdx[compno],
-                                             s->cdy[compno], s->avctx))
+                                             s->cdy[compno], s->avctx, 1))
             return ret;
     }
     return 0;
diff --git a/libavcodec/jpeg2000dwt.c b/libavcodec/jpeg2000dwt.c
index f2da7307c4..42a92b6c64 100644
--- a/libavcodec/jpeg2000dwt.c
+++ b/libavcodec/jpeg2000dwt.c
@@ -322,24 +322,24 @@ static void sr_1d53(unsigned *p, int i0, int i1)
         p[2 * i + 1] += (int)(p[2 * i] + p[2 * i + 2]) >> 1;
 }
 
-static void dwt_decode53(DWTContext *s, int *t)
+static void dwt_decode53(DWTContext *s, int *t, int lev, int dir, int slice, int slices)
 {
-    int lev;
     int w     = s->linelen[s->ndeclevels - 1][0];
-    int32_t *line = s->i_linebuf;
-    line += 3;
+    int32_t *line = s->i_linebuf + slice * s->linesize + 3;
 
-    for (lev = 0; lev < s->ndeclevels; lev++) {
         int lh = s->linelen[lev][0],
             lv = s->linelen[lev][1],
             mh = s->mod[lev][0],
             mv = s->mod[lev][1],
+            sh = (lh + slices - 1)/slices,
+            sv = (lv + slices - 1)/slices,
             lp;
         int *l;
 
+    if (dir == 0) {
         // HOR_SD
         l = line + mh;
-        for (lp = 0; lp < lv; lp++) {
+        for (lp = slice*sv; lp < lv && lp - sv < slice*sv; lp++) {
             int i, j = 0;
             // copy with interleaving
             for (i = mh; i < lh; i += 2, j++)
@@ -352,10 +352,10 @@ static void dwt_decode53(DWTContext *s, int *t)
             for (i = 0; i < lh; i++)
                 t[w * lp + i] = l[i];
         }
-
+    } else {
         // VER_SD
         l = line + mv;
-        for (lp = 0; lp < lh; lp++) {
+        for (lp = slice*sh; lp < lh && lp - sh < slice*sh; lp++) {
             int i, j = 0;
             // copy with interleaving
             for (i = mv; i < lv; i += 2, j++)
@@ -398,25 +398,26 @@ static void sr_1d97_float(float *p, int i0, int i1)
         p[2 * i + 1] += F_LFTG_ALPHA * (p[2 * i]     + p[2 * i + 2]);
 }
 
-static void dwt_decode97_float(DWTContext *s, float *t)
+static void dwt_decode97_float(DWTContext *s, float *t, int lev, int dir, int slice, int slices)
 {
-    int lev;
     int w       = s->linelen[s->ndeclevels - 1][0];
-    float *line = s->f_linebuf;
-    float *data = t;
     /* position at index O of line range [0-5,w+5] cf. extend function */
-    line += 5;
+    float *line = s->f_linebuf + slice * s->linesize + 5;
+    float *data = t;
 
-    for (lev = 0; lev < s->ndeclevels; lev++) {
         int lh = s->linelen[lev][0],
             lv = s->linelen[lev][1],
             mh = s->mod[lev][0],
             mv = s->mod[lev][1],
+            sh = (lh + slices - 1)/slices,
+            sv = (lv + slices - 1)/slices,
             lp;
         float *l;
+
+    if (dir == 0) {
         // HOR_SD
         l = line + mh;
-        for (lp = 0; lp < lv; lp++) {
+        for (lp = slice*sv; lp < lv && lp - sv < slice*sv; lp++) {
             int i, j = 0;
             // copy with interleaving
             for (i = mh; i < lh; i += 2, j++)
@@ -429,10 +430,10 @@ static void dwt_decode97_float(DWTContext *s, float *t)
             for (i = 0; i < lh; i++)
                 data[w * lp + i] = l[i];
         }
-
+    } else {
         // VER_SD
         l = line + mv;
-        for (lp = 0; lp < lh; lp++) {
+        for (lp = slice*sh; lp < lh && lp - sh < slice*sh; lp++) {
             int i, j = 0;
             // copy with interleaving
             for (i = mv; i < lv; i += 2, j++)
@@ -475,30 +476,26 @@ static void sr_1d97_int(int32_t *p, int i0, int i1)
         p[2 * i + 1] += (I_LFTG_ALPHA * (p[2 * i]     + (int64_t)p[2 * i + 2]) + (1 << 15)) >> 16;
 }
 
-static void dwt_decode97_int(DWTContext *s, int32_t *t)
+static void dwt_decode97_int(DWTContext *s, int32_t *t, int lev, int dir, int slice, int slices)
 {
-    int lev;
     int w       = s->linelen[s->ndeclevels - 1][0];
-    int h       = s->linelen[s->ndeclevels - 1][1];
-    int i;
-    int32_t *line = s->i_linebuf;
-    int32_t *data = t;
     /* position at index O of line range [0-5,w+5] cf. extend function */
-    line += 5;
-
-    for (i = 0; i < w * h; i++)
-        data[i] *= 1LL << I_PRESHIFT;
+    int32_t *line = s->i_linebuf + slice * s->linesize + 5;
+    int32_t *data = t;
 
-    for (lev = 0; lev < s->ndeclevels; lev++) {
         int lh = s->linelen[lev][0],
             lv = s->linelen[lev][1],
             mh = s->mod[lev][0],
             mv = s->mod[lev][1],
+            sh = (lh + slices - 1)/slices,
+            sv = (lv + slices - 1)/slices,
             lp;
         int32_t *l;
+
+    if (dir == 0) {
         // HOR_SD
         l = line + mh;
-        for (lp = 0; lp < lv; lp++) {
+        for (lp = slice*sv; lp < lv && lp - sv < slice*sv; lp++) {
             int i, j = 0;
             // rescale with interleaving
             for (i = mh; i < lh; i += 2, j++)
@@ -511,10 +508,10 @@ static void dwt_decode97_int(DWTContext *s, int32_t *t)
             for (i = 0; i < lh; i++)
                 data[w * lp + i] = l[i];
         }
-
+    } else {
         // VER_SD
         l = line + mv;
-        for (lp = 0; lp < lh; lp++) {
+        for (lp = slice*sh; lp < lh && lp - sh < slice*sh; lp++) {
             int i, j = 0;
             // rescale with interleaving
             for (i = mv; i < lv; i += 2, j++)
@@ -528,26 +525,29 @@ static void dwt_decode97_int(DWTContext *s, int32_t *t)
                 data[w * i + lp] = l[i];
         }
     }
-
-    for (i = 0; i < w * h; i++)
-        data[i] = (data[i] + ((1LL<<I_PRESHIFT)>>1)) >> I_PRESHIFT;
 }
 
 int ff_jpeg2000_dwt_init(DWTContext *s, int border[2][2],
-                         int decomp_levels, int type)
+                         int decomp_levels, int type, int max_slices)
 {
-    int i, j, lev = decomp_levels, maxlen,
+    int i, j, lev = decomp_levels,
         b[2][2];
 
     s->ndeclevels = decomp_levels;
     s->type       = type;
+    s->max_slices = max_slices;
+
+    if (s->max_slices > INT_MAX/FFMAX(sizeof(*s->f_linebuf),sizeof(*s->i_linebuf)))
+        return AVERROR(ENOMEM);
 
     for (i = 0; i < 2; i++)
         for (j = 0; j < 2; j++)
             b[i][j] = border[i][j];
 
-    maxlen = FFMAX(b[0][1] - b[0][0],
-                   b[1][1] - b[1][0]);
+    s->linesize   = FFMAX(b[0][1] - b[0][0],
+                          b[1][1] - b[1][0]) +
+                    (type == FF_DWT53 ? 6 : 12);
+
     while (--lev >= 0)
         for (i = 0; i < 2; i++) {
             s->linelen[lev][i] = b[i][1] - b[i][0];
@@ -555,24 +555,15 @@ int ff_jpeg2000_dwt_init(DWTContext *s, int border[2][2],
             for (j = 0; j < 2; j++)
                 b[i][j] = (b[i][j] + 1) >> 1;
         }
-    switch (type) {
-    case FF_DWT97:
-        s->f_linebuf = av_malloc_array((maxlen + 12), sizeof(*s->f_linebuf));
+
+    if (type == FF_DWT97) {
+        s->f_linebuf = av_malloc_array(s->linesize, s->max_slices*sizeof(*s->f_linebuf));
         if (!s->f_linebuf)
             return AVERROR(ENOMEM);
-        break;
-     case FF_DWT97_INT:
-        s->i_linebuf = av_malloc_array((maxlen + 12), sizeof(*s->i_linebuf));
-        if (!s->i_linebuf)
-            return AVERROR(ENOMEM);
-        break;
-    case FF_DWT53:
-        s->i_linebuf = av_malloc_array((maxlen +  6), sizeof(*s->i_linebuf));
+    } else {
+        s->i_linebuf = av_malloc_array(s->linesize, s->max_slices*sizeof(*s->i_linebuf));
         if (!s->i_linebuf)
             return AVERROR(ENOMEM);
-        break;
-    default:
-        return -1;
     }
     return 0;
 }
@@ -597,18 +588,46 @@ int ff_dwt_encode(DWTContext *s, void *t)
 
 int ff_dwt_decode(DWTContext *s, void *t)
 {
-    if (s->ndeclevels == 0)
+    int w = s->linelen[s->ndeclevels - 1][0];
+    int h = s->linelen[s->ndeclevels - 1][1];
+    int32_t *data = t;
+
+    if (s->type == FF_DWT97_INT)
+        for (int i = 0; i < w * h; i++)
+            data[i] *= 1LL << I_PRESHIFT;
+
+    for (int lev = 0; lev < s->ndeclevels; lev++)
+        for (int dir = 0; dir < 2; dir++)
+            for (int slice = 0; slice < s->max_slices; slice++) {
+                int ret = ff_dwt_decode_thread(s, t, lev, dir, slice, s->max_slices);
+                if (ret)
+                    return ret;
+            }
+
+    if (s->type == FF_DWT97_INT)
+        for (int i = 0; i < w * h; i++)
+            data[i] = (data[i] + ((1LL<<I_PRESHIFT)>>1)) >> I_PRESHIFT;
+
+    return 0;
+}
+
+int ff_dwt_decode_thread(DWTContext *s, void *t, int lev, int dir, int slice, int slices)
+{
+    slices = FFMIN(s->max_slices, slices);
+
+    // lev can be >= s->ndeclevels in files with mixed reslevels in tiles/components
+    if (s->ndeclevels == 0 || lev >= s->ndeclevels || slice >= slices)
         return 0;
 
     switch (s->type) {
     case FF_DWT97:
-        dwt_decode97_float(s, t);
+        dwt_decode97_float(s, t, lev, dir, slice, slices);
         break;
     case FF_DWT97_INT:
-        dwt_decode97_int(s, t);
+        dwt_decode97_int(s, t, lev, dir, slice, slices);
         break;
     case FF_DWT53:
-        dwt_decode53(s, t);
+        dwt_decode53(s, t, lev, dir, slice, slices);
         break;
     default:
         return -1;
diff --git a/libavcodec/jpeg2000dwt.h b/libavcodec/jpeg2000dwt.h
index 718d183ac1..0589c8355c 100644
--- a/libavcodec/jpeg2000dwt.h
+++ b/libavcodec/jpeg2000dwt.h
@@ -48,6 +48,8 @@ typedef struct DWTContext {
     uint8_t type;                        ///< 0 for 9/7; 1 for 5/3
     int32_t *i_linebuf;                  ///< int buffer used by transform
     float   *f_linebuf;                  ///< float buffer used by transform
+    int max_slices;
+    int linesize;
 } DWTContext;
 
 /**
@@ -58,10 +60,11 @@ typedef struct DWTContext {
  * @param type              0 for DWT 9/7; 1 for DWT 5/3
  */
 int ff_jpeg2000_dwt_init(DWTContext *s, int border[2][2],
-                         int decomp_levels, int type);
+                         int decomp_levels, int type, int max_slices);
 
 int ff_dwt_encode(DWTContext *s, void *t);
 int ff_dwt_decode(DWTContext *s, void *t);
+int ff_dwt_decode_thread(DWTContext *s, void *t, int lev, int dir, int slice, int slices);
 
 void ff_dwt_destroy(DWTContext *s);
 
diff --git a/libavcodec/tests/jpeg2000dwt.c b/libavcodec/tests/jpeg2000dwt.c
index 0e5a6ed947..d4d9e6d224 100644
--- a/libavcodec/tests/jpeg2000dwt.c
+++ b/libavcodec/tests/jpeg2000dwt.c
@@ -31,12 +31,12 @@
 
 #define MAX_W 256
 
-static int test_dwt(int *array, int *ref, int border[2][2], int decomp_levels, int type, int max_diff) {
+static int test_dwt(int *array, int *ref, int border[2][2], int decomp_levels, int type, int max_diff, int slices) {
     int ret, j;
     DWTContext s1={{{0}}}, *s= &s1;
     int64_t err2 = 0;
 
-    ret = ff_jpeg2000_dwt_init(s,  border, decomp_levels, type);
+    ret = ff_jpeg2000_dwt_init(s,  border, decomp_levels, type, slices);
     if (ret < 0) {
         fprintf(stderr, "ff_jpeg2000_dwt_init failed\n");
         return 1;
@@ -70,12 +70,12 @@ static int test_dwt(int *array, int *ref, int border[2][2], int decomp_levels, i
     return 0;
 }
 
-static int test_dwtf(float *array, float *ref, int border[2][2], int decomp_levels, float max_diff) {
+static int test_dwtf(float *array, float *ref, int border[2][2], int decomp_levels, float max_diff, int slices) {
     int ret, j;
     DWTContext s1={{{0}}}, *s= &s1;
     double err2 = 0;
 
-    ret = ff_jpeg2000_dwt_init(s,  border, decomp_levels, FF_DWT97);
+    ret = ff_jpeg2000_dwt_init(s,  border, decomp_levels, FF_DWT97, slices);
     if (ret < 0) {
         fprintf(stderr, "ff_jpeg2000_dwt_init failed\n");
         return 1;
@@ -125,19 +125,20 @@ int main(void) {
         arrayf[i] = reff[i] = array[i] = ref[i] =  av_lfg_get(&prng) % 2048;
 
     for (i = 0; i < 100; i++) {
+        int slices = 1 + (i % 10);
         for (j=0; j<4; j++)
             border[j>>1][j&1] = av_lfg_get(&prng) % MAX_W;
         if (border[0][0] >= border[0][1] || border[1][0] >= border[1][1])
             continue;
         decomp_levels = av_lfg_get(&prng) % FF_DWT_MAX_DECLVLS;
 
-        ret = test_dwt(array, ref, border, decomp_levels, FF_DWT53, 0);
+        ret = test_dwt(array, ref, border, decomp_levels, FF_DWT53, 0, slices);
         if (ret)
             return ret;
-        ret = test_dwt(array, ref, border, decomp_levels, FF_DWT97_INT, FFMIN(7+5*decomp_levels, 15+3*decomp_levels));
+        ret = test_dwt(array, ref, border, decomp_levels, FF_DWT97_INT, FFMIN(7+5*decomp_levels, 15+3*decomp_levels), slices);
         if (ret)
             return ret;
-        ret = test_dwtf(arrayf, reff, border, decomp_levels, 0.05);
+        ret = test_dwtf(arrayf, reff, border, decomp_levels, 0.05, slices);
         if (ret)
             return ret;
     }
-- 
2.30.2


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [FFmpeg-devel] [PATCH 04/13] lavc/jpeg2000dec: Implement IDWT slicing
  2022-06-14 14:39 [FFmpeg-devel] [PATCH 01/13] lavc/jpeg2000dec: Finer granularity threading Tomas Härdin
  2022-06-14 14:39 ` [FFmpeg-devel] [PATCH 02/13] lavc/jpeg2000dec: Reindent Tomas Härdin
  2022-06-14 14:40 ` [FFmpeg-devel] [PATCH 03/13] lavc/jpeg2000dwt: Implement sliced transforms Tomas Härdin
@ 2022-06-14 14:40 ` Tomas Härdin
  2022-06-14 14:41 ` [FFmpeg-devel] [PATCH 05/13] lavc/jpeg2000dec: Thread init_tile() Tomas Härdin
                   ` (9 subsequent siblings)
  12 siblings, 0 replies; 31+ messages in thread
From: Tomas Härdin @ 2022-06-14 14:40 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 1 bytes --]



[-- Attachment #2: 0004-lavc-jpeg2000dec-Implement-IDWT-slicing.patch --]
[-- Type: text/x-patch, Size: 9640 bytes --]

From d0ec602b0f61dd7f8d53efccc2c4859058a5d55d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20H=C3=A4rdin?= <git@haerdin.se>
Date: Mon, 13 Jun 2022 14:45:07 +0200
Subject: [PATCH 04/13] lavc/jpeg2000dec: Implement IDWT slicing

---
 libavcodec/jpeg2000dec.c | 99 +++++++++++++++++++++++++++++++++++-----
 libavcodec/jpeg2000dwt.c |  1 -
 libavcodec/jpeg2000dwt.h |  1 +
 3 files changed, 88 insertions(+), 13 deletions(-)

diff --git a/libavcodec/jpeg2000dec.c b/libavcodec/jpeg2000dec.c
index 8999974a56..9344630c6f 100644
--- a/libavcodec/jpeg2000dec.c
+++ b/libavcodec/jpeg2000dec.c
@@ -150,6 +150,10 @@ typedef struct Jpeg2000DecoderContext {
     unsigned int idwt_size;
     Jpeg2000CodeblockThread *cb;
     unsigned int cb_size;
+
+    // used for idwt slicing
+    int reslevel, dir, slices;
+    int have_dwt97_int; // 1 if any coding style is FF_DWT97_INT
 } Jpeg2000DecoderContext;
 
 /* get_bits functions for JPEG2000 packet bitstream
@@ -541,9 +545,10 @@ static int get_cox(Jpeg2000DecoderContext *s, Jpeg2000CodingStyle *c)
     }
     c->transform = bytestream2_get_byteu(&s->g); // DWT transformation type
     /* set integer 9/7 DWT in case of BITEXACT flag */
-    if ((s->avctx->flags & AV_CODEC_FLAG_BITEXACT) && (c->transform == FF_DWT97))
+    if ((s->avctx->flags & AV_CODEC_FLAG_BITEXACT) && (c->transform == FF_DWT97)) {
         c->transform = FF_DWT97_INT;
-    else if (c->transform == FF_DWT53) {
+        s->have_dwt97_int = 1;
+    } else if (c->transform == FF_DWT53) {
         s->avctx->properties |= FF_CODEC_PROPERTY_LOSSLESS;
     }
 
@@ -1052,7 +1057,7 @@ static int init_tile(Jpeg2000DecoderContext *s, int tileno)
             return AVERROR_INVALIDDATA;
         if (ret = ff_jpeg2000_init_component(comp, codsty, qntsty,
                                              s->cbps[compno], s->cdx[compno],
-                                             s->cdy[compno], s->avctx, 1))
+                                             s->cdy[compno], s->avctx, s->slices))
             return ret;
     }
     return 0;
@@ -1993,19 +1998,74 @@ static int jpeg2000_decode_cb(AVCodecContext *avctx, void *td,
     return 0;
 }
 
+static int jpeg2000_dwt97_int_preshift(AVCodecContext *avctx, void *td,
+                                       int jobnr, int threadnr)
+{
+    Jpeg2000DecoderContext *s   = avctx->priv_data;
+    Jpeg2000IdwtThread *idwt    = s->idwt + jobnr / s->slices;
+    Jpeg2000Tile *tile          = s->tile + jobnr / s->slices / s->ncomponents;
+    int compno                  = (jobnr / s->slices) % s->ncomponents;
+    int slice                   = jobnr % s->slices;
+    Jpeg2000Component *comp     = tile->comp + compno;
+    Jpeg2000CodingStyle *codsty = tile->codsty + compno;
+    int a = comp->dwt.linelen[comp->dwt.ndeclevels - 1][0] *
+            comp->dwt.linelen[comp->dwt.ndeclevels - 1][1];
+    int as = (a + s->slices - 1)/s->slices;
+
+    for (int i = idwt->cb_start; i < idwt->cb_end; i++) {
+        if (s->cb[i].coded) {
+            if (codsty->transform == FF_DWT97_INT) {
+                for (int i = as*slice; i - as < as*slice; i++)
+                    comp->i_data[i] *= 1LL << I_PRESHIFT;
+            }
+            break;
+        }
+    }
+
+    return 0;
+}
+
 static int jpeg2000_idwt(AVCodecContext *avctx, void *td,
                          int jobnr, int threadnr)
 {
     Jpeg2000DecoderContext *s   = avctx->priv_data;
-    Jpeg2000IdwtThread *idwt    = s->idwt + jobnr;
-    Jpeg2000Tile *tile          = s->tile + jobnr / s->ncomponents;
-    int compno                  = jobnr % s->ncomponents;
+    Jpeg2000IdwtThread *idwt    = s->idwt + jobnr / s->slices;
+    Jpeg2000Tile *tile          = s->tile + jobnr / s->slices / s->ncomponents;
+    int compno                  = (jobnr / s->slices) % s->ncomponents;
+    int slice                   = jobnr % s->slices;
     Jpeg2000Component *comp     = tile->comp + compno;
     Jpeg2000CodingStyle *codsty = tile->codsty + compno;
 
     for (int i = idwt->cb_start; i < idwt->cb_end; i++) {
         if (s->cb[i].coded) {
-            ff_dwt_decode(&comp->dwt, codsty->transform == FF_DWT97 ? (void*)comp->f_data : (void*)comp->i_data);
+            ff_dwt_decode_thread(&comp->dwt, codsty->transform == FF_DWT97 ? (void*)comp->f_data : (void*)comp->i_data, s->reslevel, s->dir, slice, s->slices);
+            break;
+        }
+    }
+
+    return 0;
+}
+
+static int jpeg2000_dwt97_int_postshift(AVCodecContext *avctx, void *td,
+                                        int jobnr, int threadnr)
+{
+    Jpeg2000DecoderContext *s   = avctx->priv_data;
+    Jpeg2000IdwtThread *idwt    = s->idwt + jobnr / s->slices;
+    Jpeg2000Tile *tile          = s->tile + jobnr / s->slices / s->ncomponents;
+    int compno                  = (jobnr / s->slices) % s->ncomponents;
+    int slice                   = jobnr % s->slices;
+    Jpeg2000Component *comp     = tile->comp + compno;
+    Jpeg2000CodingStyle *codsty = tile->codsty + compno;
+    int a = comp->dwt.linelen[comp->dwt.ndeclevels - 1][0] *
+            comp->dwt.linelen[comp->dwt.ndeclevels - 1][1];
+    int as = (a + s->slices - 1)/s->slices;
+
+    for (int i = idwt->cb_start; i < idwt->cb_end; i++) {
+        if (s->cb[i].coded) {
+            if (codsty->transform == FF_DWT97_INT) {
+                for (int i = as*slice; i - as < as*slice; i++)
+                    comp->i_data[i] = (comp->i_data[i] + ((1LL<<I_PRESHIFT)>>1)) >> I_PRESHIFT;
+            }
             break;
         }
     }
@@ -2476,7 +2536,7 @@ static av_cold int jpeg2000_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
-static int jpeg2000_setup_cbs(Jpeg2000DecoderContext *s, int *cbs_out)
+static int jpeg2000_setup_cbs(Jpeg2000DecoderContext *s, int *cbs_out, int *maxreslevels_out)
 {
     if (s->numXtiles * s->numYtiles > INT_MAX/sizeof(*s->idwt)/s->ncomponents)
         return AVERROR(ENOMEM);
@@ -2486,7 +2546,7 @@ static int jpeg2000_setup_cbs(Jpeg2000DecoderContext *s, int *cbs_out)
         return AVERROR(ENOMEM);
 
     for (int pass = 0; pass < 2; pass++) {
-        int cbs = 0;
+        int cbs = 0, maxreslevels = 0;
         for (int tileno = 0; tileno < s->numXtiles * s->numYtiles; tileno++) {
             for (int compno = 0; compno < s->ncomponents; compno++) {
                 Jpeg2000Tile *tile          = s->tile + tileno;
@@ -2495,6 +2555,7 @@ static int jpeg2000_setup_cbs(Jpeg2000DecoderContext *s, int *cbs_out)
                 Jpeg2000IdwtThread *idwt    = s->idwt + compno + tileno * s->ncomponents;
 
                 idwt->cb_start = cbs;
+                maxreslevels = FFMAX(maxreslevels, codsty->nreslevels2decode);
 
                 for (int reslevelno = 0; reslevelno < codsty->nreslevels2decode; reslevelno++) {
                     Jpeg2000ResLevel *rlevel = comp->reslevel + reslevelno;
@@ -2541,6 +2602,7 @@ static int jpeg2000_setup_cbs(Jpeg2000DecoderContext *s, int *cbs_out)
         }
 
         *cbs_out = cbs;
+        *maxreslevels_out = maxreslevels;
     }
     return 0;
 }
@@ -2549,7 +2611,7 @@ static int jpeg2000_decode_frame(AVCodecContext *avctx, AVFrame *picture,
                                  int *got_frame, AVPacket *avpkt)
 {
     Jpeg2000DecoderContext *s = avctx->priv_data;
-    int ret, cbs;
+    int ret, cbs, maxreslevels;
 
     s->avctx     = avctx;
     bytestream2_init(&s->g, avpkt->data, avpkt->size);
@@ -2592,6 +2654,7 @@ static int jpeg2000_decode_frame(AVCodecContext *avctx, AVFrame *picture,
         goto end;
     picture->pict_type = AV_PICTURE_TYPE_I;
     picture->key_frame = 1;
+    s->slices = avctx->active_thread_type == FF_THREAD_SLICE ? avctx->thread_count : 1;
 
     if (ret = jpeg2000_read_bitstream_packets(s))
         goto end;
@@ -2607,11 +2670,23 @@ static int jpeg2000_decode_frame(AVCodecContext *avctx, AVFrame *picture,
         }
     }
 
-    if ((ret = jpeg2000_setup_cbs(s, &cbs)))
+    if ((ret = jpeg2000_setup_cbs(s, &cbs, &maxreslevels)))
         goto end;
 
     avctx->execute2(avctx, jpeg2000_decode_cb, NULL, NULL, cbs);
-    avctx->execute2(avctx, jpeg2000_idwt, NULL, NULL, s->numXtiles * s->numYtiles * s->ncomponents);
+
+    if (s->have_dwt97_int)
+        avctx->execute2(avctx, jpeg2000_dwt97_int_preshift, NULL, NULL, s->numXtiles * s->numYtiles * s->ncomponents * s->slices);
+
+    for (s->reslevel = 0; s->reslevel < maxreslevels; s->reslevel++) {
+        for (s->dir = 0; s->dir < 2; s->dir++) {
+            avctx->execute2(avctx, jpeg2000_idwt, NULL, NULL, s->numXtiles * s->numYtiles * s->ncomponents * s->slices);
+        }
+    }
+
+    if (s->have_dwt97_int)
+        avctx->execute2(avctx, jpeg2000_dwt97_int_postshift, NULL, NULL, s->numXtiles * s->numYtiles * s->ncomponents * s->slices);
+
     avctx->execute2(avctx, jpeg2000_mct_write_frame, picture, NULL, s->numXtiles * s->numYtiles);
 
     jpeg2000_dec_cleanup(s);
diff --git a/libavcodec/jpeg2000dwt.c b/libavcodec/jpeg2000dwt.c
index 42a92b6c64..921461b6d7 100644
--- a/libavcodec/jpeg2000dwt.c
+++ b/libavcodec/jpeg2000dwt.c
@@ -45,7 +45,6 @@
 #define I_LFTG_DELTA   29066ll
 #define I_LFTG_K       80621ll
 #define I_LFTG_X       53274ll
-#define I_PRESHIFT 8
 
 static inline void extend53(int *p, int i0, int i1)
 {
diff --git a/libavcodec/jpeg2000dwt.h b/libavcodec/jpeg2000dwt.h
index 0589c8355c..d5e94c9916 100644
--- a/libavcodec/jpeg2000dwt.h
+++ b/libavcodec/jpeg2000dwt.h
@@ -32,6 +32,7 @@
 #define FF_DWT_MAX_DECLVLS 32 ///< max number of decomposition levels
 #define F_LFTG_K      1.230174104914001f
 #define F_LFTG_X      0.812893066115961f
+#define I_PRESHIFT 8
 
 enum DWTType {
     FF_DWT97,
-- 
2.30.2


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [FFmpeg-devel] [PATCH 05/13] lavc/jpeg2000dec: Thread init_tile()
  2022-06-14 14:39 [FFmpeg-devel] [PATCH 01/13] lavc/jpeg2000dec: Finer granularity threading Tomas Härdin
                   ` (2 preceding siblings ...)
  2022-06-14 14:40 ` [FFmpeg-devel] [PATCH 04/13] lavc/jpeg2000dec: Implement IDWT slicing Tomas Härdin
@ 2022-06-14 14:41 ` Tomas Härdin
  2022-06-14 21:11   ` Michael Niedermayer
  2022-06-14 14:42 ` [FFmpeg-devel] [PATCH 06/13] lavu/mem: Add ff_fast_recalloc() Tomas Härdin
                   ` (8 subsequent siblings)
  12 siblings, 1 reply; 31+ messages in thread
From: Tomas Härdin @ 2022-06-14 14:41 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 1 bytes --]



[-- Attachment #2: 0005-lavc-jpeg2000dec-Thread-init_tile.patch --]
[-- Type: text/x-patch, Size: 3343 bytes --]

From 080ebdc9bad130098bff575f9ce690b8a522c9f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20H=C3=A4rdin?= <git@haerdin.se>
Date: Mon, 13 Jun 2022 15:09:17 +0200
Subject: [PATCH 05/13] lavc/jpeg2000dec: Thread init_tile()

---
 libavcodec/jpeg2000dec.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/libavcodec/jpeg2000dec.c b/libavcodec/jpeg2000dec.c
index 9344630c6f..ef5167c29e 100644
--- a/libavcodec/jpeg2000dec.c
+++ b/libavcodec/jpeg2000dec.c
@@ -1015,12 +1015,19 @@ static int get_ppt(Jpeg2000DecoderContext *s, int n)
     return 0;
 }
 
-static int init_tile(Jpeg2000DecoderContext *s, int tileno)
+static int init_tile(AVCodecContext *avctx, void *td,
+                     int jobnr, int threadnr)
 {
-    int compno;
-    int tilex = tileno % s->numXtiles;
-    int tiley = tileno / s->numXtiles;
-    Jpeg2000Tile *tile = s->tile + tileno;
+    Jpeg2000DecoderContext *s   = avctx->priv_data;
+    int tileno                  = jobnr / s->ncomponents;
+    int tilex                   = tileno % s->numXtiles;
+    int tiley                   = tileno / s->numXtiles;
+    int compno                  = jobnr % s->ncomponents;
+    Jpeg2000Tile *tile          = s->tile + tileno;
+    Jpeg2000Component *comp     = tile->comp + compno;
+    Jpeg2000CodingStyle *codsty = tile->codsty + compno;
+    Jpeg2000QuantStyle  *qntsty = tile->qntsty + compno;
+    int ret; // global bandno
 
     if (!tile->comp)
         return AVERROR(ENOMEM);
@@ -1030,12 +1037,6 @@ static int init_tile(Jpeg2000DecoderContext *s, int tileno)
     tile->coord[1][0] = av_clip(tiley       * (int64_t)s->tile_height + s->tile_offset_y, s->image_offset_y, s->height);
     tile->coord[1][1] = av_clip((tiley + 1) * (int64_t)s->tile_height + s->tile_offset_y, s->image_offset_y, s->height);
 
-    for (compno = 0; compno < s->ncomponents; compno++) {
-        Jpeg2000Component *comp = tile->comp + compno;
-        Jpeg2000CodingStyle *codsty = tile->codsty + compno;
-        Jpeg2000QuantStyle  *qntsty = tile->qntsty + compno;
-        int ret; // global bandno
-
         comp->coord_o[0][0] = tile->coord[0][0];
         comp->coord_o[0][1] = tile->coord[0][1];
         comp->coord_o[1][0] = tile->coord[1][0];
@@ -1059,7 +1060,7 @@ static int init_tile(Jpeg2000DecoderContext *s, int tileno)
                                              s->cbps[compno], s->cdx[compno],
                                              s->cdy[compno], s->avctx, s->slices))
             return ret;
-    }
+
     return 0;
 }
 
@@ -2367,9 +2368,6 @@ static int jpeg2000_read_bitstream_packets(Jpeg2000DecoderContext *s)
     for (tileno = 0; tileno < s->numXtiles * s->numYtiles; tileno++) {
         Jpeg2000Tile *tile = s->tile + tileno;
 
-        if ((ret = init_tile(s, tileno)) < 0)
-            return ret;
-
         if ((ret = jpeg2000_decode_packets(s, tile)) < 0)
             return ret;
     }
@@ -2656,6 +2654,8 @@ static int jpeg2000_decode_frame(AVCodecContext *avctx, AVFrame *picture,
     picture->key_frame = 1;
     s->slices = avctx->active_thread_type == FF_THREAD_SLICE ? avctx->thread_count : 1;
 
+    avctx->execute2(avctx, init_tile, NULL, NULL, s->numXtiles * s->numYtiles * s->ncomponents);
+
     if (ret = jpeg2000_read_bitstream_packets(s))
         goto end;
 
-- 
2.30.2


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [FFmpeg-devel] [PATCH 06/13] lavu/mem: Add ff_fast_recalloc()
  2022-06-14 14:39 [FFmpeg-devel] [PATCH 01/13] lavc/jpeg2000dec: Finer granularity threading Tomas Härdin
                   ` (3 preceding siblings ...)
  2022-06-14 14:41 ` [FFmpeg-devel] [PATCH 05/13] lavc/jpeg2000dec: Thread init_tile() Tomas Härdin
@ 2022-06-14 14:42 ` Tomas Härdin
  2022-06-14 20:26   ` Michael Niedermayer
  2022-06-14 14:42 ` [FFmpeg-devel] [PATCH 07/13] lavc/jpeg2000*: Use ff_fast_recalloc() to eliminate lots of allocations Tomas Härdin
                   ` (7 subsequent siblings)
  12 siblings, 1 reply; 31+ messages in thread
From: Tomas Härdin @ 2022-06-14 14:42 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 81 bytes --]

Left this as an ff_ funtion for now since it's only used by the j2k
code

/Tomas

[-- Attachment #2: 0006-lavu-mem-Add-ff_fast_recalloc.patch --]
[-- Type: text/x-patch, Size: 3918 bytes --]

From 5d36d431ffe4c8ba0f698d0c288ebc16b83f0bbc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20H=C3=A4rdin?= <git@haerdin.se>
Date: Tue, 14 Jun 2022 13:35:18 +0200
Subject: [PATCH 06/13] lavu/mem: Add ff_fast_recalloc()

---
 libavutil/mem.c | 24 +++++++++++++++++++++
 libavutil/mem.h | 55 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+)

diff --git a/libavutil/mem.c b/libavutil/mem.c
index a0c9a42849..7781b715a0 100644
--- a/libavutil/mem.c
+++ b/libavutil/mem.c
@@ -530,6 +530,30 @@ void *av_fast_realloc(void *ptr, unsigned int *size, size_t min_size)
     return ptr;
 }
 
+int ff_fast_recalloc(void *ptr, unsigned int *size, size_t nelem, size_t elsize)
+{
+    void *val;
+    void *new_ptr;
+    unsigned int new_size = *size;
+    size_t product;
+    int ret;
+    memcpy(&val, ptr, sizeof(val));
+
+    if ((ret = av_size_mult(nelem, elsize, &product)) < 0)
+        return ret;
+
+    if (!(new_ptr = av_fast_realloc(val, &new_size, product)))
+        return AVERROR(ENOMEM);
+
+    if (new_size > *size) {
+        memset((uint8_t*)new_ptr + *size, 0, new_size - *size);
+        *size = new_size;
+        memcpy(ptr, &new_ptr, sizeof(new_ptr));
+    }
+
+    return 0;
+}
+
 static inline void fast_malloc(void *ptr, unsigned int *size, size_t min_size, int zero_realloc)
 {
     size_t max_size;
diff --git a/libavutil/mem.h b/libavutil/mem.h
index d91174196c..74abf3dce2 100644
--- a/libavutil/mem.h
+++ b/libavutil/mem.h
@@ -380,6 +380,61 @@ int av_reallocp_array(void *ptr, size_t nmemb, size_t size);
  */
 void *av_fast_realloc(void *ptr, unsigned int *size, size_t min_size);
 
+/**
+ * Reallocate the pointed-to buffer if it is not large enough, otherwise do
+ * nothing. Old data is memcpy()'d to the start of the new buffer. The newly
+ * allocated space at the end of the buffer is zero-initialized. In other
+ * words the buffer is expanded with zeroes when necessary.
+ *
+ * If the pointed-to buffer is `NULL`, then a new zero-initialized buffer is
+ * allocated.
+ *
+ * If the pointed-to buffer is not large enough, and reallocation fails,
+ * `AVERROR(ENOMEM)` is returned.
+ *
+ * If nelem*elsize is too large then `AVERROR(EINVAL)` is returned.
+ *
+ * Contrary to av_fast_malloc(), *ptr and *size are not touched in case of
+ * error, to allow for proper cleanup.
+ *
+ * *ptr is not guaranteed to be an exact multiple of elsize bytes.
+ *
+ * This function is intended for use with arrays of structures that contain
+ * pointers that are allowed to grow and typically don't shrink.
+ *
+ * A typical use pattern follows:
+ *
+ * @code{.c}
+ * int foo_work(SomeContext *s) {
+ *     if (ff_fast_recalloc(&s->foo, &s->foo_size, s->nfoo, sizeof(Foo)))
+ *         return AVERROR(ENOMEM);
+ *     for (x = 0; x < s->nfoo; x++)
+ *         do stuff with s->foo[x]
+ *     return 0;
+ * }
+ *
+ * void foo_close(SomeContext *s) {
+ *     // note the use of s->foo_size, not s->nfoo
+ *     for (x = 0; x < s->foo_size/sizeof(Foo); x++)
+ *         av_freep(&s->foo[x].bar);
+ *     av_freep(&s->foo);
+ * }
+ * @endcode
+ *
+ * @param[in,out] ptr      Pointer to pointer to an already allocated buffer.
+ *                         `*ptr` will be overwritten with pointer to new
+ *                         buffer on success and will be left alone on failure
+ * @param[in,out] size     Pointer to the size of buffer `*ptr`. `*size` is
+ *                         updated to the new allocated size and will be left
+ *                         along on failure.
+ * @param[in]     nelem    Number of desired elements in *ptr
+ * @param[in]     elsize   Size of each element in *ptr
+ * @return Zero on success, <0 on error.
+ * @see av_fast_realloc()
+ * @see av_fast_malloc()
+ */
+int ff_fast_recalloc(void *ptr, unsigned int *size, size_t nelem, size_t elsize);
+
 /**
  * Allocate a buffer, reusing the given one if large enough.
  *
-- 
2.30.2


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [FFmpeg-devel] [PATCH 07/13] lavc/jpeg2000*: Use ff_fast_recalloc() to eliminate lots of allocations
  2022-06-14 14:39 [FFmpeg-devel] [PATCH 01/13] lavc/jpeg2000dec: Finer granularity threading Tomas Härdin
                   ` (4 preceding siblings ...)
  2022-06-14 14:42 ` [FFmpeg-devel] [PATCH 06/13] lavu/mem: Add ff_fast_recalloc() Tomas Härdin
@ 2022-06-14 14:42 ` Tomas Härdin
  2022-06-14 15:23   ` Andreas Rheinhardt
  2022-06-14 14:43 ` [FFmpeg-devel] [PATCH 08/13] lavc/jpeg2000: Switch Jpeg2000TgtNode to int32_t parent Tomas Härdin
                   ` (6 subsequent siblings)
  12 siblings, 1 reply; 31+ messages in thread
From: Tomas Härdin @ 2022-06-14 14:42 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 1 bytes --]



[-- Attachment #2: 0007-lavc-jpeg2000-Use-ff_fast_recalloc-to-eliminate-lots.patch --]
[-- Type: text/x-patch, Size: 16413 bytes --]

From 72a5f47503338a4fff816440ad64bc62cc23a738 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20H=C3=A4rdin?= <git@haerdin.se>
Date: Mon, 13 Jun 2022 17:04:10 +0200
Subject: [PATCH 07/13] lavc/jpeg2000*: Use ff_fast_recalloc() to eliminate
 lots of allocations

---
 libavcodec/jpeg2000.c    | 72 +++++++++++++++++++++-------------------
 libavcodec/jpeg2000.h    |  9 +++++
 libavcodec/jpeg2000dec.c | 28 ++++++++--------
 libavcodec/jpeg2000dwt.c |  9 +++--
 libavcodec/jpeg2000dwt.h |  2 ++
 5 files changed, 70 insertions(+), 50 deletions(-)

diff --git a/libavcodec/jpeg2000.c b/libavcodec/jpeg2000.c
index 945b787565..7ec5986875 100644
--- a/libavcodec/jpeg2000.c
+++ b/libavcodec/jpeg2000.c
@@ -52,17 +52,23 @@ static int32_t tag_tree_size(int w, int h)
 }
 
 /* allocate the memory for tag tree */
-static Jpeg2000TgtNode *ff_jpeg2000_tag_tree_init(int w, int h)
+static int ff_jpeg2000_tag_tree_init(Jpeg2000TgtNode **old, unsigned int *size, int w, int h)
 {
     int pw = w, ph = h;
-    Jpeg2000TgtNode *res, *t, *t2;
+    Jpeg2000TgtNode *t, *t2;
     int32_t tt_size;
+    size_t prod;
 
     tt_size = tag_tree_size(w, h);
 
-    t = res = av_calloc(tt_size, sizeof(*t));
-    if (!res)
-        return NULL;
+    if (av_size_mult(tt_size, sizeof(*t), &prod))
+        return AVERROR(ENOMEM);
+
+    av_fast_malloc(old, size, prod);
+    if (!*old)
+        return AVERROR(ENOMEM);
+    t = *old;
+    memset(*old, 0, prod);
 
     while (w > 1 || h > 1) {
         int i, j;
@@ -80,7 +86,7 @@ static Jpeg2000TgtNode *ff_jpeg2000_tag_tree_init(int w, int h)
         t = t2;
     }
     t[0].parent = NULL;
-    return res;
+    return 0;
 }
 
 void ff_tag_tree_zero(Jpeg2000TgtNode *t, int w, int h, int val)
@@ -316,16 +322,14 @@ static int init_prec(AVCodecContext *avctx,
 
 
     /* Tag trees initialization */
-    prec->cblkincl =
-        ff_jpeg2000_tag_tree_init(prec->nb_codeblocks_width,
-                                  prec->nb_codeblocks_height);
-    if (!prec->cblkincl)
-        return AVERROR(ENOMEM);
-
-    prec->zerobits =
-        ff_jpeg2000_tag_tree_init(prec->nb_codeblocks_width,
-                                  prec->nb_codeblocks_height);
-    if (!prec->zerobits)
+    if (ff_jpeg2000_tag_tree_init(&prec->cblkincl,
+                                  &prec->cblkincl_size,
+                                  prec->nb_codeblocks_width,
+                                  prec->nb_codeblocks_height) ||
+        ff_jpeg2000_tag_tree_init(&prec->zerobits,
+                                  &prec->zerobits_size,
+                                  prec->nb_codeblocks_width,
+                                  prec->nb_codeblocks_height))
         return AVERROR(ENOMEM);
 
     if (prec->nb_codeblocks_width * (uint64_t)prec->nb_codeblocks_height > INT_MAX) {
@@ -333,8 +337,7 @@ static int init_prec(AVCodecContext *avctx,
         return AVERROR(ENOMEM);
     }
     nb_codeblocks = prec->nb_codeblocks_width * prec->nb_codeblocks_height;
-    prec->cblk = av_calloc(nb_codeblocks, sizeof(*prec->cblk));
-    if (!prec->cblk)
+    if (ff_fast_recalloc(&prec->cblk, &prec->cblk_size, nb_codeblocks, sizeof(*prec->cblk)))
         return AVERROR(ENOMEM);
     for (cblkno = 0; cblkno < nb_codeblocks; cblkno++) {
         Jpeg2000Cblk *cblk = prec->cblk + cblkno;
@@ -376,6 +379,7 @@ static int init_prec(AVCodecContext *avctx,
         cblk->length    = 0;
         cblk->npasses   = 0;
         if (av_codec_is_encoder(avctx->codec)) {
+            av_freep(&cblk->layers);
             cblk->layers = av_calloc(codsty->nlayers, sizeof(*cblk->layers));
             if (!cblk->layers)
                 return AVERROR(ENOMEM);
@@ -448,8 +452,7 @@ static int init_band(AVCodecContext *avctx,
         return AVERROR(ENOMEM);
     }
     nb_precincts = reslevel->num_precincts_x * reslevel->num_precincts_y;
-    band->prec = av_calloc(nb_precincts, sizeof(*band->prec));
-    if (!band->prec)
+    if (ff_fast_recalloc(&band->prec, &band->prec_size, nb_precincts, sizeof(*band->prec)))
         return AVERROR(ENOMEM);
 
     for (precno = 0; precno < nb_precincts; precno++) {
@@ -471,6 +474,7 @@ int ff_jpeg2000_init_component(Jpeg2000Component *comp,
 {
     int reslevelno, bandno, gbandno = 0, ret, i, j;
     uint32_t csize;
+    size_t prod;
 
     if (codsty->nreslevels2decode <= 0) {
         av_log(avctx, AV_LOG_ERROR, "nreslevels2decode %d invalid or uninitialized\n", codsty->nreslevels2decode);
@@ -496,19 +500,22 @@ int ff_jpeg2000_init_component(Jpeg2000Component *comp,
 
     if (codsty->transform == FF_DWT97) {
         csize += AV_INPUT_BUFFER_PADDING_SIZE / sizeof(*comp->f_data);
-        comp->i_data = NULL;
-        comp->f_data = av_calloc(csize, sizeof(*comp->f_data));
+        if (av_size_mult(csize, sizeof(*comp->f_data), &prod))
+            return AVERROR(ENOMEM);
+        av_fast_malloc(&comp->f_data, &comp->f_data_size, prod);
         if (!comp->f_data)
             return AVERROR(ENOMEM);
+        memset(comp->f_data, 0, prod);
     } else {
         csize += AV_INPUT_BUFFER_PADDING_SIZE / sizeof(*comp->i_data);
-        comp->f_data = NULL;
-        comp->i_data = av_calloc(csize, sizeof(*comp->i_data));
+        if (av_size_mult(csize, sizeof(*comp->i_data), &prod))
+            return AVERROR(ENOMEM);
+        av_fast_malloc(&comp->i_data, &comp->i_data_size, prod);
         if (!comp->i_data)
             return AVERROR(ENOMEM);
+        memset(comp->i_data, 0, prod);
     }
-    comp->reslevel = av_calloc(codsty->nreslevels, sizeof(*comp->reslevel));
-    if (!comp->reslevel)
+    if (ff_fast_recalloc(&comp->reslevel, &comp->reslevel_size, codsty->nreslevels, sizeof(*comp->reslevel)))
         return AVERROR(ENOMEM);
     /* LOOP on resolution levels */
     for (reslevelno = 0; reslevelno < codsty->nreslevels; reslevelno++) {
@@ -555,8 +562,7 @@ int ff_jpeg2000_init_component(Jpeg2000Component *comp,
                                         reslevel->log2_prec_height) -
                 (reslevel->coord[1][0] >> reslevel->log2_prec_height);
 
-        reslevel->band = av_calloc(reslevel->nbands, sizeof(*reslevel->band));
-        if (!reslevel->band)
+        if (ff_fast_recalloc(&reslevel->band, &reslevel->band_size, reslevel->nbands, sizeof(*reslevel->band)))
             return AVERROR(ENOMEM);
 
         if (reslevel->num_precincts_x * (uint64_t)reslevel->num_precincts_y * reslevel->nbands > avctx->max_pixels / sizeof(*reslevel->band->prec))
@@ -599,7 +605,7 @@ void ff_jpeg2000_cleanup(Jpeg2000Component *comp, Jpeg2000CodingStyle *codsty)
 {
     int reslevelno, bandno, precno;
     for (reslevelno = 0;
-         comp->reslevel && reslevelno < codsty->nreslevels;
+         comp->reslevel && reslevelno < comp->reslevel_size/sizeof(*comp->reslevel);
          reslevelno++) {
         Jpeg2000ResLevel *reslevel;
 
@@ -607,23 +613,21 @@ void ff_jpeg2000_cleanup(Jpeg2000Component *comp, Jpeg2000CodingStyle *codsty)
             continue;
 
         reslevel = comp->reslevel + reslevelno;
-        for (bandno = 0; bandno < reslevel->nbands; bandno++) {
+        for (bandno = 0; bandno < reslevel->band_size/sizeof(*reslevel->band); bandno++) {
             Jpeg2000Band *band;
 
             if (!reslevel->band)
                 continue;
 
             band = reslevel->band + bandno;
-            for (precno = 0; precno < reslevel->num_precincts_x * reslevel->num_precincts_y; precno++) {
+            for (precno = 0; precno < band->prec_size/sizeof(*band->prec); precno++) {
                 if (band->prec) {
                     Jpeg2000Prec *prec = band->prec + precno;
-                    int nb_code_blocks = prec->nb_codeblocks_height * prec->nb_codeblocks_width;
-
                     av_freep(&prec->zerobits);
                     av_freep(&prec->cblkincl);
                     if (prec->cblk) {
                         int cblkno;
-                        for (cblkno = 0; cblkno < nb_code_blocks; cblkno ++) {
+                        for (cblkno = 0; cblkno < prec->cblk_size/sizeof(*prec->cblk); cblkno ++) {
                             Jpeg2000Cblk *cblk = &prec->cblk[cblkno];
                             av_freep(&cblk->data);
                             av_freep(&cblk->passes);
diff --git a/libavcodec/jpeg2000.h b/libavcodec/jpeg2000.h
index cbb8e0d951..3bf85a6669 100644
--- a/libavcodec/jpeg2000.h
+++ b/libavcodec/jpeg2000.h
@@ -177,6 +177,7 @@ typedef struct Jpeg2000Cblk {
     uint8_t incl;
     uint16_t length;
     uint16_t *lengthinc;
+    unsigned int lengthinc_size;
     uint8_t nb_lengthinc;
     uint8_t lblock;
     uint8_t *data;
@@ -193,8 +194,11 @@ typedef struct Jpeg2000Prec {
     int nb_codeblocks_width;
     int nb_codeblocks_height;
     Jpeg2000TgtNode *zerobits;
+    unsigned int zerobits_size;
     Jpeg2000TgtNode *cblkincl;
+    unsigned int cblkincl_size;
     Jpeg2000Cblk *cblk;
+    unsigned int cblk_size;
     int decoded_layers;
     int coord[2][2]; // border coordinates {{x0, x1}, {y0, y1}}
 } Jpeg2000Prec; // precinct
@@ -205,6 +209,7 @@ typedef struct Jpeg2000Band {
     int i_stepsize; // quantization stepsize
     float f_stepsize; // quantization stepsize
     Jpeg2000Prec *prec;
+    unsigned int prec_size;
 } Jpeg2000Band; // subband
 
 typedef struct Jpeg2000ResLevel {
@@ -213,13 +218,17 @@ typedef struct Jpeg2000ResLevel {
     int num_precincts_x, num_precincts_y; // number of precincts in x/y direction
     uint8_t log2_prec_width, log2_prec_height; // exponent of precinct size
     Jpeg2000Band *band;
+    unsigned int band_size;
 } Jpeg2000ResLevel; // resolution level
 
 typedef struct Jpeg2000Component {
     Jpeg2000ResLevel *reslevel;
+    unsigned int reslevel_size;
     DWTContext dwt;
     float *f_data;
+    unsigned int f_data_size;
     int *i_data;
+    unsigned int i_data_size;
     int coord[2][2];   // border coordinates {{x0, x1}, {y0, y1}} -- can be reduced with lowres option
     int coord_o[2][2]; // border coordinates {{x0, x1}, {y0, y1}} -- original values from jpeg2000 headers
     uint8_t roi_shift; // ROI scaling value for the component
diff --git a/libavcodec/jpeg2000dec.c b/libavcodec/jpeg2000dec.c
index ef5167c29e..a3fc05ea97 100644
--- a/libavcodec/jpeg2000dec.c
+++ b/libavcodec/jpeg2000dec.c
@@ -79,6 +79,7 @@ typedef struct Jpeg2000TilePart {
  * one per component, so tile_part elements have a size of 3 */
 typedef struct Jpeg2000Tile {
     Jpeg2000Component   *comp;
+    unsigned int        comp_size;
     uint8_t             properties[4];
     Jpeg2000CodingStyle codsty[4];
     Jpeg2000QuantStyle  qntsty[4];
@@ -141,6 +142,7 @@ typedef struct Jpeg2000DecoderContext {
     int             curtileno;
 
     Jpeg2000Tile    *tile;
+    unsigned int    tile_size;
     Jpeg2000DSPContext dsp;
 
     /*options parameters*/
@@ -380,8 +382,7 @@ static int get_siz(Jpeg2000DecoderContext *s)
         return AVERROR(EINVAL);
     }
 
-    s->tile = av_calloc(s->numXtiles * s->numYtiles, sizeof(*s->tile));
-    if (!s->tile) {
+    if (ff_fast_recalloc(&s->tile, &s->tile_size, s->numXtiles * s->numYtiles, sizeof(*s->tile))) {
         s->numXtiles = s->numYtiles = 0;
         return AVERROR(ENOMEM);
     }
@@ -389,8 +390,7 @@ static int get_siz(Jpeg2000DecoderContext *s)
     for (i = 0; i < s->numXtiles * s->numYtiles; i++) {
         Jpeg2000Tile *tile = s->tile + i;
 
-        tile->comp = av_mallocz(s->ncomponents * sizeof(*tile->comp));
-        if (!tile->comp)
+        if (ff_fast_recalloc(&tile->comp, &tile->comp_size, s->ncomponents, sizeof(*tile->comp)))
             return AVERROR(ENOMEM);
     }
 
@@ -1196,9 +1196,7 @@ static int jpeg2000_decode_packet(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile,
 
             cblk->nb_lengthinc = 0;
             cblk->nb_terminationsinc = 0;
-            av_free(cblk->lengthinc);
-            cblk->lengthinc = av_calloc(newpasses, sizeof(*cblk->lengthinc));
-            if (!cblk->lengthinc)
+            if (ff_fast_recalloc(&cblk->lengthinc, &cblk->lengthinc_size, newpasses, sizeof(*cblk->lengthinc)))
                 return AVERROR(ENOMEM);
             tmp = av_realloc_array(cblk->data_start, cblk->nb_terminations + newpasses + 1, sizeof(*cblk->data_start));
             if (!tmp)
@@ -1292,7 +1290,6 @@ static int jpeg2000_decode_packet(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile,
                     cblk->data_start[cblk->nb_terminations] = cblk->length;
                 }
             }
-            av_freep(&cblk->lengthinc);
         }
     }
     // Save state of stream
@@ -2166,12 +2163,13 @@ static int jpeg2000_mct_write_frame(AVCodecContext *avctx, void *td,
     return 0;
 }
 
-static void jpeg2000_dec_cleanup(Jpeg2000DecoderContext *s)
+static void jpeg2000_dec_cleanup(Jpeg2000DecoderContext *s, int close)
 {
     int tileno, compno;
-    for (tileno = 0; tileno < s->numXtiles * s->numYtiles; tileno++) {
+    if (close) {
+    for (tileno = 0; tileno < s->tile_size/sizeof(*s->tile); tileno++) {
         if (s->tile[tileno].comp) {
-            for (compno = 0; compno < s->ncomponents; compno++) {
+            for (compno = 0; compno < s->tile[tileno].comp_size/sizeof(*s->tile[tileno].comp); compno++) {
                 Jpeg2000Component *comp     = s->tile[tileno].comp   + compno;
                 Jpeg2000CodingStyle *codsty = s->tile[tileno].codsty + compno;
 
@@ -2182,10 +2180,11 @@ static void jpeg2000_dec_cleanup(Jpeg2000DecoderContext *s)
             s->tile[tileno].packed_headers_size = 0;
         }
     }
+    av_freep(&s->tile);
+    }
     av_freep(&s->packed_headers);
     s->packed_headers_size = 0;
     memset(&s->packed_headers_stream, 0, sizeof(s->packed_headers_stream));
-    av_freep(&s->tile);
     memset(s->codsty, 0, sizeof(s->codsty));
     memset(s->qntsty, 0, sizeof(s->qntsty));
     memset(s->properties, 0, sizeof(s->properties));
@@ -2689,7 +2688,7 @@ static int jpeg2000_decode_frame(AVCodecContext *avctx, AVFrame *picture,
 
     avctx->execute2(avctx, jpeg2000_mct_write_frame, picture, NULL, s->numXtiles * s->numYtiles);
 
-    jpeg2000_dec_cleanup(s);
+    jpeg2000_dec_cleanup(s, 0);
 
     *got_frame = 1;
 
@@ -2702,7 +2701,7 @@ static int jpeg2000_decode_frame(AVCodecContext *avctx, AVFrame *picture,
     return bytestream2_tell(&s->g);
 
 end:
-    jpeg2000_dec_cleanup(s);
+    jpeg2000_dec_cleanup(s, 0);
     return ret;
 }
 
@@ -2712,6 +2711,7 @@ static av_cold int jpeg2000_decode_close(AVCodecContext *avctx)
 
     av_freep(&s->idwt);
     av_freep(&s->cb);
+    jpeg2000_dec_cleanup(s, 1);
 
     return 0;
 }
diff --git a/libavcodec/jpeg2000dwt.c b/libavcodec/jpeg2000dwt.c
index 921461b6d7..f3ddefe48f 100644
--- a/libavcodec/jpeg2000dwt.c
+++ b/libavcodec/jpeg2000dwt.c
@@ -531,6 +531,7 @@ int ff_jpeg2000_dwt_init(DWTContext *s, int border[2][2],
 {
     int i, j, lev = decomp_levels,
         b[2][2];
+    size_t prod;
 
     s->ndeclevels = decomp_levels;
     s->type       = type;
@@ -556,11 +557,15 @@ int ff_jpeg2000_dwt_init(DWTContext *s, int border[2][2],
         }
 
     if (type == FF_DWT97) {
-        s->f_linebuf = av_malloc_array(s->linesize, s->max_slices*sizeof(*s->f_linebuf));
+        if (av_size_mult(s->linesize, s->max_slices*sizeof(*s->f_linebuf), &prod))
+            return AVERROR(ENOMEM);
+        av_fast_malloc(&s->f_linebuf, &s->f_linebuf_size, prod);
         if (!s->f_linebuf)
             return AVERROR(ENOMEM);
     } else {
-        s->i_linebuf = av_malloc_array(s->linesize, s->max_slices*sizeof(*s->i_linebuf));
+        if (av_size_mult(s->linesize, s->max_slices*sizeof(*s->i_linebuf), &prod))
+            return AVERROR(ENOMEM);
+        av_fast_malloc(&s->i_linebuf, &s->i_linebuf_size, prod);
         if (!s->i_linebuf)
             return AVERROR(ENOMEM);
     }
diff --git a/libavcodec/jpeg2000dwt.h b/libavcodec/jpeg2000dwt.h
index d5e94c9916..fb6fc8f121 100644
--- a/libavcodec/jpeg2000dwt.h
+++ b/libavcodec/jpeg2000dwt.h
@@ -48,7 +48,9 @@ typedef struct DWTContext {
     uint8_t ndeclevels;                  ///< number of decomposition levels
     uint8_t type;                        ///< 0 for 9/7; 1 for 5/3
     int32_t *i_linebuf;                  ///< int buffer used by transform
+    unsigned int i_linebuf_size;
     float   *f_linebuf;                  ///< float buffer used by transform
+    unsigned int f_linebuf_size;
     int max_slices;
     int linesize;
 } DWTContext;
-- 
2.30.2


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [FFmpeg-devel] [PATCH 08/13] lavc/jpeg2000: Switch Jpeg2000TgtNode to int32_t parent
  2022-06-14 14:39 [FFmpeg-devel] [PATCH 01/13] lavc/jpeg2000dec: Finer granularity threading Tomas Härdin
                   ` (5 preceding siblings ...)
  2022-06-14 14:42 ` [FFmpeg-devel] [PATCH 07/13] lavc/jpeg2000*: Use ff_fast_recalloc() to eliminate lots of allocations Tomas Härdin
@ 2022-06-14 14:43 ` Tomas Härdin
  2022-06-14 14:43 ` [FFmpeg-devel] [PATCH 09/13] lavc/jpeg2000: Speed up ff_jpeg2000_tag_tree_init() using stereotypes for sizes <= 4x4 Tomas Härdin
                   ` (5 subsequent siblings)
  12 siblings, 0 replies; 31+ messages in thread
From: Tomas Härdin @ 2022-06-14 14:43 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 1 bytes --]



[-- Attachment #2: 0008-lavc-jpeg2000-Switch-Jpeg2000TgtNode-to-int32_t-pare.patch --]
[-- Type: text/x-patch, Size: 9164 bytes --]

From c0e00cf03f5a1fcffc90395d4b26607e1681690c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20H=C3=A4rdin?= <git@haerdin.se>
Date: Tue, 7 Jun 2022 16:43:40 +0200
Subject: [PATCH 08/13] lavc/jpeg2000: Switch Jpeg2000TgtNode to int32_t parent

---
 libavcodec/j2kenc.c      | 44 ++++++++++++++++++++--------------------
 libavcodec/jpeg2000.c    | 20 +++++++++---------
 libavcodec/jpeg2000.h    |  2 +-
 libavcodec/jpeg2000dec.c | 18 ++++++++--------
 4 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/libavcodec/j2kenc.c b/libavcodec/j2kenc.c
index 4de596ffa9..a3d8144acc 100644
--- a/libavcodec/j2kenc.c
+++ b/libavcodec/j2kenc.c
@@ -249,36 +249,36 @@ static void j2k_flush(Jpeg2000EncoderContext *s)
 /* tag tree routines */
 
 /** code the value stored in node */
-static void tag_tree_code(Jpeg2000EncoderContext *s, Jpeg2000TgtNode *node, int threshold)
+static void tag_tree_code(Jpeg2000EncoderContext *s, Jpeg2000TgtNode *nodes, int32_t node, int threshold)
 {
-    Jpeg2000TgtNode *stack[30];
+    int32_t stack[30];
     int sp = -1, curval = 0;
 
-    while(node->parent){
+    while(nodes[node].parent >= 0){
         stack[++sp] = node;
-        node = node->parent;
+        node = nodes[node].parent;
     }
 
     while (1) {
-        if (curval > node->temp_val)
-            node->temp_val = curval;
+        if (curval > nodes[node].temp_val)
+            nodes[node].temp_val = curval;
         else {
-            curval = node->temp_val;
+            curval = nodes[node].temp_val;
         }
 
-        if (node->val >= threshold) {
+        if (nodes[node].val >= threshold) {
             put_bits(s, 0, threshold - curval);
             curval = threshold;
         } else {
-            put_bits(s, 0, node->val - curval);
-            curval = node->val;
-            if (!node->vis) {
+            put_bits(s, 0, nodes[node].val - curval);
+            curval = nodes[node].val;
+            if (!nodes[node].vis) {
                 put_bits(s, 1, 1);
-                node->vis = 1;
+                nodes[node].vis = 1;
             }
         }
 
-        node->temp_val = curval;
+        nodes[node].temp_val = curval;
         if (sp < 0)
             break;
         node = stack[sp--];
@@ -286,14 +286,14 @@ static void tag_tree_code(Jpeg2000EncoderContext *s, Jpeg2000TgtNode *node, int
 }
 
 /** update the value in node */
-static void tag_tree_update(Jpeg2000TgtNode *node)
+static void tag_tree_update(Jpeg2000TgtNode *nodes, int node)
 {
     int lev = 0;
-    while (node->parent){
-        if (node->parent->val <= node->val)
+    while (nodes[node].parent >= 0){
+        if (nodes[nodes[node].parent].val <= nodes[node].val)
             break;
-        node->parent->val = node->val;
-        node = node->parent;
+        nodes[nodes[node].parent].val = nodes[node].val;
+        node = nodes[node].parent;
         lev++;
     }
 }
@@ -814,7 +814,7 @@ static int encode_packet(Jpeg2000EncoderContext *s, Jpeg2000ResLevel *rlevel, in
                     prec->zerobits[pos].val = expn[bandno] + numgbits - 1 - cblk->nonzerobits;
                     cblk->incl = 0;
                     cblk->lblock = 3;
-                    tag_tree_update(prec->zerobits + pos);
+                    tag_tree_update(prec->zerobits, pos);
                     for (i = 0; i < nlayers; i++) {
                         if (cblk->layers[i].npasses > 0) {
                             prec->cblkincl[pos].val = i;
@@ -823,7 +823,7 @@ static int encode_packet(Jpeg2000EncoderContext *s, Jpeg2000ResLevel *rlevel, in
                     }
                     if (i == nlayers)
                         prec->cblkincl[pos].val = i;
-                    tag_tree_update(prec->cblkincl + pos);
+                    tag_tree_update(prec->cblkincl, pos);
                 }
             }
         }
@@ -877,7 +877,7 @@ static int encode_packet(Jpeg2000EncoderContext *s, Jpeg2000ResLevel *rlevel, in
 
                 // inclusion information
                 if (!cblk->incl)
-                    tag_tree_code(s, prec->cblkincl + pos, layno + 1);
+                    tag_tree_code(s, prec->cblkincl, pos, layno + 1);
                 else {
                     put_bits(s, cblk->layers[layno].npasses > 0, 1);
                 }
@@ -887,7 +887,7 @@ static int encode_packet(Jpeg2000EncoderContext *s, Jpeg2000ResLevel *rlevel, in
 
                 // zerobits information
                 if (!cblk->incl) {
-                    tag_tree_code(s, prec->zerobits + pos, 100);
+                    tag_tree_code(s, prec->zerobits, pos, 100);
                     cblk->incl = 1;
                 }
 
diff --git a/libavcodec/jpeg2000.c b/libavcodec/jpeg2000.c
index 7ec5986875..0bec2e187d 100644
--- a/libavcodec/jpeg2000.c
+++ b/libavcodec/jpeg2000.c
@@ -55,8 +55,8 @@ static int32_t tag_tree_size(int w, int h)
 static int ff_jpeg2000_tag_tree_init(Jpeg2000TgtNode **old, unsigned int *size, int w, int h)
 {
     int pw = w, ph = h;
-    Jpeg2000TgtNode *t, *t2;
-    int32_t tt_size;
+    Jpeg2000TgtNode *t;
+    int32_t tt_size, ofs = 0;
     size_t prod;
 
     tt_size = tag_tree_size(w, h);
@@ -77,15 +77,15 @@ static int ff_jpeg2000_tag_tree_init(Jpeg2000TgtNode **old, unsigned int *size,
 
         w  = (w + 1) >> 1;
         h  = (h + 1) >> 1;
-        t2 = t + pw * ph;
+        ofs += pw * ph;
 
         for (i = 0; i < ph; i++)
             for (j = 0; j < pw; j++)
-                t[i * pw + j].parent = &t2[(i >> 1) * w + (j >> 1)];
+                t[i * pw + j].parent = (i >> 1) * w + (j >> 1) + ofs;
 
-        t = t2;
+        t += pw * ph;
     }
-    t[0].parent = NULL;
+    t[0].parent = -1;
     return 0;
 }
 
@@ -320,6 +320,10 @@ static int init_prec(AVCodecContext *avctx,
                                 band->log2_cblk_height)
         - (prec->coord[1][0] >> band->log2_cblk_height);
 
+    /* \sum_{i=0}^\inf 4^-i = 4/3 */
+    if (prec->nb_codeblocks_width * (uint64_t)prec->nb_codeblocks_height > INT32_MAX / 4 * 3) {
+        return AVERROR(ENOMEM);
+    }
 
     /* Tag trees initialization */
     if (ff_jpeg2000_tag_tree_init(&prec->cblkincl,
@@ -332,10 +336,6 @@ static int init_prec(AVCodecContext *avctx,
                                   prec->nb_codeblocks_height))
         return AVERROR(ENOMEM);
 
-    if (prec->nb_codeblocks_width * (uint64_t)prec->nb_codeblocks_height > INT_MAX) {
-        prec->cblk = NULL;
-        return AVERROR(ENOMEM);
-    }
     nb_codeblocks = prec->nb_codeblocks_width * prec->nb_codeblocks_height;
     if (ff_fast_recalloc(&prec->cblk, &prec->cblk_size, nb_codeblocks, sizeof(*prec->cblk)))
         return AVERROR(ENOMEM);
diff --git a/libavcodec/jpeg2000.h b/libavcodec/jpeg2000.h
index 3bf85a6669..1fd9d193e7 100644
--- a/libavcodec/jpeg2000.h
+++ b/libavcodec/jpeg2000.h
@@ -126,10 +126,10 @@ typedef struct Jpeg2000T1Context {
 } Jpeg2000T1Context;
 
 typedef struct Jpeg2000TgtNode {
+    int32_t parent;
     uint8_t val;
     uint8_t temp_val;
     uint8_t vis;
-    struct Jpeg2000TgtNode *parent;
 } Jpeg2000TgtNode;
 
 typedef struct Jpeg2000CodingStyle {
diff --git a/libavcodec/jpeg2000dec.c b/libavcodec/jpeg2000dec.c
index a3fc05ea97..a2b9f0166b 100644
--- a/libavcodec/jpeg2000dec.c
+++ b/libavcodec/jpeg2000dec.c
@@ -185,24 +185,24 @@ static void jpeg2000_flush(Jpeg2000DecoderContext *s)
 }
 
 /* decode the value stored in node */
-static int tag_tree_decode(Jpeg2000DecoderContext *s, Jpeg2000TgtNode *node,
+static int tag_tree_decode(Jpeg2000DecoderContext *s, Jpeg2000TgtNode *nodes, int32_t node,
                            int threshold)
 {
     Jpeg2000TgtNode *stack[30];
     int sp = -1, curval = 0;
 
-    if (!node) {
+    if (node < 0) {
         av_log(s->avctx, AV_LOG_ERROR, "missing node\n");
         return AVERROR_INVALIDDATA;
     }
 
-    while (node && !node->vis) {
-        stack[++sp] = node;
-        node        = node->parent;
+    while (node >= 0 && !nodes[node].vis) {
+        stack[++sp] = &nodes[node];
+        node        = nodes[node].parent;
     }
 
-    if (node)
-        curval = node->val;
+    if (node >= 0)
+        curval = nodes[node].val;
     else
         curval = stack[sp]->val;
 
@@ -1161,7 +1161,7 @@ static int jpeg2000_decode_packet(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile,
             if (cblk->npasses)
                 incl = get_bits(s, 1);
             else
-                incl = tag_tree_decode(s, prec->cblkincl + cblkno, layno + 1) == layno;
+                incl = tag_tree_decode(s, prec->cblkincl, cblkno, layno + 1) == layno;
             if (!incl)
                 continue;
             else if (incl < 0)
@@ -1169,7 +1169,7 @@ static int jpeg2000_decode_packet(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile,
 
             if (!cblk->npasses) {
                 int v = expn[bandno] + numgbits - 1 -
-                        tag_tree_decode(s, prec->zerobits + cblkno, 100);
+                        tag_tree_decode(s, prec->zerobits, cblkno, 100);
                 if (v < 0 || v > 30) {
                     av_log(s->avctx, AV_LOG_ERROR,
                            "nonzerobits %d invalid or unsupported\n", v);
-- 
2.30.2


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [FFmpeg-devel] [PATCH 09/13] lavc/jpeg2000: Speed up ff_jpeg2000_tag_tree_init() using stereotypes for sizes <= 4x4
  2022-06-14 14:39 [FFmpeg-devel] [PATCH 01/13] lavc/jpeg2000dec: Finer granularity threading Tomas Härdin
                   ` (6 preceding siblings ...)
  2022-06-14 14:43 ` [FFmpeg-devel] [PATCH 08/13] lavc/jpeg2000: Switch Jpeg2000TgtNode to int32_t parent Tomas Härdin
@ 2022-06-14 14:43 ` Tomas Härdin
  2022-06-18 15:00   ` Anton Khirnov
  2022-06-14 14:43 ` [FFmpeg-devel] [PATCH 10/13] lavc/jpeg2000: Reindent Tomas Härdin
                   ` (4 subsequent siblings)
  12 siblings, 1 reply; 31+ messages in thread
From: Tomas Härdin @ 2022-06-14 14:43 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 1 bytes --]



[-- Attachment #2: 0009-lavc-jpeg2000-Speed-up-ff_jpeg2000_tag_tree_init-usi.patch --]
[-- Type: text/x-patch, Size: 2551 bytes --]

From 03b806f89453571310dcb14edbd9f51e059b7476 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20H=C3=A4rdin?= <git@haerdin.se>
Date: Wed, 8 Jun 2022 10:08:15 +0200
Subject: [PATCH 09/13] lavc/jpeg2000: Speed up ff_jpeg2000_tag_tree_init()
 using stereotypes for sizes <= 4x4

---
 libavcodec/jpeg2000.c | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/libavcodec/jpeg2000.c b/libavcodec/jpeg2000.c
index 0bec2e187d..b80e68bcba 100644
--- a/libavcodec/jpeg2000.c
+++ b/libavcodec/jpeg2000.c
@@ -51,6 +51,31 @@ static int32_t tag_tree_size(int w, int h)
     return (int32_t)(res + 1);
 }
 
+#define T(x) (x*sizeof(Jpeg2000TgtNode))
+
+static const size_t tt_sizes[16] = {
+    T(1),T(3),T(6),T(7),T(3),T(5),T(9),T(11),T(6),T(9),T(14),T(17),T(7),T(11),T(17),T(21),
+};
+
+static const Jpeg2000TgtNode tt_stereotypes[16][21] = {
+    {{-1},},
+    {{2},{2},{-1},},
+    {{3},{3},{4},{5},{5},{-1},},
+    {{4},{4},{5},{5},{6},{6},{-1},},
+    {{2},{2},{-1},},
+    {{4},{4},{4},{4},{-1},},
+    {{6},{6},{7},{6},{6},{7},{8},{8},{-1},},
+    {{8},{8},{9},{9},{8},{8},{9},{9},{10},{10},{-1},},
+    {{3},{3},{4},{5},{5},{-1},},
+    {{6},{6},{6},{6},{7},{7},{8},{8},{-1},},
+    {{9},{9},{10},{9},{9},{10},{11},{11},{12},{13},{13},{13},{13},{-1},},
+    {{12},{12},{13},{13},{12},{12},{13},{13},{14},{14},{15},{15},{16},{16},{16},{16},{-1},},
+    {{4},{4},{5},{5},{6},{6},{-1},},
+    {{8},{8},{8},{8},{9},{9},{9},{9},{10},{10},{-1},},
+    {{12},{12},{13},{12},{12},{13},{14},{14},{15},{14},{14},{15},{16},{16},{16},{16},{-1},},
+    {{16},{16},{17},{17},{16},{16},{17},{17},{18},{18},{19},{19},{18},{18},{19},{19},{20},{20},{20},{20},{-1},},
+};
+
 /* allocate the memory for tag tree */
 static int ff_jpeg2000_tag_tree_init(Jpeg2000TgtNode **old, unsigned int *size, int w, int h)
 {
@@ -59,6 +84,15 @@ static int ff_jpeg2000_tag_tree_init(Jpeg2000TgtNode **old, unsigned int *size,
     int32_t tt_size, ofs = 0;
     size_t prod;
 
+    if (w <= 4 && h <= 4) {
+        int idx = w-1 + (h-1)*4;
+        size_t sz = tt_sizes[idx];
+        av_fast_malloc(old, size, sz);
+        if (*old) {
+            memcpy(*old, tt_stereotypes[idx], sz);
+        }
+        return 0;
+    } else {
     tt_size = tag_tree_size(w, h);
 
     if (av_size_mult(tt_size, sizeof(*t), &prod))
@@ -87,6 +121,7 @@ static int ff_jpeg2000_tag_tree_init(Jpeg2000TgtNode **old, unsigned int *size,
     }
     t[0].parent = -1;
     return 0;
+    }
 }
 
 void ff_tag_tree_zero(Jpeg2000TgtNode *t, int w, int h, int val)
-- 
2.30.2


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [FFmpeg-devel] [PATCH 10/13] lavc/jpeg2000: Reindent
  2022-06-14 14:39 [FFmpeg-devel] [PATCH 01/13] lavc/jpeg2000dec: Finer granularity threading Tomas Härdin
                   ` (7 preceding siblings ...)
  2022-06-14 14:43 ` [FFmpeg-devel] [PATCH 09/13] lavc/jpeg2000: Speed up ff_jpeg2000_tag_tree_init() using stereotypes for sizes <= 4x4 Tomas Härdin
@ 2022-06-14 14:43 ` Tomas Härdin
  2022-06-14 14:44 ` [FFmpeg-devel] [PATCH 11/13] lavc/jpeg2000: Minimize calls to av_codec_is_encoder() Tomas Härdin
                   ` (3 subsequent siblings)
  12 siblings, 0 replies; 31+ messages in thread
From: Tomas Härdin @ 2022-06-14 14:43 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 1 bytes --]



[-- Attachment #2: 0010-lavc-jpeg2000-Reindent.patch --]
[-- Type: text/x-patch, Size: 2009 bytes --]

From d3aaf24ca4778e6ba280f99f9ce90cb15738699b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20H=C3=A4rdin?= <git@haerdin.se>
Date: Tue, 14 Jun 2022 11:23:08 +0200
Subject: [PATCH 10/13] lavc/jpeg2000: Reindent

---
 libavcodec/jpeg2000.c | 44 +++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/libavcodec/jpeg2000.c b/libavcodec/jpeg2000.c
index b80e68bcba..8ee50b77c5 100644
--- a/libavcodec/jpeg2000.c
+++ b/libavcodec/jpeg2000.c
@@ -93,34 +93,34 @@ static int ff_jpeg2000_tag_tree_init(Jpeg2000TgtNode **old, unsigned int *size,
         }
         return 0;
     } else {
-    tt_size = tag_tree_size(w, h);
+        tt_size = tag_tree_size(w, h);
 
-    if (av_size_mult(tt_size, sizeof(*t), &prod))
-        return AVERROR(ENOMEM);
+        if (av_size_mult(tt_size, sizeof(*t), &prod))
+            return AVERROR(ENOMEM);
 
-    av_fast_malloc(old, size, prod);
-    if (!*old)
-        return AVERROR(ENOMEM);
-    t = *old;
-    memset(*old, 0, prod);
+        av_fast_malloc(old, size, prod);
+        if (!*old)
+            return AVERROR(ENOMEM);
+        t = *old;
+        memset(*old, 0, prod);
 
-    while (w > 1 || h > 1) {
-        int i, j;
-        pw = w;
-        ph = h;
+        while (w > 1 || h > 1) {
+            int i, j;
+            pw = w;
+            ph = h;
 
-        w  = (w + 1) >> 1;
-        h  = (h + 1) >> 1;
-        ofs += pw * ph;
+            w  = (w + 1) >> 1;
+            h  = (h + 1) >> 1;
+            ofs += pw * ph;
 
-        for (i = 0; i < ph; i++)
-            for (j = 0; j < pw; j++)
-                t[i * pw + j].parent = (i >> 1) * w + (j >> 1) + ofs;
+            for (i = 0; i < ph; i++)
+                for (j = 0; j < pw; j++)
+                    t[i * pw + j].parent = (i >> 1) * w + (j >> 1) + ofs;
 
-        t += pw * ph;
-    }
-    t[0].parent = -1;
-    return 0;
+            t += pw * ph;
+        }
+        t[0].parent = -1;
+        return 0;
     }
 }
 
-- 
2.30.2


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [FFmpeg-devel] [PATCH 11/13] lavc/jpeg2000: Minimize calls to av_codec_is_encoder()
  2022-06-14 14:39 [FFmpeg-devel] [PATCH 01/13] lavc/jpeg2000dec: Finer granularity threading Tomas Härdin
                   ` (8 preceding siblings ...)
  2022-06-14 14:43 ` [FFmpeg-devel] [PATCH 10/13] lavc/jpeg2000: Reindent Tomas Härdin
@ 2022-06-14 14:44 ` Tomas Härdin
  2022-06-14 15:04   ` Andreas Rheinhardt
  2022-06-14 14:44 ` [FFmpeg-devel] [PATCH 12/13] lavc/jpeg2000dec: Use coarser slicing for initial reslevels Tomas Härdin
                   ` (2 subsequent siblings)
  12 siblings, 1 reply; 31+ messages in thread
From: Tomas Härdin @ 2022-06-14 14:44 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 1 bytes --]



[-- Attachment #2: 0011-lavc-jpeg2000-Minimize-calls-to-av_codec_is_encoder.patch --]
[-- Type: text/x-patch, Size: 4206 bytes --]

From 5b492d4e92a11946fd7425497205b1842fa1912c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20H=C3=A4rdin?= <git@haerdin.se>
Date: Tue, 14 Jun 2022 10:57:45 +0200
Subject: [PATCH 11/13] lavc/jpeg2000: Minimize calls to av_codec_is_encoder()

---
 libavcodec/jpeg2000.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/libavcodec/jpeg2000.c b/libavcodec/jpeg2000.c
index 8ee50b77c5..2e3c33303b 100644
--- a/libavcodec/jpeg2000.c
+++ b/libavcodec/jpeg2000.c
@@ -247,7 +247,7 @@ static void init_band_stepsize(AVCodecContext *avctx,
                                Jpeg2000CodingStyle *codsty,
                                Jpeg2000QuantStyle *qntsty,
                                int bandno, int gbandno, int reslevelno,
-                               int cbps)
+                               int cbps, int is_enc)
 {
     /* TODO: Implementation of quantization step not finished,
      * see ISO/IEC 15444-1:2002 E.1 and A.6.4. */
@@ -305,7 +305,7 @@ static void init_band_stepsize(AVCodecContext *avctx,
 
     /* FIXME: In OpenJPEG code stepsize = stepsize * 0.5. Why?
      * If not set output of entropic decoder is not correct. */
-    if (!av_codec_is_encoder(avctx->codec))
+    if (!is_enc)
         band->f_stepsize *= 0.5;
 }
 
@@ -316,7 +316,8 @@ static int init_prec(AVCodecContext *avctx,
                      Jpeg2000CodingStyle *codsty,
                      int precno, int bandno, int reslevelno,
                      int log2_band_prec_width,
-                     int log2_band_prec_height)
+                     int log2_band_prec_height,
+                     int is_enc)
 {
     Jpeg2000Prec *prec = band->prec + precno;
     int nb_codeblocks, cblkno;
@@ -413,7 +414,7 @@ static int init_prec(AVCodecContext *avctx,
         cblk->lblock    = 3;
         cblk->length    = 0;
         cblk->npasses   = 0;
-        if (av_codec_is_encoder(avctx->codec)) {
+        if (is_enc) {
             av_freep(&cblk->layers);
             cblk->layers = av_calloc(codsty->nlayers, sizeof(*cblk->layers));
             if (!cblk->layers)
@@ -430,7 +431,7 @@ static int init_band(AVCodecContext *avctx,
                      Jpeg2000CodingStyle *codsty,
                      Jpeg2000QuantStyle *qntsty,
                      int bandno, int gbandno, int reslevelno,
-                     int cbps, int dx, int dy)
+                     int cbps, int dx, int dy, int is_enc)
 {
     Jpeg2000Band *band = reslevel->band + bandno;
     uint8_t log2_band_prec_width, log2_band_prec_height;
@@ -439,7 +440,7 @@ static int init_band(AVCodecContext *avctx,
     int nb_precincts;
     int i, j, ret;
 
-    init_band_stepsize(avctx, band, codsty, qntsty, bandno, gbandno, reslevelno, cbps);
+    init_band_stepsize(avctx, band, codsty, qntsty, bandno, gbandno, reslevelno, cbps, is_enc);
 
     /* computation of tbx_0, tbx_1, tby_0, tby_1
      * see ISO/IEC 15444-1:2002 B.5 eq. B-15 and tbl B.1
@@ -493,7 +494,8 @@ static int init_band(AVCodecContext *avctx,
     for (precno = 0; precno < nb_precincts; precno++) {
         ret = init_prec(avctx, band, reslevel, comp, codsty,
                         precno, bandno, reslevelno,
-                        log2_band_prec_width, log2_band_prec_height);
+                        log2_band_prec_width, log2_band_prec_height,
+                        is_enc);
         if (ret < 0)
             return ret;
     }
@@ -510,6 +512,7 @@ int ff_jpeg2000_init_component(Jpeg2000Component *comp,
     int reslevelno, bandno, gbandno = 0, ret, i, j;
     uint32_t csize;
     size_t prod;
+    int is_enc = av_codec_is_encoder(avctx->codec);
 
     if (codsty->nreslevels2decode <= 0) {
         av_log(avctx, AV_LOG_ERROR, "nreslevels2decode %d invalid or uninitialized\n", codsty->nreslevels2decode);
@@ -607,7 +610,7 @@ int ff_jpeg2000_init_component(Jpeg2000Component *comp,
             ret = init_band(avctx, reslevel,
                             comp, codsty, qntsty,
                             bandno, gbandno, reslevelno,
-                            cbps, dx, dy);
+                            cbps, dx, dy, is_enc);
             if (ret < 0)
                 return ret;
         }
-- 
2.30.2


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [FFmpeg-devel] [PATCH 12/13] lavc/jpeg2000dec: Use coarser slicing for initial reslevels
  2022-06-14 14:39 [FFmpeg-devel] [PATCH 01/13] lavc/jpeg2000dec: Finer granularity threading Tomas Härdin
                   ` (9 preceding siblings ...)
  2022-06-14 14:44 ` [FFmpeg-devel] [PATCH 11/13] lavc/jpeg2000: Minimize calls to av_codec_is_encoder() Tomas Härdin
@ 2022-06-14 14:44 ` Tomas Härdin
  2022-06-14 14:47 ` [FFmpeg-devel] [PATCH 13/13] lavc/jpeg2000dec: Component-level threading of write_frame() Tomas Härdin
  2022-06-18 14:50 ` [FFmpeg-devel] [PATCH 01/13] lavc/jpeg2000dec: Finer granularity threading Anton Khirnov
  12 siblings, 0 replies; 31+ messages in thread
From: Tomas Härdin @ 2022-06-14 14:44 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 1 bytes --]



[-- Attachment #2: 0012-lavc-jpeg2000dec-Use-coarser-slicing-for-initial-res.patch --]
[-- Type: text/x-patch, Size: 1269 bytes --]

From 15761070d1cdc622ffbc5d6aeb0a50e063361012 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20H=C3=A4rdin?= <git@haerdin.se>
Date: Tue, 14 Jun 2022 11:19:06 +0200
Subject: [PATCH 12/13] lavc/jpeg2000dec: Use coarser slicing for initial
 reslevels

This brings -lowres 2 lossless 4K J2K on an AMD EPYC 7R32 to 52 fps (2080% CPU).
---
 libavcodec/jpeg2000dec.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/libavcodec/jpeg2000dec.c b/libavcodec/jpeg2000dec.c
index a2b9f0166b..18ebe5219d 100644
--- a/libavcodec/jpeg2000dec.c
+++ b/libavcodec/jpeg2000dec.c
@@ -2679,7 +2679,16 @@ static int jpeg2000_decode_frame(AVCodecContext *avctx, AVFrame *picture,
 
     for (s->reslevel = 0; s->reslevel < maxreslevels; s->reslevel++) {
         for (s->dir = 0; s->dir < 2; s->dir++) {
+            int before = s->slices;
+            int div = s->slices >= 96 ? 7 : 5;
+
+            if (s->reslevel < div) {
+                int halve = 1<<(div - s->reslevel + (s->slices >= 96 ? 0 : 1 - s->dir));
+                s->slices = (s->slices + halve-1)/halve;
+            }
+
             avctx->execute2(avctx, jpeg2000_idwt, NULL, NULL, s->numXtiles * s->numYtiles * s->ncomponents * s->slices);
+            s->slices = before;
         }
     }
 
-- 
2.30.2


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [FFmpeg-devel] [PATCH 13/13] lavc/jpeg2000dec: Component-level threading of write_frame()
  2022-06-14 14:39 [FFmpeg-devel] [PATCH 01/13] lavc/jpeg2000dec: Finer granularity threading Tomas Härdin
                   ` (10 preceding siblings ...)
  2022-06-14 14:44 ` [FFmpeg-devel] [PATCH 12/13] lavc/jpeg2000dec: Use coarser slicing for initial reslevels Tomas Härdin
@ 2022-06-14 14:47 ` Tomas Härdin
  2022-06-18 14:50 ` [FFmpeg-devel] [PATCH 01/13] lavc/jpeg2000dec: Finer granularity threading Anton Khirnov
  12 siblings, 0 replies; 31+ messages in thread
From: Tomas Härdin @ 2022-06-14 14:47 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 212 bytes --]

Don't have access to the full machine to test this with 96 threads. On
2/3rds of an AMD EPYC 7R32 (-threads 64) it runs at 50 fps.
Specifically the decoder uses 59.2 seconds to decode a 60.0 second
clip.

/Tomas

[-- Attachment #2: 0013-lavc-jpeg2000dec-Component-level-threading-of-write_.patch --]
[-- Type: text/x-patch, Size: 5575 bytes --]

From 19fc2413dc2bafff577c68830cde48e08138771e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20H=C3=A4rdin?= <git@haerdin.se>
Date: Tue, 14 Jun 2022 15:45:32 +0200
Subject: [PATCH 13/13] lavc/jpeg2000dec: Component-level threading of
 write_frame()

Split off MCT and don't bother with it unless the picture actually uses MCT.
---
 libavcodec/jpeg2000dec.c | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/libavcodec/jpeg2000dec.c b/libavcodec/jpeg2000dec.c
index 18ebe5219d..8eaeda1c66 100644
--- a/libavcodec/jpeg2000dec.c
+++ b/libavcodec/jpeg2000dec.c
@@ -156,6 +156,7 @@ typedef struct Jpeg2000DecoderContext {
     // used for idwt slicing
     int reslevel, dir, slices;
     int have_dwt97_int; // 1 if any coding style is FF_DWT97_INT
+    int have_mct;
 } Jpeg2000DecoderContext;
 
 /* get_bits functions for JPEG2000 packet bitstream
@@ -600,6 +601,9 @@ static int get_cod(Jpeg2000DecoderContext *s, Jpeg2000CodingStyle *c,
         return AVERROR_INVALIDDATA;
     }
 
+    if (tmp.mct)
+        s->have_mct = 1;
+
     if ((ret = get_cox(s, &tmp)) < 0)
         return ret;
     tmp.init = 1;
@@ -2073,16 +2077,14 @@ static int jpeg2000_dwt97_int_postshift(AVCodecContext *avctx, void *td,
 
 #define WRITE_FRAME(D, PIXEL)                                                                     \
     static inline void write_frame_ ## D(Jpeg2000DecoderContext * s, Jpeg2000Tile * tile,         \
-                                         AVFrame * picture, int precision)                        \
+                                         AVFrame * picture, int precision, int compno)            \
     {                                                                                             \
         const AVPixFmtDescriptor *pixdesc = av_pix_fmt_desc_get(s->avctx->pix_fmt);               \
         int planar    = !!(pixdesc->flags & AV_PIX_FMT_FLAG_PLANAR);                              \
         int pixelsize = planar ? 1 : pixdesc->nb_components;                                      \
                                                                                                   \
-        int compno;                                                                               \
         int x, y;                                                                                 \
                                                                                                   \
-        for (compno = 0; compno < s->ncomponents; compno++) {                                     \
             Jpeg2000Component *comp     = tile->comp + compno;                                    \
             Jpeg2000CodingStyle *codsty = tile->codsty + compno;                                  \
             PIXEL *line;                                                                          \
@@ -2129,8 +2131,6 @@ static int jpeg2000_dwt97_int_postshift(AVCodecContext *avctx, void *td,
                 }                                                                                 \
                 line += picture->linesize[plane] / sizeof(PIXEL);                                 \
             }                                                                                     \
-        }                                                                                         \
-                                                                                                  \
     }
 
 WRITE_FRAME(8, uint8_t)
@@ -2138,26 +2138,36 @@ WRITE_FRAME(16, uint16_t)
 
 #undef WRITE_FRAME
 
-static int jpeg2000_mct_write_frame(AVCodecContext *avctx, void *td,
-                                    int jobnr, int threadnr)
+static int jpeg2000_mct(AVCodecContext *avctx, void *td,
+                        int jobnr, int threadnr)
 {
     Jpeg2000DecoderContext *s = avctx->priv_data;
-    AVFrame *picture = td;
     Jpeg2000Tile *tile = s->tile + jobnr;
 
     /* inverse MCT transformation */
     if (tile->codsty[0].mct)
         mct_decode(s, tile);
 
+    return 0;
+}
+
+static int jpeg2000_write_frame(AVCodecContext *avctx, void *td,
+                                int jobnr, int threadnr)
+{
+    Jpeg2000DecoderContext *s = avctx->priv_data;
+    AVFrame *picture = td;
+    Jpeg2000Tile *tile = s->tile + jobnr / s->ncomponents;
+    int compno = jobnr % s->ncomponents;
+
     if (s->precision <= 8) {
-        write_frame_8(s, tile, picture, 8);
+        write_frame_8(s, tile, picture, 8, compno);
     } else {
         int precision = picture->format == AV_PIX_FMT_XYZ12 ||
                         picture->format == AV_PIX_FMT_RGB48 ||
                         picture->format == AV_PIX_FMT_RGBA64 ||
                         picture->format == AV_PIX_FMT_GRAY16 ? 16 : s->precision;
 
-        write_frame_16(s, tile, picture, precision);
+        write_frame_16(s, tile, picture, precision, compno);
     }
 
     return 0;
@@ -2695,7 +2705,10 @@ static int jpeg2000_decode_frame(AVCodecContext *avctx, AVFrame *picture,
     if (s->have_dwt97_int)
         avctx->execute2(avctx, jpeg2000_dwt97_int_postshift, NULL, NULL, s->numXtiles * s->numYtiles * s->ncomponents * s->slices);
 
-    avctx->execute2(avctx, jpeg2000_mct_write_frame, picture, NULL, s->numXtiles * s->numYtiles);
+    if (s->have_mct)
+        avctx->execute2(avctx, jpeg2000_mct, NULL, NULL, s->numXtiles * s->numYtiles);
+
+    avctx->execute2(avctx, jpeg2000_write_frame, picture, NULL, s->numXtiles * s->numYtiles * s->ncomponents);
 
     jpeg2000_dec_cleanup(s, 0);
 
-- 
2.30.2


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [FFmpeg-devel] [PATCH 11/13] lavc/jpeg2000: Minimize calls to av_codec_is_encoder()
  2022-06-14 14:44 ` [FFmpeg-devel] [PATCH 11/13] lavc/jpeg2000: Minimize calls to av_codec_is_encoder() Tomas Härdin
@ 2022-06-14 15:04   ` Andreas Rheinhardt
  2022-06-15 10:20     ` Tomas Härdin
  0 siblings, 1 reply; 31+ messages in thread
From: Andreas Rheinhardt @ 2022-06-14 15:04 UTC (permalink / raw)
  To: ffmpeg-devel

Tomas Härdin:
> 
> 

Why call it at all? Why not just add a new parameter to
ff_jpeg2000_init_component that is always set to 1 when called from the
encoder and 0 when called from the decoder?
(And is this really a bottleneck?)

- Andreas
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [FFmpeg-devel] [PATCH 07/13] lavc/jpeg2000*: Use ff_fast_recalloc() to eliminate lots of allocations
  2022-06-14 14:42 ` [FFmpeg-devel] [PATCH 07/13] lavc/jpeg2000*: Use ff_fast_recalloc() to eliminate lots of allocations Tomas Härdin
@ 2022-06-14 15:23   ` Andreas Rheinhardt
  2022-06-15 10:03     ` Tomas Härdin
  0 siblings, 1 reply; 31+ messages in thread
From: Andreas Rheinhardt @ 2022-06-14 15:23 UTC (permalink / raw)
  To: ffmpeg-devel

Tomas Härdin:
> 
> 
> @@ -2166,12 +2163,13 @@ static int jpeg2000_mct_write_frame(AVCodecContext *avctx, void *td,
>      return 0;
>  }
>  
> -static void jpeg2000_dec_cleanup(Jpeg2000DecoderContext *s)
> +static void jpeg2000_dec_cleanup(Jpeg2000DecoderContext *s, int close)
>  {
>      int tileno, compno;
> -    for (tileno = 0; tileno < s->numXtiles * s->numYtiles; tileno++) {
> +    if (close) {
> +    for (tileno = 0; tileno < s->tile_size/sizeof(*s->tile); tileno++) {
>          if (s->tile[tileno].comp) {
> -            for (compno = 0; compno < s->ncomponents; compno++) {
> +            for (compno = 0; compno < s->tile[tileno].comp_size/sizeof(*s->tile[tileno].comp); compno++) {
>                  Jpeg2000Component *comp     = s->tile[tileno].comp   + compno;
>                  Jpeg2000CodingStyle *codsty = s->tile[tileno].codsty + compno;
>  
> @@ -2182,10 +2180,11 @@ static void jpeg2000_dec_cleanup(Jpeg2000DecoderContext *s)
>              s->tile[tileno].packed_headers_size = 0;
>          }
>      }
> +    av_freep(&s->tile);
> +    }
>      av_freep(&s->packed_headers);
>      s->packed_headers_size = 0;
>      memset(&s->packed_headers_stream, 0, sizeof(s->packed_headers_stream));
> -    av_freep(&s->tile);
>      memset(s->codsty, 0, sizeof(s->codsty));
>      memset(s->qntsty, 0, sizeof(s->qntsty));
>      memset(s->properties, 0, sizeof(s->properties));
> @@ -2689,7 +2688,7 @@ static int jpeg2000_decode_frame(AVCodecContext *avctx, AVFrame *picture,
>  
>      avctx->execute2(avctx, jpeg2000_mct_write_frame, picture, NULL, s->numXtiles * s->numYtiles);
>  
> -    jpeg2000_dec_cleanup(s);
> +    jpeg2000_dec_cleanup(s, 0);
>  
>      *got_frame = 1;
>  
> @@ -2702,7 +2701,7 @@ static int jpeg2000_decode_frame(AVCodecContext *avctx, AVFrame *picture,
>      return bytestream2_tell(&s->g);
>  
>  end:
> -    jpeg2000_dec_cleanup(s);
> +    jpeg2000_dec_cleanup(s, 0);
>      return ret;
>  }
>  
> @@ -2712,6 +2711,7 @@ static av_cold int jpeg2000_decode_close(AVCodecContext *avctx)
>  
>      av_freep(&s->idwt);
>      av_freep(&s->cb);
> +    jpeg2000_dec_cleanup(s, 1);
>  
>      return 0;
>  }

Why don't you just move the part of jpeg2000_dec_cleanup() that you
intend to be only executed in jpeg2000_decode_close() to
jpeg2000_decode_close()?

- Andreas
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [FFmpeg-devel] [PATCH 06/13] lavu/mem: Add ff_fast_recalloc()
  2022-06-14 14:42 ` [FFmpeg-devel] [PATCH 06/13] lavu/mem: Add ff_fast_recalloc() Tomas Härdin
@ 2022-06-14 20:26   ` Michael Niedermayer
  2022-06-15  9:59     ` Tomas Härdin
  0 siblings, 1 reply; 31+ messages in thread
From: Michael Niedermayer @ 2022-06-14 20:26 UTC (permalink / raw)
  To: FFmpeg development discussions and patches


[-- Attachment #1.1: Type: text/plain, Size: 945 bytes --]

On Tue, Jun 14, 2022 at 04:42:06PM +0200, Tomas Härdin wrote:
> Left this as an ff_ funtion for now since it's only used by the j2k
> code
> 
> /Tomas

>  mem.c |   24 ++++++++++++++++++++++++
>  mem.h |   55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 79 insertions(+)
> 21be65bd06e3260f9f36598d5d574ee32e7131a6  0006-lavu-mem-Add-ff_fast_recalloc.patch
> From 5d36d431ffe4c8ba0f698d0c288ebc16b83f0bbc Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?Tomas=20H=C3=A4rdin?= <git@haerdin.se>
> Date: Tue, 14 Jun 2022 13:35:18 +0200
> Subject: [PATCH 06/13] lavu/mem: Add ff_fast_recalloc()

You cannot call a ff_* function thats in libavutil from outside libavutil
this will fail with shared libs as the ff* stuff is not exported

thx

[...]

-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Observe your enemies, for they first find out your faults. -- Antisthenes

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

[-- Attachment #2: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [FFmpeg-devel] [PATCH 05/13] lavc/jpeg2000dec: Thread init_tile()
  2022-06-14 14:41 ` [FFmpeg-devel] [PATCH 05/13] lavc/jpeg2000dec: Thread init_tile() Tomas Härdin
@ 2022-06-14 21:11   ` Michael Niedermayer
  2022-06-15 13:11     ` Tomas Härdin
  0 siblings, 1 reply; 31+ messages in thread
From: Michael Niedermayer @ 2022-06-14 21:11 UTC (permalink / raw)
  To: FFmpeg development discussions and patches


[-- Attachment #1.1: Type: text/plain, Size: 1281 bytes --]

On Tue, Jun 14, 2022 at 04:41:14PM +0200, Tomas Härdin wrote:
> 

>  jpeg2000dec.c |   30 +++++++++++++++---------------
>  1 file changed, 15 insertions(+), 15 deletions(-)
> 6fa2fbf99afee36ee73459863df0527a72663f43  0005-lavc-jpeg2000dec-Thread-init_tile.patch
> From 080ebdc9bad130098bff575f9ce690b8a522c9f7 Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?Tomas=20H=C3=A4rdin?= <git@haerdin.se>
> Date: Mon, 13 Jun 2022 15:09:17 +0200
> Subject: [PATCH 05/13] lavc/jpeg2000dec: Thread init_tile()

Causes segfaults

[jpeg2000 @ 0x2cf53380] End mismatch 149
[jpeg2000 @ 0x2cf53380] ==1439== Thread 6:
==1439== Invalid read of size 4
==1439==    at 0x9771F0: jpeg2000_mct_write_frame (in ffmpeg_g)
==1439==    by 0x78BA6F: avcodec_default_execute2 (in ffmpeg_g)
==1439==    by 0x97C0BB: jpeg2000_decode_frame (in ffmpeg_g)
==1439==    by 0xA90F72: frame_worker_thread (in ffmpeg_g)
==1439==    by 0x54046DA: start_thread (pthread_create.c:463)
==1439==    by 0xF8F261E: clone (clone.S:95)

i will send you the sample privatly

thx

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

"Nothing to hide" only works if the folks in power share the values of
you and everyone you know entirely and always will -- Tom Scott


[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

[-- Attachment #2: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [FFmpeg-devel] [PATCH 06/13] lavu/mem: Add ff_fast_recalloc()
  2022-06-14 20:26   ` Michael Niedermayer
@ 2022-06-15  9:59     ` Tomas Härdin
  2022-06-15 12:15       ` James Almer
  0 siblings, 1 reply; 31+ messages in thread
From: Tomas Härdin @ 2022-06-15  9:59 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

tis 2022-06-14 klockan 22:26 +0200 skrev Michael Niedermayer:
> On Tue, Jun 14, 2022 at 04:42:06PM +0200, Tomas Härdin wrote:
> > Left this as an ff_ funtion for now since it's only used by the j2k
> > code
> > 
> > /Tomas
> 
> >  mem.c |   24 ++++++++++++++++++++++++
> >  mem.h |   55
> > +++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 79 insertions(+)
> > 21be65bd06e3260f9f36598d5d574ee32e7131a6  0006-lavu-mem-Add-
> > ff_fast_recalloc.patch
> > From 5d36d431ffe4c8ba0f698d0c288ebc16b83f0bbc Mon Sep 17 00:00:00
> > 2001
> > From: =?UTF-8?q?Tomas=20H=C3=A4rdin?= <git@haerdin.se>
> > Date: Tue, 14 Jun 2022 13:35:18 +0200
> > Subject: [PATCH 06/13] lavu/mem: Add ff_fast_recalloc()
> 
> You cannot call a ff_* function thats in libavutil from outside
> libavutil
> this will fail with shared libs as the ff* stuff is not exported

Ah, I suspected as much. Would there be much opposition to a public
function like this in lavu? I could just keep it local to the j2k code

/Tomas

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [FFmpeg-devel] [PATCH 07/13] lavc/jpeg2000*: Use ff_fast_recalloc() to eliminate lots of allocations
  2022-06-14 15:23   ` Andreas Rheinhardt
@ 2022-06-15 10:03     ` Tomas Härdin
  0 siblings, 0 replies; 31+ messages in thread
From: Tomas Härdin @ 2022-06-15 10:03 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

tis 2022-06-14 klockan 17:23 +0200 skrev Andreas Rheinhardt:
> Tomas Härdin:
> > 
> > 
> > @@ -2166,12 +2163,13 @@ static int
> > jpeg2000_mct_write_frame(AVCodecContext *avctx, void *td,
> >      return 0;
> >  }
> >  
> > -static void jpeg2000_dec_cleanup(Jpeg2000DecoderContext *s)
> > +static void jpeg2000_dec_cleanup(Jpeg2000DecoderContext *s, int
> > close)
> >  {
> >      int tileno, compno;
> > -    for (tileno = 0; tileno < s->numXtiles * s->numYtiles;
> > tileno++) {
> > +    if (close) {
> > +    for (tileno = 0; tileno < s->tile_size/sizeof(*s->tile);
> > tileno++) {
> >          if (s->tile[tileno].comp) {
> > -            for (compno = 0; compno < s->ncomponents; compno++) {
> > +            for (compno = 0; compno < s-
> > >tile[tileno].comp_size/sizeof(*s->tile[tileno].comp); compno++) {
> >                  Jpeg2000Component *comp     = s-
> > >tile[tileno].comp   + compno;
> >                  Jpeg2000CodingStyle *codsty = s-
> > >tile[tileno].codsty + compno;
> >  
> > @@ -2182,10 +2180,11 @@ static void
> > jpeg2000_dec_cleanup(Jpeg2000DecoderContext *s)
> >              s->tile[tileno].packed_headers_size = 0;
> >          }
> >      }
> > +    av_freep(&s->tile);
> > +    }
> >      av_freep(&s->packed_headers);
> >      s->packed_headers_size = 0;
> >      memset(&s->packed_headers_stream, 0, sizeof(s-
> > >packed_headers_stream));
> > -    av_freep(&s->tile);
> >      memset(s->codsty, 0, sizeof(s->codsty));
> >      memset(s->qntsty, 0, sizeof(s->qntsty));
> >      memset(s->properties, 0, sizeof(s->properties));
> > @@ -2689,7 +2688,7 @@ static int
> > jpeg2000_decode_frame(AVCodecContext *avctx, AVFrame *picture,
> >  
> >      avctx->execute2(avctx, jpeg2000_mct_write_frame, picture,
> > NULL, s->numXtiles * s->numYtiles);
> >  
> > -    jpeg2000_dec_cleanup(s);
> > +    jpeg2000_dec_cleanup(s, 0);
> >  
> >      *got_frame = 1;
> >  
> > @@ -2702,7 +2701,7 @@ static int
> > jpeg2000_decode_frame(AVCodecContext *avctx, AVFrame *picture,
> >      return bytestream2_tell(&s->g);
> >  
> >  end:
> > -    jpeg2000_dec_cleanup(s);
> > +    jpeg2000_dec_cleanup(s, 0);
> >      return ret;
> >  }
> >  
> > @@ -2712,6 +2711,7 @@ static av_cold int
> > jpeg2000_decode_close(AVCodecContext *avctx)
> >  
> >      av_freep(&s->idwt);
> >      av_freep(&s->cb);
> > +    jpeg2000_dec_cleanup(s, 1);
> >  
> >      return 0;
> >  }
> 
> Why don't you just move the part of jpeg2000_dec_cleanup() that you
> intend to be only executed in jpeg2000_decode_close() to
> jpeg2000_decode_close()?

I had in mind to do just that but forgot. Will do!

/Tomas

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [FFmpeg-devel] [PATCH 11/13] lavc/jpeg2000: Minimize calls to av_codec_is_encoder()
  2022-06-14 15:04   ` Andreas Rheinhardt
@ 2022-06-15 10:20     ` Tomas Härdin
  0 siblings, 0 replies; 31+ messages in thread
From: Tomas Härdin @ 2022-06-15 10:20 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

tis 2022-06-14 klockan 17:04 +0200 skrev Andreas Rheinhardt:
> Tomas Härdin:
> > 
> > 
> 
> Why call it at all? Why not just add a new parameter to
> ff_jpeg2000_init_component that is always set to 1 when called from
> the
> encoder and 0 when called from the decoder?

Oh yeah that's even simpler

> (And is this really a bottleneck?)

Callgrind certainly thinks so. It's called hundreds of thousands of
times per frame. Remember that this is in the serial part of the code
so any savings there get amplified -threads fold fps-wise. init_tile()
accounts for a mere 0.8 seconds out of 59.8 partly thanks to this.
Here's a rough breakdown for the curious with -threads 64:

 0.8 everything up to and including init_tiles()
23.0 jpeg2000_read_bitstream_packets()
 0.1 jpeg2000_setup_cbs()
24.1 jpeg2000_decode_cb()
 9.8 jpeg2000_idwt()
 2.0 jpeg2000_mct_write_frame()
 0.0 jpeg2000_dec_cleanup()

jpeg2000_read_bitstream_packets() is obviously the main thing to focus
on for anyone wanting to bump the speed up even more. But it's nasty.
Maybe it could be tile-threaded, but it takes some doing..

/Tomas

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [FFmpeg-devel] [PATCH 06/13] lavu/mem: Add ff_fast_recalloc()
  2022-06-15  9:59     ` Tomas Härdin
@ 2022-06-15 12:15       ` James Almer
  2022-06-16 12:44         ` Tomas Härdin
  0 siblings, 1 reply; 31+ messages in thread
From: James Almer @ 2022-06-15 12:15 UTC (permalink / raw)
  To: ffmpeg-devel

On 6/15/2022 6:59 AM, Tomas Härdin wrote:
> tis 2022-06-14 klockan 22:26 +0200 skrev Michael Niedermayer:
>> On Tue, Jun 14, 2022 at 04:42:06PM +0200, Tomas Härdin wrote:
>>> Left this as an ff_ funtion for now since it's only used by the j2k
>>> code
>>>
>>> /Tomas
>>
>>>   mem.c |   24 ++++++++++++++++++++++++
>>>   mem.h |   55
>>> +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>   2 files changed, 79 insertions(+)
>>> 21be65bd06e3260f9f36598d5d574ee32e7131a6  0006-lavu-mem-Add-
>>> ff_fast_recalloc.patch
>>>  From 5d36d431ffe4c8ba0f698d0c288ebc16b83f0bbc Mon Sep 17 00:00:00
>>> 2001
>>> From: =?UTF-8?q?Tomas=20H=C3=A4rdin?= <git@haerdin.se>
>>> Date: Tue, 14 Jun 2022 13:35:18 +0200
>>> Subject: [PATCH 06/13] lavu/mem: Add ff_fast_recalloc()
>>
>> You cannot call a ff_* function thats in libavutil from outside
>> libavutil
>> this will fail with shared libs as the ff* stuff is not exported
> 
> Ah, I suspected as much. Would there be much opposition to a public
> function like this in lavu? I could just keep it local to the j2k code

Just make it public by using the av_ prefix (You in fact added it to 
mem.h, which is installed. You'd need to add it to mem_internal.h if you 
wanted to avoid exposing it).

Don't forget to add an APIChanges entry and minor lavu version bump 
before you push if you do.

> 
> /Tomas
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [FFmpeg-devel] [PATCH 05/13] lavc/jpeg2000dec: Thread init_tile()
  2022-06-14 21:11   ` Michael Niedermayer
@ 2022-06-15 13:11     ` Tomas Härdin
  2022-06-15 21:05       ` Michael Niedermayer
  0 siblings, 1 reply; 31+ messages in thread
From: Tomas Härdin @ 2022-06-15 13:11 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

tis 2022-06-14 klockan 23:11 +0200 skrev Michael Niedermayer:
> On Tue, Jun 14, 2022 at 04:41:14PM +0200, Tomas Härdin wrote:
> > 
> 
> >  jpeg2000dec.c |   30 +++++++++++++++---------------
> >  1 file changed, 15 insertions(+), 15 deletions(-)
> > 6fa2fbf99afee36ee73459863df0527a72663f43  0005-lavc-jpeg2000dec-
> > Thread-init_tile.patch
> > From 080ebdc9bad130098bff575f9ce690b8a522c9f7 Mon Sep 17 00:00:00
> > 2001
> > From: =?UTF-8?q?Tomas=20H=C3=A4rdin?= <git@haerdin.se>
> > Date: Mon, 13 Jun 2022 15:09:17 +0200
> > Subject: [PATCH 05/13] lavc/jpeg2000dec: Thread init_tile()
> 
> Causes segfaults
> 
> [jpeg2000 @ 0x2cf53380] End mismatch 149
> [jpeg2000 @ 0x2cf53380] ==1439== Thread 6:
> ==1439== Invalid read of size 4
> ==1439==    at 0x9771F0: jpeg2000_mct_write_frame (in ffmpeg_g)
> ==1439==    by 0x78BA6F: avcodec_default_execute2 (in ffmpeg_g)
> ==1439==    by 0x97C0BB: jpeg2000_decode_frame (in ffmpeg_g)
> ==1439==    by 0xA90F72: frame_worker_thread (in ffmpeg_g)
> ==1439==    by 0x54046DA: start_thread (pthread_create.c:463)
> ==1439==    by 0xF8F261E: clone (clone.S:95)
> 
> i will send you the sample privatly

This is because init_tile() fails. I had assumed errors were handled in
some way like longjmp since the function already called execute2() but
it seems the threading doesn't do any kind of magic for this.

Can we have execute2() return some kind of error code when one or more
jobs fail? Either say FFMIN() of all errors or negative jobnr that
failed? This would save on having to allocate an array for errors when
we don't really care which exact jobs failed..

/Tomas

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [FFmpeg-devel] [PATCH 05/13] lavc/jpeg2000dec: Thread init_tile()
  2022-06-15 13:11     ` Tomas Härdin
@ 2022-06-15 21:05       ` Michael Niedermayer
  2022-06-16  9:28         ` Tomas Härdin
  0 siblings, 1 reply; 31+ messages in thread
From: Michael Niedermayer @ 2022-06-15 21:05 UTC (permalink / raw)
  To: FFmpeg development discussions and patches


[-- Attachment #1.1: Type: text/plain, Size: 2177 bytes --]

On Wed, Jun 15, 2022 at 03:11:34PM +0200, Tomas Härdin wrote:
> tis 2022-06-14 klockan 23:11 +0200 skrev Michael Niedermayer:
> > On Tue, Jun 14, 2022 at 04:41:14PM +0200, Tomas Härdin wrote:
> > > 
> > 
> > >  jpeg2000dec.c |   30 +++++++++++++++---------------
> > >  1 file changed, 15 insertions(+), 15 deletions(-)
> > > 6fa2fbf99afee36ee73459863df0527a72663f43  0005-lavc-jpeg2000dec-
> > > Thread-init_tile.patch
> > > From 080ebdc9bad130098bff575f9ce690b8a522c9f7 Mon Sep 17 00:00:00
> > > 2001
> > > From: =?UTF-8?q?Tomas=20H=C3=A4rdin?= <git@haerdin.se>
> > > Date: Mon, 13 Jun 2022 15:09:17 +0200
> > > Subject: [PATCH 05/13] lavc/jpeg2000dec: Thread init_tile()
> > 
> > Causes segfaults
> > 
> > [jpeg2000 @ 0x2cf53380] End mismatch 149
> > [jpeg2000 @ 0x2cf53380] ==1439== Thread 6:
> > ==1439== Invalid read of size 4
> > ==1439==    at 0x9771F0: jpeg2000_mct_write_frame (in ffmpeg_g)
> > ==1439==    by 0x78BA6F: avcodec_default_execute2 (in ffmpeg_g)
> > ==1439==    by 0x97C0BB: jpeg2000_decode_frame (in ffmpeg_g)
> > ==1439==    by 0xA90F72: frame_worker_thread (in ffmpeg_g)
> > ==1439==    by 0x54046DA: start_thread (pthread_create.c:463)
> > ==1439==    by 0xF8F261E: clone (clone.S:95)
> > 
> > i will send you the sample privatly
> 
> This is because init_tile() fails. I had assumed errors were handled in
> some way like longjmp since the function already called execute2() but
> it seems the threading doesn't do any kind of magic for this.
> 
> Can we have execute2() return some kind of error code when one or more
> jobs fail? Either say FFMIN() of all errors or negative jobnr that
> failed? This would save on having to allocate an array for errors when
> we don't really care which exact jobs failed..

one could return a struct with error code, index and number of failed
ones or something. But then maybe just atomically setting some error flag
and leaving the API would be fine too.
Iam fine with either

thx


[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Republics decline into democracies and democracies degenerate into
despotisms. -- Aristotle

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

[-- Attachment #2: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [FFmpeg-devel] [PATCH 05/13] lavc/jpeg2000dec: Thread init_tile()
  2022-06-15 21:05       ` Michael Niedermayer
@ 2022-06-16  9:28         ` Tomas Härdin
  0 siblings, 0 replies; 31+ messages in thread
From: Tomas Härdin @ 2022-06-16  9:28 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

ons 2022-06-15 klockan 23:05 +0200 skrev Michael Niedermayer:
> On Wed, Jun 15, 2022 at 03:11:34PM +0200, Tomas Härdin wrote:
> > tis 2022-06-14 klockan 23:11 +0200 skrev Michael Niedermayer:
> > > On Tue, Jun 14, 2022 at 04:41:14PM +0200, Tomas Härdin wrote:
> > > > 
> > > 
> > > >  jpeg2000dec.c |   30 +++++++++++++++---------------
> > > >  1 file changed, 15 insertions(+), 15 deletions(-)
> > > > 6fa2fbf99afee36ee73459863df0527a72663f43  0005-lavc-
> > > > jpeg2000dec-
> > > > Thread-init_tile.patch
> > > > From 080ebdc9bad130098bff575f9ce690b8a522c9f7 Mon Sep 17
> > > > 00:00:00
> > > > 2001
> > > > From: =?UTF-8?q?Tomas=20H=C3=A4rdin?= <git@haerdin.se>
> > > > Date: Mon, 13 Jun 2022 15:09:17 +0200
> > > > Subject: [PATCH 05/13] lavc/jpeg2000dec: Thread init_tile()
> > > 
> > > Causes segfaults
> > > 
> > > [jpeg2000 @ 0x2cf53380] End mismatch 149
> > > [jpeg2000 @ 0x2cf53380] ==1439== Thread 6:
> > > ==1439== Invalid read of size 4
> > > ==1439==    at 0x9771F0: jpeg2000_mct_write_frame (in ffmpeg_g)
> > > ==1439==    by 0x78BA6F: avcodec_default_execute2 (in ffmpeg_g)
> > > ==1439==    by 0x97C0BB: jpeg2000_decode_frame (in ffmpeg_g)
> > > ==1439==    by 0xA90F72: frame_worker_thread (in ffmpeg_g)
> > > ==1439==    by 0x54046DA: start_thread (pthread_create.c:463)
> > > ==1439==    by 0xF8F261E: clone (clone.S:95)
> > > 
> > > i will send you the sample privatly
> > 
> > This is because init_tile() fails. I had assumed errors were
> > handled in
> > some way like longjmp since the function already called execute2()
> > but
> > it seems the threading doesn't do any kind of magic for this.
> > 
> > Can we have execute2() return some kind of error code when one or
> > more
> > jobs fail? Either say FFMIN() of all errors or negative jobnr that
> > failed? This would save on having to allocate an array for errors
> > when
> > we don't really care which exact jobs failed..
> 
> one could return a struct with error code, index and number of failed
> ones or something. But then maybe just atomically setting some error
> flag
> and leaving the API would be fine too.
> Iam fine with either

It currently has the ability to return error codes in an array, but
only hevcdec.c makes use of that and it does so poorly. It just adds up
all return codes. What I propose is returning FFMIN() of all return
codes seen. This allows bailing out on error without having to bother
allocating an array for returns that no one is handling in any sensible
manner anyway.

The API states that execute() and execute2() may return non-zero and
that users should act on that so it seems safe to make it actually do
so in case of error. I think we could also av_log() errors + jobnr

I'm working on something for this at the moment. Will post a separate
patchset for that.

/Tomas

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [FFmpeg-devel] [PATCH 06/13] lavu/mem: Add ff_fast_recalloc()
  2022-06-15 12:15       ` James Almer
@ 2022-06-16 12:44         ` Tomas Härdin
  2022-06-18 14:57           ` Anton Khirnov
  0 siblings, 1 reply; 31+ messages in thread
From: Tomas Härdin @ 2022-06-16 12:44 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

ons 2022-06-15 klockan 09:15 -0300 skrev James Almer:
> On 6/15/2022 6:59 AM, Tomas Härdin wrote:
> > tis 2022-06-14 klockan 22:26 +0200 skrev Michael Niedermayer:
> > > On Tue, Jun 14, 2022 at 04:42:06PM +0200, Tomas Härdin wrote:
> > > > Left this as an ff_ funtion for now since it's only used by the
> > > > j2k
> > > > code
> > > > 
> > > > /Tomas
> > > 
> > > >   mem.c |   24 ++++++++++++++++++++++++
> > > >   mem.h |   55
> > > > +++++++++++++++++++++++++++++++++++++++++++++++++++++++
> > > >   2 files changed, 79 insertions(+)
> > > > 21be65bd06e3260f9f36598d5d574ee32e7131a6  0006-lavu-mem-Add-
> > > > ff_fast_recalloc.patch
> > > >  From 5d36d431ffe4c8ba0f698d0c288ebc16b83f0bbc Mon Sep 17
> > > > 00:00:00
> > > > 2001
> > > > From: =?UTF-8?q?Tomas=20H=C3=A4rdin?= <git@haerdin.se>
> > > > Date: Tue, 14 Jun 2022 13:35:18 +0200
> > > > Subject: [PATCH 06/13] lavu/mem: Add ff_fast_recalloc()
> > > 
> > > You cannot call a ff_* function thats in libavutil from outside
> > > libavutil
> > > this will fail with shared libs as the ff* stuff is not exported
> > 
> > Ah, I suspected as much. Would there be much opposition to a public
> > function like this in lavu? I could just keep it local to the j2k
> > code
> 
> Just make it public by using the av_ prefix (You in fact added it to 
> mem.h, which is installed. You'd need to add it to mem_internal.h if
> you 
> wanted to avoid exposing it).

Ah I didn't notice it existed. But it doesn't look like the right place
for it either, it's just a bunch of macros in there.

> Don't forget to add an APIChanges entry and minor lavu version bump 
> before you push if you do.

Right. Unless people have objections to the name or something similar.
I kinda want the functions in mem.h to have a better naming system..

/Tomas

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [FFmpeg-devel] [PATCH 01/13] lavc/jpeg2000dec: Finer granularity threading
  2022-06-14 14:39 [FFmpeg-devel] [PATCH 01/13] lavc/jpeg2000dec: Finer granularity threading Tomas Härdin
                   ` (11 preceding siblings ...)
  2022-06-14 14:47 ` [FFmpeg-devel] [PATCH 13/13] lavc/jpeg2000dec: Component-level threading of write_frame() Tomas Härdin
@ 2022-06-18 14:50 ` Anton Khirnov
  2022-06-24  8:19   ` Tomas Härdin
  12 siblings, 1 reply; 31+ messages in thread
From: Anton Khirnov @ 2022-06-18 14:50 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Quoting Tomas Härdin (2022-06-14 16:39:00)
> Patch 12 in this series is optional since it's just me getting the
> speed up on a specific machine
> 
> /Tomas
> 
> From 115aa26c343419e81c1b5ba0bfdb1615cbec27e9 Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?Tomas=20H=C3=A4rdin?= <git@haerdin.se>
> Date: Fri, 10 Jun 2022 14:10:02 +0200
> Subject: [PATCH 01/13] lavc/jpeg2000dec: Finer granularity threading
> 
> Decoding and dequant is now threaded on codeblock level.
> IDWT is threaded on component level.
> MCT and write_frame() remain threaded on tile level.
> 
> This brings lossless 4K J2K with -lowres 2 -thread_type slice -threads 96 on an AMD EPYC 7R32 from 4.8 fps (177% CPU) to 31 fps (1284% CPU).

Any measurable impact on single-threaded or frame-threaded decoding?

-- 
Anton Khirnov
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [FFmpeg-devel] [PATCH 06/13] lavu/mem: Add ff_fast_recalloc()
  2022-06-16 12:44         ` Tomas Härdin
@ 2022-06-18 14:57           ` Anton Khirnov
  2022-06-21  8:04             ` Tomas Härdin
  0 siblings, 1 reply; 31+ messages in thread
From: Anton Khirnov @ 2022-06-18 14:57 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Quoting Tomas Härdin (2022-06-16 14:44:45)
> ons 2022-06-15 klockan 09:15 -0300 skrev James Almer:
> > On 6/15/2022 6:59 AM, Tomas Härdin wrote:
> > > tis 2022-06-14 klockan 22:26 +0200 skrev Michael Niedermayer:
> > > > On Tue, Jun 14, 2022 at 04:42:06PM +0200, Tomas Härdin wrote:
> > > > > Left this as an ff_ funtion for now since it's only used by the
> > > > > j2k
> > > > > code
> > > > > 
> > > > > /Tomas
> > > > 
> > > > >   mem.c |   24 ++++++++++++++++++++++++
> > > > >   mem.h |   55
> > > > > +++++++++++++++++++++++++++++++++++++++++++++++++++++++
> > > > >   2 files changed, 79 insertions(+)
> > > > > 21be65bd06e3260f9f36598d5d574ee32e7131a6  0006-lavu-mem-Add-
> > > > > ff_fast_recalloc.patch
> > > > >  From 5d36d431ffe4c8ba0f698d0c288ebc16b83f0bbc Mon Sep 17
> > > > > 00:00:00
> > > > > 2001
> > > > > From: =?UTF-8?q?Tomas=20H=C3=A4rdin?= <git@haerdin.se>
> > > > > Date: Tue, 14 Jun 2022 13:35:18 +0200
> > > > > Subject: [PATCH 06/13] lavu/mem: Add ff_fast_recalloc()
> > > > 
> > > > You cannot call a ff_* function thats in libavutil from outside
> > > > libavutil
> > > > this will fail with shared libs as the ff* stuff is not exported
> > > 
> > > Ah, I suspected as much. Would there be much opposition to a public
> > > function like this in lavu? I could just keep it local to the j2k
> > > code
> > 
> > Just make it public by using the av_ prefix (You in fact added it to 
> > mem.h, which is installed. You'd need to add it to mem_internal.h if
> > you 
> > wanted to avoid exposing it).
> 
> Ah I didn't notice it existed. But it doesn't look like the right place
> for it either, it's just a bunch of macros in there.
> 
> > Don't forget to add an APIChanges entry and minor lavu version bump 
> > before you push if you do.
> 
> Right. Unless people have objections to the name or something similar.
> I kinda want the functions in mem.h to have a better naming system..

Yeah, I really dislike those random-endian av_fast_<do_thing>_maybe()
kinds names.
Maybe something like av_*alloc_reuse()?

-- 
Anton Khirnov
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [FFmpeg-devel] [PATCH 09/13] lavc/jpeg2000: Speed up ff_jpeg2000_tag_tree_init() using stereotypes for sizes <= 4x4
  2022-06-14 14:43 ` [FFmpeg-devel] [PATCH 09/13] lavc/jpeg2000: Speed up ff_jpeg2000_tag_tree_init() using stereotypes for sizes <= 4x4 Tomas Härdin
@ 2022-06-18 15:00   ` Anton Khirnov
  2022-06-21  7:57     ` Tomas Härdin
  0 siblings, 1 reply; 31+ messages in thread
From: Anton Khirnov @ 2022-06-18 15:00 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Quoting Tomas Härdin (2022-06-14 16:43:38)
> 
> 
> From 03b806f89453571310dcb14edbd9f51e059b7476 Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?Tomas=20H=C3=A4rdin?= <git@haerdin.se>
> Date: Wed, 8 Jun 2022 10:08:15 +0200
> Subject: [PATCH 09/13] lavc/jpeg2000: Speed up ff_jpeg2000_tag_tree_init()
>  using stereotypes for sizes <= 4x4
> 
> ---
>  libavcodec/jpeg2000.c | 35 +++++++++++++++++++++++++++++++++++
>  1 file changed, 35 insertions(+)
> 
> diff --git a/libavcodec/jpeg2000.c b/libavcodec/jpeg2000.c
> index 0bec2e187d..b80e68bcba 100644
> --- a/libavcodec/jpeg2000.c
> +++ b/libavcodec/jpeg2000.c
> @@ -51,6 +51,31 @@ static int32_t tag_tree_size(int w, int h)
>      return (int32_t)(res + 1);
>  }
>  
> +#define T(x) (x*sizeof(Jpeg2000TgtNode))
> +
> +static const size_t tt_sizes[16] = {
> +    T(1),T(3),T(6),T(7),T(3),T(5),T(9),T(11),T(6),T(9),T(14),T(17),T(7),T(11),T(17),T(21),
> +};
> +
> +static const Jpeg2000TgtNode tt_stereotypes[16][21] = {
> +    {{-1},},
> +    {{2},{2},{-1},},
> +    {{3},{3},{4},{5},{5},{-1},},
> +    {{4},{4},{5},{5},{6},{6},{-1},},
> +    {{2},{2},{-1},},
> +    {{4},{4},{4},{4},{-1},},
> +    {{6},{6},{7},{6},{6},{7},{8},{8},{-1},},
> +    {{8},{8},{9},{9},{8},{8},{9},{9},{10},{10},{-1},},
> +    {{3},{3},{4},{5},{5},{-1},},
> +    {{6},{6},{6},{6},{7},{7},{8},{8},{-1},},
> +    {{9},{9},{10},{9},{9},{10},{11},{11},{12},{13},{13},{13},{13},{-1},},
> +    {{12},{12},{13},{13},{12},{12},{13},{13},{14},{14},{15},{15},{16},{16},{16},{16},{-1},},
> +    {{4},{4},{5},{5},{6},{6},{-1},},
> +    {{8},{8},{8},{8},{9},{9},{9},{9},{10},{10},{-1},},
> +    {{12},{12},{13},{12},{12},{13},{14},{14},{15},{14},{14},{15},{16},{16},{16},{16},{-1},},
> +    {{16},{16},{17},{17},{16},{16},{17},{17},{18},{18},{19},{19},{18},{18},{19},{19},{20},{20},{20},{20},{-1},},
> +};
> +
>  /* allocate the memory for tag tree */
>  static int ff_jpeg2000_tag_tree_init(Jpeg2000TgtNode **old, unsigned int *size, int w, int h)
>  {
> @@ -59,6 +84,15 @@ static int ff_jpeg2000_tag_tree_init(Jpeg2000TgtNode **old, unsigned int *size,
>      int32_t tt_size, ofs = 0;
>      size_t prod;
>  
> +    if (w <= 4 && h <= 4) {
> +        int idx = w-1 + (h-1)*4;
> +        size_t sz = tt_sizes[idx];
> +        av_fast_malloc(old, size, sz);

Unchecked mallocs are of the beast.

-- 
Anton Khirnov
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [FFmpeg-devel] [PATCH 09/13] lavc/jpeg2000: Speed up ff_jpeg2000_tag_tree_init() using stereotypes for sizes <= 4x4
  2022-06-18 15:00   ` Anton Khirnov
@ 2022-06-21  7:57     ` Tomas Härdin
  0 siblings, 0 replies; 31+ messages in thread
From: Tomas Härdin @ 2022-06-21  7:57 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

lör 2022-06-18 klockan 17:00 +0200 skrev Anton Khirnov:
> Quoting Tomas Härdin (2022-06-14 16:43:38)
> > 
> > +    if (w <= 4 && h <= 4) {
> > +        int idx = w-1 + (h-1)*4;
> > +        size_t sz = tt_sizes[idx];
> > +        av_fast_malloc(old, size, sz);
> 
> Unchecked mallocs are of the beast.

Right, it should return AVERROR(ENOMEM) if !*old. Will be fixed in the
updated patchset

/Tomas

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [FFmpeg-devel] [PATCH 06/13] lavu/mem: Add ff_fast_recalloc()
  2022-06-18 14:57           ` Anton Khirnov
@ 2022-06-21  8:04             ` Tomas Härdin
  0 siblings, 0 replies; 31+ messages in thread
From: Tomas Härdin @ 2022-06-21  8:04 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

lör 2022-06-18 klockan 16:57 +0200 skrev Anton Khirnov:
> Quoting Tomas Härdin (2022-06-16 14:44:45)
> > ons 2022-06-15 klockan 09:15 -0300 skrev James Almer:
> > > On 6/15/2022 6:59 AM, Tomas Härdin wrote:
> > > > tis 2022-06-14 klockan 22:26 +0200 skrev Michael Niedermayer:
> > > > > On Tue, Jun 14, 2022 at 04:42:06PM +0200, Tomas Härdin wrote:
> > > > > > Left this as an ff_ funtion for now since it's only used by
> > > > > > the
> > > > > > j2k
> > > > > > code
> > > > > > 
> > > > > > /Tomas
> > > > > 
> > > > > >   mem.c |   24 ++++++++++++++++++++++++
> > > > > >   mem.h |   55
> > > > > > +++++++++++++++++++++++++++++++++++++++++++++++++++++++
> > > > > >   2 files changed, 79 insertions(+)
> > > > > > 21be65bd06e3260f9f36598d5d574ee32e7131a6  0006-lavu-mem-
> > > > > > Add-
> > > > > > ff_fast_recalloc.patch
> > > > > >  From 5d36d431ffe4c8ba0f698d0c288ebc16b83f0bbc Mon Sep 17
> > > > > > 00:00:00
> > > > > > 2001
> > > > > > From: =?UTF-8?q?Tomas=20H=C3=A4rdin?= <git@haerdin.se>
> > > > > > Date: Tue, 14 Jun 2022 13:35:18 +0200
> > > > > > Subject: [PATCH 06/13] lavu/mem: Add ff_fast_recalloc()
> > > > > 
> > > > > You cannot call a ff_* function thats in libavutil from
> > > > > outside
> > > > > libavutil
> > > > > this will fail with shared libs as the ff* stuff is not
> > > > > exported
> > > > 
> > > > Ah, I suspected as much. Would there be much opposition to a
> > > > public
> > > > function like this in lavu? I could just keep it local to the
> > > > j2k
> > > > code
> > > 
> > > Just make it public by using the av_ prefix (You in fact added it
> > > to 
> > > mem.h, which is installed. You'd need to add it to mem_internal.h
> > > if
> > > you 
> > > wanted to avoid exposing it).
> > 
> > Ah I didn't notice it existed. But it doesn't look like the right
> > place
> > for it either, it's just a bunch of macros in there.
> > 
> > > Don't forget to add an APIChanges entry and minor lavu version
> > > bump 
> > > before you push if you do.
> > 
> > Right. Unless people have objections to the name or something
> > similar.
> > I kinda want the functions in mem.h to have a better naming
> > system..
> 
> Yeah, I really dislike those random-endian av_fast_<do_thing>_maybe()
> kinds names.
> Maybe something like av_*alloc_reuse()?
> 

I see the suffix p on functions that take pointer-to-pointer, like
av_reallocp(). But not on all of them - av_fast_malloc() should've been
called av_fast_mallocp() imo

/Tomas

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [FFmpeg-devel] [PATCH 01/13] lavc/jpeg2000dec: Finer granularity threading
  2022-06-18 14:50 ` [FFmpeg-devel] [PATCH 01/13] lavc/jpeg2000dec: Finer granularity threading Anton Khirnov
@ 2022-06-24  8:19   ` Tomas Härdin
  0 siblings, 0 replies; 31+ messages in thread
From: Tomas Härdin @ 2022-06-24  8:19 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

lör 2022-06-18 klockan 16:50 +0200 skrev Anton Khirnov:
> Quoting Tomas Härdin (2022-06-14 16:39:00)
> > Patch 12 in this series is optional since it's just me getting the
> > speed up on a specific machine
> > 
> > /Tomas
> > 
> > From 115aa26c343419e81c1b5ba0bfdb1615cbec27e9 Mon Sep 17 00:00:00
> > 2001
> > From: =?UTF-8?q?Tomas=20H=C3=A4rdin?= <git@haerdin.se>
> > Date: Fri, 10 Jun 2022 14:10:02 +0200
> > Subject: [PATCH 01/13] lavc/jpeg2000dec: Finer granularity
> > threading
> > 
> > Decoding and dequant is now threaded on codeblock level.
> > IDWT is threaded on component level.
> > MCT and write_frame() remain threaded on tile level.
> > 
> > This brings lossless 4K J2K with -lowres 2 -thread_type slice -
> > threads 96 on an AMD EPYC 7R32 from 4.8 fps (177% CPU) to 31 fps
> > (1284% CPU).
> 
> Any measurable impact on single-threaded or frame-threaded decoding?
> 

median of 11 runs with -threads 1 -vframes 100 on a 4K file
before: real    0m38,664s
 after: real    0m39,139s

I have in mind to try and roll together the last step in the IDWT code
with the av_clip() in write_frame() which should improve run time in
all cases.

/Tomas

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2022-06-24  8:19 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-14 14:39 [FFmpeg-devel] [PATCH 01/13] lavc/jpeg2000dec: Finer granularity threading Tomas Härdin
2022-06-14 14:39 ` [FFmpeg-devel] [PATCH 02/13] lavc/jpeg2000dec: Reindent Tomas Härdin
2022-06-14 14:40 ` [FFmpeg-devel] [PATCH 03/13] lavc/jpeg2000dwt: Implement sliced transforms Tomas Härdin
2022-06-14 14:40 ` [FFmpeg-devel] [PATCH 04/13] lavc/jpeg2000dec: Implement IDWT slicing Tomas Härdin
2022-06-14 14:41 ` [FFmpeg-devel] [PATCH 05/13] lavc/jpeg2000dec: Thread init_tile() Tomas Härdin
2022-06-14 21:11   ` Michael Niedermayer
2022-06-15 13:11     ` Tomas Härdin
2022-06-15 21:05       ` Michael Niedermayer
2022-06-16  9:28         ` Tomas Härdin
2022-06-14 14:42 ` [FFmpeg-devel] [PATCH 06/13] lavu/mem: Add ff_fast_recalloc() Tomas Härdin
2022-06-14 20:26   ` Michael Niedermayer
2022-06-15  9:59     ` Tomas Härdin
2022-06-15 12:15       ` James Almer
2022-06-16 12:44         ` Tomas Härdin
2022-06-18 14:57           ` Anton Khirnov
2022-06-21  8:04             ` Tomas Härdin
2022-06-14 14:42 ` [FFmpeg-devel] [PATCH 07/13] lavc/jpeg2000*: Use ff_fast_recalloc() to eliminate lots of allocations Tomas Härdin
2022-06-14 15:23   ` Andreas Rheinhardt
2022-06-15 10:03     ` Tomas Härdin
2022-06-14 14:43 ` [FFmpeg-devel] [PATCH 08/13] lavc/jpeg2000: Switch Jpeg2000TgtNode to int32_t parent Tomas Härdin
2022-06-14 14:43 ` [FFmpeg-devel] [PATCH 09/13] lavc/jpeg2000: Speed up ff_jpeg2000_tag_tree_init() using stereotypes for sizes <= 4x4 Tomas Härdin
2022-06-18 15:00   ` Anton Khirnov
2022-06-21  7:57     ` Tomas Härdin
2022-06-14 14:43 ` [FFmpeg-devel] [PATCH 10/13] lavc/jpeg2000: Reindent Tomas Härdin
2022-06-14 14:44 ` [FFmpeg-devel] [PATCH 11/13] lavc/jpeg2000: Minimize calls to av_codec_is_encoder() Tomas Härdin
2022-06-14 15:04   ` Andreas Rheinhardt
2022-06-15 10:20     ` Tomas Härdin
2022-06-14 14:44 ` [FFmpeg-devel] [PATCH 12/13] lavc/jpeg2000dec: Use coarser slicing for initial reslevels Tomas Härdin
2022-06-14 14:47 ` [FFmpeg-devel] [PATCH 13/13] lavc/jpeg2000dec: Component-level threading of write_frame() Tomas Härdin
2022-06-18 14:50 ` [FFmpeg-devel] [PATCH 01/13] lavc/jpeg2000dec: Finer granularity threading Anton Khirnov
2022-06-24  8:19   ` Tomas Härdin

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git