From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by master.gitmailbox.com (Postfix) with ESMTPS id 1BBAE4C79A for ; Mon, 10 Mar 2025 03:11:32 +0000 (UTC) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id AA1DA68E84E; Mon, 10 Mar 2025 05:09:18 +0200 (EET) Received: from vidala.pars.ee (vidala.pars.ee [116.203.72.101]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 5F1B168E837 for ; Mon, 10 Mar 2025 05:09:15 +0200 (EET) DKIM-Signature: v=1; a=rsa-sha256; s=202405r; d=lynne.ee; c=relaxed/relaxed; h=Message-ID:Date:Subject:To:From; t=1741576154; bh=YeCGGWZ+MzVdBDgkqDBgIjB PvCPV5eJCHLiAQDK7TgE=; b=Qn0W8UxLrDquGSfDZadbQ54RdRndAkhh6qlAP1s4dJCrrwJZiP 1ZBT/ZsUmSq/zl4Bzy5d3GbGSXz/y6wptACsTybiiAqOmKR88pDN6Gwvzm2jPY8E7JExWLDyxVx xcBXGXpWmhP52dyRNPlxz54nm9C9WPvtL3iuy1/8bcKsdauiEGaVR+CCPzOyKjSj2STH155eYnh 9nbUoLomtMK9WG6w5V45WZKs4k/LEjDR7ncWeJHFX59AvfA9eJOXkREr3AMB9avCrEol0dBdEmI 7n0u0bvyHXyL+TYijfzw2/xWszmIKtbyjNNTxLb3uYrh/8wyFybTvwyIPZik92Ws73w==; DKIM-Signature: v=1; a=ed25519-sha256; s=202405e; d=lynne.ee; c=relaxed/relaxed; h=Message-ID:Date:Subject:To:From; t=1741576154; bh=YeCGGWZ+MzVdBDgkqDBgIjB PvCPV5eJCHLiAQDK7TgE=; b=k2cKWuSwlyEOp5O0QcMQPHNBYlP38KYI6Mv7FW69tF0XRcJsrM dUgaoQdX32Illf9urJYjTxZfKz/Y19UIhoDA==; From: Lynne To: ffmpeg-devel@ffmpeg.org Date: Mon, 10 Mar 2025 04:08:59 +0100 Message-ID: <20250310030912.60902-3-dev@lynne.ee> X-Mailer: git-send-email 2.47.2 In-Reply-To: <20250310030912.60902-1-dev@lynne.ee> References: <20250310030837.60814-1-dev@lynne.ee> <20250310030912.60902-1-dev@lynne.ee> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH 13/13] ffv1: add a Vulkan-based decoder X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Cc: Lynne Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" Archived-At: List-Archive: List-Post: This patch adds a fully-featured level 3 and 4 decoder for FFv1, supporting Golomb and all Range coding variants, all pixel formats, and all features, except for the newly added floating-point formats. On a 6000 Ada, for 3840x2160 bgr0 content at 50Mbps (standard desktop recording), it is able to do 400fps. An Alder Lake with 24 threads can barely do 100fps. --- configure | 2 + libavcodec/Makefile | 1 + libavcodec/ffv1dec.c | 6 + libavcodec/hwaccels.h | 1 + libavcodec/vulkan/Makefile | 6 + libavcodec/vulkan/common.comp | 95 ++ libavcodec/vulkan/ffv1_common.comp | 5 + libavcodec/vulkan/ffv1_dec.comp | 303 ++++++ libavcodec/vulkan/ffv1_dec_rct.comp | 72 ++ libavcodec/vulkan/ffv1_dec_setup.comp | 138 +++ libavcodec/vulkan/ffv1_rct.comp | 90 ++ libavcodec/vulkan/ffv1_vlc.comp | 37 + libavcodec/vulkan/rangecoder.comp | 74 ++ libavcodec/vulkan_decode.c | 17 + libavcodec/vulkan_ffv1.c | 1292 +++++++++++++++++++++++++ 15 files changed, 2139 insertions(+) create mode 100644 libavcodec/vulkan/ffv1_dec.comp create mode 100644 libavcodec/vulkan/ffv1_dec_rct.comp create mode 100644 libavcodec/vulkan/ffv1_dec_setup.comp create mode 100644 libavcodec/vulkan/ffv1_rct.comp create mode 100644 libavcodec/vulkan_ffv1.c diff --git a/configure b/configure index 04b83a8868..fbee82f920 100755 --- a/configure +++ b/configure @@ -3195,6 +3195,8 @@ av1_videotoolbox_hwaccel_deps="videotoolbox" av1_videotoolbox_hwaccel_select="av1_decoder" av1_vulkan_hwaccel_deps="vulkan" av1_vulkan_hwaccel_select="av1_decoder" +ffv1_vulkan_hwaccel_deps="vulkan spirv_compiler" +ffv1_vulkan_hwaccel_select="ffv1_decoder" h263_vaapi_hwaccel_deps="vaapi" h263_vaapi_hwaccel_select="h263_decoder" h263_videotoolbox_hwaccel_deps="videotoolbox" diff --git a/libavcodec/Makefile b/libavcodec/Makefile index 74de7737f9..eb91cbb5ce 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -1017,6 +1017,7 @@ OBJS-$(CONFIG_AV1_VAAPI_HWACCEL) += vaapi_av1.o OBJS-$(CONFIG_AV1_VDPAU_HWACCEL) += vdpau_av1.o OBJS-$(CONFIG_AV1_VIDEOTOOLBOX_HWACCEL) += videotoolbox_av1.o OBJS-$(CONFIG_AV1_VULKAN_HWACCEL) += vulkan_decode.o vulkan_av1.o +OBJS-$(CONFIG_FFV1_VULKAN_HWACCEL) += vulkan_decode.o ffv1_vulkan.o vulkan_ffv1.o OBJS-$(CONFIG_H263_VAAPI_HWACCEL) += vaapi_mpeg4.o OBJS-$(CONFIG_H263_VIDEOTOOLBOX_HWACCEL) += videotoolbox.o OBJS-$(CONFIG_H264_D3D11VA_HWACCEL) += dxva2_h264.o diff --git a/libavcodec/ffv1dec.c b/libavcodec/ffv1dec.c index 6396f22f79..7b0740ad37 100644 --- a/libavcodec/ffv1dec.c +++ b/libavcodec/ffv1dec.c @@ -349,6 +349,9 @@ static int decode_slice(AVCodecContext *c, void *arg) static enum AVPixelFormat get_pixel_format(FFV1Context *f) { enum AVPixelFormat pix_fmts[] = { +#if CONFIG_FFV1_VULKAN_HWACCEL + AV_PIX_FMT_VULKAN, +#endif f->pix_fmt, AV_PIX_FMT_NONE, }; @@ -862,6 +865,9 @@ const FFCodec ff_ffv1_decoder = { .caps_internal = FF_CODEC_CAP_INIT_CLEANUP | FF_CODEC_CAP_USES_PROGRESSFRAMES, .hw_configs = (const AVCodecHWConfigInternal *const []) { +#if CONFIG_FFV1_VULKAN_HWACCEL + HWACCEL_VULKAN(ffv1), +#endif NULL }, }; diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h index 910a024032..0b2c725247 100644 --- a/libavcodec/hwaccels.h +++ b/libavcodec/hwaccels.h @@ -28,6 +28,7 @@ extern const struct FFHWAccel ff_av1_vaapi_hwaccel; extern const struct FFHWAccel ff_av1_vdpau_hwaccel; extern const struct FFHWAccel ff_av1_videotoolbox_hwaccel; extern const struct FFHWAccel ff_av1_vulkan_hwaccel; +extern const struct FFHWAccel ff_ffv1_vulkan_hwaccel; extern const struct FFHWAccel ff_h263_vaapi_hwaccel; extern const struct FFHWAccel ff_h263_videotoolbox_hwaccel; extern const struct FFHWAccel ff_h264_d3d11va_hwaccel; diff --git a/libavcodec/vulkan/Makefile b/libavcodec/vulkan/Makefile index 351332ee44..e6bad486bd 100644 --- a/libavcodec/vulkan/Makefile +++ b/libavcodec/vulkan/Makefile @@ -11,6 +11,12 @@ OBJS-$(CONFIG_FFV1_VULKAN_ENCODER) += vulkan/common.o \ vulkan/ffv1_enc_vlc.o vulkan/ffv1_enc_ac.o \ vulkan/ffv1_enc.o vulkan/ffv1_enc_rgb.o +OBJS-$(CONFIG_FFV1_VULKAN_HWACCEL) += vulkan/common.o \ + vulkan/rangecoder.o vulkan/ffv1_vlc.o \ + vulkan/ffv1_common.o vulkan/ffv1_reset.o \ + vulkan/ffv1_dec_setup.o vulkan/ffv1_dec.o \ + vulkan/ffv1_dec_rct.o + VULKAN = $(subst $(SRC_PATH)/,,$(wildcard $(SRC_PATH)/libavcodec/vulkan/*.comp)) .SECONDARY: $(VULKAN:.comp=.c) libavcodec/vulkan/%.c: TAG = VULKAN diff --git a/libavcodec/vulkan/common.comp b/libavcodec/vulkan/common.comp index e4e983b3e2..b0adf8590e 100644 --- a/libavcodec/vulkan/common.comp +++ b/libavcodec/vulkan/common.comp @@ -26,6 +26,10 @@ layout(buffer_reference, buffer_reference_align = 1) buffer u8vec2buf { u8vec2 v; }; +layout(buffer_reference, buffer_reference_align = 1) buffer u8vec4buf { + u8vec4 v; +}; + layout(buffer_reference, buffer_reference_align = 2) buffer u16buf { uint16_t v; }; @@ -182,3 +186,94 @@ uint32_t put_bytes_count(in PutBitContext pb) uint64_t num_bytes = (pb.buf - pb.buf_start) + ((BUF_BITS - pb.bit_left) >> 3); return uint32_t(num_bytes); } + +struct GetBitContext { + uint64_t buf_start; + uint64_t buf; + uint64_t buf_end; + + uint64_t bits; + uint bits_valid; + uint size_in_bits; +}; + +#define LOAD64() \ + { \ + u8vec4buf ptr = u8vec4buf(gb.buf); \ + uint32_t rf1 = pack32((ptr[0].v).wzyx); \ + uint32_t rf2 = pack32((ptr[1].v).wzyx); \ + gb.buf += 8; \ + gb.bits = uint64_t(rf1) << 32 | uint64_t(rf2); \ + gb.bits_valid = 64; \ + } + +#define RELOAD32() \ + { \ + u8vec4buf ptr = u8vec4buf(gb.buf); \ + uint32_t rf = pack32((ptr[0].v).wzyx); \ + gb.buf += 4; \ + gb.bits = uint64_t(rf) << (32 - gb.bits_valid) | gb.bits; \ + gb.bits_valid += 32; \ + } + +void init_get_bits(inout GetBitContext gb, u8buf data, uint64_t len) +{ + gb.buf = gb.buf_start = uint64_t(data); + gb.buf_end = uint64_t(data) + len; + gb.size_in_bits = uint(len) * 8; + + /* Preload */ + LOAD64() +} + +bool get_bit(inout GetBitContext gb) +{ + if (gb.bits_valid == 0) + LOAD64() + + bool val = bool(gb.bits >> (64 - 1)); + gb.bits <<= 1; + gb.bits_valid--; + return val; +} + +uint get_bits(inout GetBitContext gb, uint n) +{ + if (n == 0) + return 0; + + if (n > gb.bits_valid) + RELOAD32() + + uint val = uint(gb.bits >> (64 - n)); + gb.bits <<= n; + gb.bits_valid -= n; + return val; +} + +uint show_bits(inout GetBitContext gb, uint n) +{ + if (n > gb.bits_valid) + RELOAD32() + + return uint(gb.bits >> (64 - n)); +} + +void skip_bits(inout GetBitContext gb, uint n) +{ + if (n > gb.bits_valid) + RELOAD32() + + gb.bits <<= n; + gb.bits_valid -= n; +} + +uint tell_bits(in GetBitContext gb) +{ + return uint(gb.buf - gb.buf_start) * 8 - gb.bits_valid; +} + +uint left_bits(in GetBitContext gb) +{ + return gb.size_in_bits - uint(gb.buf - gb.buf_start) * 8 + gb.bits_valid; +} diff --git a/libavcodec/vulkan/ffv1_common.comp b/libavcodec/vulkan/ffv1_common.comp index 604d03b2de..d2bd7e736e 100644 --- a/libavcodec/vulkan/ffv1_common.comp +++ b/libavcodec/vulkan/ffv1_common.comp @@ -22,7 +22,12 @@ struct SliceContext { RangeCoder c; + +#if !defined(DECODE) PutBitContext pb; /* 8*8 bytes */ +#else + GetBitContext gb; +#endif ivec2 slice_dim; ivec2 slice_pos; diff --git a/libavcodec/vulkan/ffv1_dec.comp b/libavcodec/vulkan/ffv1_dec.comp new file mode 100644 index 0000000000..a9feb9d318 --- /dev/null +++ b/libavcodec/vulkan/ffv1_dec.comp @@ -0,0 +1,303 @@ +/* + * FFv1 codec + * + * Copyright (c) 2024 Lynne + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +ivec2 get_pred(ivec2 pos, ivec2 off, int p, int comp, int sw, + uint8_t context_model) +{ + const ivec2 yoff_border1 = off.x == 0 ? ivec2(1, -1) : ivec2(0, 0); + const ivec2 yoff_border2 = off.x == 1 ? ivec2(1, -1) : ivec2(0, 0); + + TYPE top2 = TYPE(0); + if (off.y > 1) + top2 = TYPE(imageLoad(dst[p], pos + ivec2(0, -2))[comp]); + + VTYPE3 top = VTYPE3(TYPE(0), + TYPE(0), + TYPE(0)); + if (off.y > 0 && off != ivec2(0, 1)) + top[0] = TYPE(imageLoad(dst[p], pos + ivec2(-1, -1) + yoff_border1)[comp]); + if (off.y > 0) { + top[1] = TYPE(imageLoad(dst[p], pos + ivec2(0, -1))[comp]); + top[2] = TYPE(imageLoad(dst[p], pos + ivec2(min(1, sw - off.x - 1), -1))[comp]); + } + + VTYPE2 cur = VTYPE2(TYPE(0), + TYPE(0)); + if (off.x > 0 && off != ivec2(1, 0)) + cur[0] = TYPE(imageLoad(dst[p], pos + ivec2(-2, 0) + yoff_border2)[comp]); + if (off != ivec2(0, 0)) + cur[1] = TYPE(imageLoad(dst[p], pos + ivec2(-1, 0) + yoff_border1)[comp]); + + /* context, prediction */ + return ivec2(get_context(cur, top, top2, context_model), + predict(cur[1], VTYPE2(top))); +} + +void store_comp(ivec2 pos, int p, int comp, uint v) +{ +#ifdef RGB + uvec4 pix = imageLoad(dst[p], pos); + pix[comp] = v; + imageStore(dst[p], pos, pix); +#else + imageStore(dst[p], pos, uvec4(v)); +#endif +} + +#ifndef GOLOMB +int get_isymbol(inout RangeCoder c, uint64_t state) +{ + if (get_rac(c, state)) + return 0; + + state += 1; + + int e = 0; + while (get_rac(c, state + min(e, 9))) { // 1..10 + e++; + if (e > 31) { + corrupt = true; + return 0; + } + } + + state += 21; + + int a = 1; + for (int i = e - 1; i >= 0; i--) + a += a + int(get_rac(c, state + min(i, 9))); // 22..31 + + e = -int(get_rac(c, state - 11 + min(e, 10))); // 11..21 sign + return (a ^ e) - e; +} + +void decode_line_pcm(inout SliceContext sc, int y, int p, int comp, + int bits) +{ + ivec2 sp = sc.slice_pos; + int w = sc.slice_dim.x; + +#ifndef RGB + if (p > 0 && p < 3) { + w >>= chroma_shift.x; + sp >>= chroma_shift; + } +#endif + + for (int x = 0; x < w; x++) { + uint v = 0; + for (int i = (bits - 1); i >= 0; i--) + v |= uint(get_rac_equi(sc.c)) << i; + + store_comp(sp + ivec2(x, y), p, comp, v); + } +} + +void decode_line(inout SliceContext sc, uint64_t state, + int y, int p, int comp, int bits, const int run_index) +{ + ivec2 sp = sc.slice_pos; + int w = sc.slice_dim.x; + +#ifndef RGB + if (p > 0 && p < 3) { + w >>= chroma_shift.x; + sp >>= chroma_shift; + } +#endif + + for (int x = 0; x < w; x++) { + ivec2 pr = get_pred(sp + ivec2(x, y), ivec2(x, y), p, comp, w, + sc.quant_table_idx[p]); + + int diff = get_isymbol(sc.c, state + CONTEXT_SIZE*abs(pr[0])); + if (pr[0] < 0) + diff = -diff; + + uint v = zero_extend(pr[1] + diff, bits); + store_comp(sp + ivec2(x, y), p, comp, v); + } +} + +#else /* GOLOMB */ + +void decode_line(inout SliceContext sc, uint64_t state, + int y, int p, int comp, int bits, inout int run_index) +{ + ivec2 sp = sc.slice_pos; + int w = sc.slice_dim.x; + +#ifndef RGB + if (p > 0 && p < 3) { + w >>= chroma_shift.x; + sp >>= chroma_shift; + } +#endif + + int run_count = 0; + int run_mode = 0; + + for (int x = 0; x < w; x++) { + ivec2 pos = sp + ivec2(x, y); + int diff; + ivec2 pr = get_pred(sp + ivec2(x, y), ivec2(x, y), p, comp, w, + sc.quant_table_idx[p]); + + VlcState sb = VlcState(state + VLC_STATE_SIZE*abs(pr[0])); + + if (pr[0] == 0 && run_mode == 0) + run_mode = 1; + + if (run_mode != 0) { + if (run_count == 0 && run_mode == 1) { + int tmp_idx = int(log2_run[run_index]); + if (get_bit(sc.gb)) { + run_count = 1 << tmp_idx; + if (x + run_count <= w) + run_index++; + } else { + if (tmp_idx != 0) { + run_count = int(get_bits(sc.gb, tmp_idx)); + } else + run_count = 0; + + if (run_index != 0) + run_index--; + run_mode = 2; + } + } + + run_count--; + if (run_count < 0) { + run_mode = 0; + run_count = 0; + diff = read_vlc_symbol(sc.gb, sb, bits); + if (diff >= 0) + diff++; + } else { + diff = 0; + } + } else { + diff = read_vlc_symbol(sc.gb, sb, bits); + } + + if (pr[0] < 0) + diff = -diff; + + uint v = zero_extend(pr[1] + diff, bits); + store_comp(sp + ivec2(x, y), p, comp, v); + } +} +#endif + +void decode_slice(inout SliceContext sc, const uint slice_idx) +{ + int run_index = 0; + +#ifndef RGB + int bits = bits_per_raw_sample; +#else + int bits = 9; + if (bits != 8 || sc.slice_coding_mode != 0) + bits = bits_per_raw_sample + int(sc.slice_coding_mode != 1); +#endif + + /* PCM coding */ +#ifndef GOLOMB + if (sc.slice_coding_mode == 1) { +#ifndef RGB + for (int p = 0; p < planes; p++) { + int h = sc.slice_dim.y; + if (p > 0 && p < 3) + h >>= chroma_shift.y; + + for (int y = 0; y < h; y++) + decode_line_pcm(sc, y, p, 0, bits); + } +#else + if (transparency == 1) { + for (int y = 0; y < sc.slice_dim.y; y++) { + decode_line_pcm(sc, y, 0, 1, bits); + decode_line_pcm(sc, y, 0, 2, bits); + decode_line_pcm(sc, y, 0, 0, bits); + decode_line_pcm(sc, y, 0, 3, bits); + } + } else { + for (int y = 0; y < sc.slice_dim.y; y++) { + decode_line_pcm(sc, y, 0, 1, bits); + decode_line_pcm(sc, y, 0, 2, bits); + decode_line_pcm(sc, y, 0, 0, bits); + } + } +#endif + } else + + /* Arithmetic coding */ +#endif + { + uint64_t slice_state_off = uint64_t(slice_state) + + slice_idx*plane_state_size*codec_planes; + +#ifndef RGB + for (int p = 0; p < planes; p++) { + int h = sc.slice_dim.y; + if (p > 0 && p < 3) + h >>= chroma_shift.y; + + for (int y = 0; y < h; y++) + decode_line(sc, slice_state_off, y, p, 0, bits, run_index); + + /* For the second chroma plane, reuse the first plane's state */ + if (p != 1) + slice_state_off += plane_state_size; + } +#else + if (transparency == 1) { + for (int y = 0; y < sc.slice_dim.y; y++) { + decode_line(sc, slice_state_off + plane_state_size*0, + y, 0, 1, bits, run_index); + decode_line(sc, slice_state_off + plane_state_size*1, + y, 0, 2, bits, run_index); + decode_line(sc, slice_state_off + plane_state_size*1, + y, 0, 0, bits, run_index); + decode_line(sc, slice_state_off + plane_state_size*2, + y, 0, 3, bits, run_index); + } + } else { + for (int y = 0; y < sc.slice_dim.y; y++) { + decode_line(sc, slice_state_off + plane_state_size*0, + y, 0, 1, bits, run_index); + decode_line(sc, slice_state_off + plane_state_size*1, + y, 0, 2, bits, run_index); + decode_line(sc, slice_state_off + plane_state_size*1, + y, 0, 0, bits, run_index); + } + } +#endif + } +} + +void main(void) +{ + const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x; + decode_slice(slice_ctx[slice_idx], slice_idx); +} diff --git a/libavcodec/vulkan/ffv1_dec_rct.comp b/libavcodec/vulkan/ffv1_dec_rct.comp new file mode 100644 index 0000000000..0305dc3295 --- /dev/null +++ b/libavcodec/vulkan/ffv1_dec_rct.comp @@ -0,0 +1,72 @@ +/* + * FFv1 codec + * + * Copyright (c) 2025 Lynne + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +void bypass_block(in SliceContext sc) +{ + ivec2 start = ivec2(gl_LocalInvocationID) + sc.slice_pos; + ivec2 end = sc.slice_pos + sc.slice_dim; + + for (uint y = start.y; y < end.y; y += gl_WorkGroupSize.y) + for (uint x = start.x; x < end.x; x += gl_WorkGroupSize.x) + imageStore(dst[0], ivec2(x, y), ivec4(imageLoad(src[0], ivec2(x, y)))); +} + +void transform_sample(ivec2 pos, ivec2 rct_coef) +{ + ivec4 pix = ivec4(imageLoad(src[0], pos)); + + pix.b -= offset; + pix.r -= offset; + pix.g -= (pix.b*rct_coef.y + pix.r*rct_coef.x) >> 2; + pix.b += pix.g; + pix.r += pix.g; + + pix = ivec4(pix[fmt_lut[0]], pix[fmt_lut[1]], + pix[fmt_lut[2]], pix[fmt_lut[3]]); + + imageStore(dst[0], pos, pix); + if (planar_rgb != 0) { + for (int i = 1; i < (3 + transparency); i++) + imageStore(dst[i], pos, ivec4(pix[i])); + } +} + +void transform_block(in SliceContext sc) +{ + const ivec2 rct_coef = sc.slice_rct_coef; + const ivec2 start = ivec2(gl_LocalInvocationID) + sc.slice_pos; + const ivec2 end = sc.slice_pos + sc.slice_dim; + + for (uint y = start.y; y < end.y; y += gl_WorkGroupSize.y) + for (uint x = start.x; x < end.x; x += gl_WorkGroupSize.x) + transform_sample(ivec2(x, y), rct_coef); +} + +void main() +{ + const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x; + + if (slice_ctx[slice_idx].slice_coding_mode == 1) + bypass_block(slice_ctx[slice_idx]); + else + transform_block(slice_ctx[slice_idx]); +} diff --git a/libavcodec/vulkan/ffv1_dec_setup.comp b/libavcodec/vulkan/ffv1_dec_setup.comp new file mode 100644 index 0000000000..a10163a8d6 --- /dev/null +++ b/libavcodec/vulkan/ffv1_dec_setup.comp @@ -0,0 +1,138 @@ +/* + * FFv1 codec + * + * Copyright (c) 2024 Lynne + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +uint get_usymbol(inout RangeCoder c, uint64_t state) +{ + if (get_rac(c, state + 0)) + return 0; + + int e = 0; + while (get_rac(c, state + 1 + min(e, 9))) { // 1..10 + e++; + if (e > 31) { + corrupt = true; + return 0; + } + } + + uint a = 1; + for (int i = e - 1; i >= 0; i--) + a += a + uint(get_rac(c, state + 22 + min(i, 9))); // 22..31 + + return a; +} + +bool decode_slice_header(inout SliceContext sc, uint64_t state) +{ + u8buf sb = u8buf(state); + + [[unroll]] + for (int i = 0; i < CONTEXT_SIZE; i++) + sb[i].v = uint8_t(128); + + uint sx = get_usymbol(sc.c, state); + uint sy = get_usymbol(sc.c, state); + uint sw = get_usymbol(sc.c, state) + 1; + uint sh = get_usymbol(sc.c, state) + 1; + + if (sx < 0 || sy < 0 || sw <= 0 || sh <= 0 || + sx > (gl_NumWorkGroups.x - sw) || sy > (gl_NumWorkGroups.y - sh) || + corrupt) { + return true; + } + + /* Set coordinates */ + uint sxs = slice_coord(img_size.x, sx , gl_NumWorkGroups.x, chroma_shift.x); + uint sxe = slice_coord(img_size.x, sx + sw, gl_NumWorkGroups.x, chroma_shift.x); + uint sys = slice_coord(img_size.y, sy , gl_NumWorkGroups.y, chroma_shift.y); + uint sye = slice_coord(img_size.y, sy + sh, gl_NumWorkGroups.y, chroma_shift.y); + + sc.slice_pos = ivec2(sxs, sys); + sc.slice_dim = ivec2(sxe - sxs, sye - sys); + sc.slice_rct_coef = ivec2(1, 1); + sc.slice_coding_mode = int(0); + + for (uint i = 0; i < codec_planes; i++) { + uint idx = get_usymbol(sc.c, state); + if (idx >= quant_table_count) + return true; + sc.quant_table_idx[i] = uint8_t(idx); + sc.context_count = context_count[idx]; + } + + get_usymbol(sc.c, state); + get_usymbol(sc.c, state); + get_usymbol(sc.c, state); + + if (version >= 4) { + sc.slice_reset_contexts = get_rac(sc.c, state); + sc.slice_coding_mode = get_usymbol(sc.c, state); + if (sc.slice_coding_mode != 1 && colorspace == 1) { + sc.slice_rct_coef.x = int(get_usymbol(sc.c, state)); + sc.slice_rct_coef.y = int(get_usymbol(sc.c, state)); + if (sc.slice_rct_coef.x + sc.slice_rct_coef.y > 4) + return true; + } + } + + return false; +} + +void golomb_init(inout SliceContext sc, uint64_t state) +{ + if (version == 3 && micro_version > 1 || version > 3) { + u8buf(state).v = uint8_t(129); + get_rac(sc.c, state); + } + + uint64_t ac_byte_count = sc.c.bytestream - sc.c.bytestream_start - 1; + init_get_bits(sc.gb, u8buf(sc.c.bytestream_start + ac_byte_count), + sc.c.bytestream_end - sc.c.bytestream_start - ac_byte_count); +} + +void main(void) +{ + const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x; + uint64_t scratch_state = uint64_t(scratch_data) + slice_idx*CONTEXT_SIZE; + + u8buf bs = u8buf(slice_data + slice_offsets[2*slice_idx + 0]); + uint32_t slice_size = slice_offsets[2*slice_idx + 1]; + + rac_init_dec(slice_ctx[slice_idx].c, + bs, slice_size); + + if (slice_idx == (gl_NumWorkGroups.x*gl_NumWorkGroups.y - 1)) + get_rac_equi(slice_ctx[slice_idx].c); + + decode_slice_header(slice_ctx[slice_idx], scratch_state); + + if (golomb == 1) + golomb_init(slice_ctx[slice_idx], scratch_state); + + if (ec != 0 && check_crc != 0) { + uint32_t crc = crcref; + for (int i = 0; i < slice_size; i++) + crc = crc_ieee[(crc & 0xFF) ^ uint32_t(bs[i].v)] ^ (crc >> 8); + + slice_crc_mismatch[slice_idx] = crc; + } +} diff --git a/libavcodec/vulkan/ffv1_rct.comp b/libavcodec/vulkan/ffv1_rct.comp new file mode 100644 index 0000000000..b10bb47132 --- /dev/null +++ b/libavcodec/vulkan/ffv1_rct.comp @@ -0,0 +1,90 @@ +/* + * FFv1 codec + * + * Copyright (c) 2024 Lynne + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +ivec4 load_components(ivec2 pos) +{ + ivec4 pix = ivec4(imageLoad(src[0], pos)); + if (planar_rgb != 0) { + for (int i = 1; i < (3 + transparency); i++) + pix[i] = int(imageLoad(src[i], pos)[0]); + } + + return ivec4(pix[fmt_lut[0]], pix[fmt_lut[1]], + pix[fmt_lut[2]], pix[fmt_lut[3]]); +} + +void bypass_sample(ivec2 pos) +{ + imageStore(dst[0], pos, load_components(pos)); +} + +void bypass_block(in SliceContext sc) +{ + ivec2 start = ivec2(gl_LocalInvocationID) + sc.slice_pos; + ivec2 end = sc.slice_pos + sc.slice_dim; + for (uint y = start.y; y < end.y; y += gl_WorkGroupSize.y) + for (uint x = start.x; x < end.x; x += gl_WorkGroupSize.x) + bypass_sample(ivec2(x, y)); +} + +void transform_sample(ivec2 pos, ivec2 rct_coef) +{ + ivec4 pix = load_components(pos); + pix.b -= offset; + pix.r -= offset; + pix.g -= (pix.r*rct_coef.x + pix.b*rct_coef.y) >> 2; + pix.b += pix.g; + pix.r += pix.g; + imageStore(dst[0], pos, pix); +} + +void transform_sample(ivec2 pos, ivec2 rct_coef) +{ + ivec4 pix = load_components(pos); + pix.b -= pix.g; + pix.r -= pix.g; + pix.g += (pix.r*rct_coef.x + pix.b*rct_coef.y) >> 2; + pix.b += offset; + pix.r += offset; + imageStore(dst[0], pos, pix); +} + +void transform_block(in SliceContext sc) +{ + const ivec2 rct_coef = sc.slice_rct_coef; + const ivec2 start = ivec2(gl_LocalInvocationID) + sc.slice_pos; + const ivec2 end = sc.slice_pos + sc.slice_dim; + + for (uint y = start.y; y < end.y; y += gl_WorkGroupSize.y) + for (uint x = start.x; x < end.x; x += gl_WorkGroupSize.x) + transform_sample(ivec2(x, y), rct_coef); +} + +void main() +{ + const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x; + + if (slice_ctx[slice_idx].slice_coding_mode == 1) + bypass_block(slice_ctx[slice_idx]); + else + transform_block(slice_ctx[slice_idx]); +} diff --git a/libavcodec/vulkan/ffv1_vlc.comp b/libavcodec/vulkan/ffv1_vlc.comp index 0a53e035b5..d374e5a069 100644 --- a/libavcodec/vulkan/ffv1_vlc.comp +++ b/libavcodec/vulkan/ffv1_vlc.comp @@ -120,3 +120,40 @@ Symbol get_vlc_symbol(inout VlcState state, int v, int bits) return set_sr_golomb(code, k, 12, bits); } + +uint get_ur_golomb(inout GetBitContext gb, uint k, int limit, int esc_len) +{ + for (uint i = 0; i < 12; i++) + if (get_bit(gb)) + return get_bits(gb, k) + (i << k); + + return get_bits(gb, esc_len) + 11; +} + +int get_sr_golomb(inout GetBitContext gb, uint k, int limit, int esc_len) +{ + int v = int(get_ur_golomb(gb, k, limit, esc_len)); + return (v >> 1) ^ -(v & 1); +} + +int read_vlc_symbol(inout GetBitContext gb, inout VlcState state, int bits) +{ + int k, i, v, ret; + + i = state.count; + k = 0; + while (i < state.error_sum) { // FIXME: optimize + k++; + i += i; + } + + v = get_sr_golomb(gb, k, 12, bits); + + v ^= ((2 * state.drift + state.count) >> 31); + + ret = fold(v + state.bias, bits); + + update_vlc_state(state, v); + + return ret; +} diff --git a/libavcodec/vulkan/rangecoder.comp b/libavcodec/vulkan/rangecoder.comp index 6e3b9c1238..8c8d0d9d9c 100644 --- a/libavcodec/vulkan/rangecoder.comp +++ b/libavcodec/vulkan/rangecoder.comp @@ -191,3 +191,77 @@ void rac_init(out RangeCoder r, u8buf data, uint buf_size) r.outstanding_count = uint16_t(0); r.outstanding_byte = uint8_t(0xFF); } + +/* Decoder */ +uint overread; +bool corrupt; + +void rac_init_dec(out RangeCoder r, u8buf data, uint buf_size) +{ + overread = 0; + corrupt = false; + + /* Skip priming bytes */ + rac_init(r, OFFBUF(u8buf, data, 2), buf_size - 2); + + u8vec2 prime = u8vec2buf(data).v; + /* Switch endianess of the priming bytes */ + r.low = pack16(prime.yx); + + if (r.low >= 0xFF00) { + r.low = 0xFF00; + r.bytestream_end = uint64_t(data) + 2; + } +} + +void refill(inout RangeCoder c) +{ + c.range <<= 8; + c.low <<= 8; + if (c.bytestream < c.bytestream_end) { + c.low += u8buf(c.bytestream).v; + c.bytestream++; + } else { + overread++; + } +} + +bool get_rac(inout RangeCoder c, uint64_t state) +{ + u8buf sb = u8buf(state); + uint val = uint(sb.v); + uint16_t range1 = uint16_t((uint(c.range) * val) >> 8); + + c.range -= range1; + + bool bit = c.low >= c.range; + sb.v = zero_one_state[(uint(bit) << 8) + val]; + + if (bit) { + c.low -= c.range; + c.range = range1; + } + + if (c.range < 0x100) + refill(c); + + return bit; +} + +bool get_rac_equi(inout RangeCoder c) +{ + uint16_t range1 = c.range >> 1; + + c.range -= range1; + + bool bit = c.low >= c.range; + if (bit) { + c.low -= c.range; + c.range = range1; + } + + if (c.range < 0x100) + refill(c); + + return bit; +} diff --git a/libavcodec/vulkan_decode.c b/libavcodec/vulkan_decode.c index cd77e10e12..bc850a7333 100644 --- a/libavcodec/vulkan_decode.c +++ b/libavcodec/vulkan_decode.c @@ -36,6 +36,9 @@ extern const FFVulkanDecodeDescriptor ff_vk_dec_hevc_desc; #if CONFIG_AV1_VULKAN_HWACCEL extern const FFVulkanDecodeDescriptor ff_vk_dec_av1_desc; #endif +#if CONFIG_FFV1_VULKAN_HWACCEL +extern const FFVulkanDecodeDescriptor ff_vk_dec_ffv1_desc; +#endif static const FFVulkanDecodeDescriptor *dec_descs[] = { #if CONFIG_H264_VULKAN_HWACCEL @@ -47,6 +50,9 @@ static const FFVulkanDecodeDescriptor *dec_descs[] = { #if CONFIG_AV1_VULKAN_HWACCEL &ff_vk_dec_av1_desc, #endif +#if CONFIG_FFV1_VULKAN_HWACCEL + &ff_vk_dec_ffv1_desc, +#endif }; static const FFVulkanDecodeDescriptor *get_codecdesc(enum AVCodecID codec_id) @@ -1035,6 +1041,17 @@ int ff_vk_frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx) frames_ctx->free = free_profile_data; hwfc->create_pnext = &prof->profile_list; + } else { + switch (frames_ctx->sw_format) { + case AV_PIX_FMT_GBRAP16: + frames_ctx->sw_format = AV_PIX_FMT_RGBA64; + break; + case AV_PIX_FMT_BGR0: + frames_ctx->sw_format = AV_PIX_FMT_RGB0; + break; + default: + break; + } } frames_ctx->width = avctx->coded_width; diff --git a/libavcodec/vulkan_ffv1.c b/libavcodec/vulkan_ffv1.c new file mode 100644 index 0000000000..276514a228 --- /dev/null +++ b/libavcodec/vulkan_ffv1.c @@ -0,0 +1,1292 @@ +/* + * Copyright (c) 2024 Lynne + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "vulkan_decode.h" +#include "hwaccel_internal.h" + +#include "ffv1.h" +#include "ffv1_vulkan.h" +#include "libavutil/vulkan_spirv.h" +#include "libavutil/mem.h" + +extern const char *ff_source_common_comp; +extern const char *ff_source_rangecoder_comp; +extern const char *ff_source_ffv1_vlc_comp; +extern const char *ff_source_ffv1_common_comp; +extern const char *ff_source_ffv1_dec_setup_comp; +extern const char *ff_source_ffv1_reset_comp; +extern const char *ff_source_ffv1_dec_comp; +extern const char *ff_source_ffv1_dec_rct_comp; + +const FFVulkanDecodeDescriptor ff_vk_dec_ffv1_desc = { + .codec_id = AV_CODEC_ID_FFV1, + .decode_extension = FF_VK_EXT_PUSH_DESCRIPTOR, + .queue_flags = VK_QUEUE_COMPUTE_BIT, +}; + +#define HOST_MAP + +typedef struct FFv1VulkanDecodePicture { + FFVulkanDecodePicture vp; + + AVBufferRef *tmp_data; + + AVBufferRef *slice_state; + uint32_t plane_state_size; + uint32_t slice_state_size; + uint32_t slice_data_size; + uint32_t max_context_count; + + AVBufferRef *slice_offset_buf; + uint32_t *slice_offset; + int slice_num; + + AVBufferRef *slice_status_buf; + int crc_checked; +} FFv1VulkanDecodePicture; + +typedef struct FFv1VulkanDecodeContext { + AVBufferRef *intermediate_frames_ref[2]; /* 16/32 bit */ + + FFVulkanShader setup; + FFVulkanShader reset[2]; /* AC/Golomb */ + FFVulkanShader decode[2][2][2]; /* 16/32 bit, AC/Golomb, Normal/RGB */ + FFVulkanShader rct[2]; /* 16/32 bit */ + + FFVkBuffer rangecoder_static_buf; + FFVkBuffer quant_buf; + FFVkBuffer crc_tab_buf; + + AVBufferPool *slice_state_pool; + AVBufferPool *tmp_data_pool; + AVBufferPool *slice_offset_pool; + AVBufferPool *slice_status_pool; +} FFv1VulkanDecodeContext; + +typedef struct FFv1VkParameters { + uint32_t context_count[MAX_QUANT_TABLES]; + + VkDeviceAddress slice_data; + VkDeviceAddress slice_state; + VkDeviceAddress scratch_data; + + uint32_t img_size[2]; + uint32_t chroma_shift[2]; + + uint32_t plane_state_size; + uint32_t crcref; + + uint8_t bits_per_raw_sample; + uint8_t quant_table_count; + uint8_t version; + uint8_t micro_version; + uint8_t key_frame; + uint8_t planes; + uint8_t codec_planes; + uint8_t transparency; + uint8_t colorspace; + uint8_t ec; + uint8_t golomb; + uint8_t check_crc; +} FFv1VkParameters; + +static void add_push_data(FFVulkanShader *shd) +{ + GLSLC(0, layout(push_constant, scalar) uniform pushConstants { ); + GLSLF(1, uint context_count[%i]; ,MAX_QUANT_TABLES); + GLSLC(0, ); + GLSLC(1, u8buf slice_data; ); + GLSLC(1, u8buf slice_state; ); + GLSLC(1, u8buf scratch_data; ); + GLSLC(0, ); + GLSLC(1, uvec2 img_size; ); + GLSLC(1, uvec2 chroma_shift; ); + GLSLC(0, ); + GLSLC(1, uint plane_state_size; ); + GLSLC(1, uint32_t crcref; ); + GLSLC(0, ); + GLSLC(1, uint8_t bits_per_raw_sample; ); + GLSLC(1, uint8_t quant_table_count; ); + GLSLC(1, uint8_t version; ); + GLSLC(1, uint8_t micro_version; ); + GLSLC(1, uint8_t key_frame; ); + GLSLC(1, uint8_t planes; ); + GLSLC(1, uint8_t codec_planes; ); + GLSLC(1, uint8_t transparency; ); + GLSLC(1, uint8_t colorspace; ); + GLSLC(1, uint8_t ec; ); + GLSLC(1, uint8_t golomb; ); + GLSLC(1, uint8_t check_crc; ); + GLSLC(0, }; ); + ff_vk_shader_add_push_const(shd, 0, sizeof(FFv1VkParameters), + VK_SHADER_STAGE_COMPUTE_BIT); +} + +static int vk_ffv1_start_frame(AVCodecContext *avctx, + av_unused const uint8_t *buffer, + av_unused uint32_t size) +{ + int err; + FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data; + FFVulkanDecodeShared *ctx = dec->shared_ctx; + FFv1VulkanDecodeContext *fv = ctx->sd_ctx; + FFV1Context *f = avctx->priv_data; + + FFv1VulkanDecodePicture *fp = f->hwaccel_picture_private; + FFVulkanDecodePicture *vp = &fp->vp; + + AVHWFramesContext *hwfc = (AVHWFramesContext *)avctx->hw_frames_ctx->data; + enum AVPixelFormat sw_format = hwfc->sw_format; + + int is_rgb = !(f->colorspace == 0 && sw_format != AV_PIX_FMT_YA8) && + !(sw_format == AV_PIX_FMT_YA8); + + fp->slice_num = 0; + + for (int i = 0; i < f->quant_table_count; i++) + fp->max_context_count = FFMAX(f->context_count[i], fp->max_context_count); + + /* Allocate slice buffer data */ + if (f->ac == AC_GOLOMB_RICE) + fp->plane_state_size = 8; + else + fp->plane_state_size = CONTEXT_SIZE; + + fp->plane_state_size *= fp->max_context_count; + fp->slice_state_size = fp->plane_state_size*f->plane_count; + + fp->slice_data_size = 256; /* Overestimation for the SliceContext struct */ + fp->slice_state_size += fp->slice_data_size; + fp->slice_state_size = FFALIGN(fp->slice_state_size, 8); + + fp->crc_checked = f->ec && (avctx->err_recognition & AV_EF_CRCCHECK); + + /* Host map the input slices data if supported */ + if (ctx->s.extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY) { + err = ff_vk_host_map_buffer(&ctx->s, &vp->slices_buf, f->pkt_ref->data, + f->pkt_ref->buf, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT); + if (err < 0) + return err; + } + + /* Allocate slice state data */ + if (f->picture.f->flags & AV_FRAME_FLAG_KEY) { + err = ff_vk_get_pooled_buffer(&ctx->s, &fv->slice_state_pool, + &fp->slice_state, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + NULL, f->max_slice_count*fp->slice_state_size, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); + if (err < 0) + return err; + } else { + FFv1VulkanDecodePicture *fpl = f->hwaccel_last_picture_private; + fp->slice_state = av_buffer_ref(fpl->slice_state); + if (!fp->slice_state) + return AVERROR(ENOMEM); + } + + /* Allocate temporary data buffer */ + err = ff_vk_get_pooled_buffer(&ctx->s, &fv->tmp_data_pool, + &fp->tmp_data, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + NULL, f->max_slice_count*CONTEXT_SIZE, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); + if (err < 0) + return err; + + /* Allocate slice offsets buffer */ + err = ff_vk_get_pooled_buffer(&ctx->s, &fv->slice_offset_pool, + &fp->slice_offset_buf, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + NULL, 2*f->max_slice_count*sizeof(uint32_t), + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); + if (err < 0) + return err; + + /* Allocate slice status buffer */ + err = ff_vk_get_pooled_buffer(&ctx->s, &fv->slice_status_pool, + &fp->slice_status_buf, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + NULL, f->max_slice_count*sizeof(uint32_t), + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); + if (err < 0) + return err; + + /* Prepare frame to be used */ + err = ff_vk_decode_prepare_frame_sdr(dec, f->picture.f, vp, 1, + FF_VK_REP_NATIVE, 0); + if (err < 0) + return err; + + /* Create a temporaty frame for RGB */ + if (is_rgb) { + AVHWFramesContext *dpb_hwfc; + dpb_hwfc = (AVHWFramesContext *)fv->intermediate_frames_ref[f->use32bit]->data; + + vp->dpb_frame = av_frame_alloc(); + if (!vp->dpb_frame) + return AVERROR(ENOMEM); + + err = av_hwframe_get_buffer(fv->intermediate_frames_ref[f->use32bit], + vp->dpb_frame, 0); + if (err < 0) + return err; + + err = ff_vk_decode_prepare_frame_sdr(dec, vp->dpb_frame, vp, 1, + FF_VK_REP_NATIVE, 0); + if (err < 0) + return err; + + for (int i = 0; i < av_pix_fmt_count_planes(dpb_hwfc->sw_format); i++) { + err = ff_vk_create_imageview(&ctx->s, + &vp->view.dst[i], &vp->view.aspect_ref[i], + vp->dpb_frame, + i, FF_VK_REP_NATIVE); + if (err < 0) + return err; + } + } + + return 0; +} + +static int vk_ffv1_decode_slice(AVCodecContext *avctx, + const uint8_t *data, + uint32_t size) +{ + FFV1Context *f = avctx->priv_data; + FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data; + FFVulkanDecodeShared *ctx = dec->shared_ctx; + + FFv1VulkanDecodePicture *fp = f->hwaccel_picture_private; + FFVkBuffer *slice_offset = (FFVkBuffer *)fp->slice_offset_buf->data; + + if (ctx->s.extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY) { + AV_WN32(slice_offset->mapped_mem + (2*fp->slice_num + 0)*sizeof(uint32_t), + data - f->pkt_ref->data); + AV_WN32(slice_offset->mapped_mem + (2*fp->slice_num + 1)*sizeof(uint32_t), + size); + fp->slice_num++; + } else { + FFVulkanDecodePicture *vp = &fp->vp; + int err = ff_vk_decode_add_slice(avctx, vp, data, size, 0, + &fp->slice_num, + (const uint32_t **)&fp->slice_offset); + if (err < 0) + return err; + + AV_WN32(slice_offset->mapped_mem + (2*(fp->slice_num - 1) + 0)*sizeof(uint32_t), + fp->slice_offset[fp->slice_num - 1]); + AV_WN32(slice_offset->mapped_mem + (2*(fp->slice_num - 1) + 1)*sizeof(uint32_t), + size); + } + + return 0; +} + +static int vk_ffv1_end_frame(AVCodecContext *avctx) +{ + int err; + FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data; + FFVulkanDecodeShared *ctx = dec->shared_ctx; + FFVulkanFunctions *vk = &ctx->s.vkfn; + + FFV1Context *f = avctx->priv_data; + FFv1VulkanDecodeContext *fv = ctx->sd_ctx; + FFv1VkParameters pd; + FFv1VkResetParameters pd_reset; + + AVHWFramesContext *hwfc = (AVHWFramesContext *)avctx->hw_frames_ctx->data; + enum AVPixelFormat sw_format = hwfc->sw_format; + + int bits = f->avctx->bits_per_raw_sample > 0 ? f->avctx->bits_per_raw_sample : 8; + int is_rgb = !(f->colorspace == 0 && sw_format != AV_PIX_FMT_YA8) && + !(sw_format == AV_PIX_FMT_YA8); + + FFVulkanShader *reset_shader; + FFVulkanShader *decode_shader; + + FFv1VulkanDecodePicture *fp = f->hwaccel_picture_private; + FFVulkanDecodePicture *vp = &fp->vp; + + FFVkBuffer *slices_buf = (FFVkBuffer *)vp->slices_buf->data; + FFVkBuffer *slice_state = (FFVkBuffer *)fp->slice_state->data; + FFVkBuffer *slice_offset = (FFVkBuffer *)fp->slice_offset_buf->data; + FFVkBuffer *slice_status = (FFVkBuffer *)fp->slice_status_buf->data; + + FFVkBuffer *tmp_data = (FFVkBuffer *)fp->tmp_data->data; + + AVFrame *decode_dst = is_rgb ? vp->dpb_frame : f->picture.f; + VkImageView *decode_dst_view = is_rgb ? vp->view.dst : vp->view.out; + + VkImageMemoryBarrier2 img_bar[37]; + int nb_img_bar = 0; + VkBufferMemoryBarrier2 buf_bar[8]; + int nb_buf_bar = 0; + + FFVkExecContext *exec = ff_vk_exec_get(&ctx->s, &ctx->exec_pool); + ff_vk_exec_start(&ctx->s, exec); + + /* Prepare deps */ + RET(ff_vk_exec_add_dep_frame(&ctx->s, exec, f->picture.f, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT)); + + err = ff_vk_exec_mirror_sem_value(&ctx->s, exec, &vp->sem, &vp->sem_value, + f->picture.f); + if (err < 0) + return err; + + if (is_rgb) + RET(ff_vk_exec_add_dep_frame(&ctx->s, exec, vp->dpb_frame, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT)); + + if (!(f->picture.f->flags & AV_FRAME_FLAG_KEY)) { + FFv1VulkanDecodePicture *fpl = f->hwaccel_last_picture_private; + FFVulkanDecodePicture *vpl = &fpl->vp; + + /* Wait on the previous frame */ + RET(ff_vk_exec_add_dep_wait_sem(&ctx->s, exec, vpl->sem, vpl->sem_value, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)); + } + + RET(ff_vk_exec_add_dep_buf(&ctx->s, exec, &fp->slice_state, 1, 1)); + RET(ff_vk_exec_add_dep_buf(&ctx->s, exec, &fp->slice_status_buf, 1, 1)); + RET(ff_vk_exec_add_dep_buf(&ctx->s, exec, &vp->slices_buf, 1, 0)); + vp->slices_buf = NULL; + RET(ff_vk_exec_add_dep_buf(&ctx->s, exec, &fp->slice_offset_buf, 1, 0)); + fp->slice_offset_buf = NULL; + RET(ff_vk_exec_add_dep_buf(&ctx->s, exec, &fp->tmp_data, 1, 0)); + fp->tmp_data = NULL; + + /* Entry barrier for the slice state */ + if (!(f->picture.f->flags & AV_FRAME_FLAG_KEY)) { + buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, + .srcStageMask = slice_state->stage, + .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + .srcAccessMask = slice_state->access, + .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = slice_state->buf, + .offset = 0, + .size = VK_WHOLE_SIZE, + }; + } + + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pBufferMemoryBarriers = buf_bar, + .bufferMemoryBarrierCount = nb_buf_bar, + }); + if (nb_buf_bar) { + slice_state->stage = buf_bar[1].dstStageMask; + slice_state->access = buf_bar[1].dstAccessMask; + nb_buf_bar = 0; + } + + /* Setup shader */ + ff_vk_shader_update_desc_buffer(&ctx->s, exec, &fv->setup, + 1, 0, 0, + slice_state, + 0, fp->slice_data_size*f->slice_count, + VK_FORMAT_UNDEFINED); + ff_vk_shader_update_desc_buffer(&ctx->s, exec, &fv->setup, + 1, 1, 0, + slice_offset, + 0, 2*f->slice_count*sizeof(uint32_t), + VK_FORMAT_UNDEFINED); + ff_vk_shader_update_desc_buffer(&ctx->s, exec, &fv->setup, + 1, 2, 0, + slice_status, + 0, f->slice_count*sizeof(uint32_t), + VK_FORMAT_UNDEFINED); + + ff_vk_exec_bind_shader(&ctx->s, exec, &fv->setup); + pd = (FFv1VkParameters) { + /* context_count */ + + .slice_data = slices_buf->address, + .slice_state = slice_state->address + f->slice_count*fp->slice_data_size, + .scratch_data = tmp_data->address, + + .img_size[0] = f->picture.f->width, + .img_size[1] = f->picture.f->height, + .chroma_shift[0] = f->chroma_h_shift, + .chroma_shift[1] = f->chroma_v_shift, + + .plane_state_size = fp->plane_state_size, + .crcref = f->crcref, + + .bits_per_raw_sample = bits, + .quant_table_count = f->quant_table_count, + .version = f->version, + .micro_version = f->micro_version, + .key_frame = f->picture.f->flags & AV_FRAME_FLAG_KEY, + .planes = av_pix_fmt_count_planes(sw_format), + .codec_planes = f->plane_count, + .transparency = f->transparency, + .colorspace = f->colorspace, + .ec = f->ec, + .golomb = f->ac == AC_GOLOMB_RICE, + .check_crc = !!(avctx->err_recognition & AV_EF_CRCCHECK), + }; + for (int i = 0; i < MAX_QUANT_TABLES; i++) + pd.context_count[i] = f->context_count[i]; + + ff_vk_shader_update_push_const(&ctx->s, exec, &fv->setup, + VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pd), &pd); + + vk->CmdDispatch(exec->buf, f->num_h_slices, f->num_v_slices, 1); + + /* Reset shader */ + reset_shader = &fv->reset[f->ac == AC_GOLOMB_RICE]; + ff_vk_shader_update_desc_buffer(&ctx->s, exec, reset_shader, + 1, 0, 0, + slice_state, + 0, fp->slice_data_size*f->slice_count, + VK_FORMAT_UNDEFINED); + + ff_vk_exec_bind_shader(&ctx->s, exec, reset_shader); + + pd_reset = (FFv1VkResetParameters) { + .slice_state = slice_state->address + f->slice_count*fp->slice_data_size, + .plane_state_size = fp->plane_state_size, + .context_count = fp->max_context_count, + .codec_planes = f->plane_count, + .key_frame = f->picture.f->flags & AV_FRAME_FLAG_KEY, + .version = f->version, + .micro_version = f->micro_version, + }; + ff_vk_shader_update_push_const(&ctx->s, exec, reset_shader, + VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pd_reset), &pd_reset); + + /* Sync between setup and reset shaders */ + buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, + .srcStageMask = slice_state->stage, + .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + .srcAccessMask = slice_state->access, + .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT | + VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = slice_state->buf, + .offset = 0, + .size = fp->slice_data_size*f->slice_count, + }; + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pBufferMemoryBarriers = buf_bar, + .bufferMemoryBarrierCount = nb_buf_bar, + }); + slice_state->stage = buf_bar[0].dstStageMask; + slice_state->access = buf_bar[0].dstAccessMask; + nb_buf_bar = 0; + + vk->CmdDispatch(exec->buf, f->num_h_slices, f->num_v_slices, + f->plane_count); + + /* Decode */ + decode_shader = &fv->decode[f->use32bit][f->ac == AC_GOLOMB_RICE][is_rgb]; + ff_vk_shader_update_desc_buffer(&ctx->s, exec, decode_shader, + 1, 0, 0, + slice_state, + 0, fp->slice_data_size*f->slice_count, + VK_FORMAT_UNDEFINED); + ff_vk_shader_update_img_array(&ctx->s, exec, decode_shader, + decode_dst, decode_dst_view, + 1, 1, + VK_IMAGE_LAYOUT_GENERAL, + VK_NULL_HANDLE); + + ff_vk_exec_bind_shader(&ctx->s, exec, decode_shader); + ff_vk_shader_update_push_const(&ctx->s, exec, decode_shader, + VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pd), &pd); + + /* Sync between reset and decode shaders */ + buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, + .srcStageMask = slice_state->stage, + .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + .srcAccessMask = slice_state->access, + .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT | + VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = slice_state->buf, + .offset = fp->slice_data_size*f->slice_count, + .size = slice_state->size - fp->slice_data_size*f->slice_count, + }; + + /* Input frame barrier */ + ff_vk_frame_barrier(&ctx->s, exec, decode_dst, img_bar, &nb_img_bar, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, + VK_IMAGE_LAYOUT_GENERAL, + VK_QUEUE_FAMILY_IGNORED); + + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pImageMemoryBarriers = img_bar, + .imageMemoryBarrierCount = nb_img_bar, + .pBufferMemoryBarriers = buf_bar, + .bufferMemoryBarrierCount = nb_buf_bar, + }); + slice_state->stage = buf_bar[0].dstStageMask; + slice_state->access = buf_bar[0].dstAccessMask; + nb_img_bar = 0; + nb_buf_bar = 0; + + vk->CmdDispatch(exec->buf, f->num_h_slices, f->num_v_slices, 1); + + /* RCT */ + if (is_rgb) { + FFVulkanShader *rct_shader = &fv->rct[f->use32bit]; + FFv1VkRCTParameters pd_rct; + + ff_vk_shader_update_desc_buffer(&ctx->s, exec, rct_shader, + 1, 0, 0, + slice_state, + 0, fp->slice_data_size*f->slice_count, + VK_FORMAT_UNDEFINED); + ff_vk_shader_update_img_array(&ctx->s, exec, rct_shader, + decode_dst, decode_dst_view, + 1, 1, + VK_IMAGE_LAYOUT_GENERAL, + VK_NULL_HANDLE); + ff_vk_shader_update_img_array(&ctx->s, exec, rct_shader, + f->picture.f, vp->view.out, + 1, 2, + VK_IMAGE_LAYOUT_GENERAL, + VK_NULL_HANDLE); + + ff_vk_exec_bind_shader(&ctx->s, exec, rct_shader); + + pd_rct = (FFv1VkRCTParameters) { + .offset = 1 << bits, + .bits = bits, + .planar_rgb = ff_vk_mt_is_np_rgb(sw_format) && + (ff_vk_count_images((AVVkFrame *)f->picture.f->data[0]) > 1), + .transparency = f->transparency, + }; + ff_vk_set_perm(sw_format, pd_rct.fmt_lut); + + ff_vk_shader_update_push_const(&ctx->s, exec, rct_shader, + VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pd_rct), &pd_rct); + + ff_vk_frame_barrier(&ctx->s, exec, decode_dst, img_bar, &nb_img_bar, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_ACCESS_SHADER_READ_BIT, + VK_IMAGE_LAYOUT_GENERAL, + VK_QUEUE_FAMILY_IGNORED); + ff_vk_frame_barrier(&ctx->s, exec, f->picture.f, img_bar, &nb_img_bar, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_ACCESS_SHADER_WRITE_BIT, + VK_IMAGE_LAYOUT_GENERAL, + VK_QUEUE_FAMILY_IGNORED); + + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pImageMemoryBarriers = img_bar, + .imageMemoryBarrierCount = nb_img_bar, + }); + nb_img_bar = 0; + + vk->CmdDispatch(exec->buf, f->num_h_slices, f->num_v_slices, 1); + } + + err = ff_vk_exec_submit(&ctx->s, exec); + if (err < 0) + return err; + +fail: + return 0; +} + +static void define_shared_code(FFVulkanShader *shd, int use32bit) +{ + int smp_bits = use32bit ? 32 : 16; + + GLSLC(0, #define DECODE ); + + av_bprintf(&shd->src, "#define CONTEXT_SIZE %i\n" ,CONTEXT_SIZE); + av_bprintf(&shd->src, "#define MAX_QUANT_TABLE_MASK 0x%x\n" ,MAX_QUANT_TABLE_MASK); + + GLSLF(0, #define TYPE int%i_t ,smp_bits); + GLSLF(0, #define VTYPE2 i%ivec2 ,smp_bits); + GLSLF(0, #define VTYPE3 i%ivec3 ,smp_bits); + GLSLD(ff_source_rangecoder_comp); + GLSLD(ff_source_ffv1_common_comp); +} + +static int init_setup_shader(FFV1Context *f, FFVulkanContext *s, + FFVkExecPool *pool, FFVkSPIRVCompiler *spv, + FFVulkanShader *shd) +{ + int err; + FFVulkanDescriptorSetBinding *desc_set; + + uint8_t *spv_data; + size_t spv_len; + void *spv_opaque = NULL; + + RET(ff_vk_shader_init(s, shd, "ffv1_dec_setup", + VK_SHADER_STAGE_COMPUTE_BIT, + (const char *[]) { "GL_EXT_buffer_reference", + "GL_EXT_buffer_reference2" }, 2, + 1, 1, 1, + 0)); + + /* Common codec header */ + GLSLD(ff_source_common_comp); + + add_push_data(shd); + + av_bprintf(&shd->src, "#define MAX_QUANT_TABLES %i\n", MAX_QUANT_TABLES); + av_bprintf(&shd->src, "#define MAX_CONTEXT_INPUTS %i\n", MAX_CONTEXT_INPUTS); + av_bprintf(&shd->src, "#define MAX_QUANT_TABLE_SIZE %i\n", MAX_QUANT_TABLE_SIZE); + + desc_set = (FFVulkanDescriptorSetBinding []) { + { + .name = "rangecoder_static_buf", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "uint8_t zero_one_state[512];", + }, + { + .name = "crc_ieee_buf", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "uint32_t crc_ieee[256];", + }, + { + .name = "quant_buf", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "int16_t quant_table[MAX_QUANT_TABLES]" + "[MAX_CONTEXT_INPUTS][MAX_QUANT_TABLE_SIZE];", + }, + }; + + RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 3, 1, 0)); + + define_shared_code(shd, 0 /* Irrelevant */); + + desc_set = (FFVulkanDescriptorSetBinding []) { + { + .name = "slice_data_buf", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .buf_content = "SliceContext slice_ctx", + .buf_elems = f->max_slice_count, + }, + { + .name = "slice_offsets_buf", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_quali = "readonly", + .buf_content = "uint32_t slice_offsets", + .buf_elems = 2*f->max_slice_count, + }, + { + .name = "slice_status_buf", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_quali = "writeonly", + .buf_content = "uint32_t slice_crc_mismatch", + .buf_elems = 2*f->max_slice_count, + }, + }; + RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 3, 0, 0)); + + GLSLD(ff_source_ffv1_dec_setup_comp); + + RET(spv->compile_shader(s, spv, shd, &spv_data, &spv_len, "main", + &spv_opaque)); + RET(ff_vk_shader_link(s, shd, spv_data, spv_len, "main")); + + RET(ff_vk_shader_register_exec(s, pool, shd)); + +fail: + if (spv_opaque) + spv->free_shader(spv, &spv_opaque); + + return err; +} + +static int init_reset_shader(FFV1Context *f, FFVulkanContext *s, + FFVkExecPool *pool, FFVkSPIRVCompiler *spv, + FFVulkanShader *shd, int ac) +{ + int err; + FFVulkanDescriptorSetBinding *desc_set; + + uint8_t *spv_data; + size_t spv_len; + void *spv_opaque = NULL; + int wg_dim = FFMIN(s->props.properties.limits.maxComputeWorkGroupSize[0], 1024); + + RET(ff_vk_shader_init(s, shd, "ffv1_dec_reset", + VK_SHADER_STAGE_COMPUTE_BIT, + (const char *[]) { "GL_EXT_buffer_reference", + "GL_EXT_buffer_reference2" }, 2, + wg_dim, 1, 1, + 0)); + + if (ac == AC_GOLOMB_RICE) + av_bprintf(&shd->src, "#define GOLOMB\n"); + + /* Common codec header */ + GLSLD(ff_source_common_comp); + + GLSLC(0, layout(push_constant, scalar) uniform pushConstants { ); + GLSLC(1, u8buf slice_state; ); + GLSLC(1, uint plane_state_size; ); + GLSLC(1, uint context_count; ); + GLSLC(1, uint8_t codec_planes; ); + GLSLC(1, uint8_t key_frame; ); + GLSLC(1, uint8_t version; ); + GLSLC(1, uint8_t micro_version; ); + GLSLC(1, uint8_t padding[1]; ); + GLSLC(0, }; ); + ff_vk_shader_add_push_const(shd, 0, sizeof(FFv1VkResetParameters), + VK_SHADER_STAGE_COMPUTE_BIT); + + av_bprintf(&shd->src, "#define MAX_QUANT_TABLES %i\n", MAX_QUANT_TABLES); + av_bprintf(&shd->src, "#define MAX_CONTEXT_INPUTS %i\n", MAX_CONTEXT_INPUTS); + av_bprintf(&shd->src, "#define MAX_QUANT_TABLE_SIZE %i\n", MAX_QUANT_TABLE_SIZE); + + desc_set = (FFVulkanDescriptorSetBinding []) { + { + .name = "rangecoder_static_buf", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "uint8_t zero_one_state[512];", + }, + { + .name = "quant_buf", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "int16_t quant_table[MAX_QUANT_TABLES]" + "[MAX_CONTEXT_INPUTS][MAX_QUANT_TABLE_SIZE];", + }, + }; + RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 2, 1, 0)); + + define_shared_code(shd, 0 /* Bit depth irrelevant for the reset shader */); + if (ac == AC_GOLOMB_RICE) + GLSLD(ff_source_ffv1_vlc_comp); + + desc_set = (FFVulkanDescriptorSetBinding []) { + { + .name = "slice_data_buf", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .mem_quali = "readonly", + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .buf_content = "SliceContext slice_ctx", + .buf_elems = f->max_slice_count, + }, + }; + RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 1, 0, 0)); + + GLSLD(ff_source_ffv1_reset_comp); + + RET(spv->compile_shader(s, spv, shd, &spv_data, &spv_len, "main", + &spv_opaque)); + RET(ff_vk_shader_link(s, shd, spv_data, spv_len, "main")); + + RET(ff_vk_shader_register_exec(s, pool, shd)); + +fail: + if (spv_opaque) + spv->free_shader(spv, &spv_opaque); + + return err; +} + +static int init_decode_shader(FFV1Context *f, FFVulkanContext *s, + FFVkExecPool *pool, FFVkSPIRVCompiler *spv, + FFVulkanShader *shd, AVHWFramesContext *frames_ctx, + int use32bit, int ac, int rgb) +{ + int err; + FFVulkanDescriptorSetBinding *desc_set; + + uint8_t *spv_data; + size_t spv_len; + void *spv_opaque = NULL; + + RET(ff_vk_shader_init(s, shd, "ffv1_dec", + VK_SHADER_STAGE_COMPUTE_BIT, + (const char *[]) { "GL_EXT_buffer_reference", + "GL_EXT_buffer_reference2" }, 2, + 1, 1, 1, + 0)); + + if (ac == AC_GOLOMB_RICE) + av_bprintf(&shd->src, "#define GOLOMB\n"); + + if (rgb) + av_bprintf(&shd->src, "#define RGB\n"); + + /* Common codec header */ + GLSLD(ff_source_common_comp); + + add_push_data(shd); + + av_bprintf(&shd->src, "#define MAX_QUANT_TABLES %i\n", MAX_QUANT_TABLES); + av_bprintf(&shd->src, "#define MAX_CONTEXT_INPUTS %i\n", MAX_CONTEXT_INPUTS); + av_bprintf(&shd->src, "#define MAX_QUANT_TABLE_SIZE %i\n", MAX_QUANT_TABLE_SIZE); + + desc_set = (FFVulkanDescriptorSetBinding []) { + { + .name = "rangecoder_static_buf", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "uint8_t zero_one_state[512];", + }, + { + .name = "quant_buf", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "int16_t quant_table[MAX_QUANT_TABLES]" + "[MAX_CONTEXT_INPUTS][MAX_QUANT_TABLE_SIZE];", + }, + }; + + RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 2, 1, 0)); + + define_shared_code(shd, use32bit); + if (ac == AC_GOLOMB_RICE) + GLSLD(ff_source_ffv1_vlc_comp); + + desc_set = (FFVulkanDescriptorSetBinding []) { + { + .name = "slice_data_buf", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .buf_content = "SliceContext slice_ctx", + .buf_elems = f->max_slice_count, + }, + { + .name = "dst", + .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .dimensions = 2, + .mem_layout = ff_vk_shader_rep_fmt(frames_ctx->sw_format, + FF_VK_REP_NATIVE), + .elems = av_pix_fmt_count_planes(frames_ctx->sw_format), + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + }; + RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 2, 0, 0)); + + GLSLD(ff_source_ffv1_dec_comp); + + RET(spv->compile_shader(s, spv, shd, &spv_data, &spv_len, "main", + &spv_opaque)); + RET(ff_vk_shader_link(s, shd, spv_data, spv_len, "main")); + + RET(ff_vk_shader_register_exec(s, pool, shd)); + +fail: + if (spv_opaque) + spv->free_shader(spv, &spv_opaque); + + return err; +} + +static int init_rct_shader(FFV1Context *f, FFVulkanContext *s, + FFVkExecPool *pool, FFVkSPIRVCompiler *spv, + FFVulkanShader *shd, int use32bit, + AVHWFramesContext *src_ctx, AVHWFramesContext *dst_ctx) +{ + int err; + FFVulkanDescriptorSetBinding *desc_set; + + uint8_t *spv_data; + size_t spv_len; + void *spv_opaque = NULL; + int wg_count = sqrt(s->props.properties.limits.maxComputeWorkGroupInvocations); + + RET(ff_vk_shader_init(s, shd, "ffv1_rct", + VK_SHADER_STAGE_COMPUTE_BIT, + (const char *[]) { "GL_EXT_buffer_reference", + "GL_EXT_buffer_reference2" }, 2, + wg_count, wg_count, 1, + 0)); + + /* Common codec header */ + GLSLD(ff_source_common_comp); + + GLSLC(0, layout(push_constant, scalar) uniform pushConstants { ); + GLSLC(1, ivec4 fmt_lut; ); + GLSLC(1, int offset; ); + GLSLC(1, uint8_t bits; ); + GLSLC(1, uint8_t planar_rgb; ); + GLSLC(1, uint8_t transparency; ); + GLSLC(1, uint8_t version; ); + GLSLC(1, uint8_t micro_version; ); + GLSLC(1, uint8_t padding[3]; ); + GLSLC(0, }; ); + ff_vk_shader_add_push_const(shd, 0, sizeof(FFv1VkRCTParameters), + VK_SHADER_STAGE_COMPUTE_BIT); + + av_bprintf(&shd->src, "#define MAX_QUANT_TABLES %i\n", MAX_QUANT_TABLES); + av_bprintf(&shd->src, "#define MAX_CONTEXT_INPUTS %i\n", MAX_CONTEXT_INPUTS); + av_bprintf(&shd->src, "#define MAX_QUANT_TABLE_SIZE %i\n", MAX_QUANT_TABLE_SIZE); + + desc_set = (FFVulkanDescriptorSetBinding []) { + { + .name = "rangecoder_static_buf", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "uint8_t zero_one_state[512];", + }, + { + .name = "quant_buf", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "int16_t quant_table[MAX_QUANT_TABLES]" + "[MAX_CONTEXT_INPUTS][MAX_QUANT_TABLE_SIZE];", + }, + }; + RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 2, 1, 0)); + + define_shared_code(shd, use32bit); + + desc_set = (FFVulkanDescriptorSetBinding []) { + { + .name = "slice_data_buf", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .mem_quali = "readonly", + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .buf_content = "SliceContext slice_ctx", + .buf_elems = f->max_slice_count, + }, + { + .name = "src", + .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .dimensions = 2, + .mem_layout = ff_vk_shader_rep_fmt(src_ctx->sw_format, + FF_VK_REP_NATIVE), + .mem_quali = "readonly", + .elems = av_pix_fmt_count_planes(src_ctx->sw_format), + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + { + .name = "dst", + .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .dimensions = 2, + .mem_layout = ff_vk_shader_rep_fmt(dst_ctx->sw_format, + FF_VK_REP_NATIVE), + .mem_quali = "writeonly", + .elems = av_pix_fmt_count_planes(dst_ctx->sw_format), + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + }; + RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 3, 0, 0)); + + GLSLD(ff_source_ffv1_dec_rct_comp); + + RET(spv->compile_shader(s, spv, shd, &spv_data, &spv_len, "main", + &spv_opaque)); + RET(ff_vk_shader_link(s, shd, spv_data, spv_len, "main")); + + RET(ff_vk_shader_register_exec(s, pool, shd)); + +fail: + if (spv_opaque) + spv->free_shader(spv, &spv_opaque); + + return err; +} + +static int init_indirect(AVCodecContext *avctx, FFVulkanContext *s, + AVBufferRef **dst, enum AVPixelFormat sw_format) +{ + int err; + AVHWFramesContext *frames_ctx; + AVVulkanFramesContext *vk_frames; + + *dst = av_hwframe_ctx_alloc(s->device_ref); + if (!(*dst)) + return AVERROR(ENOMEM); + + frames_ctx = (AVHWFramesContext *)((*dst)->data); + frames_ctx->format = AV_PIX_FMT_VULKAN; + frames_ctx->sw_format = sw_format; + frames_ctx->width = FFALIGN(s->frames->width, 32); + frames_ctx->height = FFALIGN(s->frames->height, 32); + + vk_frames = frames_ctx->hwctx; + vk_frames->tiling = VK_IMAGE_TILING_OPTIMAL; + vk_frames->usage = VK_IMAGE_USAGE_STORAGE_BIT; + vk_frames->img_flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT; + + err = av_hwframe_ctx_init(*dst); + if (err < 0) { + av_log(avctx, AV_LOG_ERROR, "Unable to initialize frame pool with format %s: %s\n", + av_get_pix_fmt_name(sw_format), av_err2str(err)); + av_buffer_unref(dst); + return err; + } + + return 0; +} + +static void vk_decode_ffv1_uninit(FFVulkanDecodeShared *ctx) +{ + FFv1VulkanDecodeContext *fv = ctx->sd_ctx; + + ff_vk_shader_free(&ctx->s, &fv->setup); + + for (int i = 0; i < 2; i++) /* 16/32 bit */ + av_buffer_unref(&fv->intermediate_frames_ref[i]); + + for (int i = 0; i < 2; i++) /* AC/Golomb */ + ff_vk_shader_free(&ctx->s, &fv->reset[i]); + + for (int i = 0; i < 2; i++) /* 16/32 bit */ + for (int j = 0; j < 2; j++) /* AC/Golomb */ + for (int k = 0; k < 2; k++) /* Normal/RGB */ + ff_vk_shader_free(&ctx->s, &fv->decode[i][j][k]); + + for (int i = 0; i < 2; i++) /* 16/32 bit */ + ff_vk_shader_free(&ctx->s, &fv->rct[i]); + + ff_vk_free_buf(&ctx->s, &fv->quant_buf); + ff_vk_free_buf(&ctx->s, &fv->rangecoder_static_buf); + ff_vk_free_buf(&ctx->s, &fv->crc_tab_buf); + + av_buffer_pool_uninit(&fv->tmp_data_pool); + av_buffer_pool_uninit(&fv->slice_state_pool); + av_buffer_pool_uninit(&fv->slice_offset_pool); + av_buffer_pool_uninit(&fv->slice_status_pool); +} + +static int vk_decode_ffv1_init(AVCodecContext *avctx) +{ + int err; + FFV1Context *f = avctx->priv_data; + FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data; + FFVulkanDecodeShared *ctx = NULL; + FFv1VulkanDecodeContext *fv; + FFVkSPIRVCompiler *spv; + + if (f->version < 3) + return AVERROR(ENOTSUP); + + spv = ff_vk_spirv_init(); + if (!spv) { + av_log(avctx, AV_LOG_ERROR, "Unable to initialize SPIR-V compiler!\n"); + return AVERROR_EXTERNAL; + } + + err = ff_vk_decode_init(avctx); + if (err < 0) + return err; + ctx = dec->shared_ctx; + + fv = ctx->sd_ctx = av_mallocz(sizeof(*fv)); + if (!fv) { + err = AVERROR(ENOMEM); + goto fail; + } + + ctx->sd_ctx_free = &vk_decode_ffv1_uninit; + + /* Intermediate frame pool for RCT */ + for (int i = 0; i < 2; i++) { /* 16/32 bit */ + err = init_indirect(avctx, &ctx->s, &fv->intermediate_frames_ref[i], + i ? AV_PIX_FMT_RGBA128 : AV_PIX_FMT_RGBA64); + if (err < 0) + return err; + } + + /* Setup shader */ + err = init_setup_shader(f, &ctx->s, &ctx->exec_pool, spv, &fv->setup); + if (err < 0) + return err; + + /* Reset shaders */ + for (int i = 0; i < 2; i++) { /* AC/Golomb */ + err = init_reset_shader(f, &ctx->s, &ctx->exec_pool, + spv, &fv->reset[i], !i ? AC_RANGE_CUSTOM_TAB : 0); + if (err < 0) + return err; + } + + /* Decode shaders */ + for (int i = 0; i < 2; i++) { /* 16/32 bit */ + for (int j = 0; j < 2; j++) { /* AC/Golomb */ + for (int k = 0; k < 2; k++) { /* Normal/RGB */ + AVHWFramesContext *frames_ctx; + frames_ctx = k ? (AVHWFramesContext *)fv->intermediate_frames_ref[i]->data : + (AVHWFramesContext *)avctx->hw_frames_ctx->data; + err = init_decode_shader(f, &ctx->s, &ctx->exec_pool, + spv, &fv->decode[i][j][k], + frames_ctx, + i, + !j ? AC_RANGE_CUSTOM_TAB : AC_GOLOMB_RICE, + k); + if (err < 0) + return err; + } + } + } + + /* RCT shaders */ + for (int i = 0; i < 2; i++) { /* 16/32 bit */ + err = init_rct_shader(f, &ctx->s, &ctx->exec_pool, + spv, &fv->rct[i], i, + (AVHWFramesContext *)fv->intermediate_frames_ref[i]->data, + (AVHWFramesContext *)avctx->hw_frames_ctx->data); + if (err < 0) + return err; + } + + /* Range coder data */ + err = ff_ffv1_vk_init_state_transition_data(&ctx->s, + &fv->rangecoder_static_buf, + f); + if (err < 0) + return err; + + /* Quantization table data */ + err = ff_ffv1_vk_init_quant_table_data(&ctx->s, + &fv->quant_buf, + f); + if (err < 0) + return err; + + /* CRC table buffer */ + err = ff_ffv1_vk_init_crc_table_data(&ctx->s, + &fv->crc_tab_buf, + f); + if (err < 0) + return err; + + /* Update setup global descriptors */ + RET(ff_vk_shader_update_desc_buffer(&ctx->s, &ctx->exec_pool.contexts[0], + &fv->setup, 0, 0, 0, + &fv->rangecoder_static_buf, + 0, fv->rangecoder_static_buf.size, + VK_FORMAT_UNDEFINED)); + RET(ff_vk_shader_update_desc_buffer(&ctx->s, &ctx->exec_pool.contexts[0], + &fv->setup, 0, 1, 0, + &fv->crc_tab_buf, + 0, fv->crc_tab_buf.size, + VK_FORMAT_UNDEFINED)); + + /* Update decode global descriptors */ + for (int i = 0; i < 2; i++) { /* 16/32 bit */ + for (int j = 0; j < 2; j++) { /* AC/Golomb */ + for (int k = 0; k < 2; k++) { /* Normal/RGB */ + RET(ff_vk_shader_update_desc_buffer(&ctx->s, &ctx->exec_pool.contexts[0], + &fv->decode[i][j][k], 0, 0, 0, + &fv->rangecoder_static_buf, + 0, fv->rangecoder_static_buf.size, + VK_FORMAT_UNDEFINED)); + RET(ff_vk_shader_update_desc_buffer(&ctx->s, &ctx->exec_pool.contexts[0], + &fv->decode[i][j][k], 0, 1, 0, + &fv->quant_buf, + 0, fv->quant_buf.size, + VK_FORMAT_UNDEFINED)); + } + } + } + +fail: + return err; +} + +static void vk_ffv1_free_frame_priv(AVRefStructOpaque _hwctx, void *data) +{ + AVHWDeviceContext *hwctx = _hwctx.nc; + + FFv1VulkanDecodePicture *fp = data; + FFVulkanDecodePicture *vp = &fp->vp; + + ff_vk_decode_free_frame(hwctx, vp); + + if (fp->crc_checked) { + FFVkBuffer *slice_status = (FFVkBuffer *)fp->slice_status_buf->data; + for (int i = 0; i < fp->slice_num; i++) { + uint32_t crc_res; + crc_res = AV_RN32(slice_status->mapped_mem + i*sizeof(uint32_t)); + if (crc_res != 0) + av_log(hwctx, AV_LOG_ERROR, "CRC mismatch in slice %i, res: 0x%x\n", + i, crc_res); + } + } + + av_buffer_unref(&vp->slices_buf); + av_buffer_unref(&fp->slice_state); + av_buffer_unref(&fp->slice_offset_buf); + av_buffer_unref(&fp->slice_status_buf); + av_buffer_unref(&fp->tmp_data); +} + +const FFHWAccel ff_ffv1_vulkan_hwaccel = { + .p.name = "ffv1_vulkan", + .p.type = AVMEDIA_TYPE_VIDEO, + .p.id = AV_CODEC_ID_FFV1, + .p.pix_fmt = AV_PIX_FMT_VULKAN, + .start_frame = &vk_ffv1_start_frame, + .decode_slice = &vk_ffv1_decode_slice, + .end_frame = &vk_ffv1_end_frame, + .free_frame_priv = &vk_ffv1_free_frame_priv, + .frame_priv_data_size = sizeof(FFv1VulkanDecodePicture), + .init = &vk_decode_ffv1_init, + .update_thread_context = &ff_vk_update_thread_context, + .decode_params = &ff_vk_params_invalidate, + .flush = &ff_vk_decode_flush, + .uninit = &ff_vk_decode_uninit, + .frame_params = &ff_vk_frame_params, + .priv_data_size = sizeof(FFVulkanDecodeContext), + .caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_THREAD_SAFE, +}; -- 2.47.2 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".