* [FFmpeg-devel] [PATCH] avcodec/x86/hevc: fix luma 12b overflow
@ 2024-02-25 8:27 J. Dekker
2024-02-25 15:56 ` Ronald S. Bultje
0 siblings, 1 reply; 9+ messages in thread
From: J. Dekker @ 2024-02-25 8:27 UTC (permalink / raw)
To: ffmpeg-devel
Weak filter can overflow in delta0 calculation before >> 4 in int16.
Signed-off-by: J. Dekker <jdek@itanimul.li>
---
I do not know x86 simd at all, so this is just an attempt to fix
the implementation rather than write extremely performant code.
Suggestions welcome.
libavcodec/x86/hevc_deblock.asm | 47 +++++++++++++++++++++++++++++++++
libavcodec/x86/hevcdsp_init.c | 8 ------
2 files changed, 47 insertions(+), 8 deletions(-)
diff --git a/libavcodec/x86/hevc_deblock.asm b/libavcodec/x86/hevc_deblock.asm
index 85ee4800bb..ce9221ebc7 100644
--- a/libavcodec/x86/hevc_deblock.asm
+++ b/libavcodec/x86/hevc_deblock.asm
@@ -541,6 +541,7 @@ ALIGN 16
add betaq, r13
shr betaq, 3; ((beta + (beta >> 1)) >> 3))
+%if %1 < 12
mova m13, [pw_8]
psubw m12, m4, m3 ; q0 - p0
psllw m10, m12, 3; 8 * (q0 - p0)
@@ -553,7 +554,49 @@ ALIGN 16
paddw m12, m13; + 8
psraw m12, 4; >> 4 , delta0
PABSW m13, m12; abs(delta0)
+%else
+ psubw m12, m4, m3 ; q0 - p0
+ pmovsxwd m13, m12 ; m13 low
+ movhlps m12, m12
+ pmovsxwd m12, m12 ; m12 high
+
+ ; m8 low, m10 high
+ pslld m8, m13, 3; 8 * (q0 - p0)
+ pslld m10, m12, 3
+
+ paddd m8, m13 ; 9 * (q0 - p0)
+ paddd m10, m12
+
+ psubw m12, m5, m2 ; q1 - p1
+ pmovsxwd m13, m12 ; m13 low
+ movhlps m12, m12
+ pmovsxwd m12, m12 ; m12 high
+ psubd m8, m13 ; 9 * (q0 - p0) - ( q1 - p1 )
+ psubd m10, m12
+
+ pslld m13, m13, 1; 2 * ( q1 - p1 )
+ pslld m12, m12, 1
+
+ psubd m8, m13; 9 * (q0 - p0) - 3 * ( q1 - p1 )
+ psubd m10, m12
+
+ mova m13, [pw_8]
+ pmovsxwd m13, m13
+
+ paddd m8, m13 ; + 8
+ paddd m10, m13
+
+ psrad m8, 4; >> 4 , delta0
+ psrad m10, 4
+
+ packssdw m12, m8
+ packssdw m10, m10
+
+ psrldq m12, 8
+ punpcklqdq m12, m10
+ PABSW m13, m12; abs(delta0)
+%endif
psllw m10, m9, 2; 8 * tc
paddw m10, m9; 10 * tc
@@ -746,6 +789,7 @@ cglobal hevc_v_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3
.bypassluma:
RET
+%if cpuflag(avx)
cglobal hevc_v_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
sub pixq, 8
lea pix0q, [3 * strideq]
@@ -757,6 +801,7 @@ cglobal hevc_v_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3
TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_12]
.bypassluma:
RET
+%endif
;-----------------------------------------------------------------------------
; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
@@ -829,6 +874,7 @@ cglobal hevc_h_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3
.bypassluma:
RET
+%if cpuflag(avx)
cglobal hevc_h_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
lea src3strideq, [3 * strideq]
mov pix0q, pixq
@@ -859,6 +905,7 @@ cglobal hevc_h_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3
movdqu [pixq + 2 * strideq], m6; q2
.bypassluma:
RET
+%endif
%endmacro
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index f5bc342cd5..e3fcb7b591 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -1205,10 +1205,6 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
if (EXTERNAL_SSE2(cpu_flags)) {
c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2;
c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2;
- if (ARCH_X86_64) {
- c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2;
- c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2;
- }
SAO_BAND_INIT(12, sse2);
SAO_EDGE_INIT(12, sse2);
@@ -1216,10 +1212,6 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_sse2;
c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_sse2;
}
- if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
- c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3;
- c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_ssse3;
- }
if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 12, sse4);
EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 12, sse4);
--
2.43.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH] avcodec/x86/hevc: fix luma 12b overflow
2024-02-25 8:27 [FFmpeg-devel] [PATCH] avcodec/x86/hevc: fix luma 12b overflow J. Dekker
@ 2024-02-25 15:56 ` Ronald S. Bultje
2024-02-25 16:22 ` Ronald S. Bultje
0 siblings, 1 reply; 9+ messages in thread
From: Ronald S. Bultje @ 2024-02-25 15:56 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Hi,
On Sun, Feb 25, 2024 at 3:28 AM J. Dekker <jdek@itanimul.li> wrote:
> Weak filter can overflow in delta0 calculation before >> 4 in int16.
>
> Signed-off-by: J. Dekker <jdek@itanimul.li>
> ---
>
> I do not know x86 simd at all, so this is just an attempt to fix
> the implementation rather than write extremely performant code.
>
> Suggestions welcome.
>
https://pastebin.com/KvcbQ2nK
Most of this can remain in 16bits when doing pabsw before the add. The odd
thing is that if I break (intentionally) the sse2 version, the ssse3/avx
checkams tests start failing occasionally also. I'm not sure why that is
the case. Not sure how to measure perf impact, I don't think there is any
for the non-12bit case (same number of instructions) but I didn't test very
hard and the perf is not very stable...
Ronald
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH] avcodec/x86/hevc: fix luma 12b overflow
2024-02-25 15:56 ` Ronald S. Bultje
@ 2024-02-25 16:22 ` Ronald S. Bultje
2024-02-25 16:24 ` Ronald S. Bultje
2024-02-25 16:28 ` James Almer
0 siblings, 2 replies; 9+ messages in thread
From: Ronald S. Bultje @ 2024-02-25 16:22 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1: Type: text/plain, Size: 600 bytes --]
On Sun, Feb 25, 2024 at 10:56 AM Ronald S. Bultje <rsbultje@gmail.com>
wrote:
> Hi,
>
> On Sun, Feb 25, 2024 at 3:28 AM J. Dekker <jdek@itanimul.li> wrote:
>
>> Weak filter can overflow in delta0 calculation before >> 4 in int16.
>>
>> Signed-off-by: J. Dekker <jdek@itanimul.li>
>> ---
>>
>> I do not know x86 simd at all, so this is just an attempt to fix
>> the implementation rather than write extremely performant code.
>>
>> Suggestions welcome.
>>
>
> https://pastebin.com/KvcbQ2nK
>
Attached a slightly adjusted version which does sse2 in 16bit also.
Ronald
[-- Attachment #2: 0001-hevx-x86-deblock-fix-12bit-overflow.patch --]
[-- Type: application/octet-stream, Size: 2698 bytes --]
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH] avcodec/x86/hevc: fix luma 12b overflow
2024-02-25 16:22 ` Ronald S. Bultje
@ 2024-02-25 16:24 ` Ronald S. Bultje
2024-02-25 16:28 ` James Almer
1 sibling, 0 replies; 9+ messages in thread
From: Ronald S. Bultje @ 2024-02-25 16:24 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1: Type: text/plain, Size: 774 bytes --]
Hi,
On Sun, Feb 25, 2024 at 11:22 AM Ronald S. Bultje <rsbultje@gmail.com>
wrote:
>
>
> On Sun, Feb 25, 2024 at 10:56 AM Ronald S. Bultje <rsbultje@gmail.com>
> wrote:
>
>> Hi,
>>
>> On Sun, Feb 25, 2024 at 3:28 AM J. Dekker <jdek@itanimul.li> wrote:
>>
>>> Weak filter can overflow in delta0 calculation before >> 4 in int16.
>>>
>>> Signed-off-by: J. Dekker <jdek@itanimul.li>
>>> ---
>>>
>>> I do not know x86 simd at all, so this is just an attempt to fix
>>> the implementation rather than write extremely performant code.
>>>
>>> Suggestions welcome.
>>>
>>
>> https://pastebin.com/KvcbQ2nK
>>
>
> Attached a slightly adjusted version which does sse2 in 16bit also.
>
And now without typos and whitespace changes.
Ronald
[-- Attachment #2: 0001-hevc-x86-deblock-fix-12bit-overflow.patch --]
[-- Type: application/octet-stream, Size: 2497 bytes --]
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH] avcodec/x86/hevc: fix luma 12b overflow
2024-02-25 16:22 ` Ronald S. Bultje
2024-02-25 16:24 ` Ronald S. Bultje
@ 2024-02-25 16:28 ` James Almer
2024-02-25 16:41 ` Ronald S. Bultje
1 sibling, 1 reply; 9+ messages in thread
From: James Almer @ 2024-02-25 16:28 UTC (permalink / raw)
To: ffmpeg-devel
On 2/25/2024 1:22 PM, Ronald S. Bultje wrote:
> On Sun, Feb 25, 2024 at 10:56 AM Ronald S. Bultje <rsbultje@gmail.com>
> wrote:
>
>> Hi,
>>
>> On Sun, Feb 25, 2024 at 3:28 AM J. Dekker <jdek@itanimul.li> wrote:
>>
>>> Weak filter can overflow in delta0 calculation before >> 4 in int16.
>>>
>>> Signed-off-by: J. Dekker <jdek@itanimul.li>
>>> ---
>>>
>>> I do not know x86 simd at all, so this is just an attempt to fix
>>> the implementation rather than write extremely performant code.
>>>
>>> Suggestions welcome.
>>>
>>
>> https://pastebin.com/KvcbQ2nK
>>
>
> Attached a slightly adjusted version which does sse2 in 16bit also.
>
> Ronald
> diff --git a/libavcodec/x86/hevc_deblock.asm b/libavcodec/x86/hevc_deblock.asm
> index 85ee4800bb..869301caff 100644
> --- a/libavcodec/x86/hevc_deblock.asm
> +++ b/libavcodec/x86/hevc_deblock.asm
> @@ -31,6 +31,7 @@ cextern pw_1023
> pw_pixel_max_12: times 8 dw ((1 << 12)-1)
> pw_m2: times 8 dw -2
> pd_1 : times 4 dd 1
> +pd_8 : times 8 dd 8
This is unused.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH] avcodec/x86/hevc: fix luma 12b overflow
2024-02-25 16:28 ` James Almer
@ 2024-02-25 16:41 ` Ronald S. Bultje
2024-02-25 22:30 ` Henrik Gramner via ffmpeg-devel
0 siblings, 1 reply; 9+ messages in thread
From: Ronald S. Bultje @ 2024-02-25 16:41 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1: Type: text/plain, Size: 1272 bytes --]
Hi,
On Sun, Feb 25, 2024 at 11:28 AM James Almer <jamrial@gmail.com> wrote:
> On 2/25/2024 1:22 PM, Ronald S. Bultje wrote:
> > On Sun, Feb 25, 2024 at 10:56 AM Ronald S. Bultje <rsbultje@gmail.com>
> > wrote:
> >
> >> Hi,
> >>
> >> On Sun, Feb 25, 2024 at 3:28 AM J. Dekker <jdek@itanimul.li> wrote:
> >>
> >>> Weak filter can overflow in delta0 calculation before >> 4 in int16.
> >>>
> >>> Signed-off-by: J. Dekker <jdek@itanimul.li>
> >>> ---
> >>>
> >>> I do not know x86 simd at all, so this is just an attempt to fix
> >>> the implementation rather than write extremely performant code.
> >>>
> >>> Suggestions welcome.
> >>>
> >>
> >> https://pastebin.com/KvcbQ2nK
> >>
> >
> > Attached a slightly adjusted version which does sse2 in 16bit also.
> >
> > Ronald
>
> > diff --git a/libavcodec/x86/hevc_deblock.asm
> b/libavcodec/x86/hevc_deblock.asm
> > index 85ee4800bb..869301caff 100644
> > --- a/libavcodec/x86/hevc_deblock.asm
> > +++ b/libavcodec/x86/hevc_deblock.asm
> > @@ -31,6 +31,7 @@ cextern pw_1023
> > pw_pixel_max_12: times 8 dw ((1 << 12)-1)
> > pw_m2: times 8 dw -2
> > pd_1 : times 4 dd 1
> > +pd_8 : times 8 dd 8
>
> This is unused.
>
Fixed.
Ronald
[-- Attachment #2: 0001-hevc-x86-deblock-fix-12bit-overflow.patch --]
[-- Type: application/octet-stream, Size: 2294 bytes --]
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH] avcodec/x86/hevc: fix luma 12b overflow
2024-02-25 16:41 ` Ronald S. Bultje
@ 2024-02-25 22:30 ` Henrik Gramner via ffmpeg-devel
2024-02-25 23:00 ` Ronald S. Bultje
0 siblings, 1 reply; 9+ messages in thread
From: Henrik Gramner via ffmpeg-devel @ 2024-02-25 22:30 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: Henrik Gramner
On Sun, Feb 25, 2024 at 5:42 PM Ronald S. Bultje <rsbultje@gmail.com> wrote:
> + mova m13, [pw_8]
> + paddw m10, m12, m12
> + paddw m12, m10 ; 9 * (q0 - p0) - 3 * ( q1 - p1 )
> paddw m12, m13; + 8
Memory operand
> + paddw m10, m13, m13
> + paddw m13, m10 ; abs(9 * (q0 - p0) - 3 * ( q1 - p1 ))
> + paddw m13, [pw_8]
[...]
> + paddw m13, m12, m12
> + paddw m13, m12 ; 3*abs(m12)
> + paddw m13, [pw_8]
Another minor improvement would be to reorder the adds like (x + x) +
(x + 8) instead of ((x + x) + x) + 8 to allow for more
instruction-level parallelism.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH] avcodec/x86/hevc: fix luma 12b overflow
2024-02-25 22:30 ` Henrik Gramner via ffmpeg-devel
@ 2024-02-25 23:00 ` Ronald S. Bultje
2024-02-26 14:26 ` J. Dekker
0 siblings, 1 reply; 9+ messages in thread
From: Ronald S. Bultje @ 2024-02-25 23:00 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: Henrik Gramner
[-- Attachment #1: Type: text/plain, Size: 900 bytes --]
Hi,
On Sun, Feb 25, 2024 at 5:30 PM Henrik Gramner via ffmpeg-devel <
ffmpeg-devel@ffmpeg.org> wrote:
> On Sun, Feb 25, 2024 at 5:42 PM Ronald S. Bultje <rsbultje@gmail.com>
> wrote:
> > + mova m13, [pw_8]
> > + paddw m10, m12, m12
> > + paddw m12, m10 ; 9 * (q0 - p0) - 3 * ( q1 - p1 )
> > paddw m12, m13; + 8
>
> Memory operand
>
> > + paddw m10, m13, m13
> > + paddw m13, m10 ; abs(9 * (q0 - p0) - 3 * ( q1 - p1 ))
> > + paddw m13, [pw_8]
> [...]
> > + paddw m13, m12, m12
> > + paddw m13, m12 ; 3*abs(m12)
> > + paddw m13, [pw_8]
>
> Another minor improvement would be to reorder the adds like (x + x) +
> (x + 8) instead of ((x + x) + x) + 8 to allow for more
> instruction-level parallelism.
>
New version attached.
Ronald
[-- Attachment #2: 0001-hevc-x86-deblock-fix-12bit-overflow.patch --]
[-- Type: application/octet-stream, Size: 2299 bytes --]
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2024-02-26 14:28 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-02-25 8:27 [FFmpeg-devel] [PATCH] avcodec/x86/hevc: fix luma 12b overflow J. Dekker
2024-02-25 15:56 ` Ronald S. Bultje
2024-02-25 16:22 ` Ronald S. Bultje
2024-02-25 16:24 ` Ronald S. Bultje
2024-02-25 16:28 ` James Almer
2024-02-25 16:41 ` Ronald S. Bultje
2024-02-25 22:30 ` Henrik Gramner via ffmpeg-devel
2024-02-25 23:00 ` Ronald S. Bultje
2024-02-26 14:26 ` J. Dekker
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git