From: James Almer <jamrial@gmail.com> To: ffmpeg-devel@ffmpeg.org Subject: [FFmpeg-devel] [PATCH] x86/vvc_alf: use the x86inc instruction macros Date: Tue, 21 May 2024 10:52:56 -0300 Message-ID: <20240521135256.3111-1-jamrial@gmail.com> (raw) Let its magic figure out the correct mnemonic based on target instruction set. Signed-off-by: James Almer <jamrial@gmail.com> --- libavcodec/x86/vvc/vvc_alf.asm | 202 ++++++++++++++++----------------- 1 file changed, 101 insertions(+), 101 deletions(-) diff --git a/libavcodec/x86/vvc/vvc_alf.asm b/libavcodec/x86/vvc/vvc_alf.asm index b3d118962f..71e821c27b 100644 --- a/libavcodec/x86/vvc/vvc_alf.asm +++ b/libavcodec/x86/vvc/vvc_alf.asm @@ -73,15 +73,15 @@ SECTION .text ;m%2 = 07 06 05 04 ;m%3 = 11 10 09 08 - vshufpd m%5, m%1, m%2, 0011b ;06 02 05 01 - vshufpd m%6, m%3, m%5, 1001b ;06 10 01 09 + shufpd m%5, m%1, m%2, 0011b ;06 02 05 01 + shufpd m%6, m%3, m%5, 1001b ;06 10 01 09 - vshufpd m%1, m%1, m%6, 1100b ;06 03 09 00 - vshufpd m%2, m%2, m%6, 0110b ;10 07 01 04 - vshufpd m%3, m%3, m%5, 0110b ;02 11 05 08 + shufpd m%1, m%1, m%6, 1100b ;06 03 09 00 + shufpd m%2, m%2, m%6, 0110b ;10 07 01 04 + shufpd m%3, m%3, m%5, 0110b ;02 11 05 08 vpermpd m%1, m%1, 01111000b ;09 06 03 00 - vshufpd m%2, m%2, m%2, 1001b ;10 07 04 01 + shufpd m%2, m%2, m%2, 1001b ;10 07 04 01 vpermpd m%3, m%3, 10000111b ;11 08 05 02 %endmacro @@ -125,21 +125,21 @@ SECTION .text pxor m11, m11 psubw m11, m12 ;-clip - vpsubw m9, m2 + psubw m9, m2 CLIPW m9, m11, m12 - vpsubw m10, m2 + psubw m10, m2 CLIPW m10, m11, m12 - vpunpckhwd m13, m9, m10 - vpunpcklwd m9, m9, m10 + punpckhwd m13, m9, m10 + punpcklwd m9, m9, m10 pshufb m12, filters, [param_shuffe_ %+ i] ;filter - vpunpcklwd m10, m12, m12 - vpunpckhwd m12, m12, m12 + punpcklwd m10, m12, m12 + punpckhwd m12, m12, m12 - vpmaddwd m9, m10 - vpmaddwd m12, m13 + pmaddwd m9, m10 + pmaddwd m12, m13 paddd m0, m9 paddd m1, m12 @@ -268,17 +268,17 @@ SECTION .text je %%near_vb %endif %%no_vb: - vpsrad m0, SHIFT - vpsrad m1, SHIFT + psrad m0, SHIFT + psrad m1, SHIFT jmp %%shift_end %%near_vb: vpbroadcastd m9, [dd448] paddd m0, m9 paddd m1, m9 - vpsrad m0, SHIFT + 3 - vpsrad m1, SHIFT + 3 + psrad m0, SHIFT + 3 + psrad m1, SHIFT + 3 %%shift_end: - vpackssdw m0, m0, m1 + packssdw m0, m0, m1 %endmacro ; FILTER_VB(line) @@ -320,7 +320,7 @@ SECTION .text %if ps == 2 movu %1, %2 %else - vpmovzxbw %1, %2 + pmovzxbw %1, %2 %endif %endmacro @@ -329,7 +329,7 @@ SECTION .text %if ps == 2 movu %1, m%2 %else - vpackuswb m%2, m%2 + packuswb m%2, m%2 vpermq m%2, m%2, 0x8 movu %1, xm%2 %endif @@ -489,43 +489,43 @@ cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, gradient_sum, src, src_stride, w LOAD_PIXELS m6, [s2q + 2 * ps] LOAD_PIXELS m7, [s3q + 2 * ps] - vpblendw m8, m0, m1, 0xaa ; nw - vpblendw m9, m0, m5, 0x55 ; n - vpblendw m10, m4, m5, 0xaa ; ne - vpblendw m11, m1, m2, 0xaa ; w - vpblendw m12, m5, m6, 0xaa ; e - vpblendw m13, m2, m3, 0xaa ; sw - vpblendw m14, m2, m7, 0x55 ; s + pblendw m8, m0, m1, 0xaa ; nw + pblendw m9, m0, m5, 0x55 ; n + pblendw m10, m4, m5, 0xaa ; ne + pblendw m11, m1, m2, 0xaa ; w + pblendw m12, m5, m6, 0xaa ; e + pblendw m13, m2, m3, 0xaa ; sw + pblendw m14, m2, m7, 0x55 ; s - vpblendw m0, m1, m6, 0x55 - vpaddw m0, m0 ; c + pblendw m0, m1, m6, 0x55 + paddw m0, m0 ; c movu m1, [CLASSIFY_SHUFFE] pshufb m1, m0, m1 ; d - vpaddw m9, m14 ; n + s - vpsubw m9, m0 ; (n + s) - c - vpabsw m9, m9 ; ver + paddw m9, m14 ; n + s + psubw m9, m0 ; (n + s) - c + pabsw m9, m9 ; ver - vpaddw m11, m12 ; w + e - vpsubw m11, m1 ; (w + e) - d - vpabsw m11, m11 ; hor + paddw m11, m12 ; w + e + psubw m11, m1 ; (w + e) - d + pabsw m11, m11 ; hor - vpblendw m14, m6, m7, 0xaa ; se - vpaddw m8, m14 ; nw + se - vpsubw m8, m1 ; (nw + se) - d - vpabsw m8, m8 ; di0 + pblendw m14, m6, m7, 0xaa ; se + paddw m8, m14 ; nw + se + psubw m8, m1 ; (nw + se) - d + pabsw m8, m8 ; di0 - vpaddw m10, m13 ; ne + sw - vpsubw m10, m1 ; (nw + se) - d - vpabsw m10, m10 ; di1 + paddw m10, m13 ; ne + sw + psubw m10, m1 ; (nw + se) - d + pabsw m10, m10 ; di1 phaddw m9, m11 ; vh, each word represent 2x2 pixels phaddw m8, m10 ; di, each word represent 2x2 pixels phaddw m0, m9, m8 ; all = each word represent 4x2 pixels, order is v_h_d0_d1 x 4 vinserti128 m15, m15, xm0, 1 - vpblendw m1, m0, m15, 0xaa ; t + pblendw m1, m0, m15, 0xaa ; t phaddw m1, m0 ; each word represent 8x2 pixels, adjacent word share 4x2 pixels @@ -594,7 +594,7 @@ cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, gradient_sum, src, src_stride, w vpbroadcastd m13, xm13 movd xm12, vb_posd vpbroadcastd m12, xm12 - vpcmpeqd m13, m12 ; y == vb_pos + pcmpeqd m13, m12 ; y == vb_pos pandn m13, m11 ; y != vb_pos vpbroadcastd m14, [dw3] @@ -603,23 +603,23 @@ cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, gradient_sum, src, src_stride, w pblendvb m3, m15, [gradq + sum_stride3q], m13 ; extent to dword to avoid overflow - vpunpcklwd m4, m0, m15 - vpunpckhwd m5, m0, m15 - vpunpcklwd m6, m1, m15 - vpunpckhwd m7, m1, m15 - vpunpcklwd m8, m2, m15 - vpunpckhwd m9, m2, m15 - vpunpcklwd m10, m3, m15 - vpunpckhwd m11, m3, m15 - - vpaddd m0, m4, m6 - vpaddd m1, m5, m7 - vpaddd m2, m8, m10 - vpaddd m3, m9, m11 + punpcklwd m4, m0, m15 + punpckhwd m5, m0, m15 + punpcklwd m6, m1, m15 + punpckhwd m7, m1, m15 + punpcklwd m8, m2, m15 + punpckhwd m9, m2, m15 + punpcklwd m10, m3, m15 + punpckhwd m11, m3, m15 + + paddd m0, m4, m6 + paddd m1, m5, m7 + paddd m2, m8, m10 + paddd m3, m9, m11 ; sum of the first row - vpaddd m0, m2 ; low - vpaddd m1, m3 ; high + paddd m0, m2 ; low + paddd m1, m3 ; high lea gradq, [gradq + 2 * sum_strideq] @@ -629,65 +629,65 @@ cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, gradient_sum, src, src_stride, w movu m12, [gradq + 2 * sum_strideq] movu m13, [gradq + sum_stride3q] - vpunpcklwd m4, m10, m15 - vpunpckhwd m5, m10, m15 - vpunpcklwd m6, m11, m15 - vpunpckhwd m7, m11, m15 - vpunpcklwd m8, m12, m15 - vpunpckhwd m9, m12, m15 - vpunpcklwd m10, m13, m15 - vpunpckhwd m11, m13, m15 + punpcklwd m4, m10, m15 + punpckhwd m5, m10, m15 + punpcklwd m6, m11, m15 + punpckhwd m7, m11, m15 + punpcklwd m8, m12, m15 + punpckhwd m9, m12, m15 + punpcklwd m10, m13, m15 + punpckhwd m11, m13, m15 - vpaddd m2, m4, m6 - vpaddd m3, m5, m7 - vpaddd m4, m8, m10 - vpaddd m5, m9, m11 + paddd m2, m4, m6 + paddd m3, m5, m7 + paddd m4, m8, m10 + paddd m5, m9, m11 ; sum of the second row - vpaddd m2, m4 ; low - vpaddd m3, m5 ; high + paddd m2, m4 ; low + paddd m3, m5 ; high - vpunpckldq m4, m0, m2 - vpunpckhdq m5, m0, m2 - vpunpckldq m6, m1, m3 - vpunpckhdq m7, m1, m3 + punpckldq m4, m0, m2 + punpckhdq m5, m0, m2 + punpckldq m6, m1, m3 + punpckhdq m7, m1, m3 ; each dword represent 4x2 alf blocks ; the order is 01452367 - vpunpckldq m0, m4, m6 ; sum_v - vpunpckhdq m1, m4, m6 ; sum_h - vpunpckldq m2, m5, m7 ; sum_d0 - vpunpckhdq m3, m5, m7 ; sum_d1 + punpckldq m0, m4, m6 ; sum_v + punpckhdq m1, m4, m6 ; sum_h + punpckldq m2, m5, m7 ; sum_d0 + punpckhdq m3, m5, m7 ; sum_d1 - vpcmpgtd m4, m0, m1 ; dir_hv - 1 - vpmaxsd m5, m0, m1 ; hv1 - vpminsd m6, m0, m1 ; hv0 + pcmpgtd m4, m0, m1 ; dir_hv - 1 + pmaxsd m5, m0, m1 ; hv1 + pminsd m6, m0, m1 ; hv0 - vpaddd m0, m1; ; sum_hv + paddd m0, m1; ; sum_hv - vpcmpgtd m7, m2, m3 ; dir_d - 1 - vpmaxsd m8, m2, m3 ; d1 - vpminsd m9, m2, m3 ; d0 + pcmpgtd m7, m2, m3 ; dir_d - 1 + pmaxsd m8, m2, m3 ; d1 + pminsd m9, m2, m3 ; d0 ; *transpose_idx = dir_d * 2 + dir_hv; vpbroadcastd m10, [dw3] - vpaddd m11, m7, m7 - vpaddd m11, m4 - vpaddd m10, m11 + paddd m11, m7, m7 + paddd m11, m4 + paddd m10, m11 vpermq m10, m10, 11011000b SAVE_CLASSIFY_PARAM transpose_idx, 10 - vpsrlq m10, m8, 32 - vpsrlq m11, m6, 32 + psrlq m10, m8, 32 + psrlq m11, m6, 32 pmuldq m12, m10, m11 ; d1 * hv0 high - vpsrlq m1, m9, 32 - vpsrlq m2, m5, 32 + psrlq m1, m9, 32 + psrlq m2, m5, 32 pmuldq m3, m1, m2 ; d0 * hv1 high - vpcmpgtq m10, m12, m3 ; dir1 - 1 high + pcmpgtq m10, m12, m3 ; dir1 - 1 high pmuldq m1, m8, m6 ; d1 * hv0 low pmuldq m2, m9, m5 ; d0 * hv1 low - vpcmpgtq m1, m2 ; dir1 - 1 low + pcmpgtq m1, m2 ; dir1 - 1 low vpblendd m1, m1, m10, 0xaa ; dir1 - 1 @@ -698,9 +698,9 @@ cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, gradient_sum, src, src_stride, w vpbroadcastd m5, xm5 ;*class_idx = arg_var[av_clip_uintp2(sum_hv * ac >> (BIT_DEPTH - 1), 4)]; - vpmulld m0, m14 ; sum_hv * ac + pmulld m0, m14 ; sum_hv * ac vpsrlvd m0, m0, m5 - vpminsd m0, [dd15] + pminsd m0, [dd15] movu m6, [ARG_VAR_SHUFFE] pshufb m6, m0 ; class_idx @@ -716,7 +716,7 @@ cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, gradient_sum, src, src_stride, w paddd m6, m7 ; class_idx paddd m8, m2, m2 - vpslld m9, m3, 3 + pslld m9, m3, 3 paddd m9, m3 pcmpgtd m8, m9 ; hvd1 * 2 > 9 * hvd0 pand m8, m10 -- 2.45.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next reply other threads:[~2024-05-21 13:53 UTC|newest] Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top 2024-05-21 13:52 James Almer [this message] 2024-05-21 16:29 ` [FFmpeg-devel] 回复: " Wu Jianhua 2024-05-22 12:58 ` Nuo Mi
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20240521135256.3111-1-jamrial@gmail.com \ --to=jamrial@gmail.com \ --cc=ffmpeg-devel@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git