From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> To: ffmpeg-devel@ffmpeg.org Cc: Shiyou Yin <yinshiyou-hf@loongson.cn>, Lu Wang <wanglu@loongson.cn>, Hao Chen <chenhao@loongson.cn>, Andreas Rheinhardt <andreas.rheinhardt@outlook.com> Subject: [FFmpeg-devel] [PATCH v5 3/4] avcodec/loongarch/h264chroma, vc1dsp_lasx: Add wrapper for __lasx_xvldx Date: Tue, 2 Aug 2022 02:23:11 +0200 Message-ID: <DB6PR0101MB22140F4C8EAEDD5607A4490B8F9D9@DB6PR0101MB2214.eurprd01.prod.exchangelabs.com> (raw) In-Reply-To: <DB6PR0101MB2214CF7EDAF942C491CA75C98F9D9@DB6PR0101MB2214.eurprd01.prod.exchangelabs.com> __lasx_xvldx does not accept a pointer to const (in fact, no function in lasxintrin.h does so), although it is not allowed to modify the pointed-to buffer. Therefore this commit adds a wrapper for it in order to constify the H264Chroma API in a later commit. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/loongarch/h264chroma_lasx.c | 94 ++++++++++++++------------ libavcodec/loongarch/vc1dsp_lasx.c | 20 +++--- 2 files changed, 61 insertions(+), 53 deletions(-) diff --git a/libavcodec/loongarch/h264chroma_lasx.c b/libavcodec/loongarch/h264chroma_lasx.c index 824a78dfc8..bada8bb5ed 100644 --- a/libavcodec/loongarch/h264chroma_lasx.c +++ b/libavcodec/loongarch/h264chroma_lasx.c @@ -26,6 +26,10 @@ #include "libavutil/avassert.h" #include "libavutil/loongarch/loongson_intrinsics.h" +/* __lasx_xvldx() in lasxintrin.h does not accept a const void*; + * remove the following once it does. */ +#define LASX_XVLDX(ptr, stride) __lasx_xvldx((void*)ptr, stride) + static const uint8_t chroma_mask_arr[64] = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, @@ -51,7 +55,7 @@ static av_always_inline void avc_chroma_hv_8x4_lasx(uint8_t *src, uint8_t *dst, __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1); DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0); - DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x, + DUP4_ARG2(LASX_XVLDX, src, stride, src, stride_2x, src, stride_3x, src, stride_4x, src1, src2, src3, src4); DUP2_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src1, src3); src0 = __lasx_xvshuf_b(src0, src0, mask); @@ -91,10 +95,10 @@ static av_always_inline void avc_chroma_hv_8x8_lasx(uint8_t *src, uint8_t *dst, __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1); DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0); - DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x, + DUP4_ARG2(LASX_XVLDX, src, stride, src, stride_2x, src, stride_3x, src, stride_4x, src1, src2, src3, src4); src += stride_4x; - DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x, + DUP4_ARG2(LASX_XVLDX, src, stride, src, stride_2x, src, stride_3x, src, stride_4x, src5, src6, src7, src8); DUP4_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src6, src5, 0x20, src8, src7, 0x20, src1, src3, src5, src7); @@ -141,8 +145,8 @@ static av_always_inline void avc_chroma_hz_8x4_lasx(uint8_t *src, uint8_t *dst, coeff_vec = __lasx_xvslli_b(coeff_vec, 3); DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0); - DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src1, src2); - src3 = __lasx_xvldx(src, stride_3x); + DUP2_ARG2(LASX_XVLDX, src, stride, src, stride_2x, src1, src2); + src3 = LASX_XVLDX(src, stride_3x); DUP2_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src0, src2); DUP2_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src0, src2); DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1); @@ -170,11 +174,11 @@ static av_always_inline void avc_chroma_hz_8x8_lasx(uint8_t *src, uint8_t *dst, coeff_vec = __lasx_xvslli_b(coeff_vec, 3); DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0); - DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x, + DUP4_ARG2(LASX_XVLDX, src, stride, src, stride_2x, src, stride_3x, src, stride_4x, src1, src2, src3, src4); src += stride_4x; - DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src5, src6); - src7 = __lasx_xvldx(src, stride_3x); + DUP2_ARG2(LASX_XVLDX, src, stride, src, stride_2x, src5, src6); + src7 = LASX_XVLDX(src, stride_3x); DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src5, src4, 0x20, src7, src6, 0x20, src0, src2, src4, src6); DUP4_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src4, src4, mask, @@ -212,7 +216,7 @@ static av_always_inline void avc_chroma_hz_nonmult_lasx(uint8_t *src, coeff_vec = __lasx_xvslli_b(coeff_vec, 3); for (row = height >> 2; row--;) { - DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, + DUP4_ARG2(LASX_XVLDX, src, 0, src, stride, src, stride_2x, src, stride_3x, src0, src1, src2, src3); src += stride_4x; DUP2_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src0, src2); @@ -228,7 +232,7 @@ static av_always_inline void avc_chroma_hz_nonmult_lasx(uint8_t *src, if ((height & 3)) { src0 = __lasx_xvld(src, 0); - src1 = __lasx_xvldx(src, stride); + src1 = LASX_XVLDX(src, stride); src1 = __lasx_xvpermi_q(src1, src0, 0x20); src0 = __lasx_xvshuf_b(src1, src1, mask); res0 = __lasx_xvdp2_h_bu(src0, coeff_vec); @@ -253,7 +257,7 @@ static av_always_inline void avc_chroma_vt_8x4_lasx(uint8_t *src, uint8_t *dst, coeff_vec = __lasx_xvslli_b(coeff_vec, 3); src0 = __lasx_xvld(src, 0); src += stride; - DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, + DUP4_ARG2(LASX_XVLDX, src, 0, src, stride, src, stride_2x, src, stride_3x, src1, src2, src3, src4); DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20, src4, src3, 0x20, src0, src1, src2, src3); @@ -282,10 +286,10 @@ static av_always_inline void avc_chroma_vt_8x8_lasx(uint8_t *src, uint8_t *dst, coeff_vec = __lasx_xvslli_b(coeff_vec, 3); src0 = __lasx_xvld(src, 0); src += stride; - DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, + DUP4_ARG2(LASX_XVLDX, src, 0, src, stride, src, stride_2x, src, stride_3x, src1, src2, src3, src4); src += stride_4x; - DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, + DUP4_ARG2(LASX_XVLDX, src, 0, src, stride, src, stride_2x, src, stride_3x, src5, src6, src7, src8); DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20, src4, src3, 0x20, src0, src1, src2, src3); @@ -402,7 +406,7 @@ static void avc_chroma_hv_4x2_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, __m256i coeff_vt_vec = __lasx_xvpermi_q(coeff_vt_vec1, coeff_vt_vec0, 0x02); DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0); - DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2, src1, src2); + DUP2_ARG2(LASX_XVLDX, src, stride, src, stride_2, src1, src2); DUP2_ARG3(__lasx_xvshuf_b, src1, src0, mask, src2, src1, mask, src0, src1); src0 = __lasx_xvpermi_q(src0, src1, 0x02); res_hz = __lasx_xvdp2_h_bu(src0, coeff_hz_vec); @@ -431,7 +435,7 @@ static void avc_chroma_hv_4x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1); DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0); - DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3, + DUP4_ARG2(LASX_XVLDX, src, stride, src, stride_2, src, stride_3, src, stride_4, src1, src2, src3, src4); DUP4_ARG3(__lasx_xvshuf_b, src1, src0, mask, src2, src1, mask, src3, src2, mask, src4, src3, mask, src0, src1, src2, src3); @@ -464,10 +468,10 @@ static void avc_chroma_hv_4x8_lasx(uint8_t *src, uint8_t * dst, ptrdiff_t stride __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1); DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0); - DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3, + DUP4_ARG2(LASX_XVLDX, src, stride, src, stride_2, src, stride_3, src, stride_4, src1, src2, src3, src4); src += stride_4; - DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3, + DUP4_ARG2(LASX_XVLDX, src, stride, src, stride_2, src, stride_3, src, stride_4, src5, src6, src7, src8); DUP4_ARG3(__lasx_xvshuf_b, src1, src0, mask, src2, src1, mask, src3, src2, mask, src4, src3, mask, src0, src1, src2, src3); @@ -519,7 +523,7 @@ static void avc_chroma_hz_4x2_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0); - src1 = __lasx_xvldx(src, stride); + src1 = LASX_XVLDX(src, stride); src0 = __lasx_xvshuf_b(src1, src0, mask); res = __lasx_xvdp2_h_bu(src0, coeff_vec); res = __lasx_xvslli_h(res, 3); @@ -540,8 +544,8 @@ static void avc_chroma_hz_4x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0); - DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2, src1, src2); - src3 = __lasx_xvldx(src, stride_3); + DUP2_ARG2(LASX_XVLDX, src, stride, src, stride_2, src1, src2); + src3 = LASX_XVLDX(src, stride_3); DUP2_ARG3(__lasx_xvshuf_b, src1, src0, mask, src3, src2, mask, src0, src2); src0 = __lasx_xvpermi_q(src0, src2, 0x02); res = __lasx_xvdp2_h_bu(src0, coeff_vec); @@ -567,11 +571,11 @@ static void avc_chroma_hz_4x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, coeff_vec = __lasx_xvslli_b(coeff_vec, 3); DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0); - DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3, + DUP4_ARG2(LASX_XVLDX, src, stride, src, stride_2, src, stride_3, src, stride_4, src1, src2, src3, src4); src += stride_4; - DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2, src5, src6); - src7 = __lasx_xvldx(src, stride_3); + DUP2_ARG2(LASX_XVLDX, src, stride, src, stride_2, src5, src6); + src7 = LASX_XVLDX(src, stride_3); DUP4_ARG3(__lasx_xvshuf_b, src1, src0, mask, src3, src2, mask, src5, src4, mask, src7, src6, mask, src0, src2, src4, src6); DUP2_ARG3(__lasx_xvpermi_q, src0, src2, 0x02, src4, src6, 0x02, src0, src4); @@ -625,7 +629,7 @@ static void avc_chroma_vt_4x2_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); src0 = __lasx_xvld(src, 0); - DUP2_ARG2(__lasx_xvldx, src, stride, src, stride << 1, src1, src2); + DUP2_ARG2(LASX_XVLDX, src, stride, src, stride << 1, src1, src2); DUP2_ARG2(__lasx_xvilvl_b, src1, src0, src2, src1, tmp0, tmp1); tmp0 = __lasx_xvilvl_d(tmp1, tmp0); res = __lasx_xvdp2_h_bu(tmp0, coeff_vec); @@ -649,7 +653,7 @@ static void avc_chroma_vt_4x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1); src0 = __lasx_xvld(src, 0); - DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3, + DUP4_ARG2(LASX_XVLDX, src, stride, src, stride_2, src, stride_3, src, stride_4, src1, src2, src3, src4); DUP4_ARG2(__lasx_xvilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2, tmp3); @@ -679,10 +683,10 @@ static void avc_chroma_vt_4x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, coeff_vec = __lasx_xvslli_b(coeff_vec, 3); src0 = __lasx_xvld(src, 0); - DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3, + DUP4_ARG2(LASX_XVLDX, src, stride, src, stride_2, src, stride_3, src, stride_4, src1, src2, src3, src4); src += stride_4; - DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3, + DUP4_ARG2(LASX_XVLDX, src, stride, src, stride_2, src, stride_3, src, stride_4, src5, src6, src7, src8); DUP4_ARG2(__lasx_xvilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2, tmp3); @@ -860,7 +864,7 @@ static av_always_inline void avc_chroma_hv_and_aver_dst_8x4_lasx(uint8_t *src, __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1); DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0); - DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x, + DUP4_ARG2(LASX_XVLDX, src, stride, src, stride_2x, src, stride_3x, src, stride_4x, src1, src2, src3, src4); DUP2_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src1, src3); src0 = __lasx_xvshuf_b(src0, src0, mask); @@ -874,7 +878,7 @@ static av_always_inline void avc_chroma_hv_and_aver_dst_8x4_lasx(uint8_t *src, res_vt0 = __lasx_xvmadd_h(res_vt0, res_hz0, coeff_vt_vec1); res_vt1 = __lasx_xvmadd_h(res_vt1, res_hz1, coeff_vt_vec1); out = __lasx_xvssrarni_bu_h(res_vt1, res_vt0, 6); - DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, + DUP4_ARG2(LASX_XVLDX, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, tp0, tp1, tp2, tp3); DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2); tp0 = __lasx_xvpermi_q(tp2, tp0, 0x20); @@ -907,10 +911,10 @@ static av_always_inline void avc_chroma_hv_and_aver_dst_8x8_lasx(uint8_t *src, DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0); src += stride; - DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, + DUP4_ARG2(LASX_XVLDX, src, 0, src, stride, src, stride_2x, src, stride_3x, src1, src2, src3, src4); src += stride_4x; - DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, + DUP4_ARG2(LASX_XVLDX, src, 0, src, stride, src, stride_2x, src, stride_3x, src5, src6, src7, src8); DUP4_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src6, src5, 0x20, src8, src7, 0x20, src1, src3, src5, src7); @@ -934,12 +938,12 @@ static av_always_inline void avc_chroma_hv_and_aver_dst_8x8_lasx(uint8_t *src, res_vt3 = __lasx_xvmadd_h(res_vt3, res_hz3, coeff_vt_vec1); DUP2_ARG3(__lasx_xvssrarni_bu_h, res_vt1, res_vt0, 6, res_vt3, res_vt2, 6, out0, out1); - DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, + DUP4_ARG2(LASX_XVLDX, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, tp0, tp1, tp2, tp3); DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2); dst0 = __lasx_xvpermi_q(tp2, tp0, 0x20); dst += stride_4x; - DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, + DUP4_ARG2(LASX_XVLDX, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, tp0, tp1, tp2, tp3); dst -= stride_4x; DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2); @@ -973,13 +977,13 @@ static av_always_inline void avc_chroma_hz_and_aver_dst_8x4_lasx(uint8_t *src, coeff_vec = __lasx_xvslli_b(coeff_vec, 3); mask = __lasx_xvld(chroma_mask_arr, 0); - DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, + DUP4_ARG2(LASX_XVLDX, src, 0, src, stride, src, stride_2x, src, stride_3x, src0, src1, src2, src3); DUP2_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src0, src2); DUP2_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src0, src2); DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1); out = __lasx_xvssrarni_bu_h(res1, res0, 6); - DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, + DUP4_ARG2(LASX_XVLDX, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, tp0, tp1, tp2, tp3); DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2); tp0 = __lasx_xvpermi_q(tp2, tp0, 0x20); @@ -1008,10 +1012,10 @@ static av_always_inline void avc_chroma_hz_and_aver_dst_8x8_lasx(uint8_t *src, coeff_vec = __lasx_xvslli_b(coeff_vec, 3); mask = __lasx_xvld(chroma_mask_arr, 0); - DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, + DUP4_ARG2(LASX_XVLDX, src, 0, src, stride, src, stride_2x, src, stride_3x, src0, src1, src2, src3); src += stride_4x; - DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, + DUP4_ARG2(LASX_XVLDX, src, 0, src, stride, src, stride_2x, src, stride_3x, src4, src5, src6, src7); DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src5, src4, 0x20, src7, src6, 0x20, src0, src2, src4, src6); @@ -1020,12 +1024,12 @@ static av_always_inline void avc_chroma_hz_and_aver_dst_8x8_lasx(uint8_t *src, DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, src4, coeff_vec, src6, coeff_vec, res0, res1, res2, res3); DUP2_ARG3(__lasx_xvssrarni_bu_h, res1, res0, 6, res3, res2, 6, out0, out1); - DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, + DUP4_ARG2(LASX_XVLDX, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, tp0, tp1, tp2, tp3); DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2); dst0 = __lasx_xvpermi_q(tp2, tp0, 0x20); dst += stride_4x; - DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, + DUP4_ARG2(LASX_XVLDX, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, tp0, tp1, tp2, tp3); dst -= stride_4x; DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2); @@ -1059,14 +1063,14 @@ static av_always_inline void avc_chroma_vt_and_aver_dst_8x4_lasx(uint8_t *src, coeff_vec = __lasx_xvslli_b(coeff_vec, 3); src0 = __lasx_xvld(src, 0); - DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x, + DUP4_ARG2(LASX_XVLDX, src, stride, src, stride_2x, src, stride_3x, src, stride_4x, src1, src2, src3, src4); DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20, src4, src3, 0x20, src0, src1, src2, src3); DUP2_ARG2(__lasx_xvilvl_b, src1, src0, src3, src2, src0, src2); DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1); out = __lasx_xvssrarni_bu_h(res1, res0, 6); - DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, + DUP4_ARG2(LASX_XVLDX, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, tp0, tp1, tp2, tp3); DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2); tp0 = __lasx_xvpermi_q(tp2, tp0, 0x20); @@ -1095,10 +1099,10 @@ static av_always_inline void avc_chroma_vt_and_aver_dst_8x8_lasx(uint8_t *src, coeff_vec = __lasx_xvslli_b(coeff_vec, 3); src0 = __lasx_xvld(src, 0); src += stride; - DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, + DUP4_ARG2(LASX_XVLDX, src, 0, src, stride, src, stride_2x, src, stride_3x, src1, src2, src3, src4); src += stride_4x; - DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x, + DUP4_ARG2(LASX_XVLDX, src, 0, src, stride, src, stride_2x, src, stride_3x, src5, src6, src7, src8); DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20, src4, src3, 0x20, src0, src1, src2, src3); @@ -1109,12 +1113,12 @@ static av_always_inline void avc_chroma_vt_and_aver_dst_8x8_lasx(uint8_t *src, DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, src4, coeff_vec, src6, coeff_vec, res0, res1, res2, res3); DUP2_ARG3(__lasx_xvssrarni_bu_h, res1, res0, 6, res3, res2, 6, out0, out1); - DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, + DUP4_ARG2(LASX_XVLDX, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, tp0, tp1, tp2, tp3); DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2); dst0 = __lasx_xvpermi_q(tp2, tp0, 0x20); dst += stride_4x; - DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, + DUP4_ARG2(LASX_XVLDX, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x, tp0, tp1, tp2, tp3); dst -= stride_4x; DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2); diff --git a/libavcodec/loongarch/vc1dsp_lasx.c b/libavcodec/loongarch/vc1dsp_lasx.c index 40b8668f2b..63950bc076 100644 --- a/libavcodec/loongarch/vc1dsp_lasx.c +++ b/libavcodec/loongarch/vc1dsp_lasx.c @@ -22,6 +22,10 @@ #include "vc1dsp_loongarch.h" #include "libavutil/loongarch/loongson_intrinsics.h" +/* __lasx_xvldx() in lasxintrin.h does not accept a const void*; + * remove the following once it does. */ +#define LASX_XVLDX(ptr, stride) __lasx_xvldx((void*)ptr, stride) + void ff_vc1_inv_trans_8x8_lasx(int16_t block[64]) { int32_t con_4 = 4; @@ -831,20 +835,20 @@ static void put_vc1_mspel_mc_h_lasx(uint8_t *dst, const uint8_t *src, const_para1_2 = __lasx_xvreplgr2vr_h(*(para_v + 1)); in0 = __lasx_xvld(_src, 0); - DUP2_ARG2(__lasx_xvldx, _src, stride, _src, stride2, in1, in2); - in3 = __lasx_xvldx(_src, stride3); + DUP2_ARG2(LASX_XVLDX, _src, stride, _src, stride2, in1, in2); + in3 = LASX_XVLDX(_src, stride3); _src += stride4; in4 = __lasx_xvld(_src, 0); - DUP2_ARG2(__lasx_xvldx, _src, stride, _src, stride2, in5, in6); - in7 = __lasx_xvldx(_src, stride3); + DUP2_ARG2(LASX_XVLDX, _src, stride, _src, stride2, in5, in6); + in7 = LASX_XVLDX(_src, stride3); _src += stride4; in8 = __lasx_xvld(_src, 0); - DUP2_ARG2(__lasx_xvldx, _src, stride, _src, stride2, in9, in10); - in11 = __lasx_xvldx(_src, stride3); + DUP2_ARG2(LASX_XVLDX, _src, stride, _src, stride2, in9, in10); + in11 = LASX_XVLDX(_src, stride3); _src += stride4; in12 = __lasx_xvld(_src, 0); - DUP2_ARG2(__lasx_xvldx, _src, stride, _src, stride2, in13, in14); - in15 = __lasx_xvldx(_src, stride3); + DUP2_ARG2(LASX_XVLDX, _src, stride, _src, stride2, in13, in14); + in15 = LASX_XVLDX(_src, stride3); DUP4_ARG2(__lasx_xvilvl_b, in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, tmp3_m); DUP4_ARG2(__lasx_xvilvl_b, in10, in8, in11, in9, in14, in12, in15, in13, -- 2.34.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2022-08-02 0:24 UTC|newest] Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top 2022-08-02 0:19 [FFmpeg-devel] [PATCH v5 1/4] avcodec/loongarch: Add wrapper for __lsx_vldx Andreas Rheinhardt 2022-08-02 0:23 ` [FFmpeg-devel] [PATCH v5 2/4] avcodec/hevcdsp: Constify src pointers Andreas Rheinhardt 2022-08-02 0:23 ` Andreas Rheinhardt [this message] 2022-08-02 0:23 ` [FFmpeg-devel] [PATCH v5 4/4] avcodec/h264chroma: Constify src in h264_chroma_mc_func Andreas Rheinhardt 2022-08-04 1:26 ` [FFmpeg-devel] [PATCH v5 1/4] avcodec/loongarch: Add wrapper for __lsx_vldx Andreas Rheinhardt
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=DB6PR0101MB22140F4C8EAEDD5607A4490B8F9D9@DB6PR0101MB2214.eurprd01.prod.exchangelabs.com \ --to=andreas.rheinhardt@outlook.com \ --cc=chenhao@loongson.cn \ --cc=ffmpeg-devel@ffmpeg.org \ --cc=wanglu@loongson.cn \ --cc=yinshiyou-hf@loongson.cn \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git