From: 殷时友 <yinshiyou-hf@loongson.cn> To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Subject: Re: [FFmpeg-devel] [PATCH 1/3] avcodec: [loongarch] Optimize hpeldsp with LASX. Date: Wed, 29 Dec 2021 17:38:37 +0800 Message-ID: <A63996FD-D184-4322-BF9C-D1906DE20221@loongson.cn> (raw) In-Reply-To: <20211224094922.32762-2-chenhao@loongson.cn> > 2021年12月24日 下午5:49,Hao Chen <chenhao@loongson.cn> 写道: > > From: Shiyou Yin <yinshiyou-hf@loongson.cn> > > ./ffmpeg -i 8_mpeg4_1080p_24fps_12Mbps.avi -f rawvideo -y /dev/null -an > before:376fps > after :433fps > > Change-Id: Ic8018562093154887323b508b81d0f489c0d265d > Signed-off-by: Hao Chen <chenhao@loongson.cn> > --- > libavcodec/hpeldsp.c | 2 + > libavcodec/hpeldsp.h | 1 + > libavcodec/loongarch/Makefile | 2 + > libavcodec/loongarch/hpeldsp_init_loongarch.c | 50 + > libavcodec/loongarch/hpeldsp_lasx.c | 1289 +++++++++++++++++ > libavcodec/loongarch/hpeldsp_lasx.h | 58 + > 6 files changed, 1402 insertions(+) > create mode 100644 libavcodec/loongarch/hpeldsp_init_loongarch.c > create mode 100644 libavcodec/loongarch/hpeldsp_lasx.c > create mode 100644 libavcodec/loongarch/hpeldsp_lasx.h > > > diff --git a/libavcodec/loongarch/hpeldsp_lasx.c b/libavcodec/loongarch/hpeldsp_lasx.c > new file mode 100644 > index 0000000000..07f0c4b517 > --- /dev/null > +++ b/libavcodec/loongarch/hpeldsp_lasx.c > @@ -0,0 +1,1289 @@ > > +void ff_put_no_rnd_pixels16_y2_8_lasx(uint8_t *block, const uint8_t *pixels, > + ptrdiff_t line_size, int h) > +{ > + if (h == 16) { > + common_vt_bil_no_rnd_16x16_lasx(pixels, line_size, block, line_size); > + } else if (h == 8) { > + common_vt_bil_no_rnd_8x16_lasx(pixels, line_size, block, line_size); > + } > +} > + > +static void common_hv_bil_no_rnd_16x16_lasx(const uint8_t *src, > + int32_t src_stride, > + uint8_t *dst, int32_t dst_stride) > +{ > + __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; > + __m256i src10, src11, src12, src13, src14, src15, src16, src17; > + __m256i sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7; > + int32_t src_stride_2x = src_stride << 1; > + int32_t src_stride_4x = src_stride << 2; > + int32_t src_stride_3x = src_stride_2x + src_stride; > + uint8_t* _src = (uint8_t*)src; > + > + src0 = __lasx_xvld(_src, 0); > + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src1, src2); > + src3 = __lasx_xvldx(_src, src_stride_3x); > + _src += src_stride_4x; > + src4 = __lasx_xvld(_src, 0); > + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src5, src6); > + src7 = __lasx_xvldx(_src, src_stride_3x); > + _src += (1 - src_stride_4x); > + src9 = __lasx_xvld(_src, 0); > + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, > + src10, src11); > + src12 = __lasx_xvldx(_src, src_stride_3x); > + _src += src_stride_4x; > + src13 = __lasx_xvld(_src, 0); > + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, > + src14, src15); > + src16 = __lasx_xvldx(_src, src_stride_3x); > + _src += (src_stride_4x - 1); > + DUP2_ARG2(__lasx_xvld, _src, 0, _src, 1, src8, src17); > + > + DUP4_ARG3(__lasx_xvpermi_q, src0, src4, 0x02, src1, src5, 0x02, src2, > + src6, 0x02, src3, src7, 0x02, src0, src1, src2, src3); > + DUP4_ARG3(__lasx_xvpermi_q, src4, src8, 0x02, src9, src13, 0x02, src10, > + src14, 0x02, src11, src15, 0x02, src4, src5, src6, src7); > + DUP2_ARG3(__lasx_xvpermi_q, src12, src16, 0x02, src13, src17, 0x02, > + src8, src9); > + DUP4_ARG2(__lasx_xvilvl_h, src5, src0, src6, src1, src7, src2, src8, src3, > + sum0, sum2, sum4, sum6); > + DUP4_ARG2(__lasx_xvilvh_h, src5, src0, src6, src1, src7, src2, src8, src3, > + sum1, sum3, sum5, sum7); > + src8 = __lasx_xvilvl_h(src9, src4); > + src9 = __lasx_xvilvh_h(src9, src4); > + > + DUP4_ARG2(__lasx_xvhaddw_hu_bu, sum0, sum0, sum1, sum1, sum2, sum2, > + sum3, sum3, src0, src1, src2, src3); > + DUP4_ARG2(__lasx_xvhaddw_hu_bu, sum4, sum4, sum5, sum5, sum6, sum6, > + sum7, sum7, src4, src5, src6, src7); > + DUP2_ARG2(__lasx_xvhaddw_hu_bu, src8, src8, src9, src9, src8, src9); > + > + DUP4_ARG2(__lasx_xvadd_h, src0, src2, src1, src3, src2, src4, src3, src5, > + sum0, sum1, sum2, sum3); > + DUP4_ARG2(__lasx_xvadd_h, src4, src6, src5, src7, src6, src8, src7, src9, > + sum4, sum5, sum6, sum7); > + DUP4_ARG2(__lasx_xvaddi_hu, sum0, 1, sum1, 1, sum2, 1, sum3, 1, > + sum0, sum1, sum2, sum3); > + DUP4_ARG2(__lasx_xvaddi_hu, sum4, 1, sum5, 1, sum6, 1, sum7, 1, > + sum4, sum5, sum6, sum7); > + DUP4_ARG3(__lasx_xvsrani_b_h, sum1, sum0, 2, sum3, sum2, 2, sum5, sum4, 2, > + sum7, sum6, 2, sum0, sum1, sum2, sum3); > + __lasx_xvstelm_d(sum0, dst, 0, 0); > + __lasx_xvstelm_d(sum0, dst, 8, 1); > + dst += dst_stride; > + __lasx_xvstelm_d(sum1, dst, 0, 0); > + __lasx_xvstelm_d(sum1, dst, 8, 1); > + dst += dst_stride; > + __lasx_xvstelm_d(sum2, dst, 0, 0); > + __lasx_xvstelm_d(sum2, dst, 8, 1); > + dst += dst_stride; > + __lasx_xvstelm_d(sum3, dst, 0, 0); > + __lasx_xvstelm_d(sum3, dst, 8, 1); > + dst += dst_stride; > + __lasx_xvstelm_d(sum0, dst, 0, 2); > + __lasx_xvstelm_d(sum0, dst, 8, 3); > + dst += dst_stride; > + __lasx_xvstelm_d(sum1, dst, 0, 2); > + __lasx_xvstelm_d(sum1, dst, 8, 3); > + dst += dst_stride; > + __lasx_xvstelm_d(sum2, dst, 0, 2); > + __lasx_xvstelm_d(sum2, dst, 8, 3); > + dst += dst_stride; > + __lasx_xvstelm_d(sum3, dst, 0, 2); > + __lasx_xvstelm_d(sum3, dst, 8, 3); > + dst += dst_stride; > + > + src0 = __lasx_xvld(_src, 0); > + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src1, src2); > + src3 = __lasx_xvldx(_src, src_stride_3x); > + _src += src_stride_4x; > + src4 = __lasx_xvld(_src, 0); > + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src5, src6); > + src7 = __lasx_xvldx(_src, src_stride_3x); > + _src += (1 - src_stride_4x); > + src9 = __lasx_xvld(_src, 0); > + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, > + src10, src11); > + src12 = __lasx_xvldx(_src, src_stride_3x); > + _src += src_stride_4x; > + src13 = __lasx_xvld(_src, 0); > + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, > + src14, src15); > + src16 = __lasx_xvldx(_src, src_stride_3x); > + _src += (src_stride_4x - 1); > + DUP2_ARG2(__lasx_xvld, _src, 0, _src, 1, src8, src17); > + > + DUP4_ARG3(__lasx_xvpermi_q, src0, src4, 0x02, src1, src5, 0x02, src2, src6, 0x02, > + src3, src7, 0x02, src0, src1, src2, src3); > + DUP4_ARG3(__lasx_xvpermi_q, src4, src8, 0x02, src9, src13, 0x02, src10, src14, 0x02, > + src11, src15, 0x02, src4, src5, src6, src7); > + DUP2_ARG3(__lasx_xvpermi_q, src12, src16, 0x02, src13, src17, 0x02, src8, src9); > + > + DUP4_ARG2(__lasx_xvilvl_h, src5, src0, src6, src1, src7, src2, src8, src3, > + sum0, sum2, sum4, sum6); > + DUP4_ARG2(__lasx_xvilvh_h, src5, src0, src6, src1, src7, src2, src8, src3, > + sum1, sum3, sum5, sum7); > + src8 = __lasx_xvilvl_h(src9, src4); > + src9 = __lasx_xvilvh_h(src9, src4); > + > + DUP4_ARG2(__lasx_xvhaddw_hu_bu, sum0, sum0, sum1, sum1, sum2, sum2, > + sum3, sum3, src0, src1, src2, src3); > + DUP4_ARG2(__lasx_xvhaddw_hu_bu, sum4, sum4, sum5, sum5, sum6, sum6, > + sum7, sum7, src4, src5, src6, src7); > + DUP2_ARG2(__lasx_xvhaddw_hu_bu, src8, src8, src9, src9, src8, src9); > + > + DUP4_ARG2(__lasx_xvadd_h, src0, src2, src1, src3, src2, src4, src3, src5, > + sum0, sum1, sum2, sum3); > + DUP4_ARG2(__lasx_xvadd_h, src4, src6, src5, src7, src6, src8, src7, src9, > + sum4, sum5, sum6, sum7); > + DUP4_ARG2(__lasx_xvaddi_hu, sum0, 1, sum1, 1, sum2, 1, sum3, 1, > + sum0, sum1, sum2, sum3); > + DUP4_ARG2(__lasx_xvaddi_hu, sum4, 1, sum5, 1, sum6, 1, sum7, 1, > + sum4, sum5, sum6, sum7); > + DUP4_ARG3(__lasx_xvsrani_b_h, sum1, sum0, 2, sum3, sum2, 2, sum5, sum4, 2, > + sum7, sum6, 2, sum0, sum1, sum2, sum3); > + __lasx_xvstelm_d(sum0, dst, 0, 0); > + __lasx_xvstelm_d(sum0, dst, 8, 1); > + dst += dst_stride; > + __lasx_xvstelm_d(sum1, dst, 0, 0); > + __lasx_xvstelm_d(sum1, dst, 8, 1); > + dst += dst_stride; > + __lasx_xvstelm_d(sum2, dst, 0, 0); > + __lasx_xvstelm_d(sum2, dst, 8, 1); > + dst += dst_stride; > + __lasx_xvstelm_d(sum3, dst, 0, 0); > + __lasx_xvstelm_d(sum3, dst, 8, 1); > + dst += dst_stride; > + __lasx_xvstelm_d(sum0, dst, 0, 2); > + __lasx_xvstelm_d(sum0, dst, 8, 3); > + dst += dst_stride; > + __lasx_xvstelm_d(sum1, dst, 0, 2); > + __lasx_xvstelm_d(sum1, dst, 8, 3); > + dst += dst_stride; > + __lasx_xvstelm_d(sum2, dst, 0, 2); > + __lasx_xvstelm_d(sum2, dst, 8, 3); > + dst += dst_stride; > + __lasx_xvstelm_d(sum3, dst, 0, 2); > + __lasx_xvstelm_d(sum3, dst, 8, 3); > + dst += dst_stride; The last line is not needed. > +} > + > +static void common_hv_bil_no_rnd_8x16_lasx(const uint8_t *src, > + int32_t src_stride, > + uint8_t *dst, int32_t dst_stride) > +{ > + __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; > + __m256i src10, src11, src12, src13, src14, src15, src16, src17; > + __m256i sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7; > + int32_t src_stride_2x = src_stride << 1; > + int32_t src_stride_4x = src_stride << 2; > + int32_t src_stride_3x = src_stride_2x + src_stride; > + uint8_t* _src = (uint8_t*)src; > + > + src0 = __lasx_xvld(_src, 0); > + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src1, src2); > + src3 = __lasx_xvldx(_src, src_stride_3x); > + _src += src_stride_4x; > + src4 = __lasx_xvld(_src, 0); > + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src5, src6); > + src7 = __lasx_xvldx(_src, src_stride_3x); > + _src += (1 - src_stride_4x); > + src9 = __lasx_xvld(_src, 0); > + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, > + src10, src11); > + src12 = __lasx_xvldx(_src, src_stride_3x); > + _src += src_stride_4x; > + src13 = __lasx_xvld(_src, 0); > + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, > + src14, src15); > + src16 = __lasx_xvldx(_src, src_stride_3x); > + _src += (src_stride_4x - 1); > + DUP2_ARG2(__lasx_xvld, _src, 0, _src, 1, src8, src17); > + > + DUP4_ARG3(__lasx_xvpermi_q, src0, src4, 0x02, src1, src5, 0x02, src2, > + src6, 0x02, src3, src7, 0x02, src0, src1, src2, src3); > + DUP4_ARG3(__lasx_xvpermi_q, src4, src8, 0x02, src9, src13, 0x02, src10, > + src14, 0x02, src11, src15, 0x02, src4, src5, src6, src7); > + DUP2_ARG3(__lasx_xvpermi_q, src12, src16, 0x02, src13, src17, 0x02, src8, src9); > + > + DUP4_ARG2(__lasx_xvilvl_h, src5, src0, src6, src1, src7, src2, src8, src3, > + sum0, sum2, sum4, sum6); > + DUP4_ARG2(__lasx_xvilvh_h, src5, src0, src6, src1, src7, src2, src8, src3, > + sum1, sum3, sum5, sum7); > + src8 = __lasx_xvilvl_h(src9, src4); > + src9 = __lasx_xvilvh_h(src9, src4); > + > + DUP4_ARG2(__lasx_xvhaddw_hu_bu, sum0, sum0, sum1, sum1, sum2, sum2, > + sum3, sum3, src0, src1, src2, src3); > + DUP4_ARG2(__lasx_xvhaddw_hu_bu, sum4, sum4, sum5, sum5, sum6, sum6, > + sum7, sum7, src4, src5, src6, src7); > + DUP2_ARG2(__lasx_xvhaddw_hu_bu, src8, src8, src9, src9, src8, src9); > + > + DUP4_ARG2(__lasx_xvadd_h, src0, src2, src1, src3, src2, src4, src3, src5, > + sum0, sum1, sum2, sum3); > + DUP4_ARG2(__lasx_xvadd_h, src4, src6, src5, src7, src6, src8, src7, src9, > + sum4, sum5, sum6, sum7); > + DUP4_ARG2(__lasx_xvaddi_hu, sum0, 1, sum1, 1, sum2, 1, sum3, 1, > + sum0, sum1, sum2, sum3); > + DUP4_ARG2(__lasx_xvaddi_hu, sum4, 1, sum5, 1, sum6, 1, sum7, 1, > + sum4, sum5, sum6, sum7); > + DUP4_ARG3(__lasx_xvsrani_b_h, sum1, sum0, 2, sum3, sum2, 2, sum5, sum4, 2, > + sum7, sum6, 2, sum0, sum1, sum2, sum3); > + __lasx_xvstelm_d(sum0, dst, 0, 0); > + __lasx_xvstelm_d(sum0, dst, 8, 1); > + dst += dst_stride; > + __lasx_xvstelm_d(sum1, dst, 0, 0); > + __lasx_xvstelm_d(sum1, dst, 8, 1); > + dst += dst_stride; > + __lasx_xvstelm_d(sum2, dst, 0, 0); > + __lasx_xvstelm_d(sum2, dst, 8, 1); > + dst += dst_stride; > + __lasx_xvstelm_d(sum3, dst, 0, 0); > + __lasx_xvstelm_d(sum3, dst, 8, 1); > + dst += dst_stride; > + __lasx_xvstelm_d(sum0, dst, 0, 2); > + __lasx_xvstelm_d(sum0, dst, 8, 3); > + dst += dst_stride; > + __lasx_xvstelm_d(sum1, dst, 0, 2); > + __lasx_xvstelm_d(sum1, dst, 8, 3); > + dst += dst_stride; > + __lasx_xvstelm_d(sum2, dst, 0, 2); > + __lasx_xvstelm_d(sum2, dst, 8, 3); > + dst += dst_stride; > + __lasx_xvstelm_d(sum3, dst, 0, 2); > + __lasx_xvstelm_d(sum3, dst, 8, 3); > + dst += dst_stride; This line is not needed too. _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2021-12-29 9:38 UTC|newest] Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top 2021-12-24 9:49 [FFmpeg-devel] Optimize Mpeg4 decoding for loongarch Hao Chen 2021-12-24 9:49 ` [FFmpeg-devel] [PATCH 1/3] avcodec: [loongarch] Optimize hpeldsp with LASX Hao Chen 2021-12-29 9:38 ` 殷时友 [this message] 2021-12-24 9:49 ` [FFmpeg-devel] [PATCH 2/3] avcodec: [loongarch] Optimize idctdstp " Hao Chen 2021-12-24 9:49 ` [FFmpeg-devel] [PATCH 3/3] avcodec: [loongarch] Optimize prefetch with loongarch Hao Chen
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=A63996FD-D184-4322-BF9C-D1906DE20221@loongson.cn \ --to=yinshiyou-hf@loongson.cn \ --cc=ffmpeg-devel@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git