From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> To: ffmpeg-devel@ffmpeg.org Subject: Re: [FFmpeg-devel] [PATCH 1/5] avcodec/vvc_mc: split the SAD dsp prototype into one function per blocksize width Date: Thu, 23 May 2024 14:35:55 +0200 Message-ID: <GV1P250MB073792BBA7747C78B536FA038FF42@GV1P250MB0737.EURP250.PROD.OUTLOOK.COM> (raw) In-Reply-To: <20240523122716.2158-1-jamrial@gmail.com> James Almer: > Signed-off-by: James Almer <jamrial@gmail.com> > --- The commit message should explain what the advantage of this is. Particularly, what is the advantage of this over jumping in the function to based upon blocksize vs. selecting an appropriate function in the generic code (even when these functions turn out all the same as is for the C version). > libavcodec/vvc/dsp.h | 2 +- > libavcodec/vvc/inter.c | 6 ++++-- > libavcodec/vvc/inter_template.c | 6 +++++- > libavcodec/x86/vvc/vvc_sad.asm | 32 ++++++++++++++++++++++++++------ > libavcodec/x86/vvc/vvcdsp_init.c | 22 +++++++++++++++++----- > tests/checkasm/vvc_mc.c | 3 ++- > 6 files changed, 55 insertions(+), 16 deletions(-) > > diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h > index 1f14096c41..55c4c81f53 100644 > --- a/libavcodec/vvc/dsp.h > +++ b/libavcodec/vvc/dsp.h > @@ -99,7 +99,7 @@ typedef struct VVCInterDSPContext { > > void (*apply_bdof)(uint8_t *dst, ptrdiff_t dst_stride, int16_t *src0, int16_t *src1, int block_w, int block_h); > > - int (*sad)(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h); > + int (*sad[5])(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h); > void (*dmvr[2][2])(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, int height, > intptr_t mx, intptr_t my, int width); > } VVCInterDSPContext; > diff --git a/libavcodec/vvc/inter.c b/libavcodec/vvc/inter.c > index e1011b4fa1..0214e46634 100644 > --- a/libavcodec/vvc/inter.c > +++ b/libavcodec/vvc/inter.c > @@ -740,6 +740,8 @@ static void dmvr_mv_refine(VVCLocalContext *lc, MvField *mvf, MvField *orig_mv, > const AVFrame *ref0, const AVFrame *ref1, const int x_off, const int y_off, const int block_w, const int block_h) > { > const VVCFrameContext *fc = lc->fc; > + static const uint8_t sad_tab[16] = { 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4 }; > + const int tab = sad_tab[(FFALIGN(block_w, 8) >> 3) - 1]; > const int sr_range = 2; > const AVFrame *ref[] = { ref0, ref1 }; > int16_t *tmp[] = { lc->tmp, lc->tmp1 }; > @@ -763,7 +765,7 @@ static void dmvr_mv_refine(VVCLocalContext *lc, MvField *mvf, MvField *orig_mv, > fc->vvcdsp.inter.dmvr[!!my][!!mx](tmp[i], src, src_stride, pred_h, mx, my, pred_w); > } > > - min_sad = fc->vvcdsp.inter.sad(tmp[L0], tmp[L1], dx, dy, block_w, block_h); > + min_sad = fc->vvcdsp.inter.sad[tab](tmp[L0], tmp[L1], dx, dy, block_w, block_h); > min_sad -= min_sad >> 2; > sad[dy][dx] = min_sad; > > @@ -773,7 +775,7 @@ static void dmvr_mv_refine(VVCLocalContext *lc, MvField *mvf, MvField *orig_mv, > for (dy = 0; dy < SAD_ARRAY_SIZE; dy++) { > for (dx = 0; dx < SAD_ARRAY_SIZE; dx++) { > if (dx != sr_range || dy != sr_range) { > - sad[dy][dx] = fc->vvcdsp.inter.sad(lc->tmp, lc->tmp1, dx, dy, block_w, block_h); > + sad[dy][dx] = fc->vvcdsp.inter.sad[tab](lc->tmp, lc->tmp1, dx, dy, block_w, block_h); > if (sad[dy][dx] < min_sad) { > min_sad = sad[dy][dx]; > min_dx = dx; > diff --git a/libavcodec/vvc/inter_template.c b/libavcodec/vvc/inter_template.c > index a8068f4ba8..34485321d3 100644 > --- a/libavcodec/vvc/inter_template.c > +++ b/libavcodec/vvc/inter_template.c > @@ -626,7 +626,11 @@ static void FUNC(ff_vvc_inter_dsp_init)(VVCInterDSPContext *const inter) > inter->apply_prof_uni_w = FUNC(apply_prof_uni_w); > inter->apply_bdof = FUNC(apply_bdof); > inter->prof_grad_filter = FUNC(prof_grad_filter); > - inter->sad = vvc_sad; > + inter->sad[0] = > + inter->sad[1] = > + inter->sad[2] = > + inter->sad[3] = > + inter->sad[4] = vvc_sad; > } > > #undef FUNCS > diff --git a/libavcodec/x86/vvc/vvc_sad.asm b/libavcodec/x86/vvc/vvc_sad.asm > index b468d89ac2..a20818530f 100644 > --- a/libavcodec/x86/vvc/vvc_sad.asm > +++ b/libavcodec/x86/vvc/vvc_sad.asm > @@ -51,7 +51,7 @@ SECTION .text > > INIT_YMM avx2 > > -cglobal vvc_sad, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx > +cglobal vvc_sad_8, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx > movsxdifnidn dxq, dxd > movsxdifnidn dyq, dyd > > @@ -76,10 +76,6 @@ cglobal vvc_sad, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_ > pxor m3, m3 > vpbroadcastd m4, [pw_1] > > - cmp block_wd, 16 > - jge vvc_sad_16_128 > - > - vvc_sad_8: > .loop_height: > movu xm0, [src1q] > vinserti128 m0, m0, [src1q + MAX_PB_SIZE * ROWS * 2], 1 > @@ -100,7 +96,31 @@ cglobal vvc_sad, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_ > movd eax, xm0 > RET > > - vvc_sad_16_128: > +cglobal vvc_sad_16, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx > + movsxdifnidn dxq, dxd > + movsxdifnidn dyq, dyd > + > + sub dxq, 2 > + sub dyq, 2 > + > + mov off1q, 2 > + mov off2q, 2 > + > + add off1q, dyq > + sub off2q, dyq > + > + shl off1q, 7 > + shl off2q, 7 > + > + add off1q, dxq > + sub off2q, dxq > + > + lea src1q, [src1q + off1q * 2 + 2 * 2] > + lea src2q, [src2q + off2q * 2 + 2 * 2] > + > + pxor m3, m3 > + vpbroadcastd m4, [pw_1] > + > sar block_wd, 4 > .loop_height: > mov off1q, src1q > diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c > index 4b4a2aa937..bd60963432 100644 > --- a/libavcodec/x86/vvc/vvcdsp_init.c > +++ b/libavcodec/x86/vvc/vvcdsp_init.c > @@ -312,8 +312,20 @@ ALF_FUNCS(16, 12, avx2) > c->alf.classify = ff_vvc_alf_classify_##bd##_avx2; \ > } while (0) > > -int ff_vvc_sad_avx2(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h); > -#define SAD_INIT() c->inter.sad = ff_vvc_sad_avx2 > +#define SAD_PROTOTYPE(w, opt) \ > +int bf(ff_vvc_sad, w, opt)(const int16_t *src0, const int16_t *src1, \ > + int dx, int dy, int block_w, int block_h) \ > + > +SAD_PROTOTYPE(8, avx2); > +SAD_PROTOTYPE(16, avx2); > + > +#define SAD_INIT(opt) do { \ > + c->inter.sad[0] = ff_vvc_sad_8_##opt; \ > + c->inter.sad[1] = \ > + c->inter.sad[2] = \ > + c->inter.sad[3] = \ > + c->inter.sad[4] = ff_vvc_sad_16_##opt; \ > +} while (0) > #endif > > void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd) > @@ -330,7 +342,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd) > ALF_INIT(8); > AVG_INIT(8, avx2); > MC_LINKS_AVX2(8); > - SAD_INIT(); > + SAD_INIT(avx2); > } > break; > case 10: > @@ -342,7 +354,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd) > AVG_INIT(10, avx2); > MC_LINKS_AVX2(10); > MC_LINKS_16BPC_AVX2(10); > - SAD_INIT(); > + SAD_INIT(avx2); > } > break; > case 12: > @@ -354,7 +366,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd) > AVG_INIT(12, avx2); > MC_LINKS_AVX2(12); > MC_LINKS_16BPC_AVX2(12); > - SAD_INIT(); > + SAD_INIT(avx2); > } > break; > default: > diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c > index 1e889e2cff..deae1014d2 100644 > --- a/tests/checkasm/vvc_mc.c > +++ b/tests/checkasm/vvc_mc.c > @@ -327,6 +327,7 @@ static void check_avg(void) > static void check_vvc_sad(void) > { > const int bit_depth = 10; > + static const uint8_t sad_tab[16] = { 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4 }; > VVCDSPContext c; > LOCAL_ALIGNED_32(uint16_t, src0, [MAX_CTU_SIZE * MAX_CTU_SIZE * 4]); > LOCAL_ALIGNED_32(uint16_t, src1, [MAX_CTU_SIZE * MAX_CTU_SIZE * 4]); > @@ -341,7 +342,7 @@ static void check_vvc_sad(void) > for (int w = 8; w <= MAX_CTU_SIZE; w *= 2) { > for(int offy = 0; offy <= 4; offy++) { > for(int offx = 0; offx <= 4; offx++) { > - if(check_func(c.inter.sad, "sad_%dx%d", w, h)) { > + if(check_func(c.inter.sad[sad_tab[(w >> 3) - 1]], "sad_%dx%d", w, h)) { > int result0; > int result1; > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
prev parent reply other threads:[~2024-05-23 12:36 UTC|newest] Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top 2024-05-23 12:27 James Almer 2024-05-23 12:27 ` [FFmpeg-devel] [PATCH 2/5] x86/vvc_sad: optimize vvc_sad_16 James Almer 2024-05-23 12:27 ` [FFmpeg-devel] [PATCH 3/5] x86/vvc_sad: add sse4 versions of all functions James Almer 2024-05-23 12:27 ` [FFmpeg-devel] [PATCH 4/5] x86/vvc_sad: reduce gpr usage in all loop functions James Almer 2024-05-23 12:27 ` [FFmpeg-devel] [PATCH 5/5] x86/vvc_sad: reindent after the previous changes James Almer 2024-05-23 12:35 ` Andreas Rheinhardt [this message]
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=GV1P250MB073792BBA7747C78B536FA038FF42@GV1P250MB0737.EURP250.PROD.OUTLOOK.COM \ --to=andreas.rheinhardt@outlook.com \ --cc=ffmpeg-devel@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git