* [FFmpeg-devel] [PATCH 1/2] checkasm/flacdsp: add LPC test @ 2023-11-15 18:02 Rémi Denis-Courmont 2023-11-15 18:02 ` [FFmpeg-devel] [PATCH 2/2] lavc/flacdsp: R-V V LPC32 Rémi Denis-Courmont 2023-11-15 19:14 ` [FFmpeg-devel] [PATCH 1/2] checkasm/flacdsp: add LPC test James Almer 0 siblings, 2 replies; 5+ messages in thread From: Rémi Denis-Courmont @ 2023-11-15 18:02 UTC (permalink / raw) To: ffmpeg-devel --- tests/checkasm/flacdsp.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tests/checkasm/flacdsp.c b/tests/checkasm/flacdsp.c index 51a0e0060b..4d69cbe507 100644 --- a/tests/checkasm/flacdsp.c +++ b/tests/checkasm/flacdsp.c @@ -54,6 +54,28 @@ static void check_decorrelate(uint8_t **ref_dst, uint8_t **ref_src, uint8_t **ne bench_new(new_dst, (int32_t **)new_src, channels, BUF_SIZE / sizeof(int32_t), 8); } +static void check_lpc(FLACDSPContext *c, int pred_order) +{ + int qlevel = rnd() % 16; + LOCAL_ALIGNED_16(int32_t, coeffs, [32]); + LOCAL_ALIGNED_16(int32_t, dst0, [BUF_SIZE]); + LOCAL_ALIGNED_16(int32_t, dst1, [BUF_SIZE]); + + declare_func(void, int32_t *, const int[32], int, int, int); + + for (int i = 0; i < 32; i++) + coeffs[i] = rnd(); + for (int i = 0; i < BUF_SIZE; i++) + dst0[i] = rnd(); + + memcpy(dst1, dst0, BUF_SIZE * sizeof (int32_t)); + call_ref(dst0, coeffs, pred_order, qlevel, BUF_SIZE); + call_new(dst1, coeffs, pred_order, qlevel, BUF_SIZE); + if (memcmp(dst0, dst1, BUF_SIZE * sizeof (int32_t)) != 0) + fail(); + bench_new(dst1, coeffs, pred_order, qlevel, BUF_SIZE); +} + void checkasm_check_flacdsp(void) { LOCAL_ALIGNED_16(uint8_t, ref_dst, [BUF_SIZE*MAX_CHANNELS]); @@ -72,6 +94,7 @@ void checkasm_check_flacdsp(void) { AV_SAMPLE_FMT_S16, 16 }, { AV_SAMPLE_FMT_S32, 32 }, }; + static const signed char pred_orders[] = { 13, 16, 29, 32 }; FLACDSPContext h; int i, j; @@ -88,4 +111,13 @@ void checkasm_check_flacdsp(void) } report("decorrelate"); + + for (int i = 0; i < sizeof (pred_orders); i++) { + if (check_func(h.lpc16, "flac_lpc_16_%d", pred_orders[i])) + check_lpc(&h, pred_orders[i]); + if (check_func(h.lpc32, "flac_lpc_32_%d", pred_orders[i])) + check_lpc(&h, pred_orders[i]); + } + + report("lpc"); } -- 2.42.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 5+ messages in thread
* [FFmpeg-devel] [PATCH 2/2] lavc/flacdsp: R-V V LPC32 2023-11-15 18:02 [FFmpeg-devel] [PATCH 1/2] checkasm/flacdsp: add LPC test Rémi Denis-Courmont @ 2023-11-15 18:02 ` Rémi Denis-Courmont 2023-11-15 19:14 ` [FFmpeg-devel] [PATCH 1/2] checkasm/flacdsp: add LPC test James Almer 1 sibling, 0 replies; 5+ messages in thread From: Rémi Denis-Courmont @ 2023-11-15 18:02 UTC (permalink / raw) To: ffmpeg-devel The entire set of 32 coefficients and corresponding past 32 samples can fit in a single vector (with LMUL=8) exactly, but... since widening double the needed vector sizes, we still end up too short with 128-bit vectors. This adds a very simple version for future 256+-bit hardware, and for pred_orders values up to 16, and a bit more involved loop for for 128-bit hardware with pred_orders between 17 and 32. With 128-bit hardware, the benchmarks look like this: flac_lpc_32_13_c: 30152.0 flac_lpc_32_13_rvv_i32: 10244.7 flac_lpc_32_16_c: 37314.2 flac_lpc_32_16_rvv_i32: 10126.2 flac_lpc_32_29_c: 61910.0 flac_lpc_32_29_rvv_i32: 14495.2 flac_lpc_32_32_c: 68204.0 flac_lpc_32_32_rvv_i32: 13273.7 --- libavcodec/riscv/flacdsp_init.c | 12 +++++++ libavcodec/riscv/flacdsp_rvv.S | 57 +++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/libavcodec/riscv/flacdsp_init.c b/libavcodec/riscv/flacdsp_init.c index 73d431cb77..f60f98ea31 100644 --- a/libavcodec/riscv/flacdsp_init.c +++ b/libavcodec/riscv/flacdsp_init.c @@ -22,8 +22,13 @@ #include "libavutil/attributes.h" #include "libavutil/cpu.h" +#include "libavutil/riscv/cpu.h" #include "libavcodec/flacdsp.h" +void ff_flac_lpc32_rvv(int32_t *decoded, const int coeffs[32], + int pred_order, int qlevel, int len); +void ff_flac_lpc32_rvv_simple(int32_t *decoded, const int coeffs[32], + int pred_order, int qlevel, int len); void ff_flac_decorrelate_indep2_16_rvv(uint8_t **out, int32_t **in, int channels, int len, int shift); void ff_flac_decorrelate_indep4_16_rvv(uint8_t **out, int32_t **in, @@ -60,6 +65,13 @@ av_cold void ff_flacdsp_init_riscv(FLACDSPContext *c, enum AVSampleFormat fmt, int flags = av_get_cpu_flags(); if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR)) { + int vlenb = ff_get_rv_vlenb(); + + if (vlenb == 16) + c->lpc32 = ff_flac_lpc32_rvv; + else if (vlenb > 16) + c->lpc32 = ff_flac_lpc32_rvv_simple; + switch (fmt) { case AV_SAMPLE_FMT_S16: switch (channels) { diff --git a/libavcodec/riscv/flacdsp_rvv.S b/libavcodec/riscv/flacdsp_rvv.S index 12b456f7da..b1724f5500 100644 --- a/libavcodec/riscv/flacdsp_rvv.S +++ b/libavcodec/riscv/flacdsp_rvv.S @@ -21,6 +21,63 @@ #include "libavutil/riscv/asm.S" #if (__riscv_xlen == 64) +func ff_flac_lpc32_rvv, zve32x + addi t2, a2, -16 + ble t2, zero, ff_flac_lpc32_rvv_simple + vsetivli zero, 1, e64, m1, ta, ma + vmv.s.x v0, zero + vsetvli zero, a2, e32, m8, ta, ma + vle32.v v8, (a1) + sub a4, a4, a2 + vle32.v v16, (a0) + sh2add a0, a2, a0 +1: + vsetvli zero, a2, e32, m4, ta, ma + vwmul.vv v24, v8, v16 + vsetvli zero, t2, e32, m4, tu, ma + vwmacc.vv v24, v12, v20 + vsetvli zero, a2, e64, m8, ta, ma + vredsum.vs v24, v24, v0 + lw t0, (a0) + addi a4, a4, -1 + vmv.x.s t1, v24 + vsetvli zero, a2, e32, m8, ta, ma + sra t1, t1, a3 + add t0, t0, t1 + vslide1down.vx v16, v16, t0 + sw t0, (a0) + addi a0, a0, 4 + bnez a4, 1b + + ret +endfunc + +func ff_flac_lpc32_rvv_simple, zve32x + vsetivli zero, 1, e64, m1, ta, ma + vmv.s.x v0, zero + vsetvli zero, a2, e32, m4, ta, ma + vle32.v v8, (a1) + sub a4, a4, a2 + vle32.v v16, (a0) + sh2add a0, a2, a0 +1: + vwmul.vv v24, v8, v16 + vsetvli zero, zero, e64, m8, ta, ma + vredsum.vs v24, v24, v0 + lw t0, (a0) + addi a4, a4, -1 + vmv.x.s t1, v24 + vsetvli zero, zero, e32, m4, ta, ma + sra t1, t1, a3 + add t0, t0, t1 + vslide1down.vx v16, v16, t0 + sw t0, (a0) + addi a0, a0, 4 + bnez a4, 1b + + ret +endfunc + func ff_flac_decorrelate_indep2_16_rvv, zve32x ld a0, (a0) ld a2, 8(a1) -- 2.42.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/2] checkasm/flacdsp: add LPC test 2023-11-15 18:02 [FFmpeg-devel] [PATCH 1/2] checkasm/flacdsp: add LPC test Rémi Denis-Courmont 2023-11-15 18:02 ` [FFmpeg-devel] [PATCH 2/2] lavc/flacdsp: R-V V LPC32 Rémi Denis-Courmont @ 2023-11-15 19:14 ` James Almer 2023-11-15 19:19 ` Rémi Denis-Courmont 1 sibling, 1 reply; 5+ messages in thread From: James Almer @ 2023-11-15 19:14 UTC (permalink / raw) To: ffmpeg-devel On 11/15/2023 3:02 PM, Rémi Denis-Courmont wrote: > --- > tests/checkasm/flacdsp.c | 32 ++++++++++++++++++++++++++++++++ > 1 file changed, 32 insertions(+) > > diff --git a/tests/checkasm/flacdsp.c b/tests/checkasm/flacdsp.c > index 51a0e0060b..4d69cbe507 100644 > --- a/tests/checkasm/flacdsp.c > +++ b/tests/checkasm/flacdsp.c > @@ -54,6 +54,28 @@ static void check_decorrelate(uint8_t **ref_dst, uint8_t **ref_src, uint8_t **ne > bench_new(new_dst, (int32_t **)new_src, channels, BUF_SIZE / sizeof(int32_t), 8); > } > > +static void check_lpc(FLACDSPContext *c, int pred_order) c is unused. > +{ > + int qlevel = rnd() % 16; > + LOCAL_ALIGNED_16(int32_t, coeffs, [32]); > + LOCAL_ALIGNED_16(int32_t, dst0, [BUF_SIZE]); > + LOCAL_ALIGNED_16(int32_t, dst1, [BUF_SIZE]); > + > + declare_func(void, int32_t *, const int[32], int, int, int); > + > + for (int i = 0; i < 32; i++) > + coeffs[i] = rnd(); > + for (int i = 0; i < BUF_SIZE; i++) > + dst0[i] = rnd(); > + > + memcpy(dst1, dst0, BUF_SIZE * sizeof (int32_t)); > + call_ref(dst0, coeffs, pred_order, qlevel, BUF_SIZE); > + call_new(dst1, coeffs, pred_order, qlevel, BUF_SIZE); > + if (memcmp(dst0, dst1, BUF_SIZE * sizeof (int32_t)) != 0) > + fail(); > + bench_new(dst1, coeffs, pred_order, qlevel, BUF_SIZE); Not sure if it matters, but dst1 is already trashed by call_new(). > +} > + > void checkasm_check_flacdsp(void) > { > LOCAL_ALIGNED_16(uint8_t, ref_dst, [BUF_SIZE*MAX_CHANNELS]); > @@ -72,6 +94,7 @@ void checkasm_check_flacdsp(void) > { AV_SAMPLE_FMT_S16, 16 }, > { AV_SAMPLE_FMT_S32, 32 }, > }; > + static const signed char pred_orders[] = { 13, 16, 29, 32 }; > FLACDSPContext h; > int i, j; > > @@ -88,4 +111,13 @@ void checkasm_check_flacdsp(void) > } > > report("decorrelate"); > + > + for (int i = 0; i < sizeof (pred_orders); i++) { i is already defined. Also, use FF_ARRAY_ELEMS(pred_orders), so it doesn't depend on char being 1 byte. > + if (check_func(h.lpc16, "flac_lpc_16_%d", pred_orders[i])) > + check_lpc(&h, pred_orders[i]); > + if (check_func(h.lpc32, "flac_lpc_32_%d", pred_orders[i])) > + check_lpc(&h, pred_orders[i]); > + } > + > + report("lpc"); > } LGTM otherwise. _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/2] checkasm/flacdsp: add LPC test 2023-11-15 19:14 ` [FFmpeg-devel] [PATCH 1/2] checkasm/flacdsp: add LPC test James Almer @ 2023-11-15 19:19 ` Rémi Denis-Courmont 2023-11-15 19:23 ` James Almer 0 siblings, 1 reply; 5+ messages in thread From: Rémi Denis-Courmont @ 2023-11-15 19:19 UTC (permalink / raw) To: ffmpeg-devel Le keskiviikkona 15. marraskuuta 2023, 21.14.26 EET James Almer a écrit : > On 11/15/2023 3:02 PM, Rémi Denis-Courmont wrote: > > --- > > > > tests/checkasm/flacdsp.c | 32 ++++++++++++++++++++++++++++++++ > > 1 file changed, 32 insertions(+) > > > > diff --git a/tests/checkasm/flacdsp.c b/tests/checkasm/flacdsp.c > > index 51a0e0060b..4d69cbe507 100644 > > --- a/tests/checkasm/flacdsp.c > > +++ b/tests/checkasm/flacdsp.c > > @@ -54,6 +54,28 @@ static void check_decorrelate(uint8_t **ref_dst, > > uint8_t **ref_src, uint8_t **ne> > > bench_new(new_dst, (int32_t **)new_src, channels, BUF_SIZE / > > sizeof(int32_t), 8);> > > } > > > > +static void check_lpc(FLACDSPContext *c, int pred_order) > > c is unused. > > > +{ > > + int qlevel = rnd() % 16; > > + LOCAL_ALIGNED_16(int32_t, coeffs, [32]); > > + LOCAL_ALIGNED_16(int32_t, dst0, [BUF_SIZE]); > > + LOCAL_ALIGNED_16(int32_t, dst1, [BUF_SIZE]); > > + > > + declare_func(void, int32_t *, const int[32], int, int, int); > > + > > + for (int i = 0; i < 32; i++) > > + coeffs[i] = rnd(); > > + for (int i = 0; i < BUF_SIZE; i++) > > + dst0[i] = rnd(); > > + > > + memcpy(dst1, dst0, BUF_SIZE * sizeof (int32_t)); > > + call_ref(dst0, coeffs, pred_order, qlevel, BUF_SIZE); > > + call_new(dst1, coeffs, pred_order, qlevel, BUF_SIZE); > > + if (memcmp(dst0, dst1, BUF_SIZE * sizeof (int32_t)) != 0) > > + fail(); > > + bench_new(dst1, coeffs, pred_order, qlevel, BUF_SIZE); > > Not sure if it matters, but dst1 is already trashed by call_new(). Yeah I know. I could allocate a third buffer. AFAICT, the only parameter that should affect the benchmarks is pred-order (which indeed affects the result on both x86 and RVV). So that the extra code to preserve dst seemed pointless? > > > +} > > + > > > > void checkasm_check_flacdsp(void) > > { > > > > LOCAL_ALIGNED_16(uint8_t, ref_dst, [BUF_SIZE*MAX_CHANNELS]); > > > > @@ -72,6 +94,7 @@ void checkasm_check_flacdsp(void) > > > > { AV_SAMPLE_FMT_S16, 16 }, > > { AV_SAMPLE_FMT_S32, 32 }, > > > > }; > > > > + static const signed char pred_orders[] = { 13, 16, 29, 32 }; > > > > FLACDSPContext h; > > int i, j; > > > > @@ -88,4 +111,13 @@ void checkasm_check_flacdsp(void) > > > > } > > > > report("decorrelate"); > > > > + > > + for (int i = 0; i < sizeof (pred_orders); i++) { > > i is already defined. Also, use FF_ARRAY_ELEMS(pred_orders), so it > doesn't depend on char being 1 byte. > > > + if (check_func(h.lpc16, "flac_lpc_16_%d", pred_orders[i])) > > + check_lpc(&h, pred_orders[i]); > > + if (check_func(h.lpc32, "flac_lpc_32_%d", pred_orders[i])) > > + check_lpc(&h, pred_orders[i]); > > + } > > + > > + report("lpc"); > > > > } > > LGTM otherwise. > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". -- 雷米‧德尼-库尔蒙 http://www.remlab.net/ _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/2] checkasm/flacdsp: add LPC test 2023-11-15 19:19 ` Rémi Denis-Courmont @ 2023-11-15 19:23 ` James Almer 0 siblings, 0 replies; 5+ messages in thread From: James Almer @ 2023-11-15 19:23 UTC (permalink / raw) To: ffmpeg-devel On 11/15/2023 4:19 PM, Rémi Denis-Courmont wrote: > Le keskiviikkona 15. marraskuuta 2023, 21.14.26 EET James Almer a écrit : >> On 11/15/2023 3:02 PM, Rémi Denis-Courmont wrote: >>> --- >>> >>> tests/checkasm/flacdsp.c | 32 ++++++++++++++++++++++++++++++++ >>> 1 file changed, 32 insertions(+) >>> >>> diff --git a/tests/checkasm/flacdsp.c b/tests/checkasm/flacdsp.c >>> index 51a0e0060b..4d69cbe507 100644 >>> --- a/tests/checkasm/flacdsp.c >>> +++ b/tests/checkasm/flacdsp.c >>> @@ -54,6 +54,28 @@ static void check_decorrelate(uint8_t **ref_dst, >>> uint8_t **ref_src, uint8_t **ne> >>> bench_new(new_dst, (int32_t **)new_src, channels, BUF_SIZE / >>> sizeof(int32_t), 8);> >>> } >>> >>> +static void check_lpc(FLACDSPContext *c, int pred_order) >> >> c is unused. >> >>> +{ >>> + int qlevel = rnd() % 16; >>> + LOCAL_ALIGNED_16(int32_t, coeffs, [32]); >>> + LOCAL_ALIGNED_16(int32_t, dst0, [BUF_SIZE]); >>> + LOCAL_ALIGNED_16(int32_t, dst1, [BUF_SIZE]); >>> + >>> + declare_func(void, int32_t *, const int[32], int, int, int); >>> + >>> + for (int i = 0; i < 32; i++) >>> + coeffs[i] = rnd(); >>> + for (int i = 0; i < BUF_SIZE; i++) >>> + dst0[i] = rnd(); >>> + >>> + memcpy(dst1, dst0, BUF_SIZE * sizeof (int32_t)); >>> + call_ref(dst0, coeffs, pred_order, qlevel, BUF_SIZE); >>> + call_new(dst1, coeffs, pred_order, qlevel, BUF_SIZE); >>> + if (memcmp(dst0, dst1, BUF_SIZE * sizeof (int32_t)) != 0) >>> + fail(); >>> + bench_new(dst1, coeffs, pred_order, qlevel, BUF_SIZE); >> >> Not sure if it matters, but dst1 is already trashed by call_new(). > > Yeah I know. I could allocate a third buffer. AFAICT, the only parameter that > should affect the benchmarks is pred-order (which indeed affects the result on > both x86 and RVV). So that the extra code to preserve dst seemed pointless? I guess it's pointless in this case, yeah, but I know that some other dsp functions ended up with different benchmark results if the contents of a buffer were bogus (Which afaik is why when being filled with rnd() they are also clipped with a mask). > >> >>> +} >>> + >>> >>> void checkasm_check_flacdsp(void) >>> { >>> >>> LOCAL_ALIGNED_16(uint8_t, ref_dst, [BUF_SIZE*MAX_CHANNELS]); >>> >>> @@ -72,6 +94,7 @@ void checkasm_check_flacdsp(void) >>> >>> { AV_SAMPLE_FMT_S16, 16 }, >>> { AV_SAMPLE_FMT_S32, 32 }, >>> >>> }; >>> >>> + static const signed char pred_orders[] = { 13, 16, 29, 32 }; >>> >>> FLACDSPContext h; >>> int i, j; >>> >>> @@ -88,4 +111,13 @@ void checkasm_check_flacdsp(void) >>> >>> } >>> >>> report("decorrelate"); >>> >>> + >>> + for (int i = 0; i < sizeof (pred_orders); i++) { >> >> i is already defined. Also, use FF_ARRAY_ELEMS(pred_orders), so it >> doesn't depend on char being 1 byte. >> >>> + if (check_func(h.lpc16, "flac_lpc_16_%d", pred_orders[i])) >>> + check_lpc(&h, pred_orders[i]); >>> + if (check_func(h.lpc32, "flac_lpc_32_%d", pred_orders[i])) >>> + check_lpc(&h, pred_orders[i]); >>> + } >>> + >>> + report("lpc"); >>> >>> } >> >> LGTM otherwise. >> _______________________________________________ >> ffmpeg-devel mailing list >> ffmpeg-devel@ffmpeg.org >> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel >> >> To unsubscribe, visit link above, or email >> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2023-11-15 19:23 UTC | newest] Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2023-11-15 18:02 [FFmpeg-devel] [PATCH 1/2] checkasm/flacdsp: add LPC test Rémi Denis-Courmont 2023-11-15 18:02 ` [FFmpeg-devel] [PATCH 2/2] lavc/flacdsp: R-V V LPC32 Rémi Denis-Courmont 2023-11-15 19:14 ` [FFmpeg-devel] [PATCH 1/2] checkasm/flacdsp: add LPC test James Almer 2023-11-15 19:19 ` Rémi Denis-Courmont 2023-11-15 19:23 ` James Almer
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git