* [FFmpeg-devel] [PATCH 1/7] checkasm: add csv/tsv bench output @ 2024-08-13 14:03 J. Dekker 2024-08-13 14:03 ` [FFmpeg-devel] [PATCH 2/7] checkasm: improve print format J. Dekker ` (5 more replies) 0 siblings, 6 replies; 13+ messages in thread From: J. Dekker @ 2024-08-13 14:03 UTC (permalink / raw) To: ffmpeg-devel When collecting performance information from checkasm it is common to parse the output for use in graphs to compare vs different architectures. Signed-off-by: J. Dekker <jdek@itanimul.li> --- tests/checkasm/checkasm.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 58597d3888..f82ee0864f 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -362,6 +362,8 @@ static struct { const char *cpu_flag_name; const char *test_name; int verbose; + int csv; + int tsv; volatile sig_atomic_t catch_signals; } state; @@ -586,7 +588,14 @@ static void print_benchs(CheckasmFunc *f) CheckasmPerf *p = &v->perf; if (p->iterations) { int decicycles = (10*p->cycles/p->iterations - state.nop_time) / 4; - printf("%s_%s: %d.%d\n", f->name, cpu_suffix(v->cpu), decicycles/10, decicycles%10); + if (state.csv) { + const char sep = state.tsv ? '\t' : ','; + printf("%s%c%s%c%d.%d\n", f->name, sep, + cpu_suffix(v->cpu), sep, + decicycles / 10, decicycles % 10); + } else { + printf("%s_%s: %d.%d\n", f->name, cpu_suffix(v->cpu), decicycles/10, decicycles%10); + } } } while ((v = v->next)); } @@ -829,7 +838,12 @@ static void bench_uninit(void) static int usage(const char *path) { fprintf(stderr, - "Usage: %s [--bench] [--runs=<ptwo>] [--test=<pattern>] [--verbose] [seed]\n", + "Usage: %s [options...] [seed]\n" + " --test=<pattern> Run specific test.\n" + " --bench Run benchmark.\n" + " --csv, --tsv Output benchmark results in CSV or TSV format.\n" + " --runs=<ptwo> Manual number of benchmark iterations to run 2**<ptwo>.\n" + " --verbose Increase verbosity.\n", path); return 1; } @@ -877,6 +891,10 @@ int main(int argc, char *argv[]) state.bench_pattern = ""; } else if (!strncmp(arg, "--test=", 7)) { state.test_name = arg + 7; + } else if (!strcmp(arg, "--csv")) { + state.csv = 1; state.tsv = 0; + } else if (!strcmp(arg, "--tsv")) { + state.csv = 1; state.tsv = 1; } else if (!strcmp(arg, "--verbose") || !strcmp(arg, "-v")) { state.verbose = 1; } else if (!strncmp(arg, "--runs=", 7)) { -- 2.45.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 13+ messages in thread
* [FFmpeg-devel] [PATCH 2/7] checkasm: improve print format 2024-08-13 14:03 [FFmpeg-devel] [PATCH 1/7] checkasm: add csv/tsv bench output J. Dekker @ 2024-08-13 14:03 ` J. Dekker 2024-08-13 16:39 ` Lynne via ffmpeg-devel 2024-08-13 14:03 ` [FFmpeg-devel] [PATCH 3/7] checkasm: add wildcompares for test & functions J. Dekker ` (4 subsequent siblings) 5 siblings, 1 reply; 13+ messages in thread From: J. Dekker @ 2024-08-13 14:03 UTC (permalink / raw) To: ffmpeg-devel Port dav1d's checkasm output format to FFmpeg's checkasm, includes relative speedups and aligns results. Signed-off-by: J. Dekker <jdek@itanimul.li> --- tests/checkasm/checkasm.c | 53 +++++++++++++++++++++++++++++++++++---- 1 file changed, 48 insertions(+), 5 deletions(-) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index f82ee0864f..0095758268 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -18,6 +18,31 @@ * You should have received a copy of the GNU General Public License along * with FFmpeg; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" @@ -575,6 +600,16 @@ static int measure_nop_time(void) return nop_sum / 500; } +static inline double avg_cycles_per_call(const CheckasmPerf *const p) +{ + if (p->iterations) { + const double cycles = (double)(10 * p->cycles) / p->iterations - state.nop_time; + if (cycles > 0.0) + return cycles / 4.0; /* 4 calls per iteration */ + } + return 0.0; +} + /* Print benchmark results */ static void print_benchs(CheckasmFunc *f) { @@ -584,17 +619,25 @@ static void print_benchs(CheckasmFunc *f) /* Only print functions with at least one assembly version */ if (f->versions.cpu || f->versions.next) { CheckasmFuncVersion *v = &f->versions; + const CheckasmPerf *p = &v->perf; + const double baseline = avg_cycles_per_call(p); + double decicycles; do { - CheckasmPerf *p = &v->perf; if (p->iterations) { - int decicycles = (10*p->cycles/p->iterations - state.nop_time) / 4; + p = &v->perf; + decicycles = avg_cycles_per_call(p); if (state.csv) { const char sep = state.tsv ? '\t' : ','; - printf("%s%c%s%c%d.%d\n", f->name, sep, + printf("%s%c%s%c%.1f\n", f->name, sep, cpu_suffix(v->cpu), sep, - decicycles / 10, decicycles % 10); + decicycles / 10.0); } else { - printf("%s_%s: %d.%d\n", f->name, cpu_suffix(v->cpu), decicycles/10, decicycles%10); + const int pad_length = 10 + 50 - + printf("%s_%s:", f->name, cpu_suffix(v->cpu)); + const double ratio = decicycles ? + baseline / decicycles : 0.0; + printf("%*.1f (%5.2fx)\n", FFMAX(pad_length, 0), + decicycles / 10.0, ratio); } } } while ((v = v->next)); -- 2.45.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/7] checkasm: improve print format 2024-08-13 14:03 ` [FFmpeg-devel] [PATCH 2/7] checkasm: improve print format J. Dekker @ 2024-08-13 16:39 ` Lynne via ffmpeg-devel 0 siblings, 0 replies; 13+ messages in thread From: Lynne via ffmpeg-devel @ 2024-08-13 16:39 UTC (permalink / raw) To: ffmpeg-devel; +Cc: Lynne [-- Attachment #1.1.1.1: Type: text/plain, Size: 4913 bytes --] On 13/08/2024 16:03, J. Dekker wrote: > Port dav1d's checkasm output format to FFmpeg's checkasm, includes > relative speedups and aligns results. > > Signed-off-by: J. Dekker <jdek@itanimul.li> > --- > tests/checkasm/checkasm.c | 53 +++++++++++++++++++++++++++++++++++---- > 1 file changed, 48 insertions(+), 5 deletions(-) > > diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c > index f82ee0864f..0095758268 100644 > --- a/tests/checkasm/checkasm.c > +++ b/tests/checkasm/checkasm.c > @@ -18,6 +18,31 @@ > * You should have received a copy of the GNU General Public License along > * with FFmpeg; if not, write to the Free Software Foundation, Inc., > * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. > + * > + * Copyright © 2018, VideoLAN and dav1d authors > + * Copyright © 2018, Two Orioles, LLC > + * All rights reserved. > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions are met: > + * > + * 1. Redistributions of source code must retain the above copyright notice, this > + * list of conditions and the following disclaimer. > + * > + * 2. Redistributions in binary form must reproduce the above copyright notice, > + * this list of conditions and the following disclaimer in the documentation > + * and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED > + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE > + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR > + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES > + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; > + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND > + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS > + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. > */ > > #include "config.h" > @@ -575,6 +600,16 @@ static int measure_nop_time(void) > return nop_sum / 500; > } > > +static inline double avg_cycles_per_call(const CheckasmPerf *const p) > +{ > + if (p->iterations) { > + const double cycles = (double)(10 * p->cycles) / p->iterations - state.nop_time; > + if (cycles > 0.0) > + return cycles / 4.0; /* 4 calls per iteration */ > + } > + return 0.0; > +} > + > /* Print benchmark results */ > static void print_benchs(CheckasmFunc *f) > { > @@ -584,17 +619,25 @@ static void print_benchs(CheckasmFunc *f) > /* Only print functions with at least one assembly version */ > if (f->versions.cpu || f->versions.next) { > CheckasmFuncVersion *v = &f->versions; > + const CheckasmPerf *p = &v->perf; > + const double baseline = avg_cycles_per_call(p); > + double decicycles; > do { > - CheckasmPerf *p = &v->perf; > if (p->iterations) { > - int decicycles = (10*p->cycles/p->iterations - state.nop_time) / 4; > + p = &v->perf; > + decicycles = avg_cycles_per_call(p); > if (state.csv) { > const char sep = state.tsv ? '\t' : ','; > - printf("%s%c%s%c%d.%d\n", f->name, sep, > + printf("%s%c%s%c%.1f\n", f->name, sep, > cpu_suffix(v->cpu), sep, > - decicycles / 10, decicycles % 10); > + decicycles / 10.0); > } else { > - printf("%s_%s: %d.%d\n", f->name, cpu_suffix(v->cpu), decicycles/10, decicycles%10); > + const int pad_length = 10 + 50 - > + printf("%s_%s:", f->name, cpu_suffix(v->cpu)); > + const double ratio = decicycles ? > + baseline / decicycles : 0.0; > + printf("%*.1f (%5.2fx)\n", FFMAX(pad_length, 0), > + decicycles / 10.0, ratio); > } > } > } while ((v = v->next)); How does it improve it? You're only interested in the last X iterations, after cache has fully warmed up and is out of the equation. Averaging all results from all iteration would be also benchmarking the memory layout of the system, but only the cycles are of interest. [-- Attachment #1.1.1.2: OpenPGP public key --] [-- Type: application/pgp-keys, Size: 637 bytes --] [-- Attachment #1.2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 236 bytes --] [-- Attachment #2: Type: text/plain, Size: 251 bytes --] _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 13+ messages in thread
* [FFmpeg-devel] [PATCH 3/7] checkasm: add wildcompares for test & functions 2024-08-13 14:03 [FFmpeg-devel] [PATCH 1/7] checkasm: add csv/tsv bench output J. Dekker 2024-08-13 14:03 ` [FFmpeg-devel] [PATCH 2/7] checkasm: improve print format J. Dekker @ 2024-08-13 14:03 ` J. Dekker 2024-08-13 14:03 ` [FFmpeg-devel] [PATCH 4/7] avutil/riscv/asm: add stack pushing helpers J. Dekker ` (3 subsequent siblings) 5 siblings, 0 replies; 13+ messages in thread From: J. Dekker @ 2024-08-13 14:03 UTC (permalink / raw) To: ffmpeg-devel Added: --test=<pattern> Filter tests by glob style pattern. --bench[=<pattern>] Run benchmark and optionally filter functions by glob style pattern. Example: $ ./tests/checkasm/checkasm --bench=yuva* [...] yuva420p_bgr24_8_c: 34.5 ( 1.00x) yuva420p_bgr24_8_ssse3: 31.1 ( 1.11x) yuva420p_bgr24_128_c: 310.6 ( 1.00x) yuva420p_bgr24_128_ssse3: 178.1 ( 1.74x) yuva420p_bgr24_1080_c: 2509.6 ( 1.00x) yuva420p_bgr24_1080_ssse3: 1471.5 ( 1.71x) yuva420p_bgr24_1920_c: 4462.6 ( 1.00x) yuva420p_bgr24_1920_ssse3: 2331.1 ( 1.91x) [...] Ported from dav1d. Signed-off-by: J. Dekker <jdek@itanimul.li> --- tests/checkasm/checkasm.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 0095758268..79cf39c27f 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -385,7 +385,7 @@ static struct { int cpu_flag; const char *cpu_flag_name; - const char *test_name; + const char *test_pattern; int verbose; int csv; int tsv; @@ -771,6 +771,22 @@ static void signal_handler(int s) { } #endif +/* Compares a string with a wildcard pattern. */ +static int wildstrcmp(const char *str, const char *pattern) +{ + const char *wild = strchr(pattern, '*'); + if (wild) { + const size_t len = wild - pattern; + if (strncmp(str, pattern, len)) return 1; + while (*++wild == '*'); + if (!*wild) return 0; + str += len; + while (*str && wildstrcmp(str, wild)) str++; + return !*str; + } + return strcmp(str, pattern); +} + /* Perform tests and benchmarks for the specified cpu flag if supported by the host */ static void check_cpu_flag(const char *name, int flag) { @@ -786,7 +802,7 @@ static void check_cpu_flag(const char *name, int flag) state.cpu_flag_name = name; for (i = 0; tests[i].func; i++) { - if (state.test_name && strcmp(tests[i].name, state.test_name)) + if (state.test_pattern && wildstrcmp(tests[i].name, state.test_pattern)) continue; state.current_test_name = tests[i].name; tests[i].func(); @@ -882,11 +898,12 @@ static int usage(const char *path) { fprintf(stderr, "Usage: %s [options...] [seed]\n" - " --test=<pattern> Run specific test.\n" - " --bench Run benchmark.\n" - " --csv, --tsv Output benchmark results in CSV or TSV format.\n" - " --runs=<ptwo> Manual number of benchmark iterations to run 2**<ptwo>.\n" - " --verbose Increase verbosity.\n", + " --test=<pattern> Filter tests by glob style pattern.\n" + " --bench[=<pattern>] Run benchmark and optionally filter functions\n" + " by glob style pattern.\n" + " --csv, --tsv Print benchmark results in CSV or TSV format.\n" + " --runs=<ptwo> Manual number of benchmark iterations to run 2**<ptwo>.\n" + " --verbose Increase verbosity.\n", path); return 1; } @@ -931,9 +948,9 @@ int main(int argc, char *argv[]) state.bench_pattern = arg + 8; state.bench_pattern_len = strlen(state.bench_pattern); } else - state.bench_pattern = ""; + state.bench_pattern = "*"; } else if (!strncmp(arg, "--test=", 7)) { - state.test_name = arg + 7; + state.test_pattern = arg + 7; } else if (!strcmp(arg, "--csv")) { state.csv = 1; state.tsv = 0; } else if (!strcmp(arg, "--tsv")) { @@ -1037,7 +1054,7 @@ void *checkasm_check_func(void *func, const char *name, ...) int checkasm_bench_func(void) { return !state.num_failed && state.bench_pattern && - !strncmp(state.current_func->name, state.bench_pattern, state.bench_pattern_len); + !wildstrcmp(state.current_func->name, state.bench_pattern); } /* Indicate that the current test has failed */ -- 2.45.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 13+ messages in thread
* [FFmpeg-devel] [PATCH 4/7] avutil/riscv/asm: add stack pushing helpers 2024-08-13 14:03 [FFmpeg-devel] [PATCH 1/7] checkasm: add csv/tsv bench output J. Dekker 2024-08-13 14:03 ` [FFmpeg-devel] [PATCH 2/7] checkasm: improve print format J. Dekker 2024-08-13 14:03 ` [FFmpeg-devel] [PATCH 3/7] checkasm: add wildcompares for test & functions J. Dekker @ 2024-08-13 14:03 ` J. Dekker 2024-08-13 15:51 ` Rémi Denis-Courmont 2024-08-13 14:03 ` [FFmpeg-devel] [PATCH 5/7] avutil/riscv/asm: add helper macro to count varargs J. Dekker ` (2 subsequent siblings) 5 siblings, 1 reply; 13+ messages in thread From: J. Dekker @ 2024-08-13 14:03 UTC (permalink / raw) To: ffmpeg-devel; +Cc: Niklas Haas From: Niklas Haas <git@haasn.dev> Instead of duplicating these common macros in every file, add them to the shared utility file. Also add a base case for sanity. --- libavcodec/riscv/h264addpx_rvv.S | 10 ---------- libavcodec/riscv/h264idct_rvv.S | 10 ---------- libavcodec/riscv/startcode_rvb.S | 10 ---------- libavutil/riscv/asm.S | 34 ++++++++++++++++++++++++++++++++ 4 files changed, 34 insertions(+), 30 deletions(-) diff --git a/libavcodec/riscv/h264addpx_rvv.S b/libavcodec/riscv/h264addpx_rvv.S index 82739881d9..cf3b742294 100644 --- a/libavcodec/riscv/h264addpx_rvv.S +++ b/libavcodec/riscv/h264addpx_rvv.S @@ -26,16 +26,6 @@ #include "libavutil/riscv/asm.S" - .macro sx rd, addr -#if (__riscv_xlen == 32) - sw \rd, \addr -#elif (__riscv_xlen == 64) - sd \rd, \addr -#else - sq \rd, \addr -#endif - .endm - func ff_h264_add_pixels4_8_rvv, zve32x lpad 0 vsetivli zero, 4, e8, mf4, ta, ma diff --git a/libavcodec/riscv/h264idct_rvv.S b/libavcodec/riscv/h264idct_rvv.S index d2f77a5b47..076935a5d5 100644 --- a/libavcodec/riscv/h264idct_rvv.S +++ b/libavcodec/riscv/h264idct_rvv.S @@ -29,16 +29,6 @@ #include "libavutil/riscv/asm.S" - .macro sx rd, addr -#if (__riscv_xlen == 32) - sw \rd, \addr -#elif (__riscv_xlen == 64) - sd \rd, \addr -#else - sq \rd, \addr -#endif - .endm - .variant_cc ff_h264_idct4_rvv func ff_h264_idct4_rvv, zve32x vsra.vi v5, v1, 1 diff --git a/libavcodec/riscv/startcode_rvb.S b/libavcodec/riscv/startcode_rvb.S index eec92d3340..c131ebdf59 100644 --- a/libavcodec/riscv/startcode_rvb.S +++ b/libavcodec/riscv/startcode_rvb.S @@ -26,16 +26,6 @@ #include "libavutil/riscv/asm.S" - .macro lx rd, addr -#if (__riscv_xlen == 32) - lw \rd, \addr -#elif (__riscv_xlen == 64) - ld \rd, \addr -#else - lq \rd, \addr -#endif - .endm - func ff_startcode_find_candidate_rvb, zbb lpad 0 add a1, a0, a1 diff --git a/libavutil/riscv/asm.S b/libavutil/riscv/asm.S index ec68a042d1..175f2a8672 100644 --- a/libavutil/riscv/asm.S +++ b/libavutil/riscv/asm.S @@ -237,3 +237,37 @@ .macro vntypei rd, rs, n=1 vwtypei \rd, \rs, -(\n) .endm + + /** + * Write an XLEN-sized register to an address. + * @param rs source register + * @param addr address to write to + */ + .macro sx rs, addr +#if (__riscv_xlen == 32) + sw \rs, \addr +#elif (__riscv_xlen == 64) + sd \rs, \addr +#elif (__riscv_xlen == 128) + sq \rs, \addr +#else +#error Unhandled value of XLEN +#endif + .endm + + /** + * Read an XLEN-sized register from an address. + * @param[out] rd destination register + * @param addr address to read from + */ + .macro lx rd, addr +#if (__riscv_xlen == 32) + lw \rd, \addr +#elif (__riscv_xlen == 64) + ld \rd, \addr +#elif (__riscv_xlen == 128) + lq \rd, \addr +#else +#error Unhandled value of XLEN +#endif + .endm -- 2.45.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [FFmpeg-devel] [PATCH 4/7] avutil/riscv/asm: add stack pushing helpers 2024-08-13 14:03 ` [FFmpeg-devel] [PATCH 4/7] avutil/riscv/asm: add stack pushing helpers J. Dekker @ 2024-08-13 15:51 ` Rémi Denis-Courmont 2024-08-13 16:10 ` epirat07 0 siblings, 1 reply; 13+ messages in thread From: Rémi Denis-Courmont @ 2024-08-13 15:51 UTC (permalink / raw) To: FFmpeg development discussions and patches Le 13 août 2024 17:03:33 GMT+03:00, "J. Dekker" <jdek@itanimul.li> a écrit : >From: Niklas Haas <git@haasn.dev> > >Instead of duplicating these common macros in every file, add them to >the shared utility file. Also add a base case for sanity. Is `#error` a standard directive of C11? >--- > libavcodec/riscv/h264addpx_rvv.S | 10 ---------- > libavcodec/riscv/h264idct_rvv.S | 10 ---------- > libavcodec/riscv/startcode_rvb.S | 10 ---------- > libavutil/riscv/asm.S | 34 ++++++++++++++++++++++++++++++++ > 4 files changed, 34 insertions(+), 30 deletions(-) > >diff --git a/libavcodec/riscv/h264addpx_rvv.S b/libavcodec/riscv/h264addpx_rvv.S >index 82739881d9..cf3b742294 100644 >--- a/libavcodec/riscv/h264addpx_rvv.S >+++ b/libavcodec/riscv/h264addpx_rvv.S >@@ -26,16 +26,6 @@ > > #include "libavutil/riscv/asm.S" > >- .macro sx rd, addr >-#if (__riscv_xlen == 32) >- sw \rd, \addr >-#elif (__riscv_xlen == 64) >- sd \rd, \addr >-#else >- sq \rd, \addr >-#endif >- .endm >- > func ff_h264_add_pixels4_8_rvv, zve32x > lpad 0 > vsetivli zero, 4, e8, mf4, ta, ma >diff --git a/libavcodec/riscv/h264idct_rvv.S b/libavcodec/riscv/h264idct_rvv.S >index d2f77a5b47..076935a5d5 100644 >--- a/libavcodec/riscv/h264idct_rvv.S >+++ b/libavcodec/riscv/h264idct_rvv.S >@@ -29,16 +29,6 @@ > > #include "libavutil/riscv/asm.S" > >- .macro sx rd, addr >-#if (__riscv_xlen == 32) >- sw \rd, \addr >-#elif (__riscv_xlen == 64) >- sd \rd, \addr >-#else >- sq \rd, \addr >-#endif >- .endm >- > .variant_cc ff_h264_idct4_rvv > func ff_h264_idct4_rvv, zve32x > vsra.vi v5, v1, 1 >diff --git a/libavcodec/riscv/startcode_rvb.S b/libavcodec/riscv/startcode_rvb.S >index eec92d3340..c131ebdf59 100644 >--- a/libavcodec/riscv/startcode_rvb.S >+++ b/libavcodec/riscv/startcode_rvb.S >@@ -26,16 +26,6 @@ > > #include "libavutil/riscv/asm.S" > >- .macro lx rd, addr >-#if (__riscv_xlen == 32) >- lw \rd, \addr >-#elif (__riscv_xlen == 64) >- ld \rd, \addr >-#else >- lq \rd, \addr >-#endif >- .endm >- > func ff_startcode_find_candidate_rvb, zbb > lpad 0 > add a1, a0, a1 >diff --git a/libavutil/riscv/asm.S b/libavutil/riscv/asm.S >index ec68a042d1..175f2a8672 100644 >--- a/libavutil/riscv/asm.S >+++ b/libavutil/riscv/asm.S >@@ -237,3 +237,37 @@ > .macro vntypei rd, rs, n=1 > vwtypei \rd, \rs, -(\n) > .endm >+ >+ /** >+ * Write an XLEN-sized register to an address. >+ * @param rs source register >+ * @param addr address to write to >+ */ >+ .macro sx rs, addr >+#if (__riscv_xlen == 32) >+ sw \rs, \addr >+#elif (__riscv_xlen == 64) >+ sd \rs, \addr >+#elif (__riscv_xlen == 128) >+ sq \rs, \addr >+#else >+#error Unhandled value of XLEN >+#endif >+ .endm >+ >+ /** >+ * Read an XLEN-sized register from an address. >+ * @param[out] rd destination register >+ * @param addr address to read from >+ */ >+ .macro lx rd, addr >+#if (__riscv_xlen == 32) >+ lw \rd, \addr >+#elif (__riscv_xlen == 64) >+ ld \rd, \addr >+#elif (__riscv_xlen == 128) >+ lq \rd, \addr >+#else >+#error Unhandled value of XLEN >+#endif >+ .endm _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [FFmpeg-devel] [PATCH 4/7] avutil/riscv/asm: add stack pushing helpers 2024-08-13 15:51 ` Rémi Denis-Courmont @ 2024-08-13 16:10 ` epirat07 2024-08-13 16:13 ` Rémi Denis-Courmont 0 siblings, 1 reply; 13+ messages in thread From: epirat07 @ 2024-08-13 16:10 UTC (permalink / raw) To: FFmpeg development discussions and patches On 13 Aug 2024, at 17:51, Rémi Denis-Courmont wrote: > Le 13 août 2024 17:03:33 GMT+03:00, "J. Dekker" <jdek@itanimul.li> a écrit : >> From: Niklas Haas <git@haasn.dev> >> >> Instead of duplicating these common macros in every file, add them to >> the shared utility file. Also add a base case for sanity. > > Is `#error` a standard directive of C11? The #error directive dates back to C89: https://en.cppreference.com/w/c/preprocessor/error > >> --- >> libavcodec/riscv/h264addpx_rvv.S | 10 ---------- >> libavcodec/riscv/h264idct_rvv.S | 10 ---------- >> libavcodec/riscv/startcode_rvb.S | 10 ---------- >> libavutil/riscv/asm.S | 34 ++++++++++++++++++++++++++++++++ >> 4 files changed, 34 insertions(+), 30 deletions(-) >> >> diff --git a/libavcodec/riscv/h264addpx_rvv.S b/libavcodec/riscv/h264addpx_rvv.S >> index 82739881d9..cf3b742294 100644 >> --- a/libavcodec/riscv/h264addpx_rvv.S >> +++ b/libavcodec/riscv/h264addpx_rvv.S >> @@ -26,16 +26,6 @@ >> >> #include "libavutil/riscv/asm.S" >> >> - .macro sx rd, addr >> -#if (__riscv_xlen == 32) >> - sw \rd, \addr >> -#elif (__riscv_xlen == 64) >> - sd \rd, \addr >> -#else >> - sq \rd, \addr >> -#endif >> - .endm >> - >> func ff_h264_add_pixels4_8_rvv, zve32x >> lpad 0 >> vsetivli zero, 4, e8, mf4, ta, ma >> diff --git a/libavcodec/riscv/h264idct_rvv.S b/libavcodec/riscv/h264idct_rvv.S >> index d2f77a5b47..076935a5d5 100644 >> --- a/libavcodec/riscv/h264idct_rvv.S >> +++ b/libavcodec/riscv/h264idct_rvv.S >> @@ -29,16 +29,6 @@ >> >> #include "libavutil/riscv/asm.S" >> >> - .macro sx rd, addr >> -#if (__riscv_xlen == 32) >> - sw \rd, \addr >> -#elif (__riscv_xlen == 64) >> - sd \rd, \addr >> -#else >> - sq \rd, \addr >> -#endif >> - .endm >> - >> .variant_cc ff_h264_idct4_rvv >> func ff_h264_idct4_rvv, zve32x >> vsra.vi v5, v1, 1 >> diff --git a/libavcodec/riscv/startcode_rvb.S b/libavcodec/riscv/startcode_rvb.S >> index eec92d3340..c131ebdf59 100644 >> --- a/libavcodec/riscv/startcode_rvb.S >> +++ b/libavcodec/riscv/startcode_rvb.S >> @@ -26,16 +26,6 @@ >> >> #include "libavutil/riscv/asm.S" >> >> - .macro lx rd, addr >> -#if (__riscv_xlen == 32) >> - lw \rd, \addr >> -#elif (__riscv_xlen == 64) >> - ld \rd, \addr >> -#else >> - lq \rd, \addr >> -#endif >> - .endm >> - >> func ff_startcode_find_candidate_rvb, zbb >> lpad 0 >> add a1, a0, a1 >> diff --git a/libavutil/riscv/asm.S b/libavutil/riscv/asm.S >> index ec68a042d1..175f2a8672 100644 >> --- a/libavutil/riscv/asm.S >> +++ b/libavutil/riscv/asm.S >> @@ -237,3 +237,37 @@ >> .macro vntypei rd, rs, n=1 >> vwtypei \rd, \rs, -(\n) >> .endm >> + >> + /** >> + * Write an XLEN-sized register to an address. >> + * @param rs source register >> + * @param addr address to write to >> + */ >> + .macro sx rs, addr >> +#if (__riscv_xlen == 32) >> + sw \rs, \addr >> +#elif (__riscv_xlen == 64) >> + sd \rs, \addr >> +#elif (__riscv_xlen == 128) >> + sq \rs, \addr >> +#else >> +#error Unhandled value of XLEN >> +#endif >> + .endm >> + >> + /** >> + * Read an XLEN-sized register from an address. >> + * @param[out] rd destination register >> + * @param addr address to read from >> + */ >> + .macro lx rd, addr >> +#if (__riscv_xlen == 32) >> + lw \rd, \addr >> +#elif (__riscv_xlen == 64) >> + ld \rd, \addr >> +#elif (__riscv_xlen == 128) >> + lq \rd, \addr >> +#else >> +#error Unhandled value of XLEN >> +#endif >> + .endm > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [FFmpeg-devel] [PATCH 4/7] avutil/riscv/asm: add stack pushing helpers 2024-08-13 16:10 ` epirat07 @ 2024-08-13 16:13 ` Rémi Denis-Courmont 0 siblings, 0 replies; 13+ messages in thread From: Rémi Denis-Courmont @ 2024-08-13 16:13 UTC (permalink / raw) To: FFmpeg development discussions and patches Le 13 août 2024 19:10:48 GMT+03:00, epirat07@gmail.com a écrit : >On 13 Aug 2024, at 17:51, Rémi Denis-Courmont wrote: > >> Le 13 août 2024 17:03:33 GMT+03:00, "J. Dekker" <jdek@itanimul.li> a écrit : >>> From: Niklas Haas <git@haasn.dev> >>> >>> Instead of duplicating these common macros in every file, add them to >>> the shared utility file. Also add a base case for sanity. >> >> Is `#error` a standard directive of C11? > >The #error directive dates back to C89: >https://en.cppreference.com/w/c/preprocessor/error Ok thanks. Any reason not to use an assembler directive though? >> >>> --- >>> libavcodec/riscv/h264addpx_rvv.S | 10 ---------- >>> libavcodec/riscv/h264idct_rvv.S | 10 ---------- >>> libavcodec/riscv/startcode_rvb.S | 10 ---------- >>> libavutil/riscv/asm.S | 34 ++++++++++++++++++++++++++++++++ >>> 4 files changed, 34 insertions(+), 30 deletions(-) >>> >>> diff --git a/libavcodec/riscv/h264addpx_rvv.S b/libavcodec/riscv/h264addpx_rvv.S >>> index 82739881d9..cf3b742294 100644 >>> --- a/libavcodec/riscv/h264addpx_rvv.S >>> +++ b/libavcodec/riscv/h264addpx_rvv.S >>> @@ -26,16 +26,6 @@ >>> >>> #include "libavutil/riscv/asm.S" >>> >>> - .macro sx rd, addr >>> -#if (__riscv_xlen == 32) >>> - sw \rd, \addr >>> -#elif (__riscv_xlen == 64) >>> - sd \rd, \addr >>> -#else >>> - sq \rd, \addr >>> -#endif >>> - .endm >>> - >>> func ff_h264_add_pixels4_8_rvv, zve32x >>> lpad 0 >>> vsetivli zero, 4, e8, mf4, ta, ma >>> diff --git a/libavcodec/riscv/h264idct_rvv.S b/libavcodec/riscv/h264idct_rvv.S >>> index d2f77a5b47..076935a5d5 100644 >>> --- a/libavcodec/riscv/h264idct_rvv.S >>> +++ b/libavcodec/riscv/h264idct_rvv.S >>> @@ -29,16 +29,6 @@ >>> >>> #include "libavutil/riscv/asm.S" >>> >>> - .macro sx rd, addr >>> -#if (__riscv_xlen == 32) >>> - sw \rd, \addr >>> -#elif (__riscv_xlen == 64) >>> - sd \rd, \addr >>> -#else >>> - sq \rd, \addr >>> -#endif >>> - .endm >>> - >>> .variant_cc ff_h264_idct4_rvv >>> func ff_h264_idct4_rvv, zve32x >>> vsra.vi v5, v1, 1 >>> diff --git a/libavcodec/riscv/startcode_rvb.S b/libavcodec/riscv/startcode_rvb.S >>> index eec92d3340..c131ebdf59 100644 >>> --- a/libavcodec/riscv/startcode_rvb.S >>> +++ b/libavcodec/riscv/startcode_rvb.S >>> @@ -26,16 +26,6 @@ >>> >>> #include "libavutil/riscv/asm.S" >>> >>> - .macro lx rd, addr >>> -#if (__riscv_xlen == 32) >>> - lw \rd, \addr >>> -#elif (__riscv_xlen == 64) >>> - ld \rd, \addr >>> -#else >>> - lq \rd, \addr >>> -#endif >>> - .endm >>> - >>> func ff_startcode_find_candidate_rvb, zbb >>> lpad 0 >>> add a1, a0, a1 >>> diff --git a/libavutil/riscv/asm.S b/libavutil/riscv/asm.S >>> index ec68a042d1..175f2a8672 100644 >>> --- a/libavutil/riscv/asm.S >>> +++ b/libavutil/riscv/asm.S >>> @@ -237,3 +237,37 @@ >>> .macro vntypei rd, rs, n=1 >>> vwtypei \rd, \rs, -(\n) >>> .endm >>> + >>> + /** >>> + * Write an XLEN-sized register to an address. >>> + * @param rs source register >>> + * @param addr address to write to >>> + */ >>> + .macro sx rs, addr >>> +#if (__riscv_xlen == 32) >>> + sw \rs, \addr >>> +#elif (__riscv_xlen == 64) >>> + sd \rs, \addr >>> +#elif (__riscv_xlen == 128) >>> + sq \rs, \addr >>> +#else >>> +#error Unhandled value of XLEN >>> +#endif >>> + .endm >>> + >>> + /** >>> + * Read an XLEN-sized register from an address. >>> + * @param[out] rd destination register >>> + * @param addr address to read from >>> + */ >>> + .macro lx rd, addr >>> +#if (__riscv_xlen == 32) >>> + lw \rd, \addr >>> +#elif (__riscv_xlen == 64) >>> + ld \rd, \addr >>> +#elif (__riscv_xlen == 128) >>> + lq \rd, \addr >>> +#else >>> +#error Unhandled value of XLEN >>> +#endif >>> + .endm >> _______________________________________________ >> ffmpeg-devel mailing list >> ffmpeg-devel@ffmpeg.org >> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel >> >> To unsubscribe, visit link above, or email >> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". >_______________________________________________ >ffmpeg-devel mailing list >ffmpeg-devel@ffmpeg.org >https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > >To unsubscribe, visit link above, or email >ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 13+ messages in thread
* [FFmpeg-devel] [PATCH 5/7] avutil/riscv/asm: add helper macro to count varargs 2024-08-13 14:03 [FFmpeg-devel] [PATCH 1/7] checkasm: add csv/tsv bench output J. Dekker ` (2 preceding siblings ...) 2024-08-13 14:03 ` [FFmpeg-devel] [PATCH 4/7] avutil/riscv/asm: add stack pushing helpers J. Dekker @ 2024-08-13 14:03 ` J. Dekker 2024-08-13 14:03 ` [FFmpeg-devel] [PATCH 6/7] avutil/riscv/asm: add generic push/pop helpers J. Dekker 2024-08-13 14:03 ` [FFmpeg-devel] [PATCH 7/7] avcodec/riscv: add h264 qpel J. Dekker 5 siblings, 0 replies; 13+ messages in thread From: J. Dekker @ 2024-08-13 14:03 UTC (permalink / raw) To: ffmpeg-devel; +Cc: Niklas Haas From: Niklas Haas <git@haasn.dev> (Ab)using nested macros to get the number of arguments passed to a variadic macro. Useful for stack manipulation. --- libavutil/riscv/asm.S | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/libavutil/riscv/asm.S b/libavutil/riscv/asm.S index 175f2a8672..db190e99ca 100644 --- a/libavutil/riscv/asm.S +++ b/libavutil/riscv/asm.S @@ -271,3 +271,20 @@ #error Unhandled value of XLEN #endif .endm + + .macro count_args_inner num, arg, args:vararg + .ifb \arg + .equ num_args, \num + .else + count_args_inner \num + 1, \args + .endif + .endm + + /** + * Helper macro to count the number of arguments to a macro. Assigns + * the count to the symbol `num_args`. + * @param args arguments to count + */ + .macro count_args args:vararg + count_args_inner 0, \args + .endm -- 2.45.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 13+ messages in thread
* [FFmpeg-devel] [PATCH 6/7] avutil/riscv/asm: add generic push/pop helpers 2024-08-13 14:03 [FFmpeg-devel] [PATCH 1/7] checkasm: add csv/tsv bench output J. Dekker ` (3 preceding siblings ...) 2024-08-13 14:03 ` [FFmpeg-devel] [PATCH 5/7] avutil/riscv/asm: add helper macro to count varargs J. Dekker @ 2024-08-13 14:03 ` J. Dekker 2024-08-13 15:55 ` Rémi Denis-Courmont 2024-08-13 14:03 ` [FFmpeg-devel] [PATCH 7/7] avcodec/riscv: add h264 qpel J. Dekker 5 siblings, 1 reply; 13+ messages in thread From: J. Dekker @ 2024-08-13 14:03 UTC (permalink / raw) To: ffmpeg-devel; +Cc: Niklas Haas From: Niklas Haas <git@haasn.dev> Generic helper macros to push/pop multiple registers at once. Expands to a single `addi` plus a sequence of XLEN-sized stores/loads. --- libavutil/riscv/asm.S | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/libavutil/riscv/asm.S b/libavutil/riscv/asm.S index db190e99ca..3955530e4e 100644 --- a/libavutil/riscv/asm.S +++ b/libavutil/riscv/asm.S @@ -288,3 +288,40 @@ .macro count_args args:vararg count_args_inner 0, \args .endm + + /** + * Helper macro to iterate over constant sized elements in memory + * @param op operation to perform on each element (sized load/store) + * @param size size in bytes per element + * @param offset starting offset of first element + * @param addr base address to load/store + * @param regs registers to iterate over + */ + .macro for_mem op, size, offset, addr, reg, regs:vararg + .ifnb \reg + \op \reg, \offset(\addr) + for_mem \op, \size, \offset + \size, \addr, \regs + .endif + .endm + + /** + * Push a variable number of registers to the stack. + * @param n number of registers to push + * @param regs registers to push + */ + .macro push regs:vararg + count_args \regs + addi sp, sp, -(num_args * (__riscv_xlen >> 3)) + for_mem sx, __riscv_xlen >> 3, 0, sp, \regs + .endm + + /** + * Pop a variable number of registers from the stack. + * @param n number of registers to pop + * @param[out] regs registers to pop + */ + .macro pop regs:vararg + count_args \regs + for_mem lx, __riscv_xlen >> 3, 0, sp, \regs + addi sp, sp, num_args * (__riscv_xlen >> 3) + .endm -- 2.45.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [FFmpeg-devel] [PATCH 6/7] avutil/riscv/asm: add generic push/pop helpers 2024-08-13 14:03 ` [FFmpeg-devel] [PATCH 6/7] avutil/riscv/asm: add generic push/pop helpers J. Dekker @ 2024-08-13 15:55 ` Rémi Denis-Courmont 2024-08-15 12:13 ` Niklas Haas 0 siblings, 1 reply; 13+ messages in thread From: Rémi Denis-Courmont @ 2024-08-13 15:55 UTC (permalink / raw) To: FFmpeg development discussions and patches Le 13 août 2024 17:03:35 GMT+03:00, "J. Dekker" <jdek@itanimul.li> a écrit : >From: Niklas Haas <git@haasn.dev> > >Generic helper macros to push/pop multiple registers at once. Expands to >a single `addi` plus a sequence of XLEN-sized stores/loads. >--- > libavutil/riscv/asm.S | 37 +++++++++++++++++++++++++++++++++++++ > 1 file changed, 37 insertions(+) > >diff --git a/libavutil/riscv/asm.S b/libavutil/riscv/asm.S >index db190e99ca..3955530e4e 100644 >--- a/libavutil/riscv/asm.S >+++ b/libavutil/riscv/asm.S >@@ -288,3 +288,40 @@ > .macro count_args args:vararg > count_args_inner 0, \args > .endm >+ >+ /** >+ * Helper macro to iterate over constant sized elements in memory >+ * @param op operation to perform on each element (sized load/store) >+ * @param size size in bytes per element >+ * @param offset starting offset of first element >+ * @param addr base address to load/store >+ * @param regs registers to iterate over >+ */ >+ .macro for_mem op, size, offset, addr, reg, regs:vararg >+ .ifnb \reg >+ \op \reg, \offset(\addr) >+ for_mem \op, \size, \offset + \size, \addr, \regs >+ .endif >+ .endm >+ >+ /** >+ * Push a variable number of registers to the stack. >+ * @param n number of registers to push >+ * @param regs registers to push >+ */ >+ .macro push regs:vararg >+ count_args \regs >+ addi sp, sp, -(num_args * (__riscv_xlen >> 3)) >+ for_mem sx, __riscv_xlen >> 3, 0, sp, \regs >+ .endm This is not in line with the psABI specification for RV32 and RV64. Ditto below. It's also not in line with the RV128 ABI since that doesn't even exist yet. >+ >+ /** >+ * Pop a variable number of registers from the stack. >+ * @param n number of registers to pop >+ * @param[out] regs registers to pop >+ */ >+ .macro pop regs:vararg >+ count_args \regs >+ for_mem lx, __riscv_xlen >> 3, 0, sp, \regs >+ addi sp, sp, num_args * (__riscv_xlen >> 3) >+ .endm _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [FFmpeg-devel] [PATCH 6/7] avutil/riscv/asm: add generic push/pop helpers 2024-08-13 15:55 ` Rémi Denis-Courmont @ 2024-08-15 12:13 ` Niklas Haas 0 siblings, 0 replies; 13+ messages in thread From: Niklas Haas @ 2024-08-15 12:13 UTC (permalink / raw) To: FFmpeg development discussions and patches On Tue, 13 Aug 2024 18:55:24 +0300 Rémi Denis-Courmont <remi@remlab.net> wrote: > > > Le 13 août 2024 17:03:35 GMT+03:00, "J. Dekker" <jdek@itanimul.li> a écrit : > >From: Niklas Haas <git@haasn.dev> > > > >Generic helper macros to push/pop multiple registers at once. Expands to > >a single `addi` plus a sequence of XLEN-sized stores/loads. > >--- > > libavutil/riscv/asm.S | 37 +++++++++++++++++++++++++++++++++++++ > > 1 file changed, 37 insertions(+) > > > >diff --git a/libavutil/riscv/asm.S b/libavutil/riscv/asm.S > >index db190e99ca..3955530e4e 100644 > >--- a/libavutil/riscv/asm.S > >+++ b/libavutil/riscv/asm.S > >@@ -288,3 +288,40 @@ > > .macro count_args args:vararg > > count_args_inner 0, \args > > .endm > >+ > >+ /** > >+ * Helper macro to iterate over constant sized elements in memory > >+ * @param op operation to perform on each element (sized load/store) > >+ * @param size size in bytes per element > >+ * @param offset starting offset of first element > >+ * @param addr base address to load/store > >+ * @param regs registers to iterate over > >+ */ > >+ .macro for_mem op, size, offset, addr, reg, regs:vararg > >+ .ifnb \reg > >+ \op \reg, \offset(\addr) > >+ for_mem \op, \size, \offset + \size, \addr, \regs > >+ .endif > >+ .endm > >+ > >+ /** > >+ * Push a variable number of registers to the stack. > >+ * @param n number of registers to push > >+ * @param regs registers to push > >+ */ > >+ .macro push regs:vararg > >+ count_args \regs > >+ addi sp, sp, -(num_args * (__riscv_xlen >> 3)) > >+ for_mem sx, __riscv_xlen >> 3, 0, sp, \regs > >+ .endm > > This is not in line with the psABI specification for RV32 and RV64. Ditto below. Missing alignment to multiples of 16 bytes, what else? > > It's also not in line with the RV128 ABI since that doesn't even exist yet. > > >+ > >+ /** > >+ * Pop a variable number of registers from the stack. > >+ * @param n number of registers to pop > >+ * @param[out] regs registers to pop > >+ */ > >+ .macro pop regs:vararg > >+ count_args \regs > >+ for_mem lx, __riscv_xlen >> 3, 0, sp, \regs > >+ addi sp, sp, num_args * (__riscv_xlen >> 3) > >+ .endm > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 13+ messages in thread
* [FFmpeg-devel] [PATCH 7/7] avcodec/riscv: add h264 qpel 2024-08-13 14:03 [FFmpeg-devel] [PATCH 1/7] checkasm: add csv/tsv bench output J. Dekker ` (4 preceding siblings ...) 2024-08-13 14:03 ` [FFmpeg-devel] [PATCH 6/7] avutil/riscv/asm: add generic push/pop helpers J. Dekker @ 2024-08-13 14:03 ` J. Dekker 5 siblings, 0 replies; 13+ messages in thread From: J. Dekker @ 2024-08-13 14:03 UTC (permalink / raw) To: ffmpeg-devel; +Cc: Niklas Haas From: Niklas Haas <git@haasn.dev> checkasm: bench runs 131072 (1 << 17) avg_h264_qpel_4_mc00_8_c: 37.6 ( 1.00x) avg_h264_qpel_4_mc00_8_rvv_i32: 27.4 ( 1.37x) avg_h264_qpel_4_mc01_8_c: 214.6 ( 1.00x) avg_h264_qpel_4_mc01_8_rvv_i32: 79.3 ( 2.70x) avg_h264_qpel_4_mc02_8_c: 214.8 ( 1.00x) avg_h264_qpel_4_mc02_8_rvv_i32: 79.3 ( 2.71x) avg_h264_qpel_4_mc03_8_c: 214.8 ( 1.00x) avg_h264_qpel_4_mc03_8_rvv_i32: 79.3 ( 2.71x) avg_h264_qpel_4_mc10_8_c: 173.1 ( 1.00x) avg_h264_qpel_4_mc10_8_rvv_i32: 120.8 ( 1.43x) avg_h264_qpel_4_mc11_8_c: 339.9 ( 1.00x) avg_h264_qpel_4_mc11_8_rvv_i32: 183.3 ( 1.85x) avg_h264_qpel_4_mc12_8_c: 537.6 ( 1.00x) avg_h264_qpel_4_mc12_8_rvv_i32: 339.9 ( 1.58x) avg_h264_qpel_4_mc13_8_c: 339.9 ( 1.00x) avg_h264_qpel_4_mc13_8_rvv_i32: 194.1 ( 1.75x) avg_h264_qpel_4_mc20_8_c: 141.8 ( 1.00x) avg_h264_qpel_4_mc20_8_rvv_i32: 121.1 ( 1.17x) avg_h264_qpel_4_mc21_8_c: 485.6 ( 1.00x) avg_h264_qpel_4_mc21_8_rvv_i32: 381.4 ( 1.27x) avg_h264_qpel_4_mc22_8_c: 350.1 ( 1.00x) avg_h264_qpel_4_mc22_8_rvv_i32: 266.9 ( 1.31x) avg_h264_qpel_4_mc23_8_c: 485.6 ( 1.00x) avg_h264_qpel_4_mc23_8_rvv_i32: 381.6 ( 1.27x) avg_h264_qpel_4_mc30_8_c: 173.1 ( 1.00x) avg_h264_qpel_4_mc30_8_rvv_i32: 131.6 ( 1.32x) avg_h264_qpel_4_mc31_8_c: 339.9 ( 1.00x) avg_h264_qpel_4_mc31_8_rvv_i32: 183.3 ( 1.85x) avg_h264_qpel_4_mc32_8_c: 537.9 ( 1.00x) avg_h264_qpel_4_mc32_8_rvv_i32: 339.9 ( 1.58x) avg_h264_qpel_4_mc33_8_c: 339.9 ( 1.00x) avg_h264_qpel_4_mc33_8_rvv_i32: 193.8 ( 1.75x) avg_h264_qpel_8_mc00_8_c: 110.6 ( 1.00x) avg_h264_qpel_8_mc00_8_rvv_i32: 48.1 ( 2.30x) avg_h264_qpel_8_mc01_8_c: 766.9 ( 1.00x) avg_h264_qpel_8_mc01_8_rvv_i32: 152.1 ( 5.04x) avg_h264_qpel_8_mc02_8_c: 766.9 ( 1.00x) avg_h264_qpel_8_mc02_8_rvv_i32: 141.8 ( 5.41x) avg_h264_qpel_8_mc03_8_c: 777.4 ( 1.00x) avg_h264_qpel_8_mc03_8_rvv_i32: 152.3 ( 5.10x) avg_h264_qpel_8_mc10_8_c: 620.9 ( 1.00x) avg_h264_qpel_8_mc10_8_rvv_i32: 235.6 ( 2.64x) avg_h264_qpel_8_mc11_8_c: 1204.6 ( 1.00x) avg_h264_qpel_8_mc11_8_rvv_i32: 360.6 ( 3.34x) avg_h264_qpel_8_mc12_8_c: 1912.6 ( 1.00x) avg_h264_qpel_8_mc12_8_rvv_i32: 558.4 ( 3.43x) avg_h264_qpel_8_mc13_8_c: 1214.6 ( 1.00x) avg_h264_qpel_8_mc13_8_rvv_i32: 360.6 ( 3.37x) avg_h264_qpel_8_mc20_8_c: 506.4 ( 1.00x) avg_h264_qpel_8_mc20_8_rvv_i32: 225.1 ( 2.25x) avg_h264_qpel_8_mc21_8_c: 1714.8 ( 1.00x) avg_h264_qpel_8_mc21_8_rvv_i32: 631.6 ( 2.72x) avg_h264_qpel_8_mc22_8_c: 1266.8 ( 1.00x) avg_h264_qpel_8_mc22_8_rvv_i32: 423.1 ( 2.99x) avg_h264_qpel_8_mc23_8_c: 1714.6 ( 1.00x) avg_h264_qpel_8_mc23_8_rvv_i32: 631.4 ( 2.72x) avg_h264_qpel_8_mc30_8_c: 610.6 ( 1.00x) avg_h264_qpel_8_mc30_8_rvv_i32: 235.6 ( 2.59x) avg_h264_qpel_8_mc31_8_c: 1214.6 ( 1.00x) avg_h264_qpel_8_mc31_8_rvv_i32: 350.1 ( 3.47x) avg_h264_qpel_8_mc32_8_c: 1902.3 ( 1.00x) avg_h264_qpel_8_mc32_8_rvv_i32: 558.6 ( 3.41x) avg_h264_qpel_8_mc33_8_c: 1214.8 ( 1.00x) avg_h264_qpel_8_mc33_8_rvv_i32: 360.6 ( 3.37x) avg_h264_qpel_16_mc00_8_c: 423.1 ( 1.00x) avg_h264_qpel_16_mc00_8_rvv_i32: 68.8 ( 6.15x) avg_h264_qpel_16_mc01_8_c: 2850.1 ( 1.00x) avg_h264_qpel_16_mc01_8_rvv_i32: 298.1 ( 9.56x) avg_h264_qpel_16_mc02_8_c: 2954.6 ( 1.00x) avg_h264_qpel_16_mc02_8_rvv_i32: 277.4 (10.65x) avg_h264_qpel_16_mc03_8_c: 2871.1 ( 1.00x) avg_h264_qpel_16_mc03_8_rvv_i32: 298.1 ( 9.63x) avg_h264_qpel_16_mc10_8_c: 2423.1 ( 1.00x) avg_h264_qpel_16_mc10_8_rvv_i32: 464.9 ( 5.21x) avg_h264_qpel_16_mc11_8_c: 4683.6 ( 1.00x) avg_h264_qpel_16_mc11_8_rvv_i32: 714.6 ( 6.55x) avg_h264_qpel_16_mc12_8_c: 7496.4 ( 1.00x) avg_h264_qpel_16_mc12_8_rvv_i32: 1037.6 ( 7.22x) avg_h264_qpel_16_mc13_8_c: 4642.1 ( 1.00x) avg_h264_qpel_16_mc13_8_rvv_i32: 704.4 ( 6.59x) avg_h264_qpel_16_mc20_8_c: 2069.1 ( 1.00x) avg_h264_qpel_16_mc20_8_rvv_i32: 443.9 ( 4.66x) avg_h264_qpel_16_mc21_8_c: 6808.6 ( 1.00x) avg_h264_qpel_16_mc21_8_rvv_i32: 1204.3 ( 5.65x) avg_h264_qpel_16_mc22_8_c: 5048.4 ( 1.00x) avg_h264_qpel_16_mc22_8_rvv_i32: 777.4 ( 6.49x) avg_h264_qpel_16_mc23_8_c: 6819.1 ( 1.00x) avg_h264_qpel_16_mc23_8_rvv_i32: 1214.8 ( 5.61x) avg_h264_qpel_16_mc30_8_c: 2412.8 ( 1.00x) avg_h264_qpel_16_mc30_8_rvv_i32: 464.9 ( 5.19x) avg_h264_qpel_16_mc31_8_c: 4662.9 ( 1.00x) avg_h264_qpel_16_mc31_8_rvv_i32: 714.6 ( 6.53x) avg_h264_qpel_16_mc32_8_c: 7516.9 ( 1.00x) avg_h264_qpel_16_mc32_8_rvv_i32: 1058.6 ( 7.10x) avg_h264_qpel_16_mc33_8_c: 4673.4 ( 1.00x) avg_h264_qpel_16_mc33_8_rvv_i32: 714.9 ( 6.54x) put_h264_qpel_4_mc00_8_c: 27.4 ( 1.00x) put_h264_qpel_4_mc00_8_rvv_i32: 16.9 ( 1.62x) put_h264_qpel_4_mc01_8_c: 214.6 ( 1.00x) put_h264_qpel_4_mc01_8_rvv_i32: 79.3 ( 2.70x) put_h264_qpel_4_mc02_8_c: 183.3 ( 1.00x) put_h264_qpel_4_mc02_8_rvv_i32: 79.3 ( 2.31x) put_h264_qpel_4_mc03_8_c: 204.3 ( 1.00x) put_h264_qpel_4_mc03_8_rvv_i32: 89.6 ( 2.28x) put_h264_qpel_4_mc10_8_c: 173.1 ( 1.00x) put_h264_qpel_4_mc10_8_rvv_i32: 120.8 ( 1.43x) put_h264_qpel_4_mc11_8_c: 339.6 ( 1.00x) put_h264_qpel_4_mc11_8_rvv_i32: 183.3 ( 1.85x) put_h264_qpel_4_mc12_8_c: 527.4 ( 1.00x) put_h264_qpel_4_mc12_8_rvv_i32: 339.9 ( 1.55x) put_h264_qpel_4_mc13_8_c: 329.4 ( 1.00x) put_h264_qpel_4_mc13_8_rvv_i32: 183.6 ( 1.79x) put_h264_qpel_4_mc20_8_c: 121.1 ( 1.00x) put_h264_qpel_4_mc20_8_rvv_i32: 110.6 ( 1.09x) put_h264_qpel_4_mc21_8_c: 464.6 ( 1.00x) put_h264_qpel_4_mc21_8_rvv_i32: 371.1 ( 1.25x) put_h264_qpel_4_mc22_8_c: 329.4 ( 1.00x) put_h264_qpel_4_mc22_8_rvv_i32: 256.4 ( 1.28x) put_h264_qpel_4_mc23_8_c: 475.1 ( 1.00x) put_h264_qpel_4_mc23_8_rvv_i32: 371.1 ( 1.28x) put_h264_qpel_4_mc30_8_c: 162.6 ( 1.00x) put_h264_qpel_4_mc30_8_rvv_i32: 121.1 ( 1.34x) put_h264_qpel_4_mc31_8_c: 339.9 ( 1.00x) put_h264_qpel_4_mc31_8_rvv_i32: 183.6 ( 1.85x) put_h264_qpel_4_mc32_8_c: 527.1 ( 1.00x) put_h264_qpel_4_mc32_8_rvv_i32: 339.9 ( 1.55x) put_h264_qpel_4_mc33_8_c: 339.9 ( 1.00x) put_h264_qpel_4_mc33_8_rvv_i32: 183.3 ( 1.85x) put_h264_qpel_8_mc00_8_c: 89.8 ( 1.00x) put_h264_qpel_8_mc00_8_rvv_i32: 37.6 ( 2.39x) put_h264_qpel_8_mc01_8_c: 725.1 ( 1.00x) put_h264_qpel_8_mc01_8_rvv_i32: 141.8 ( 5.11x) put_h264_qpel_8_mc02_8_c: 662.9 ( 1.00x) put_h264_qpel_8_mc02_8_rvv_i32: 131.3 ( 5.05x) put_h264_qpel_8_mc03_8_c: 735.6 ( 1.00x) put_h264_qpel_8_mc03_8_rvv_i32: 141.8 ( 5.19x) put_h264_qpel_8_mc10_8_c: 600.4 ( 1.00x) put_h264_qpel_8_mc10_8_rvv_i32: 225.1 ( 2.67x) put_h264_qpel_8_mc11_8_c: 1173.1 ( 1.00x) put_h264_qpel_8_mc11_8_rvv_i32: 339.9 ( 3.45x) put_h264_qpel_8_mc12_8_c: 1871.1 ( 1.00x) put_h264_qpel_8_mc12_8_rvv_i32: 548.1 ( 3.41x) put_h264_qpel_8_mc13_8_c: 1173.1 ( 1.00x) put_h264_qpel_8_mc13_8_rvv_i32: 339.9 ( 3.45x) put_h264_qpel_8_mc20_8_c: 454.6 ( 1.00x) put_h264_qpel_8_mc20_8_rvv_i32: 214.8 ( 2.12x) put_h264_qpel_8_mc21_8_c: 1683.6 ( 1.00x) put_h264_qpel_8_mc21_8_rvv_i32: 621.1 ( 2.71x) put_h264_qpel_8_mc22_8_c: 1162.6 ( 1.00x) put_h264_qpel_8_mc22_8_rvv_i32: 412.9 ( 2.82x) put_h264_qpel_8_mc23_8_c: 1673.3 ( 1.00x) put_h264_qpel_8_mc23_8_rvv_i32: 631.4 ( 2.65x) put_h264_qpel_8_mc30_8_c: 589.9 ( 1.00x) put_h264_qpel_8_mc30_8_rvv_i32: 225.3 ( 2.62x) put_h264_qpel_8_mc31_8_c: 1173.1 ( 1.00x) put_h264_qpel_8_mc31_8_rvv_i32: 339.9 ( 3.45x) put_h264_qpel_8_mc32_8_c: 1871.1 ( 1.00x) put_h264_qpel_8_mc32_8_rvv_i32: 548.1 ( 3.41x) put_h264_qpel_8_mc33_8_c: 1162.6 ( 1.00x) put_h264_qpel_8_mc33_8_rvv_i32: 350.1 ( 3.32x) put_h264_qpel_16_mc00_8_c: 308.6 ( 1.00x) put_h264_qpel_16_mc00_8_rvv_i32: 48.1 ( 6.42x) put_h264_qpel_16_mc01_8_c: 2746.1 ( 1.00x) put_h264_qpel_16_mc01_8_rvv_i32: 277.4 ( 9.90x) put_h264_qpel_16_mc02_8_c: 2558.6 ( 1.00x) put_h264_qpel_16_mc02_8_rvv_i32: 266.9 ( 9.59x) put_h264_qpel_16_mc03_8_c: 2756.6 ( 1.00x) put_h264_qpel_16_mc03_8_rvv_i32: 277.4 ( 9.94x) put_h264_qpel_16_mc10_8_c: 2287.8 ( 1.00x) put_h264_qpel_16_mc10_8_rvv_i32: 443.9 ( 5.15x) put_h264_qpel_16_mc11_8_c: 4558.6 ( 1.00x) put_h264_qpel_16_mc11_8_rvv_i32: 683.4 ( 6.67x) put_h264_qpel_16_mc12_8_c: 7381.9 ( 1.00x) put_h264_qpel_16_mc12_8_rvv_i32: 1027.1 ( 7.19x) put_h264_qpel_16_mc13_8_c: 4548.4 ( 1.00x) put_h264_qpel_16_mc13_8_rvv_i32: 683.6 ( 6.65x) put_h264_qpel_16_mc20_8_c: 1819.1 ( 1.00x) put_h264_qpel_16_mc20_8_rvv_i32: 423.4 ( 4.30x) put_h264_qpel_16_mc21_8_c: 6704.6 ( 1.00x) put_h264_qpel_16_mc21_8_rvv_i32: 1183.6 ( 5.66x) put_h264_qpel_16_mc22_8_c: 4641.9 ( 1.00x) put_h264_qpel_16_mc22_8_rvv_i32: 756.4 ( 6.14x) put_h264_qpel_16_mc23_8_c: 6725.6 ( 1.00x) put_h264_qpel_16_mc23_8_rvv_i32: 1183.6 ( 5.68x) put_h264_qpel_16_mc30_8_c: 2308.6 ( 1.00x) put_h264_qpel_16_mc30_8_rvv_i32: 443.9 ( 5.20x) put_h264_qpel_16_mc31_8_c: 4548.4 ( 1.00x) put_h264_qpel_16_mc31_8_rvv_i32: 704.4 ( 6.46x) put_h264_qpel_16_mc32_8_c: 7412.9 ( 1.00x) put_h264_qpel_16_mc32_8_rvv_i32: 1037.8 ( 7.14x) put_h264_qpel_16_mc33_8_c: 4558.6 ( 1.00x) put_h264_qpel_16_mc33_8_rvv_i32: 694.1 ( 6.57x) Signed-off-by: Niklas Haas <git@haasn.dev> Signed-off-by: J. Dekker <jdek@itanimul.li> --- libavcodec/h264qpel.c | 2 + libavcodec/h264qpel.h | 1 + libavcodec/riscv/Makefile | 2 + libavcodec/riscv/h264qpel_init.c | 113 +++++++ libavcodec/riscv/h264qpel_rvv.S | 554 +++++++++++++++++++++++++++++++ 5 files changed, 672 insertions(+) create mode 100644 libavcodec/riscv/h264qpel_init.c create mode 100644 libavcodec/riscv/h264qpel_rvv.S diff --git a/libavcodec/h264qpel.c b/libavcodec/h264qpel.c index 65fef03304..faca1e8953 100644 --- a/libavcodec/h264qpel.c +++ b/libavcodec/h264qpel.c @@ -102,6 +102,8 @@ av_cold void ff_h264qpel_init(H264QpelContext *c, int bit_depth) ff_h264qpel_init_arm(c, bit_depth); #elif ARCH_PPC ff_h264qpel_init_ppc(c, bit_depth); +#elif ARCH_RISCV + ff_h264qpel_init_riscv(c, bit_depth); #elif ARCH_X86 ff_h264qpel_init_x86(c, bit_depth); #elif ARCH_MIPS diff --git a/libavcodec/h264qpel.h b/libavcodec/h264qpel.h index 0259e8de23..24baf826f9 100644 --- a/libavcodec/h264qpel.h +++ b/libavcodec/h264qpel.h @@ -34,6 +34,7 @@ void ff_h264qpel_init(H264QpelContext *c, int bit_depth); void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth); void ff_h264qpel_init_arm(H264QpelContext *c, int bit_depth); void ff_h264qpel_init_ppc(H264QpelContext *c, int bit_depth); +void ff_h264qpel_init_riscv(H264QpelContext *c, int bit_depth); void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth); void ff_h264qpel_init_mips(H264QpelContext *c, int bit_depth); void ff_h264qpel_init_loongarch(H264QpelContext *c, int bit_depth); diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile index b3a6b588c9..d4276521f3 100644 --- a/libavcodec/riscv/Makefile +++ b/libavcodec/riscv/Makefile @@ -33,6 +33,8 @@ RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_init.o RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o riscv/h264dsp_rvv.o \ riscv/h264idct_rvv.o +OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o +RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_init.o diff --git a/libavcodec/riscv/h264qpel_init.c b/libavcodec/riscv/h264qpel_init.c new file mode 100644 index 0000000000..69a1345447 --- /dev/null +++ b/libavcodec/riscv/h264qpel_init.c @@ -0,0 +1,113 @@ +/* + * RISC-V optimised DSP functions + * Copyright (c) 2024 Niklas Haas + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/riscv/cpu.h" +#include "libavcodec/h264qpel.h" + +#define DECL_QPEL_OPS(OP, SIZE, EXT) \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc00_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc10_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc20_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc30_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc01_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc11_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc21_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc31_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc02_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc12_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc22_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc32_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc03_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc13_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc23_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc33_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); + +DECL_QPEL_OPS(put, 16, rvv256) +DECL_QPEL_OPS(put, 8, rvv256) +DECL_QPEL_OPS(put, 4, rvv256) + +DECL_QPEL_OPS(avg, 16, rvv256) +DECL_QPEL_OPS(avg, 8, rvv256) +DECL_QPEL_OPS(avg, 4, rvv256) + +DECL_QPEL_OPS(put, 16, rvv) +DECL_QPEL_OPS(put, 8, rvv) +DECL_QPEL_OPS(put, 4, rvv) + +DECL_QPEL_OPS(avg, 16, rvv) +DECL_QPEL_OPS(avg, 8, rvv) +DECL_QPEL_OPS(avg, 4, rvv) + +#define SET_QPEL_FNS(OP, IDX, SIZE, EXT) \ +do { \ + c->OP ## _h264_qpel_pixels_tab[IDX][ 0] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc00_ ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][ 1] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc10_ ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][ 2] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc20_ ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][ 3] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc30_ ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][ 4] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc01_ ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][ 5] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc11_ ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][ 6] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc21_ ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][ 7] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc31_ ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][ 8] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc02_ ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][ 9] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc12_ ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][10] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc22_ ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][11] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc32_ ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][12] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc03_ ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][13] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc13_ ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][14] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc23_ ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][15] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc33_ ## EXT; \ +} while (0) + +av_cold void ff_h264qpel_init_riscv(H264QpelContext *c, int bit_depth) +{ +#if HAVE_RVV + int flags = av_get_cpu_flags(); + if (flags & AV_CPU_FLAG_RVV_I32) { + const int vlen = 8 * ff_get_rv_vlenb(); + + switch (bit_depth) { + case 8: + if (vlen >= 256) { + SET_QPEL_FNS(put, 0, 16, rvv256); + SET_QPEL_FNS(put, 1, 8, rvv256); + SET_QPEL_FNS(put, 2, 4, rvv256); + + SET_QPEL_FNS(avg, 0, 16, rvv256); + SET_QPEL_FNS(avg, 1, 8, rvv256); + SET_QPEL_FNS(avg, 2, 4, rvv256); + } else if (vlen >= 128) { + SET_QPEL_FNS(put, 0, 16, rvv); + SET_QPEL_FNS(put, 1, 8, rvv); + SET_QPEL_FNS(put, 2, 4, rvv); + + SET_QPEL_FNS(avg, 0, 16, rvv); + SET_QPEL_FNS(avg, 1, 8, rvv); + SET_QPEL_FNS(avg, 2, 4, rvv); + } + break; + } + } +#endif +} diff --git a/libavcodec/riscv/h264qpel_rvv.S b/libavcodec/riscv/h264qpel_rvv.S new file mode 100644 index 0000000000..7713372f23 --- /dev/null +++ b/libavcodec/riscv/h264qpel_rvv.S @@ -0,0 +1,554 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Niklas Haas + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "libavutil/riscv/asm.S" + +.macro vnclipsu.wi shifti, lmul, lmul2, vregs:vararg + vsetvli zero, zero, e16, \lmul2, ta, ma + .irp x, \vregs + vmax.vx \x, \x, zero + .endr + vsetvli zero, zero, e8, \lmul, ta, ma + .irp x, \vregs + vnclipu.wi \x, \x, \shifti + .endr +.endm + +.macro lowpass_init lmul, sizei, size, w0, w1, backup + vsetivli zero, \sizei, e8, \lmul, ta, ma + csrwi vxrm, 0 + li \size, \sizei + .ifnb \w0 + li \w0, 20 + li \w1, -5 + .endif +.endm + + /* output is unclipped; clobbers v26-v31 plus \tmp and \tmp2 */ +.macro lowpass_h vdst, src, w0, w1, tmp=t3, tmp2=t4 + addi \tmp, \src, 3 + lbu \tmp2, 2(\src) + vle8.v v31, (\tmp) + lbu \tmp, 1(\src) + vslide1up.vx v30, v31, \tmp2 + lbu \tmp2, 0(\src) + vslide1up.vx v29, v30, \tmp + lbu \tmp, -1(\src) + vslide1up.vx v28, v29, \tmp2 + lbu \tmp2, -2(\src) + vslide1up.vx v27, v28, \tmp + vslide1up.vx v26, v27, \tmp2 + vwaddu.vv \vdst, v26, v31 + vwmaccu.vx \vdst, \w0, v28 + vwmaccu.vx \vdst, \w0, v29 + vwmaccsu.vx \vdst, \w1, v27 + vwmaccsu.vx \vdst, \w1, v30 +.endm + + /* output is unclipped */ +.macro lowpass_v w0, w1, vdst, vsrc0, vsrc1, vsrc2, vsrc3, vsrc4, vsrc5, signed=0 + .if \signed + vwadd.vv \vdst, \vsrc0, \vsrc5 + vwmacc.vx \vdst, \w0, \vsrc2 + vwmacc.vx \vdst, \w0, \vsrc3 + vwmacc.vx \vdst, \w1, \vsrc1 + vwmacc.vx \vdst, \w1, \vsrc4 + .else + vwaddu.vv \vdst, \vsrc0, \vsrc5 + vwmaccu.vx \vdst, \w0, \vsrc2 + vwmaccu.vx \vdst, \w0, \vsrc3 + vwmaccsu.vx \vdst, \w1, \vsrc1 + vwmaccsu.vx \vdst, \w1, \vsrc4 + .endif +.endm + +.macro qpel_mc00 op, dst, src, stride, size +func ff_\op\()_h264_qpel_pixels, zve32x +1: + add t0, \stride, \src + add t1, \stride, t0 + add t2, \stride, t1 + vle8.v v0, (\src) + vle8.v v1, (t0) + vle8.v v2, (t1) + vle8.v v3, (t2) + addi \size, \size, -4 + add \src, \stride, t2 + add t0, \stride, \dst + add t1, \stride, t0 + add t2, \stride, t1 + .ifc \op, avg + vle8.v v4, (\dst) + vle8.v v5, (t0) + vle8.v v6, (t1) + vle8.v v7, (t2) + vaaddu.vv v0, v0, v4 + vaaddu.vv v1, v1, v5 + vaaddu.vv v2, v2, v6 + vaaddu.vv v3, v3, v7 + .endif + vse8.v v0, (\dst) + vse8.v v1, (t0) + vse8.v v2, (t1) + vse8.v v3, (t2) + add \dst, \stride, t2 + bnez \size, 1b + ret +endfunc +.endm + + qpel_mc00 put, a0, a1, a2, a4 + qpel_mc00 avg, a0, a1, a2, a4 + +.macro qpel_lowpass op, ext, lmul, lmul2, dst, src, dst_stride, src_stride, size, w0, w1, src2, src2_stride +func ff_\op\()_h264_qpel_h_lowpass_\lmul\ext, zve32x +1: + add t0, \src_stride, \src + add t1, \src_stride, t0 + add t2, \src_stride, t1 + lowpass_h v0, \src, \w0, \w1 + lowpass_h v2, t0, \w0, \w1 + lowpass_h v4, t1, \w0, \w1 + lowpass_h v6, t2, \w0, \w1 + add \src, \src_stride, t2 + addi \size, \size, -4 + vnclipsu.wi 5, \lmul, \lmul2, v0, v2, v4, v6 + .ifnb \src2 + add t0, \src2_stride, \src2 + add t1, \src2_stride, t0 + add t2, \src2_stride, t1 + vle8.v v8, (\src2) + vle8.v v10, (t0) + vle8.v v12, (t1) + vle8.v v14, (t2) + add \src2, \dst_stride, t2 + vaaddu.vv v0, v0, v8 + vaaddu.vv v2, v2, v10 + vaaddu.vv v4, v4, v12 + vaaddu.vv v6, v6, v14 + .endif + add t0, \dst_stride, \dst + add t1, \dst_stride, t0 + add t2, \dst_stride, t1 + .ifc \op, avg + vle8.v v1, (\dst) + vle8.v v3, (t0) + vle8.v v5, (t1) + vle8.v v7, (t2) + vaaddu.vv v0, v0, v1 + vaaddu.vv v2, v2, v3 + vaaddu.vv v4, v4, v5 + vaaddu.vv v6, v6, v7 + .endif + vse8.v v0, (\dst) + vse8.v v2, (t0) + vse8.v v4, (t1) + vse8.v v6, (t2) + add \dst, \dst_stride, t2 + bnez \size, 1b + ret +endfunc + +func ff_\op\()_h264_qpel_v_lowpass_\lmul\ext, zve32x + sub t0, \src, \src_stride + sub t1, t0, \src_stride + vle8.v v2, (\src) + vle8.v v1, (t0) + vle8.v v0, (t1) + add t0, \src, \src_stride + add t1, t0, \src_stride + add \src, t1, \src_stride + vle8.v v3, (t0) + vle8.v v4, (t1) +1: + add t0, \src_stride, \src + add t1, \src_stride, t0 + add t2, \src_stride, t1 + vle8.v v5, (\src) + vle8.v v6, (t0) + vle8.v v7, (t1) + vle8.v v8, (t2) + add \src, \src_stride, t2 + lowpass_v \w0, \w1, v24, v0, v1, v2, v3, v4, v5 + lowpass_v \w0, \w1, v26, v1, v2, v3, v4, v5, v6 + lowpass_v \w0, \w1, v28, v2, v3, v4, v5, v6, v7 + lowpass_v \w0, \w1, v30, v3, v4, v5, v6, v7, v8 + addi \size, \size, -4 + vnclipsu.wi 5, \lmul, \lmul2, v24, v26, v28, v30 + .ifnb \src2 + add t0, \src2_stride, \src2 + add t1, \src2_stride, t0 + add t2, \src2_stride, t1 + vle8.v v9, (\src2) + vle8.v v10, (t0) + vle8.v v11, (t1) + vle8.v v12, (t2) + add \src2, \src2_stride, t2 + vaaddu.vv v24, v24, v9 + vaaddu.vv v26, v26, v10 + vaaddu.vv v28, v28, v11 + vaaddu.vv v30, v30, v12 + .endif + add t0, \dst_stride, \dst + add t1, \dst_stride, t0 + add t2, \dst_stride, t1 + .ifc \op, avg + vle8.v v9, (\dst) + vle8.v v10, (t0) + vle8.v v11, (t1) + vle8.v v12, (t2) + vaaddu.vv v24, v24, v9 + vaaddu.vv v26, v26, v10 + vaaddu.vv v28, v28, v11 + vaaddu.vv v30, v30, v12 + .endif + vse8.v v24, (\dst) + vse8.v v26, (t0) + vse8.v v28, (t1) + vse8.v v30, (t2) + add \dst, \dst_stride, t2 + vmv.v.v v0, v4 + vmv.v.v v1, v5 + vmv.v.v v2, v6 + vmv.v.v v3, v7 + vmv.v.v v4, v8 + bnez \size, 1b + ret +endfunc + +func ff_\op\()_h264_qpel_hv_lowpass_\lmul\ext, zve32x + sub t0, \src, \src_stride + sub t1, t0, \src_stride + lowpass_h v4, \src, \w0, \w1 + lowpass_h v2, t0, \w0, \w1 + lowpass_h v0, t1, \w0, \w1 + add t0, \src, \src_stride + add t1, t0, \src_stride + add \src, t1, \src_stride + lowpass_h v6, t0, \w0, \w1 + lowpass_h v8, t1, \w0, \w1 +1: + add t0, \src_stride, \src + add t1, \src_stride, t0 + add t2, \src_stride, t1 + lowpass_h v10, \src, \w0, \w1 + lowpass_h v12, t0, \w0, \w1 + lowpass_h v14, t1, \w0, \w1 + lowpass_h v16, t2, \w0, \w1 + vsetvli zero, zero, e16, \lmul2, ta, ma + addi \size, \size, -4 + lowpass_v \w0, \w1, v20, v0, v2, v4, v6, v8, v10, signed=1 + lowpass_v \w0, \w1, v24, v2, v4, v6, v8, v10, v12, signed=1 + lowpass_v \w0, \w1, v28, v4, v6, v8, v10, v12, v14, signed=1 + vnclip.wi v0, v20, 10 + lowpass_v \w0, \w1, v20, v6, v8, v10, v12, v14, v16, signed=1 + vnclip.wi v2, v24, 10 + vnclip.wi v4, v28, 10 + vnclip.wi v6, v20, 10 + vmax.vx v18, v0, zero + vmax.vx v20, v2, zero + vmax.vx v22, v4, zero + vmax.vx v24, v6, zero + vmv.v.v v0, v8 + vmv.v.v v2, v10 + vmv.v.v v4, v12 + vmv.v.v v6, v14 + vmv.v.v v8, v16 + add \src, \src_stride, t2 + vsetvli zero, zero, e8, \lmul, ta, ma + vnclipu.wi v18, v18, 0 + vnclipu.wi v20, v20, 0 + vnclipu.wi v22, v22, 0 + vnclipu.wi v24, v24, 0 + .ifnb \src2 + add t0, \src2_stride, \src2 + add t1, \src2_stride, t0 + add t2, \src2_stride, t1 + vle8.v v26, (\src2) + vle8.v v27, (t0) + vle8.v v28, (t1) + vle8.v v29, (t2) + add \src2, \src2_stride, t2 + vaaddu.vv v18, v18, v26 + vaaddu.vv v20, v20, v27 + vaaddu.vv v22, v22, v28 + vaaddu.vv v24, v24, v29 + .endif + add t0, \dst_stride, \dst + add t1, \dst_stride, t0 + add t2, \dst_stride, t1 + .ifc \op, avg + vle8.v v26, (\dst) + vle8.v v27, (t0) + vle8.v v28, (t1) + vle8.v v29, (t2) + vaaddu.vv v18, v18, v26 + vaaddu.vv v20, v20, v27 + vaaddu.vv v22, v22, v28 + vaaddu.vv v24, v24, v29 + .endif + vse8.v v18, (\dst) + vse8.v v20, (t0) + vse8.v v22, (t1) + vse8.v v24, (t2) + add \dst, \dst_stride, t2 + bnez \size, 1b + ret +endfunc +.endm + +/* Note: We could possibly specialize for the width 8 / width 4 cases by + loading 32 bit integers, but this makes the convolutions more complicated + to implement, so it's not necessarily any faster. */ + +.macro h264_qpel lmul, lmul2 + qpel_lowpass put, , \lmul, \lmul2, a0, a1, a2, a3, a4, t5, t6 + qpel_lowpass put, _l2, \lmul, \lmul2, a0, a1, a2, a3, a4, t5, t6, a5, a6 + qpel_lowpass avg, , \lmul, \lmul2, a0, a1, a2, a3, a4, t5, t6 + qpel_lowpass avg, _l2, \lmul, \lmul2, a0, a1, a2, a3, a4, t5, t6, a5, a6 +.endm + + h264_qpel m1, m2 + h264_qpel mf2, m1 + h264_qpel mf4, mf2 + h264_qpel mf8, mf4 + +.macro ff_h264_qpel_fns op, lmul, sizei, ext=rvv, dst, src, dst_stride, src_stride, size, w0, w1, src2, src2_stride, tmp +func ff_\op\()_h264_qpel\sizei\()_mc00_\ext, zve32x + lowpass_init \lmul, \sizei, \size, + j ff_\op\()_h264_qpel_pixels +endfunc + +func ff_\op\()_h264_qpel\sizei\()_mc10_\ext, zve32x + lowpass_init \lmul, \sizei, \size, \w0, \w1 + mv \src_stride, \dst_stride + mv \src2, \src + mv \src2_stride, \src_stride + j ff_\op\()_h264_qpel_h_lowpass_\lmul\()_l2 +endfunc + +func ff_\op\()_h264_qpel\sizei\()_mc20_\ext, zve32x + lowpass_init \lmul, \sizei, \size, \w0, \w1 + mv \src_stride, \dst_stride + j ff_\op\()_h264_qpel_h_lowpass_\lmul\() +endfunc + +func ff_\op\()_h264_qpel\sizei\()_mc30_\ext, zve32x + lowpass_init \lmul, \sizei, \size, \w0, \w1 + mv \src_stride, \dst_stride + addi \src2, \src, 1 + mv \src2_stride, \src_stride + j ff_\op\()_h264_qpel_h_lowpass_\lmul\()_l2 +endfunc + +func ff_\op\()_h264_qpel\sizei\()_mc01_\ext, zve32x + lowpass_init \lmul, \sizei, \size, \w0, \w1 + mv \src_stride, \dst_stride + mv \src2, \src + mv \src2_stride, \src_stride + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 +endfunc + +func ff_\op\()_h264_qpel\sizei\()_mc02_\ext, zve32x + lowpass_init \lmul, \sizei, \size, \w0, \w1 + mv \src_stride, \dst_stride + j ff_\op\()_h264_qpel_v_lowpass_\lmul +endfunc + +func ff_\op\()_h264_qpel\sizei\()_mc03_\ext, zve32x + lowpass_init \lmul, \sizei, \size, \w0, \w1 + mv \src_stride, \dst_stride + add \src2, \src, \src_stride + mv \src2_stride, \src_stride + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 +endfunc + +func ff_\op\()_h264_qpel\sizei\()_mc11_\ext, zve32x + lowpass_init \lmul, \sizei, \size, \w0, \w1 + push \dst, \src + mv \tmp, ra + mv \src_stride, \dst_stride + addi \dst, sp, -(\sizei * \sizei) + li \dst_stride, \sizei + call ff_put_h264_qpel_h_lowpass_\lmul + addi \src2, sp, -(\sizei * \sizei) + mv \src2_stride, \dst_stride + pop \dst, \src + mv \dst_stride, \src_stride + li \size, \sizei + mv ra, \tmp + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 +endfunc + +func ff_\op\()_h264_qpel\sizei\()_mc31_\ext, zve32x + lowpass_init \lmul, \sizei, \size, \w0, \w1 + push \dst, \src + mv \tmp, ra + mv \src_stride, \dst_stride + addi \dst, sp, -(\sizei * \sizei) + li \dst_stride, \sizei + call ff_put_h264_qpel_h_lowpass_\lmul + addi \src2, sp, -(\sizei * \sizei) + mv \src2_stride, \dst_stride + pop \dst, \src + addi \src, \src, 1 + mv \dst_stride, \src_stride + li \size, \sizei + mv ra, \tmp + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 +endfunc + +func ff_\op\()_h264_qpel\sizei\()_mc13_\ext, zve32x + lowpass_init \lmul, \sizei, \size, \w0, \w1 + push \dst, \src + mv \tmp, ra + mv \src_stride, \dst_stride + add \src, \src, \src_stride + addi \dst, sp, -(\sizei * \sizei) + li \dst_stride, \sizei + call ff_put_h264_qpel_h_lowpass_\lmul + addi \src2, sp, -(\sizei * \sizei) + mv \src2_stride, \dst_stride + pop \dst, \src + mv \dst_stride, \src_stride + li \size, \sizei + mv ra, \tmp + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 +endfunc + +func ff_\op\()_h264_qpel\sizei\()_mc33_\ext, zve32x + lowpass_init \lmul, \sizei, \size, \w0, \w1 + push \dst, \src + mv \tmp, ra + mv \src_stride, \dst_stride + add \src, \src, \src_stride + addi \dst, sp, -(\sizei * \sizei) + li \dst_stride, \sizei + call ff_put_h264_qpel_h_lowpass_\lmul + addi \src2, sp, -(\sizei * \sizei) + mv \src2_stride, \dst_stride + pop \dst, \src + addi \src, \src, 1 + mv \dst_stride, \src_stride + li \size, \sizei + mv ra, \tmp + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 +endfunc + +func ff_\op\()_h264_qpel\sizei\()_mc22_\ext, zve32x + lowpass_init \lmul, \sizei, \size, \w0, \w1 + mv \src_stride, \dst_stride + j ff_\op\()_h264_qpel_hv_lowpass_\lmul +endfunc + +func ff_\op\()_h264_qpel\sizei\()_mc21_\ext, zve32x + lowpass_init \lmul, \sizei, \size, \w0, \w1 + push \dst, \src + mv \tmp, ra + mv \src_stride, \dst_stride + addi \dst, sp, -(\sizei * \sizei) + li \dst_stride, \sizei + call ff_put_h264_qpel_h_lowpass_\lmul + addi \src2, sp, -(\sizei * \sizei) + mv \src2_stride, \dst_stride + pop \dst, \src + mv \dst_stride, \src_stride + li \size, \sizei + mv ra, \tmp + j ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2 +endfunc + +func ff_\op\()_h264_qpel\sizei\()_mc23_\ext, zve32x + lowpass_init \lmul, \sizei, \size, \w0, \w1 + push \dst, \src + mv \tmp, ra + mv \src_stride, \dst_stride + add \src, \src, \src_stride + addi \dst, sp, -(\sizei * \sizei) + li \dst_stride, \sizei + call ff_put_h264_qpel_h_lowpass_\lmul + addi \src2, sp, -(\sizei * \sizei) + mv \src2_stride, \dst_stride + pop \dst, \src + mv \dst_stride, \src_stride + li \size, \sizei + mv ra, \tmp + j ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2 +endfunc + +func ff_\op\()_h264_qpel\sizei\()_mc12_\ext, zve32x + lowpass_init \lmul, \sizei, \size, \w0, \w1 + push \dst, \src + mv \tmp, ra + mv \src_stride, \dst_stride + addi \dst, sp, -(\sizei * \sizei) + li \dst_stride, \sizei + call ff_put_h264_qpel_v_lowpass_\lmul + addi \src2, sp, -(\sizei * \sizei) + mv \src2_stride, \dst_stride + pop \dst, \src + mv \dst_stride, \src_stride + li \size, \sizei + mv ra, \tmp + j ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2 +endfunc + +func ff_\op\()_h264_qpel\sizei\()_mc32_\ext, zve32x + lowpass_init \lmul, \sizei, \size, \w0, \w1 + push \dst, \src + mv \tmp, ra + addi \src, \src, 1 + mv \src_stride, \dst_stride + addi \dst, sp, -(\sizei * \sizei) + li \dst_stride, \sizei + call ff_put_h264_qpel_v_lowpass_\lmul + addi \src2, sp, -(\sizei * \sizei) + mv \src2_stride, \dst_stride + pop \dst, \src + mv \dst_stride, \src_stride + li \size, \sizei + mv ra, \tmp + j ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2 +endfunc +.endm + + ff_h264_qpel_fns put, mf2, 16, rvv256, a0, a1, a2, a3, a4, t5, t6, a5, a6, a7 + ff_h264_qpel_fns put, mf4, 8, rvv256, a0, a1, a2, a3, a4, t5, t6, a5, a6, a7 + ff_h264_qpel_fns put, mf8, 4, rvv256, a0, a1, a2, a3, a4, t5, t6, a5, a6, a7 + + ff_h264_qpel_fns avg, mf2, 16, rvv256, a0, a1, a2, a3, a4, t5, t6, a5, a6, a7 + ff_h264_qpel_fns avg, mf4, 8, rvv256, a0, a1, a2, a3, a4, t5, t6, a5, a6, a7 + ff_h264_qpel_fns avg, mf8, 4, rvv256, a0, a1, a2, a3, a4, t5, t6, a5, a6, a7 + + ff_h264_qpel_fns put, m1, 16, rvv, a0, a1, a2, a3, a4, t5, t6, a5, a6, a7 + ff_h264_qpel_fns put, mf2, 8, rvv, a0, a1, a2, a3, a4, t5, t6, a5, a6, a7 + ff_h264_qpel_fns put, mf4, 4, rvv, a0, a1, a2, a3, a4, t5, t6, a5, a6, a7 + + ff_h264_qpel_fns avg, m1, 16, rvv, a0, a1, a2, a3, a4, t5, t6, a5, a6, a7 + ff_h264_qpel_fns avg, mf2, 8, rvv, a0, a1, a2, a3, a4, t5, t6, a5, a6, a7 + ff_h264_qpel_fns avg, mf4, 4, rvv, a0, a1, a2, a3, a4, t5, t6, a5, a6, a7 -- 2.45.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2024-08-15 12:14 UTC | newest] Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2024-08-13 14:03 [FFmpeg-devel] [PATCH 1/7] checkasm: add csv/tsv bench output J. Dekker 2024-08-13 14:03 ` [FFmpeg-devel] [PATCH 2/7] checkasm: improve print format J. Dekker 2024-08-13 16:39 ` Lynne via ffmpeg-devel 2024-08-13 14:03 ` [FFmpeg-devel] [PATCH 3/7] checkasm: add wildcompares for test & functions J. Dekker 2024-08-13 14:03 ` [FFmpeg-devel] [PATCH 4/7] avutil/riscv/asm: add stack pushing helpers J. Dekker 2024-08-13 15:51 ` Rémi Denis-Courmont 2024-08-13 16:10 ` epirat07 2024-08-13 16:13 ` Rémi Denis-Courmont 2024-08-13 14:03 ` [FFmpeg-devel] [PATCH 5/7] avutil/riscv/asm: add helper macro to count varargs J. Dekker 2024-08-13 14:03 ` [FFmpeg-devel] [PATCH 6/7] avutil/riscv/asm: add generic push/pop helpers J. Dekker 2024-08-13 15:55 ` Rémi Denis-Courmont 2024-08-15 12:13 ` Niklas Haas 2024-08-13 14:03 ` [FFmpeg-devel] [PATCH 7/7] avcodec/riscv: add h264 qpel J. Dekker
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git