* [FFmpeg-devel] [PATCH v2 4/5] libswscale: Enable hscale_avx2 for all input sizes. @ 2022-02-17 10:04 Alan Kelly 2022-04-22 8:04 ` Alan Kelly 2022-07-14 16:56 ` Michael Niedermayer 0 siblings, 2 replies; 10+ messages in thread From: Alan Kelly @ 2022-02-17 10:04 UTC (permalink / raw) To: ffmpeg-devel; +Cc: Alan Kelly ff_shuffle_filter_coefficients shuffles the tail as required. --- libswscale/utils.c | 19 ++++++++++++++++--- libswscale/x86/swscale.c | 6 ++---- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/libswscale/utils.c b/libswscale/utils.c index 7c8e1bbdde..d818c9ce55 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -285,8 +285,7 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, #if ARCH_X86_64 int i, j, k; int cpu_flags = av_get_cpu_flags(); - // avx2 hscale filter processes 16 pixel blocks. - if (!filter || dstW % 16 != 0) + if (!filter) return 0; if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) { if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { @@ -298,9 +297,11 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, } // Do not swap filterPos for pixels which won't be processed by // the main loop. - for (i = 0; i + 8 <= dstW; i += 8) { + for (i = 0; i + 16 <= dstW; i += 16) { FFSWAP(int, filterPos[i + 2], filterPos[i + 4]); FFSWAP(int, filterPos[i + 3], filterPos[i + 5]); + FFSWAP(int, filterPos[i + 10], filterPos[i + 12]); + FFSWAP(int, filterPos[i + 11], filterPos[i + 13]); } if (filterSize > 4) { // 16 pixels are processed at a time. @@ -314,6 +315,18 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, } } } + // 4 pixels are processed at a time in the tail. + for (; i < dstW; i += 4) { + // 4 filter coeffs are processed at a time. + int rem = dstW - i >= 4 ? 4 : dstW - i; + for (k = 0; k + 4 <= filterSize; k += 4) { + for (j = 0; j < rem; ++j) { + int from = (i + j) * filterSize + k; + int to = i * filterSize + j * 4 + k * 4; + memcpy(&filter[to], &filterCopy[from], 4 * sizeof(int16_t)); + } + } + } } av_free(filterCopy); } diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 73869355b8..76f5a70fc5 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -691,10 +691,8 @@ switch(c->dstBpc){ \ if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) { if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { - if (c->chrDstW % 16 == 0) - ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); - if (c->dstW % 16 == 0) - ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); + ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); + ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); } } -- 2.35.1.265.g69c8d7142f-goog _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 4/5] libswscale: Enable hscale_avx2 for all input sizes. 2022-02-17 10:04 [FFmpeg-devel] [PATCH v2 4/5] libswscale: Enable hscale_avx2 for all input sizes Alan Kelly @ 2022-04-22 8:04 ` Alan Kelly 2022-07-13 8:05 ` Alan Kelly 2022-07-14 16:56 ` Michael Niedermayer 1 sibling, 1 reply; 10+ messages in thread From: Alan Kelly @ 2022-04-22 8:04 UTC (permalink / raw) To: ffmpeg-devel Ping! On Thu, Feb 17, 2022 at 11:04 AM Alan Kelly <alankelly@google.com> wrote: > ff_shuffle_filter_coefficients shuffles the tail as required. > --- > libswscale/utils.c | 19 ++++++++++++++++--- > libswscale/x86/swscale.c | 6 ++---- > 2 files changed, 18 insertions(+), 7 deletions(-) > > diff --git a/libswscale/utils.c b/libswscale/utils.c > index 7c8e1bbdde..d818c9ce55 100644 > --- a/libswscale/utils.c > +++ b/libswscale/utils.c > @@ -285,8 +285,7 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int > *filterPos, > #if ARCH_X86_64 > int i, j, k; > int cpu_flags = av_get_cpu_flags(); > - // avx2 hscale filter processes 16 pixel blocks. > - if (!filter || dstW % 16 != 0) > + if (!filter) > return 0; > if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & > AV_CPU_FLAG_SLOW_GATHER)) { > if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { > @@ -298,9 +297,11 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int > *filterPos, > } > // Do not swap filterPos for pixels which won't be processed by > // the main loop. > - for (i = 0; i + 8 <= dstW; i += 8) { > + for (i = 0; i + 16 <= dstW; i += 16) { > FFSWAP(int, filterPos[i + 2], filterPos[i + 4]); > FFSWAP(int, filterPos[i + 3], filterPos[i + 5]); > + FFSWAP(int, filterPos[i + 10], filterPos[i + 12]); > + FFSWAP(int, filterPos[i + 11], filterPos[i + 13]); > } > if (filterSize > 4) { > // 16 pixels are processed at a time. > @@ -314,6 +315,18 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int > *filterPos, > } > } > } > + // 4 pixels are processed at a time in the tail. > + for (; i < dstW; i += 4) { > + // 4 filter coeffs are processed at a time. > + int rem = dstW - i >= 4 ? 4 : dstW - i; > + for (k = 0; k + 4 <= filterSize; k += 4) { > + for (j = 0; j < rem; ++j) { > + int from = (i + j) * filterSize + k; > + int to = i * filterSize + j * 4 + k * 4; > + memcpy(&filter[to], &filterCopy[from], 4 * > sizeof(int16_t)); > + } > + } > + } > } > av_free(filterCopy); > } > diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c > index 73869355b8..76f5a70fc5 100644 > --- a/libswscale/x86/swscale.c > +++ b/libswscale/x86/swscale.c > @@ -691,10 +691,8 @@ switch(c->dstBpc){ \ > > if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & > AV_CPU_FLAG_SLOW_GATHER)) { > if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { > - if (c->chrDstW % 16 == 0) > - ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); > - if (c->dstW % 16 == 0) > - ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); > + ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); > + ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); > } > } > > -- > 2.35.1.265.g69c8d7142f-goog > > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 4/5] libswscale: Enable hscale_avx2 for all input sizes. 2022-04-22 8:04 ` Alan Kelly @ 2022-07-13 8:05 ` Alan Kelly 0 siblings, 0 replies; 10+ messages in thread From: Alan Kelly @ 2022-07-13 8:05 UTC (permalink / raw) To: FFmpeg development discussions and patches Pushing this back up to the top. This is required to enable the previous patch in this chain. Thanks On Fri, Apr 22, 2022 at 10:04 AM Alan Kelly <alankelly@google.com> wrote: > Ping! > > On Thu, Feb 17, 2022 at 11:04 AM Alan Kelly <alankelly@google.com> wrote: > >> ff_shuffle_filter_coefficients shuffles the tail as required. >> --- >> libswscale/utils.c | 19 ++++++++++++++++--- >> libswscale/x86/swscale.c | 6 ++---- >> 2 files changed, 18 insertions(+), 7 deletions(-) >> >> diff --git a/libswscale/utils.c b/libswscale/utils.c >> index 7c8e1bbdde..d818c9ce55 100644 >> --- a/libswscale/utils.c >> +++ b/libswscale/utils.c >> @@ -285,8 +285,7 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int >> *filterPos, >> #if ARCH_X86_64 >> int i, j, k; >> int cpu_flags = av_get_cpu_flags(); >> - // avx2 hscale filter processes 16 pixel blocks. >> - if (!filter || dstW % 16 != 0) >> + if (!filter) >> return 0; >> if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & >> AV_CPU_FLAG_SLOW_GATHER)) { >> if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { >> @@ -298,9 +297,11 @@ int ff_shuffle_filter_coefficients(SwsContext *c, >> int *filterPos, >> } >> // Do not swap filterPos for pixels which won't be processed >> by >> // the main loop. >> - for (i = 0; i + 8 <= dstW; i += 8) { >> + for (i = 0; i + 16 <= dstW; i += 16) { >> FFSWAP(int, filterPos[i + 2], filterPos[i + 4]); >> FFSWAP(int, filterPos[i + 3], filterPos[i + 5]); >> + FFSWAP(int, filterPos[i + 10], filterPos[i + 12]); >> + FFSWAP(int, filterPos[i + 11], filterPos[i + 13]); >> } >> if (filterSize > 4) { >> // 16 pixels are processed at a time. >> @@ -314,6 +315,18 @@ int ff_shuffle_filter_coefficients(SwsContext *c, >> int *filterPos, >> } >> } >> } >> + // 4 pixels are processed at a time in the tail. >> + for (; i < dstW; i += 4) { >> + // 4 filter coeffs are processed at a time. >> + int rem = dstW - i >= 4 ? 4 : dstW - i; >> + for (k = 0; k + 4 <= filterSize; k += 4) { >> + for (j = 0; j < rem; ++j) { >> + int from = (i + j) * filterSize + k; >> + int to = i * filterSize + j * 4 + k * 4; >> + memcpy(&filter[to], &filterCopy[from], 4 * >> sizeof(int16_t)); >> + } >> + } >> + } >> } >> av_free(filterCopy); >> } >> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c >> index 73869355b8..76f5a70fc5 100644 >> --- a/libswscale/x86/swscale.c >> +++ b/libswscale/x86/swscale.c >> @@ -691,10 +691,8 @@ switch(c->dstBpc){ \ >> >> if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & >> AV_CPU_FLAG_SLOW_GATHER)) { >> if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { >> - if (c->chrDstW % 16 == 0) >> - ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); >> - if (c->dstW % 16 == 0) >> - ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); >> + ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); >> + ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); >> } >> } >> >> -- >> 2.35.1.265.g69c8d7142f-goog >> >> _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 4/5] libswscale: Enable hscale_avx2 for all input sizes. 2022-02-17 10:04 [FFmpeg-devel] [PATCH v2 4/5] libswscale: Enable hscale_avx2 for all input sizes Alan Kelly 2022-04-22 8:04 ` Alan Kelly @ 2022-07-14 16:56 ` Michael Niedermayer 2022-07-15 14:59 ` Alan Kelly 1 sibling, 1 reply; 10+ messages in thread From: Michael Niedermayer @ 2022-07-14 16:56 UTC (permalink / raw) To: FFmpeg development discussions and patches [-- Attachment #1.1: Type: text/plain, Size: 2060 bytes --] On Thu, Feb 17, 2022 at 11:04:20AM +0100, Alan Kelly wrote: > ff_shuffle_filter_coefficients shuffles the tail as required. > --- > libswscale/utils.c | 19 ++++++++++++++++--- > libswscale/x86/swscale.c | 6 ++---- > 2 files changed, 18 insertions(+), 7 deletions(-) it seems patch 3 and 4 together fail fate with current git master (i think 1 & 2 are already applied) make V=2 fate-checkasm-sw_scale TEST checkasm-sw_scale ./tests/fate-run.sh fate-checkasm-sw_scale "fate-suite/" "" "ffmpeg" 'run tests/checkasm/checkasm --test=sw_scale' 'null' '' '' '1' '' '' '' '' '' '' '' '' '' '' ffmpeg/tests/checkasm/checkasm --test=sw_scale Test checkasm-sw_scale failed. Look at tests/data/fate/checkasm-sw_scale.err for details. checkasm: using random seed 1761410321 MMXEXT: - sw_scale.yuv2yuvX [OK] SSE2: - sw_scale.hscale [OK] SSE3: - sw_scale.yuv2yuvX [OK] SSSE3: - sw_scale.hscale [OK] SSE4.1: - sw_scale.hscale [OK] AVX2: hscale_8_to_15__fs_4_dstW_8_avx2 (sw_scale.c:235) hscale_8_to_15__fs_4_dstW_24_avx2 (sw_scale.c:235) hscale_8_to_15__fs_8_dstW_8_avx2 (sw_scale.c:235) hscale_8_to_15__fs_8_dstW_24_avx2 (sw_scale.c:235) hscale_8_to_15__fs_12_dstW_8_avx2 (sw_scale.c:235) hscale_8_to_15__fs_12_dstW_24_avx2 (sw_scale.c:235) hscale_8_to_15__fs_16_dstW_8_avx2 (sw_scale.c:235) hscale_8_to_15__fs_16_dstW_24_avx2 (sw_scale.c:235) hscale_8_to_15__fs_32_dstW_8_avx2 (sw_scale.c:235) hscale_8_to_15__fs_32_dstW_24_avx2 (sw_scale.c:235) hscale_8_to_15__fs_40_dstW_8_avx2 (sw_scale.c:235) hscale_8_to_15__fs_40_dstW_24_avx2 (sw_scale.c:235) - sw_scale.hscale [FAILED] - sw_scale.yuv2yuvX [OK] checkasm: 12 of 504 tests have failed tests/Makefile:304: recipe for target 'fate-checkasm-sw_scale' failed make: *** [fate-checkasm-sw_scale] Error 1 [...] -- Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB While the State exists there can be no freedom; when there is freedom there will be no State. -- Vladimir Lenin [-- Attachment #1.2: signature.asc --] [-- Type: application/pgp-signature, Size: 195 bytes --] [-- Attachment #2: Type: text/plain, Size: 251 bytes --] _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH v2 4/5] libswscale: Enable hscale_avx2 for all input sizes. 2022-07-14 16:56 ` Michael Niedermayer @ 2022-07-15 14:59 ` Alan Kelly 2022-07-15 15:03 ` Alan Kelly 0 siblings, 1 reply; 10+ messages in thread From: Alan Kelly @ 2022-07-15 14:59 UTC (permalink / raw) To: ffmpeg-devel; +Cc: Alan Kelly ff_shuffle_filter_coefficients shuffles the tail as required. --- libswscale/utils.c | 19 ++++++++++++++++--- libswscale/x86/swscale.c | 6 ++---- tests/checkasm/sw_scale.c | 2 +- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/libswscale/utils.c b/libswscale/utils.c index cb4f5b521c..544b7fee96 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -266,8 +266,7 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, #if ARCH_X86_64 int i, j, k; int cpu_flags = av_get_cpu_flags(); - // avx2 hscale filter processes 16 pixel blocks. - if (!filter || dstW % 16 != 0) + if (!filter) return 0; if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) { if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { @@ -279,9 +278,11 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, } // Do not swap filterPos for pixels which won't be processed by // the main loop. - for (i = 0; i + 8 <= dstW; i += 8) { + for (i = 0; i + 16 <= dstW; i += 16) { FFSWAP(int, filterPos[i + 2], filterPos[i + 4]); FFSWAP(int, filterPos[i + 3], filterPos[i + 5]); + FFSWAP(int, filterPos[i + 10], filterPos[i + 12]); + FFSWAP(int, filterPos[i + 11], filterPos[i + 13]); } if (filterSize > 4) { // 16 pixels are processed at a time. @@ -295,6 +296,18 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, } } } + // 4 pixels are processed at a time in the tail. + for (; i < dstW; i += 4) { + // 4 filter coeffs are processed at a time. + int rem = dstW - i >= 4 ? 4 : dstW - i; + for (k = 0; k + 4 <= filterSize; k += 4) { + for (j = 0; j < rem; ++j) { + int from = (i + j) * filterSize + k; + int to = i * filterSize + j * 4 + k * 4; + memcpy(&filter[to], &filterCopy[from], 4 * sizeof(int16_t)); + } + } + } } av_free(filterCopy); } diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 628f12137c..f628c71bd4 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -626,10 +626,8 @@ switch(c->dstBpc){ \ if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) { if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { - if (c->chrDstW % 16 == 0) - ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); - if (c->dstW % 16 == 0) - ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); + ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); + ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); } } diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c index b643a47c30..798990a6cf 100644 --- a/tests/checkasm/sw_scale.c +++ b/tests/checkasm/sw_scale.c @@ -223,7 +223,7 @@ static void check_hscale(void) ff_sws_init_scale(ctx); memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * MAX_FILTER_WIDTH + MAX_FILTER_WIDTH)); if ((cpu_flags & AV_CPU_FLAG_AVX2) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) - ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, filterAvx2, SRC_PIXELS); + ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, filterAvx2, ctx->dstW); if (check_func(ctx->hcScale, "hscale_%d_to_%d__fs_%d_dstW_%d", ctx->srcBpc, ctx->dstBpc + 1, width, ctx->dstW)) { memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0])); -- 2.37.0.170.g444d1eabd0-goog _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 4/5] libswscale: Enable hscale_avx2 for all input sizes. 2022-07-15 14:59 ` Alan Kelly @ 2022-07-15 15:03 ` Alan Kelly 2022-07-16 11:14 ` Michael Niedermayer 0 siblings, 1 reply; 10+ messages in thread From: Alan Kelly @ 2022-07-15 15:03 UTC (permalink / raw) To: FFmpeg development discussions and patches Hi Michael, Thanks for looking at this. I fixed the test issue. Alan On Fri, Jul 15, 2022 at 4:59 PM Alan Kelly <alankelly@google.com> wrote: > ff_shuffle_filter_coefficients shuffles the tail as required. > --- > libswscale/utils.c | 19 ++++++++++++++++--- > libswscale/x86/swscale.c | 6 ++---- > tests/checkasm/sw_scale.c | 2 +- > 3 files changed, 19 insertions(+), 8 deletions(-) > > diff --git a/libswscale/utils.c b/libswscale/utils.c > index cb4f5b521c..544b7fee96 100644 > --- a/libswscale/utils.c > +++ b/libswscale/utils.c > @@ -266,8 +266,7 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int > *filterPos, > #if ARCH_X86_64 > int i, j, k; > int cpu_flags = av_get_cpu_flags(); > - // avx2 hscale filter processes 16 pixel blocks. > - if (!filter || dstW % 16 != 0) > + if (!filter) > return 0; > if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & > AV_CPU_FLAG_SLOW_GATHER)) { > if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { > @@ -279,9 +278,11 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int > *filterPos, > } > // Do not swap filterPos for pixels which won't be processed by > // the main loop. > - for (i = 0; i + 8 <= dstW; i += 8) { > + for (i = 0; i + 16 <= dstW; i += 16) { > FFSWAP(int, filterPos[i + 2], filterPos[i + 4]); > FFSWAP(int, filterPos[i + 3], filterPos[i + 5]); > + FFSWAP(int, filterPos[i + 10], filterPos[i + 12]); > + FFSWAP(int, filterPos[i + 11], filterPos[i + 13]); > } > if (filterSize > 4) { > // 16 pixels are processed at a time. > @@ -295,6 +296,18 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int > *filterPos, > } > } > } > + // 4 pixels are processed at a time in the tail. > + for (; i < dstW; i += 4) { > + // 4 filter coeffs are processed at a time. > + int rem = dstW - i >= 4 ? 4 : dstW - i; > + for (k = 0; k + 4 <= filterSize; k += 4) { > + for (j = 0; j < rem; ++j) { > + int from = (i + j) * filterSize + k; > + int to = i * filterSize + j * 4 + k * 4; > + memcpy(&filter[to], &filterCopy[from], 4 * > sizeof(int16_t)); > + } > + } > + } > } > av_free(filterCopy); > } > diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c > index 628f12137c..f628c71bd4 100644 > --- a/libswscale/x86/swscale.c > +++ b/libswscale/x86/swscale.c > @@ -626,10 +626,8 @@ switch(c->dstBpc){ \ > > if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & > AV_CPU_FLAG_SLOW_GATHER)) { > if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { > - if (c->chrDstW % 16 == 0) > - ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); > - if (c->dstW % 16 == 0) > - ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); > + ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); > + ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); > } > } > > diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c > index b643a47c30..798990a6cf 100644 > --- a/tests/checkasm/sw_scale.c > +++ b/tests/checkasm/sw_scale.c > @@ -223,7 +223,7 @@ static void check_hscale(void) > ff_sws_init_scale(ctx); > memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS > * MAX_FILTER_WIDTH + MAX_FILTER_WIDTH)); > if ((cpu_flags & AV_CPU_FLAG_AVX2) && !(cpu_flags & > AV_CPU_FLAG_SLOW_GATHER)) > - ff_shuffle_filter_coefficients(ctx, filterPosAvx, > width, filterAvx2, SRC_PIXELS); > + ff_shuffle_filter_coefficients(ctx, filterPosAvx, > width, filterAvx2, ctx->dstW); > > if (check_func(ctx->hcScale, > "hscale_%d_to_%d__fs_%d_dstW_%d", ctx->srcBpc, ctx->dstBpc + 1, width, > ctx->dstW)) { > memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0])); > -- > 2.37.0.170.g444d1eabd0-goog > > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 4/5] libswscale: Enable hscale_avx2 for all input sizes. 2022-07-15 15:03 ` Alan Kelly @ 2022-07-16 11:14 ` Michael Niedermayer 2022-07-18 7:54 ` Alan Kelly 0 siblings, 1 reply; 10+ messages in thread From: Michael Niedermayer @ 2022-07-16 11:14 UTC (permalink / raw) To: FFmpeg development discussions and patches [-- Attachment #1.1: Type: text/plain, Size: 1455 bytes --] On Fri, Jul 15, 2022 at 05:03:56PM +0200, Alan Kelly wrote: > Hi Michael, > > Thanks for looking at this. I fixed the test issue. seems to be still failing here: make distclean ; ./configure && make -j32 tests/checkasm/checkasm && tests/checkasm/checkasm --test=sw_scale checkasm: using random seed 1328711543 MMXEXT: - sw_scale.yuv2yuvX [OK] SSE2: - sw_scale.hscale [OK] SSE3: - sw_scale.yuv2yuvX [OK] SSSE3: - sw_scale.hscale [OK] SSE4.1: - sw_scale.hscale [OK] AVX2: hscale_8_to_15__fs_4_dstW_8_avx2 (sw_scale.c:235) hscale_8_to_15__fs_4_dstW_24_avx2 (sw_scale.c:235) hscale_8_to_15__fs_8_dstW_8_avx2 (sw_scale.c:235) hscale_8_to_15__fs_8_dstW_24_avx2 (sw_scale.c:235) hscale_8_to_15__fs_12_dstW_8_avx2 (sw_scale.c:235) hscale_8_to_15__fs_12_dstW_24_avx2 (sw_scale.c:235) hscale_8_to_15__fs_16_dstW_8_avx2 (sw_scale.c:235) hscale_8_to_15__fs_16_dstW_24_avx2 (sw_scale.c:235) hscale_8_to_15__fs_32_dstW_8_avx2 (sw_scale.c:235) hscale_8_to_15__fs_32_dstW_24_avx2 (sw_scale.c:235) hscale_8_to_15__fs_40_dstW_8_avx2 (sw_scale.c:235) hscale_8_to_15__fs_40_dstW_24_avx2 (sw_scale.c:235) - sw_scale.hscale [FAILED] - sw_scale.yuv2yuvX [OK] checkasm: 12 of 504 tests have failed [...] -- Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB It is a danger to trust the dream we wish for rather than the science we have, -- Dr. Kenneth Brown [-- Attachment #1.2: signature.asc --] [-- Type: application/pgp-signature, Size: 195 bytes --] [-- Attachment #2: Type: text/plain, Size: 251 bytes --] _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 4/5] libswscale: Enable hscale_avx2 for all input sizes. 2022-07-16 11:14 ` Michael Niedermayer @ 2022-07-18 7:54 ` Alan Kelly 2022-07-18 16:49 ` Michael Niedermayer 0 siblings, 1 reply; 10+ messages in thread From: Alan Kelly @ 2022-07-18 7:54 UTC (permalink / raw) To: FFmpeg development discussions and patches Hi Michael, I have tried to recreate this locally in a clean client applying the patches as sent in the email thread. I have tried gcc and mingw and this passes for me. Are you sure you applied both patches 3 & 4? If only patch 4 is applied, then I get the error you have. Thanks, Alan On Sat, Jul 16, 2022 at 1:14 PM Michael Niedermayer <michael@niedermayer.cc> wrote: > On Fri, Jul 15, 2022 at 05:03:56PM +0200, Alan Kelly wrote: > > Hi Michael, > > > > Thanks for looking at this. I fixed the test issue. > > seems to be still failing here: > make distclean ; ./configure && make -j32 tests/checkasm/checkasm && > tests/checkasm/checkasm --test=sw_scale > checkasm: using random seed 1328711543 > MMXEXT: > - sw_scale.yuv2yuvX [OK] > SSE2: > - sw_scale.hscale [OK] > SSE3: > - sw_scale.yuv2yuvX [OK] > SSSE3: > - sw_scale.hscale [OK] > SSE4.1: > - sw_scale.hscale [OK] > AVX2: > hscale_8_to_15__fs_4_dstW_8_avx2 (sw_scale.c:235) > hscale_8_to_15__fs_4_dstW_24_avx2 (sw_scale.c:235) > hscale_8_to_15__fs_8_dstW_8_avx2 (sw_scale.c:235) > hscale_8_to_15__fs_8_dstW_24_avx2 (sw_scale.c:235) > hscale_8_to_15__fs_12_dstW_8_avx2 (sw_scale.c:235) > hscale_8_to_15__fs_12_dstW_24_avx2 (sw_scale.c:235) > hscale_8_to_15__fs_16_dstW_8_avx2 (sw_scale.c:235) > hscale_8_to_15__fs_16_dstW_24_avx2 (sw_scale.c:235) > hscale_8_to_15__fs_32_dstW_8_avx2 (sw_scale.c:235) > hscale_8_to_15__fs_32_dstW_24_avx2 (sw_scale.c:235) > hscale_8_to_15__fs_40_dstW_8_avx2 (sw_scale.c:235) > hscale_8_to_15__fs_40_dstW_24_avx2 (sw_scale.c:235) > - sw_scale.hscale [FAILED] > - sw_scale.yuv2yuvX [OK] > checkasm: 12 of 504 tests have failed > > > [...] > -- > Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB > > It is a danger to trust the dream we wish for rather than > the science we have, -- Dr. Kenneth Brown > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 4/5] libswscale: Enable hscale_avx2 for all input sizes. 2022-07-18 7:54 ` Alan Kelly @ 2022-07-18 16:49 ` Michael Niedermayer 2022-08-15 8:36 ` Alan Kelly 0 siblings, 1 reply; 10+ messages in thread From: Michael Niedermayer @ 2022-07-18 16:49 UTC (permalink / raw) To: FFmpeg development discussions and patches [-- Attachment #1.1: Type: text/plain, Size: 649 bytes --] On Mon, Jul 18, 2022 at 09:54:39AM +0200, Alan Kelly wrote: > Hi Michael, > > I have tried to recreate this locally in a clean client applying the > patches as sent in the email thread. I have tried gcc and mingw and this > passes for me. Are you sure you applied both patches 3 & 4? If only patch 4 > is applied, then I get the error you have. ive retested, and i cannot reproduce, i think i had #4 & #5 not #3 and #4 applied thx [...] -- Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB Those who are too smart to engage in politics are punished by being governed by those who are dumber. -- Plato [-- Attachment #1.2: signature.asc --] [-- Type: application/pgp-signature, Size: 195 bytes --] [-- Attachment #2: Type: text/plain, Size: 251 bytes --] _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 4/5] libswscale: Enable hscale_avx2 for all input sizes. 2022-07-18 16:49 ` Michael Niedermayer @ 2022-08-15 8:36 ` Alan Kelly 0 siblings, 0 replies; 10+ messages in thread From: Alan Kelly @ 2022-08-15 8:36 UTC (permalink / raw) To: FFmpeg development discussions and patches Hi Michael, Is there anything blocking this change being applied? Is there anything I can do to help? Thanks, Alan On Mon, Jul 18, 2022 at 6:49 PM Michael Niedermayer <michael@niedermayer.cc> wrote: > On Mon, Jul 18, 2022 at 09:54:39AM +0200, Alan Kelly wrote: > > Hi Michael, > > > > I have tried to recreate this locally in a clean client applying the > > patches as sent in the email thread. I have tried gcc and mingw and this > > passes for me. Are you sure you applied both patches 3 & 4? If only > patch 4 > > is applied, then I get the error you have. > > ive retested, and i cannot reproduce, i think i had #4 & #5 not #3 and #4 > applied > > thx > > [...] > -- > Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB > > Those who are too smart to engage in politics are punished by being > governed by those who are dumber. -- Plato > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2022-08-15 8:37 UTC | newest] Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2022-02-17 10:04 [FFmpeg-devel] [PATCH v2 4/5] libswscale: Enable hscale_avx2 for all input sizes Alan Kelly 2022-04-22 8:04 ` Alan Kelly 2022-07-13 8:05 ` Alan Kelly 2022-07-14 16:56 ` Michael Niedermayer 2022-07-15 14:59 ` Alan Kelly 2022-07-15 15:03 ` Alan Kelly 2022-07-16 11:14 ` Michael Niedermayer 2022-07-18 7:54 ` Alan Kelly 2022-07-18 16:49 ` Michael Niedermayer 2022-08-15 8:36 ` Alan Kelly
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git