* [FFmpeg-devel] [PATCH] avdevice/xcbgrab: enable window resizing
@ 2023-02-09 14:25 aline.gondimsantos
2023-02-09 14:29 ` Nicolas George
2023-02-09 18:19 ` [FFmpeg-devel] (no subject) Aline Gondim Santos
0 siblings, 2 replies; 15+ messages in thread
From: aline.gondimsantos @ 2023-02-09 14:25 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Aline Gondim Santos
From: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
Signed-off-by: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
---
libavdevice/xcbgrab.c | 180 +++++++++---------------------------------
1 file changed, 39 insertions(+), 141 deletions(-)
diff --git a/libavdevice/xcbgrab.c b/libavdevice/xcbgrab.c
index 64a68ba497..05282911a9 100644
--- a/libavdevice/xcbgrab.c
+++ b/libavdevice/xcbgrab.c
@@ -29,11 +29,6 @@
#include <xcb/xfixes.h>
#endif
-#if CONFIG_LIBXCB_SHM
-#include <sys/shm.h>
-#include <xcb/shm.h>
-#endif
-
#if CONFIG_LIBXCB_SHAPE
#include <xcb/shape.h>
#endif
@@ -53,9 +48,6 @@ typedef struct XCBGrabContext {
xcb_connection_t *conn;
xcb_screen_t *screen;
xcb_window_t window;
-#if CONFIG_LIBXCB_SHM
- AVBufferPool *shm_pool;
-#endif
int64_t time_frame;
AVRational time_base;
int64_t frame_duration;
@@ -72,10 +64,9 @@ typedef struct XCBGrabContext {
int region_border;
int centered;
int select_region;
+ int is_area;
const char *framerate;
-
- int has_shm;
} XCBGrabContext;
#define FOLLOW_CENTER -1
@@ -97,6 +88,7 @@ static const AVOption options[] = {
{ "show_region", "Show the grabbing region.", OFFSET(show_region), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, D },
{ "region_border", "Set the region border thickness.", OFFSET(region_border), AV_OPT_TYPE_INT, { .i64 = 3 }, 1, 128, D },
{ "select_region", "Select the grabbing region graphically using the pointer.", OFFSET(select_region), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, D },
+ { "is_area", "Define if we are grabing a region of the display/window.", OFFSET(is_area), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, D },
{ NULL },
};
@@ -216,99 +208,6 @@ static int64_t wait_frame(AVFormatContext *s, AVPacket *pkt)
return curtime;
}
-#if CONFIG_LIBXCB_SHM
-static int check_shm(xcb_connection_t *conn)
-{
- xcb_shm_query_version_cookie_t cookie = xcb_shm_query_version(conn);
- xcb_shm_query_version_reply_t *reply;
-
- reply = xcb_shm_query_version_reply(conn, cookie, NULL);
- if (reply) {
- free(reply);
- return 1;
- }
-
- return 0;
-}
-
-static void free_shm_buffer(void *opaque, uint8_t *data)
-{
- shmdt(data);
-}
-
-static AVBufferRef *allocate_shm_buffer(void *opaque, size_t size)
-{
- xcb_connection_t *conn = opaque;
- xcb_shm_seg_t segment;
- AVBufferRef *ref;
- uint8_t *data;
- int id;
-
- id = shmget(IPC_PRIVATE, size, IPC_CREAT | 0777);
- if (id == -1)
- return NULL;
-
- segment = xcb_generate_id(conn);
- xcb_shm_attach(conn, segment, id, 0);
- data = shmat(id, NULL, 0);
- shmctl(id, IPC_RMID, 0);
- if ((intptr_t)data == -1 || !data)
- return NULL;
-
- ref = av_buffer_create(data, size, free_shm_buffer, (void *)(ptrdiff_t)segment, 0);
- if (!ref)
- shmdt(data);
-
- return ref;
-}
-
-static int xcbgrab_frame_shm(AVFormatContext *s, AVPacket *pkt)
-{
- XCBGrabContext *c = s->priv_data;
- xcb_shm_get_image_cookie_t iq;
- xcb_shm_get_image_reply_t *img;
- xcb_drawable_t drawable = c->window_id;
- xcb_generic_error_t *e = NULL;
- AVBufferRef *buf;
- xcb_shm_seg_t segment;
-
- buf = av_buffer_pool_get(c->shm_pool);
- if (!buf) {
- av_log(s, AV_LOG_ERROR, "Could not get shared memory buffer.\n");
- return AVERROR(ENOMEM);
- }
- segment = (xcb_shm_seg_t)(uintptr_t)av_buffer_pool_buffer_get_opaque(buf);
-
- iq = xcb_shm_get_image(c->conn, drawable,
- c->x, c->y, c->width, c->height, ~0,
- XCB_IMAGE_FORMAT_Z_PIXMAP, segment, 0);
- img = xcb_shm_get_image_reply(c->conn, iq, &e);
-
- xcb_flush(c->conn);
-
- if (e) {
- av_log(s, AV_LOG_ERROR,
- "Cannot get the image data "
- "event_error: response_type:%u error_code:%u "
- "sequence:%u resource_id:%u minor_code:%u major_code:%u.\n",
- e->response_type, e->error_code,
- e->sequence, e->resource_id, e->minor_code, e->major_code);
-
- free(e);
- av_buffer_unref(&buf);
- return AVERROR(EACCES);
- }
-
- free(img);
-
- pkt->buf = buf;
- pkt->data = buf->data;
- pkt->size = c->frame_size;
-
- return 0;
-}
-#endif /* CONFIG_LIBXCB_SHM */
-
#if CONFIG_LIBXCB_XFIXES
static int check_xfixes(xcb_connection_t *conn)
{
@@ -462,14 +361,7 @@ static int xcbgrab_read_packet(AVFormatContext *s, AVPacket *pkt)
if (c->show_region)
xcbgrab_update_region(s, win_x, win_y);
-#if CONFIG_LIBXCB_SHM
- if (c->has_shm && xcbgrab_frame_shm(s, pkt) < 0) {
- av_log(s, AV_LOG_WARNING, "Continuing without shared memory.\n");
- c->has_shm = 0;
- }
-#endif
- if (!c->has_shm)
- ret = xcbgrab_frame(s, pkt);
+ ret = xcbgrab_frame(s, pkt);
pkt->dts = pkt->pts = pts;
pkt->duration = c->frame_duration;
@@ -488,11 +380,8 @@ static av_cold int xcbgrab_read_close(AVFormatContext *s)
{
XCBGrabContext *ctx = s->priv_data;
-#if CONFIG_LIBXCB_SHM
- av_buffer_pool_uninit(&ctx->shm_pool);
-#endif
-
xcb_disconnect(ctx->conn);
+ ctx->conn = NULL;
return 0;
}
@@ -572,7 +461,15 @@ static int pixfmt_from_pixmap_format(AVFormatContext *s, int depth,
static int create_stream(AVFormatContext *s)
{
XCBGrabContext *c = s->priv_data;
- AVStream *st = avformat_new_stream(s, NULL);
+
+ // If we try to open another stream to x11grab, there is no reason
+ // to keep more than one stream in the context.
+ AVStream *st;
+ if (!s->nb_streams) {
+ st = avformat_new_stream(s, NULL);
+ } else {
+ st = s->streams[0];
+ }
xcb_get_geometry_cookie_t gc;
xcb_get_geometry_reply_t *geo;
int64_t frame_size_bits;
@@ -594,7 +491,16 @@ static int create_stream(AVFormatContext *s)
return AVERROR_EXTERNAL;
}
+ // Width and Height are not 0 only when we set a window area to share
+ // This if may be valid only in the first call to create_stream
if (!c->width || !c->height) {
+ c->is_area = 0;
+ c->width = geo->width;
+ c->height = geo->height;
+ }
+ // If not a predefined area, then we should follow geometry changes
+ // This can be valid only on the second call onwards
+ if (!c->is_area && (c->width != geo->width || c->height != geo->height)) {
c->width = geo->width;
c->height = geo->height;
}
@@ -628,13 +534,6 @@ static int create_stream(AVFormatContext *s)
}
c->frame_size = frame_size_bits / 8;
-#if CONFIG_LIBXCB_SHM
- c->shm_pool = av_buffer_pool_init2(c->frame_size + AV_INPUT_BUFFER_PADDING_SIZE,
- c->conn, allocate_shm_buffer, NULL);
- if (!c->shm_pool)
- return AVERROR(ENOMEM);
-#endif
-
st->codecpar->codec_type = AVMEDIA_TYPE_VIDEO;
st->codecpar->codec_id = AV_CODEC_ID_RAWVIDEO;
st->codecpar->width = c->width;
@@ -829,23 +728,26 @@ static av_cold int xcbgrab_read_header(AVFormatContext *s)
sscanf(s->url, "+%d,%d", &c->x, &c->y);
}
- c->conn = xcb_connect(display_name[0] ? display_name : NULL, &screen_num);
- av_freep(&display_name);
+ if (!c->conn || !c->screen) {
+ xcbgrab_read_close(s);
+ c->conn = xcb_connect(display_name[0] ? display_name : NULL, &screen_num);
+ av_freep(&display_name);
- if ((ret = xcb_connection_has_error(c->conn))) {
- av_log(s, AV_LOG_ERROR, "Cannot open display %s, error %d.\n",
- s->url[0] ? s->url : "default", ret);
- return AVERROR(EIO);
- }
+ if ((ret = xcb_connection_has_error(c->conn))) {
+ av_log(s, AV_LOG_ERROR, "Cannot open display %s, error %d.\n",
+ s->url[0] ? s->url : "default", ret);
+ return AVERROR(EIO);
+ }
- setup = xcb_get_setup(c->conn);
+ setup = xcb_get_setup(c->conn);
- c->screen = get_screen(setup, screen_num);
- if (!c->screen) {
- av_log(s, AV_LOG_ERROR, "The screen %d does not exist.\n",
- screen_num);
- xcbgrab_read_close(s);
- return AVERROR(EIO);
+ c->screen = get_screen(setup, screen_num);
+ if (!c->screen) {
+ av_log(s, AV_LOG_ERROR, "The screen %d does not exist.\n",
+ screen_num);
+ xcbgrab_read_close(s);
+ return AVERROR(EIO);
+ }
}
if (c->window_id == XCB_NONE)
@@ -876,10 +778,6 @@ static av_cold int xcbgrab_read_header(AVFormatContext *s)
return ret;
}
-#if CONFIG_LIBXCB_SHM
- c->has_shm = check_shm(c->conn);
-#endif
-
#if CONFIG_LIBXCB_XFIXES
if (c->draw_mouse) {
if (!(c->draw_mouse = check_xfixes(c->conn))) {
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [FFmpeg-devel] [PATCH] avdevice/xcbgrab: enable window resizing
2023-02-09 14:25 [FFmpeg-devel] [PATCH] avdevice/xcbgrab: enable window resizing aline.gondimsantos
@ 2023-02-09 14:29 ` Nicolas George
2023-02-09 18:19 ` [FFmpeg-devel] (no subject) Aline Gondim Santos
1 sibling, 0 replies; 15+ messages in thread
From: Nicolas George @ 2023-02-09 14:29 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: Aline Gondim Santos
aline.gondimsantos@savoirfairelinux.com (12023-02-09):
> From: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
>
> Signed-off-by: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
> ---
> libavdevice/xcbgrab.c | 180 +++++++++---------------------------------
> 1 file changed, 39 insertions(+), 141 deletions(-)
Hi. Thanks for the patch. Are you removing support for using shared
memory entirely to implement this feature? If so, can we see some
benchmarks comparisons?
Regards,
--
Nicolas George
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* [FFmpeg-devel] (no subject)
2023-02-09 14:25 [FFmpeg-devel] [PATCH] avdevice/xcbgrab: enable window resizing aline.gondimsantos
2023-02-09 14:29 ` Nicolas George
@ 2023-02-09 18:19 ` Aline Gondim Santos
2023-02-09 18:19 ` [FFmpeg-devel] [PATCH] avdevice/xcbgrab: enable window resizing Aline Gondim Santos
1 sibling, 1 reply; 15+ messages in thread
From: Aline Gondim Santos @ 2023-02-09 18:19 UTC (permalink / raw)
To: ffmpeg-devel
Hello Nicolas,
Bellow you can find the bechmarks using `ffmpeg -benchmark` option.
1 - master
./ffmpeg -benchmark -t 10 -framerate 25 -f x11grab -i ":1+0,0 1920x1080" output1master.mp4
ffmpeg version N-109782-g458ae405ef Copyright (c) 2000-2023 the FFmpeg developers
built with gcc 11 (Ubuntu 11.3.0-1ubuntu1~22.04)
configuration:
libavutil 57. 44.100 / 57. 44.100
libavcodec 59. 63.100 / 59. 63.100
libavformat 59. 38.100 / 59. 38.100
libavdevice 59. 8.101 / 59. 8.101
libavfilter 8. 56.100 / 8. 56.100
libswscale 6. 8.112 / 6. 8.112
libswresample 4. 9.100 / 4. 9.100
[x11grab @ 0x564d03e165c0] Stream #0: not enough frames to estimate rate; consider increasing probesize
Input #0, x11grab, from ':1+0,0 1920x1080':
Duration: N/A, start: 1675963927.428661, bitrate: 3379200 kb/s
Stream #0:0: Video: rawvideo (BGR[0] / 0x524742), bgr0, 3520x1200, 3379200 kb/s, 25 fps, 1000k tbr, 1000k tbn
Stream mapping:
Stream #0:0 -> #0:0 (rawvideo (native) -> mpeg4 (native))
Press [q] to stop, [?] for help
Output #0, mp4, to 'output1master.mp4':
Metadata:
encoder : Lavf59.38.100
Stream #0:0: Video: mpeg4 (mp4v / 0x7634706D), yuv420p(tv, progressive), 3520x1200, q=2-31, 200 kb/s, 25 fps, 12800 tbn
Metadata:
encoder : Lavc59.63.100 mpeg4
Side data:
cpb: bitrate max/min/avg: 0/0/200000 buffer size: 0 vbv_delay: N/A
frame= 251 fps= 25 q=31.0 Lsize= 5720kB time=00:00:10.00 bitrate=4686.0kbits/s speed=0.996x
video:5718kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.034719%
bench: utime=13.142s stime=0.307s rtime=10.039s
bench: maxrss=207576kB
2 - master
./ffmpeg -benchmark -t 10 -framerate 25 -f x11grab -window_id 0x5600008 -i ":1+0,0 1920x1080" output2master.mp4
ffmpeg version N-109782-g458ae405ef Copyright (c) 2000-2023 the FFmpeg developers
built with gcc 11 (Ubuntu 11.3.0-1ubuntu1~22.04)
configuration:
libavutil 57. 44.100 / 57. 44.100
libavcodec 59. 63.100 / 59. 63.100
libavformat 59. 38.100 / 59. 38.100
libavdevice 59. 8.101 / 59. 8.101
libavfilter 8. 56.100 / 8. 56.100
libswscale 6. 8.112 / 6. 8.112
libswresample 4. 9.100 / 4. 9.100
Input #0, x11grab, from ':1+0,0 1920x1080':
Duration: N/A, start: 1675963986.581500, bitrate: 472305 kb/s
Stream #0:0: Video: rawvideo (BGR[0] / 0x524742), bgr0, 841x702, 472305 kb/s, 25 fps, 25 tbr, 1000k tbn
Stream mapping:
Stream #0:0 -> #0:0 (rawvideo (native) -> mpeg4 (native))
Press [q] to stop, [?] for help
Output #0, mp4, to 'output2master.mp4':
Metadata:
encoder : Lavf59.38.100
Stream #0:0: Video: mpeg4 (mp4v / 0x7634706D), yuv420p(tv, progressive), 841x702, q=2-31, 200 kb/s, 25 fps, 12800 tbn
Metadata:
encoder : Lavc59.63.100 mpeg4
Side data:
cpb: bitrate max/min/avg: 0/0/200000 buffer size: 0 vbv_delay: N/A
frame= 250 fps= 25 q=31.0 Lsize= 1274kB time=00:00:09.96 bitrate=1047.9kbits/s speed= 1x
video:1272kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.151768%
bench: utime=0.628s stime=1.465s rtime=9.920s
bench: maxrss=52268kB
3 - patch applied
./ffmpeg -benchmark -t 10 -framerate 25 -f x11grab -i ":1+0,0 1920x1080" output1.mp4
ffmpeg version N-109783-g2352934f8b Copyright (c) 2000-2023 the FFmpeg developers
built with gcc 11 (Ubuntu 11.3.0-1ubuntu1~22.04)
configuration:
libavutil 57. 44.100 / 57. 44.100
libavcodec 59. 63.100 / 59. 63.100
libavformat 59. 38.100 / 59. 38.100
libavdevice 59. 8.101 / 59. 8.101
libavfilter 8. 56.100 / 8. 56.100
libswscale 6. 8.112 / 6. 8.112
libswresample 4. 9.100 / 4. 9.100
[x11grab @ 0x55e86905b5c0] Stream #0: not enough frames to estimate rate; consider increasing probesize
Input #0, x11grab, from ':1+0,0 1920x1080':
Duration: N/A, start: 1675964519.431271, bitrate: 3379200 kb/s
Stream #0:0: Video: rawvideo (BGR[0] / 0x524742), bgr0, 3520x1200, 3379200 kb/s, 25 fps, 1000k tbr, 1000k tbn
Stream mapping:
Stream #0:0 -> #0:0 (rawvideo (native) -> mpeg4 (native))
Press [q] to stop, [?] for help
Output #0, mp4, to 'output1.mp4':
Metadata:
encoder : Lavf59.38.100
Stream #0:0: Video: mpeg4 (mp4v / 0x7634706D), yuv420p(tv, progressive), 3520x1200, q=2-31, 200 kb/s, 25 fps, 12800 tbn
Metadata:
encoder : Lavc59.63.100 mpeg4
Side data:
cpb: bitrate max/min/avg: 0/0/200000 buffer size: 0 vbv_delay: N/A
frame= 250 fps= 25 q=31.0 Lsize= 5723kB time=00:00:09.96 bitrate=4706.8kbits/s speed=0.996x
video:5721kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.034227%
bench: utime=14.005s stime=0.168s rtime=9.998s
bench: maxrss=207828kB
4 - patch applied
./ffmpeg -benchmark -t 10 -framerate 25 -f x11grab -window_id 0x5600008 -i ":1+0,0 1920x1080" output2.mp4
ffmpeg version N-109783-g2352934f8b Copyright (c) 2000-2023 the FFmpeg developers
built with gcc 11 (Ubuntu 11.3.0-1ubuntu1~22.04)
configuration:
libavutil 57. 44.100 / 57. 44.100
libavcodec 59. 63.100 / 59. 63.100
libavformat 59. 38.100 / 59. 38.100
libavdevice 59. 8.101 / 59. 8.101
libavfilter 8. 56.100 / 8. 56.100
libswscale 6. 8.112 / 6. 8.112
libswresample 4. 9.100 / 4. 9.100
Input #0, x11grab, from ':1+0,0 1920x1080':
Duration: N/A, start: 1675964455.191272, bitrate: 472305 kb/s
Stream #0:0: Video: rawvideo (BGR[0] / 0x524742), bgr0, 841x702, 472305 kb/s, 25 fps, 24.83 tbr, 1000k tbn
Stream mapping:
Stream #0:0 -> #0:0 (rawvideo (native) -> mpeg4 (native))
Press [q] to stop, [?] for help
Output #0, mp4, to 'output2.mp4':
Metadata:
encoder : Lavf59.38.100
Stream #0:0: Video: mpeg4 (mp4v / 0x7634706D), yuv420p(tv, progressive), 841x702, q=2-31, 200 kb/s, 24.83 fps, 19072 tbn
Metadata:
encoder : Lavc59.63.100 mpeg4
Side data:
cpb: bitrate max/min/avg: 0/0/200000 buffer size: 0 vbv_delay: N/A
frame= 250 fps= 25 q=31.0 Lsize= 961kB time=00:00:10.02 bitrate= 785.0kbits/s speed=1.01x
video:959kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.199722%
bench: utime=1.624s stime=0.049s rtime=9.920s
bench: maxrss=51996kB
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* [FFmpeg-devel] [PATCH] avdevice/xcbgrab: enable window resizing
2023-02-09 18:19 ` [FFmpeg-devel] (no subject) Aline Gondim Santos
@ 2023-02-09 18:19 ` Aline Gondim Santos
2023-02-09 18:29 ` Aline Gondim Santos Gondim Santos
0 siblings, 1 reply; 15+ messages in thread
From: Aline Gondim Santos @ 2023-02-09 18:19 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Aline Gondim Santos
Signed-off-by: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
---
libavdevice/xcbgrab.c | 180 +++++++++---------------------------------
1 file changed, 39 insertions(+), 141 deletions(-)
diff --git a/libavdevice/xcbgrab.c b/libavdevice/xcbgrab.c
index 64a68ba497..05282911a9 100644
--- a/libavdevice/xcbgrab.c
+++ b/libavdevice/xcbgrab.c
@@ -29,11 +29,6 @@
#include <xcb/xfixes.h>
#endif
-#if CONFIG_LIBXCB_SHM
-#include <sys/shm.h>
-#include <xcb/shm.h>
-#endif
-
#if CONFIG_LIBXCB_SHAPE
#include <xcb/shape.h>
#endif
@@ -53,9 +48,6 @@ typedef struct XCBGrabContext {
xcb_connection_t *conn;
xcb_screen_t *screen;
xcb_window_t window;
-#if CONFIG_LIBXCB_SHM
- AVBufferPool *shm_pool;
-#endif
int64_t time_frame;
AVRational time_base;
int64_t frame_duration;
@@ -72,10 +64,9 @@ typedef struct XCBGrabContext {
int region_border;
int centered;
int select_region;
+ int is_area;
const char *framerate;
-
- int has_shm;
} XCBGrabContext;
#define FOLLOW_CENTER -1
@@ -97,6 +88,7 @@ static const AVOption options[] = {
{ "show_region", "Show the grabbing region.", OFFSET(show_region), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, D },
{ "region_border", "Set the region border thickness.", OFFSET(region_border), AV_OPT_TYPE_INT, { .i64 = 3 }, 1, 128, D },
{ "select_region", "Select the grabbing region graphically using the pointer.", OFFSET(select_region), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, D },
+ { "is_area", "Define if we are grabing a region of the display/window.", OFFSET(is_area), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, D },
{ NULL },
};
@@ -216,99 +208,6 @@ static int64_t wait_frame(AVFormatContext *s, AVPacket *pkt)
return curtime;
}
-#if CONFIG_LIBXCB_SHM
-static int check_shm(xcb_connection_t *conn)
-{
- xcb_shm_query_version_cookie_t cookie = xcb_shm_query_version(conn);
- xcb_shm_query_version_reply_t *reply;
-
- reply = xcb_shm_query_version_reply(conn, cookie, NULL);
- if (reply) {
- free(reply);
- return 1;
- }
-
- return 0;
-}
-
-static void free_shm_buffer(void *opaque, uint8_t *data)
-{
- shmdt(data);
-}
-
-static AVBufferRef *allocate_shm_buffer(void *opaque, size_t size)
-{
- xcb_connection_t *conn = opaque;
- xcb_shm_seg_t segment;
- AVBufferRef *ref;
- uint8_t *data;
- int id;
-
- id = shmget(IPC_PRIVATE, size, IPC_CREAT | 0777);
- if (id == -1)
- return NULL;
-
- segment = xcb_generate_id(conn);
- xcb_shm_attach(conn, segment, id, 0);
- data = shmat(id, NULL, 0);
- shmctl(id, IPC_RMID, 0);
- if ((intptr_t)data == -1 || !data)
- return NULL;
-
- ref = av_buffer_create(data, size, free_shm_buffer, (void *)(ptrdiff_t)segment, 0);
- if (!ref)
- shmdt(data);
-
- return ref;
-}
-
-static int xcbgrab_frame_shm(AVFormatContext *s, AVPacket *pkt)
-{
- XCBGrabContext *c = s->priv_data;
- xcb_shm_get_image_cookie_t iq;
- xcb_shm_get_image_reply_t *img;
- xcb_drawable_t drawable = c->window_id;
- xcb_generic_error_t *e = NULL;
- AVBufferRef *buf;
- xcb_shm_seg_t segment;
-
- buf = av_buffer_pool_get(c->shm_pool);
- if (!buf) {
- av_log(s, AV_LOG_ERROR, "Could not get shared memory buffer.\n");
- return AVERROR(ENOMEM);
- }
- segment = (xcb_shm_seg_t)(uintptr_t)av_buffer_pool_buffer_get_opaque(buf);
-
- iq = xcb_shm_get_image(c->conn, drawable,
- c->x, c->y, c->width, c->height, ~0,
- XCB_IMAGE_FORMAT_Z_PIXMAP, segment, 0);
- img = xcb_shm_get_image_reply(c->conn, iq, &e);
-
- xcb_flush(c->conn);
-
- if (e) {
- av_log(s, AV_LOG_ERROR,
- "Cannot get the image data "
- "event_error: response_type:%u error_code:%u "
- "sequence:%u resource_id:%u minor_code:%u major_code:%u.\n",
- e->response_type, e->error_code,
- e->sequence, e->resource_id, e->minor_code, e->major_code);
-
- free(e);
- av_buffer_unref(&buf);
- return AVERROR(EACCES);
- }
-
- free(img);
-
- pkt->buf = buf;
- pkt->data = buf->data;
- pkt->size = c->frame_size;
-
- return 0;
-}
-#endif /* CONFIG_LIBXCB_SHM */
-
#if CONFIG_LIBXCB_XFIXES
static int check_xfixes(xcb_connection_t *conn)
{
@@ -462,14 +361,7 @@ static int xcbgrab_read_packet(AVFormatContext *s, AVPacket *pkt)
if (c->show_region)
xcbgrab_update_region(s, win_x, win_y);
-#if CONFIG_LIBXCB_SHM
- if (c->has_shm && xcbgrab_frame_shm(s, pkt) < 0) {
- av_log(s, AV_LOG_WARNING, "Continuing without shared memory.\n");
- c->has_shm = 0;
- }
-#endif
- if (!c->has_shm)
- ret = xcbgrab_frame(s, pkt);
+ ret = xcbgrab_frame(s, pkt);
pkt->dts = pkt->pts = pts;
pkt->duration = c->frame_duration;
@@ -488,11 +380,8 @@ static av_cold int xcbgrab_read_close(AVFormatContext *s)
{
XCBGrabContext *ctx = s->priv_data;
-#if CONFIG_LIBXCB_SHM
- av_buffer_pool_uninit(&ctx->shm_pool);
-#endif
-
xcb_disconnect(ctx->conn);
+ ctx->conn = NULL;
return 0;
}
@@ -572,7 +461,15 @@ static int pixfmt_from_pixmap_format(AVFormatContext *s, int depth,
static int create_stream(AVFormatContext *s)
{
XCBGrabContext *c = s->priv_data;
- AVStream *st = avformat_new_stream(s, NULL);
+
+ // If we try to open another stream to x11grab, there is no reason
+ // to keep more than one stream in the context.
+ AVStream *st;
+ if (!s->nb_streams) {
+ st = avformat_new_stream(s, NULL);
+ } else {
+ st = s->streams[0];
+ }
xcb_get_geometry_cookie_t gc;
xcb_get_geometry_reply_t *geo;
int64_t frame_size_bits;
@@ -594,7 +491,16 @@ static int create_stream(AVFormatContext *s)
return AVERROR_EXTERNAL;
}
+ // Width and Height are not 0 only when we set a window area to share
+ // This if may be valid only in the first call to create_stream
if (!c->width || !c->height) {
+ c->is_area = 0;
+ c->width = geo->width;
+ c->height = geo->height;
+ }
+ // If not a predefined area, then we should follow geometry changes
+ // This can be valid only on the second call onwards
+ if (!c->is_area && (c->width != geo->width || c->height != geo->height)) {
c->width = geo->width;
c->height = geo->height;
}
@@ -628,13 +534,6 @@ static int create_stream(AVFormatContext *s)
}
c->frame_size = frame_size_bits / 8;
-#if CONFIG_LIBXCB_SHM
- c->shm_pool = av_buffer_pool_init2(c->frame_size + AV_INPUT_BUFFER_PADDING_SIZE,
- c->conn, allocate_shm_buffer, NULL);
- if (!c->shm_pool)
- return AVERROR(ENOMEM);
-#endif
-
st->codecpar->codec_type = AVMEDIA_TYPE_VIDEO;
st->codecpar->codec_id = AV_CODEC_ID_RAWVIDEO;
st->codecpar->width = c->width;
@@ -829,23 +728,26 @@ static av_cold int xcbgrab_read_header(AVFormatContext *s)
sscanf(s->url, "+%d,%d", &c->x, &c->y);
}
- c->conn = xcb_connect(display_name[0] ? display_name : NULL, &screen_num);
- av_freep(&display_name);
+ if (!c->conn || !c->screen) {
+ xcbgrab_read_close(s);
+ c->conn = xcb_connect(display_name[0] ? display_name : NULL, &screen_num);
+ av_freep(&display_name);
- if ((ret = xcb_connection_has_error(c->conn))) {
- av_log(s, AV_LOG_ERROR, "Cannot open display %s, error %d.\n",
- s->url[0] ? s->url : "default", ret);
- return AVERROR(EIO);
- }
+ if ((ret = xcb_connection_has_error(c->conn))) {
+ av_log(s, AV_LOG_ERROR, "Cannot open display %s, error %d.\n",
+ s->url[0] ? s->url : "default", ret);
+ return AVERROR(EIO);
+ }
- setup = xcb_get_setup(c->conn);
+ setup = xcb_get_setup(c->conn);
- c->screen = get_screen(setup, screen_num);
- if (!c->screen) {
- av_log(s, AV_LOG_ERROR, "The screen %d does not exist.\n",
- screen_num);
- xcbgrab_read_close(s);
- return AVERROR(EIO);
+ c->screen = get_screen(setup, screen_num);
+ if (!c->screen) {
+ av_log(s, AV_LOG_ERROR, "The screen %d does not exist.\n",
+ screen_num);
+ xcbgrab_read_close(s);
+ return AVERROR(EIO);
+ }
}
if (c->window_id == XCB_NONE)
@@ -876,10 +778,6 @@ static av_cold int xcbgrab_read_header(AVFormatContext *s)
return ret;
}
-#if CONFIG_LIBXCB_SHM
- c->has_shm = check_shm(c->conn);
-#endif
-
#if CONFIG_LIBXCB_XFIXES
if (c->draw_mouse) {
if (!(c->draw_mouse = check_xfixes(c->conn))) {
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [FFmpeg-devel] [PATCH] avdevice/xcbgrab: enable window resizing
2023-02-09 18:19 ` [FFmpeg-devel] [PATCH] avdevice/xcbgrab: enable window resizing Aline Gondim Santos
@ 2023-02-09 18:29 ` Aline Gondim Santos Gondim Santos
2023-02-09 20:09 ` Marton Balint
0 siblings, 1 reply; 15+ messages in thread
From: Aline Gondim Santos Gondim Santos @ 2023-02-09 18:29 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Hello Nicolas,
Bellow you can find the bechmarks using `ffmpeg -benchmark` option.
1 - master
./ffmpeg -benchmark -t 10 -framerate 25 -f x11grab -i ":1+0,0 1920x1080" output1master.mp4
ffmpeg version N-109782-g458ae405ef Copyright (c) 2000-2023 the FFmpeg developers
built with gcc 11 (Ubuntu 11.3.0-1ubuntu1~22.04)
configuration:
libavutil 57. 44.100 / 57. 44.100
libavcodec 59. 63.100 / 59. 63.100
libavformat 59. 38.100 / 59. 38.100
libavdevice 59. 8.101 / 59. 8.101
libavfilter 8. 56.100 / 8. 56.100
libswscale 6. 8.112 / 6. 8.112
libswresample 4. 9.100 / 4. 9.100
[x11grab @ 0x564d03e165c0] Stream #0: not enough frames to estimate rate; consider increasing probesize
Input #0, x11grab, from ':1+0,0 1920x1080':
Duration: N/A, start: 1675963927.428661, bitrate: 3379200 kb/s
Stream #0:0: Video: rawvideo (BGR[0] / 0x524742), bgr0, 3520x1200, 3379200 kb/s, 25 fps, 1000k tbr, 1000k tbn
Stream mapping:
Stream #0:0 -> #0:0 (rawvideo (native) -> mpeg4 (native))
Press [q] to stop, [?] for help
Output #0, mp4, to 'output1master.mp4':
Metadata:
encoder : Lavf59.38.100
Stream #0:0: Video: mpeg4 (mp4v / 0x7634706D), yuv420p(tv, progressive), 3520x1200, q=2-31, 200 kb/s, 25 fps, 12800 tbn
Metadata:
encoder : Lavc59.63.100 mpeg4
Side data:
cpb: bitrate max/min/avg: 0/0/200000 buffer size: 0 vbv_delay: N/A
frame= 251 fps= 25 q=31.0 Lsize= 5720kB time=00:00:10.00 bitrate=4686.0kbits/s speed=0.996x
video:5718kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.034719%
bench: utime=13.142s stime=0.307s rtime=10.039s
bench: maxrss=207576kB
2 - master
./ffmpeg -benchmark -t 10 -framerate 25 -f x11grab -window_id 0x5600008 -i ":1+0,0 1920x1080" output2master.mp4
ffmpeg version N-109782-g458ae405ef Copyright (c) 2000-2023 the FFmpeg developers
built with gcc 11 (Ubuntu 11.3.0-1ubuntu1~22.04)
configuration:
libavutil 57. 44.100 / 57. 44.100
libavcodec 59. 63.100 / 59. 63.100
libavformat 59. 38.100 / 59. 38.100
libavdevice 59. 8.101 / 59. 8.101
libavfilter 8. 56.100 / 8. 56.100
libswscale 6. 8.112 / 6. 8.112
libswresample 4. 9.100 / 4. 9.100
Input #0, x11grab, from ':1+0,0 1920x1080':
Duration: N/A, start: 1675963986.581500, bitrate: 472305 kb/s
Stream #0:0: Video: rawvideo (BGR[0] / 0x524742), bgr0, 841x702, 472305 kb/s, 25 fps, 25 tbr, 1000k tbn
Stream mapping:
Stream #0:0 -> #0:0 (rawvideo (native) -> mpeg4 (native))
Press [q] to stop, [?] for help
Output #0, mp4, to 'output2master.mp4':
Metadata:
encoder : Lavf59.38.100
Stream #0:0: Video: mpeg4 (mp4v / 0x7634706D), yuv420p(tv, progressive), 841x702, q=2-31, 200 kb/s, 25 fps, 12800 tbn
Metadata:
encoder : Lavc59.63.100 mpeg4
Side data:
cpb: bitrate max/min/avg: 0/0/200000 buffer size: 0 vbv_delay: N/A
frame= 250 fps= 25 q=31.0 Lsize= 1274kB time=00:00:09.96 bitrate=1047.9kbits/s speed= 1x
video:1272kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.151768%
bench: utime=0.628s stime=1.465s rtime=9.920s
bench: maxrss=52268kB
3 - patch applied
./ffmpeg -benchmark -t 10 -framerate 25 -f x11grab -i ":1+0,0 1920x1080" output1.mp4
ffmpeg version N-109783-g2352934f8b Copyright (c) 2000-2023 the FFmpeg developers
built with gcc 11 (Ubuntu 11.3.0-1ubuntu1~22.04)
configuration:
libavutil 57. 44.100 / 57. 44.100
libavcodec 59. 63.100 / 59. 63.100
libavformat 59. 38.100 / 59. 38.100
libavdevice 59. 8.101 / 59. 8.101
libavfilter 8. 56.100 / 8. 56.100
libswscale 6. 8.112 / 6. 8.112
libswresample 4. 9.100 / 4. 9.100
[x11grab @ 0x55e86905b5c0] Stream #0: not enough frames to estimate rate; consider increasing probesize
Input #0, x11grab, from ':1+0,0 1920x1080':
Duration: N/A, start: 1675964519.431271, bitrate: 3379200 kb/s
Stream #0:0: Video: rawvideo (BGR[0] / 0x524742), bgr0, 3520x1200, 3379200 kb/s, 25 fps, 1000k tbr, 1000k tbn
Stream mapping:
Stream #0:0 -> #0:0 (rawvideo (native) -> mpeg4 (native))
Press [q] to stop, [?] for help
Output #0, mp4, to 'output1.mp4':
Metadata:
encoder : Lavf59.38.100
Stream #0:0: Video: mpeg4 (mp4v / 0x7634706D), yuv420p(tv, progressive), 3520x1200, q=2-31, 200 kb/s, 25 fps, 12800 tbn
Metadata:
encoder : Lavc59.63.100 mpeg4
Side data:
cpb: bitrate max/min/avg: 0/0/200000 buffer size: 0 vbv_delay: N/A
frame= 250 fps= 25 q=31.0 Lsize= 5723kB time=00:00:09.96 bitrate=4706.8kbits/s speed=0.996x
video:5721kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.034227%
bench: utime=14.005s stime=0.168s rtime=9.998s
bench: maxrss=207828kB
4 - patch applied
./ffmpeg -benchmark -t 10 -framerate 25 -f x11grab -window_id 0x5600008 -i ":1+0,0 1920x1080" output2.mp4
ffmpeg version N-109783-g2352934f8b Copyright (c) 2000-2023 the FFmpeg developers
built with gcc 11 (Ubuntu 11.3.0-1ubuntu1~22.04)
configuration:
libavutil 57. 44.100 / 57. 44.100
libavcodec 59. 63.100 / 59. 63.100
libavformat 59. 38.100 / 59. 38.100
libavdevice 59. 8.101 / 59. 8.101
libavfilter 8. 56.100 / 8. 56.100
libswscale 6. 8.112 / 6. 8.112
libswresample 4. 9.100 / 4. 9.100
Input #0, x11grab, from ':1+0,0 1920x1080':
Duration: N/A, start: 1675964455.191272, bitrate: 472305 kb/s
Stream #0:0: Video: rawvideo (BGR[0] / 0x524742), bgr0, 841x702, 472305 kb/s, 25 fps, 24.83 tbr, 1000k tbn
Stream mapping:
Stream #0:0 -> #0:0 (rawvideo (native) -> mpeg4 (native))
Press [q] to stop, [?] for help
Output #0, mp4, to 'output2.mp4':
Metadata:
encoder : Lavf59.38.100
Stream #0:0: Video: mpeg4 (mp4v / 0x7634706D), yuv420p(tv, progressive), 841x702, q=2-31, 200 kb/s, 24.83 fps, 19072 tbn
Metadata:
encoder : Lavc59.63.100 mpeg4
Side data:
cpb: bitrate max/min/avg: 0/0/200000 buffer size: 0 vbv_delay: N/A
frame= 250 fps= 25 q=31.0 Lsize= 961kB time=00:00:10.02 bitrate= 785.0kbits/s speed=1.01x
video:959kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.199722%
bench: utime=1.624s stime=0.049s rtime=9.920s
bench: maxrss=51996kB
Aline Gondim Santos,
Software Developer - Montreal, QC
Savoir-faire Linux inc.
Contactez moi sur [ https://jami.net/ | Jami ] : agsantos
----- Original Message -----
From: "Aline Gondim Santos Gondim Santos" <aline.gondimsantos@savoirfairelinux.com>
To: "ffmpeg-devel" <ffmpeg-devel@ffmpeg.org>
Cc: "Aline Gondim Santos Gondim Santos" <aline.gondimsantos@savoirfairelinux.com>
Sent: Thursday, 9 February, 2023 15:19:53
Subject: [FFmpeg-devel] [PATCH] avdevice/xcbgrab: enable window resizing
Signed-off-by: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
---
libavdevice/xcbgrab.c | 180 +++++++++---------------------------------
1 file changed, 39 insertions(+), 141 deletions(-)
diff --git a/libavdevice/xcbgrab.c b/libavdevice/xcbgrab.c
index 64a68ba497..05282911a9 100644
--- a/libavdevice/xcbgrab.c
+++ b/libavdevice/xcbgrab.c
@@ -29,11 +29,6 @@
#include <xcb/xfixes.h>
#endif
-#if CONFIG_LIBXCB_SHM
-#include <sys/shm.h>
-#include <xcb/shm.h>
-#endif
-
#if CONFIG_LIBXCB_SHAPE
#include <xcb/shape.h>
#endif
@@ -53,9 +48,6 @@ typedef struct XCBGrabContext {
xcb_connection_t *conn;
xcb_screen_t *screen;
xcb_window_t window;
-#if CONFIG_LIBXCB_SHM
- AVBufferPool *shm_pool;
-#endif
int64_t time_frame;
AVRational time_base;
int64_t frame_duration;
@@ -72,10 +64,9 @@ typedef struct XCBGrabContext {
int region_border;
int centered;
int select_region;
+ int is_area;
const char *framerate;
-
- int has_shm;
} XCBGrabContext;
#define FOLLOW_CENTER -1
@@ -97,6 +88,7 @@ static const AVOption options[] = {
{ "show_region", "Show the grabbing region.", OFFSET(show_region), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, D },
{ "region_border", "Set the region border thickness.", OFFSET(region_border), AV_OPT_TYPE_INT, { .i64 = 3 }, 1, 128, D },
{ "select_region", "Select the grabbing region graphically using the pointer.", OFFSET(select_region), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, D },
+ { "is_area", "Define if we are grabing a region of the display/window.", OFFSET(is_area), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, D },
{ NULL },
};
@@ -216,99 +208,6 @@ static int64_t wait_frame(AVFormatContext *s, AVPacket *pkt)
return curtime;
}
-#if CONFIG_LIBXCB_SHM
-static int check_shm(xcb_connection_t *conn)
-{
- xcb_shm_query_version_cookie_t cookie = xcb_shm_query_version(conn);
- xcb_shm_query_version_reply_t *reply;
-
- reply = xcb_shm_query_version_reply(conn, cookie, NULL);
- if (reply) {
- free(reply);
- return 1;
- }
-
- return 0;
-}
-
-static void free_shm_buffer(void *opaque, uint8_t *data)
-{
- shmdt(data);
-}
-
-static AVBufferRef *allocate_shm_buffer(void *opaque, size_t size)
-{
- xcb_connection_t *conn = opaque;
- xcb_shm_seg_t segment;
- AVBufferRef *ref;
- uint8_t *data;
- int id;
-
- id = shmget(IPC_PRIVATE, size, IPC_CREAT | 0777);
- if (id == -1)
- return NULL;
-
- segment = xcb_generate_id(conn);
- xcb_shm_attach(conn, segment, id, 0);
- data = shmat(id, NULL, 0);
- shmctl(id, IPC_RMID, 0);
- if ((intptr_t)data == -1 || !data)
- return NULL;
-
- ref = av_buffer_create(data, size, free_shm_buffer, (void *)(ptrdiff_t)segment, 0);
- if (!ref)
- shmdt(data);
-
- return ref;
-}
-
-static int xcbgrab_frame_shm(AVFormatContext *s, AVPacket *pkt)
-{
- XCBGrabContext *c = s->priv_data;
- xcb_shm_get_image_cookie_t iq;
- xcb_shm_get_image_reply_t *img;
- xcb_drawable_t drawable = c->window_id;
- xcb_generic_error_t *e = NULL;
- AVBufferRef *buf;
- xcb_shm_seg_t segment;
-
- buf = av_buffer_pool_get(c->shm_pool);
- if (!buf) {
- av_log(s, AV_LOG_ERROR, "Could not get shared memory buffer.\n");
- return AVERROR(ENOMEM);
- }
- segment = (xcb_shm_seg_t)(uintptr_t)av_buffer_pool_buffer_get_opaque(buf);
-
- iq = xcb_shm_get_image(c->conn, drawable,
- c->x, c->y, c->width, c->height, ~0,
- XCB_IMAGE_FORMAT_Z_PIXMAP, segment, 0);
- img = xcb_shm_get_image_reply(c->conn, iq, &e);
-
- xcb_flush(c->conn);
-
- if (e) {
- av_log(s, AV_LOG_ERROR,
- "Cannot get the image data "
- "event_error: response_type:%u error_code:%u "
- "sequence:%u resource_id:%u minor_code:%u major_code:%u.\n",
- e->response_type, e->error_code,
- e->sequence, e->resource_id, e->minor_code, e->major_code);
-
- free(e);
- av_buffer_unref(&buf);
- return AVERROR(EACCES);
- }
-
- free(img);
-
- pkt->buf = buf;
- pkt->data = buf->data;
- pkt->size = c->frame_size;
-
- return 0;
-}
-#endif /* CONFIG_LIBXCB_SHM */
-
#if CONFIG_LIBXCB_XFIXES
static int check_xfixes(xcb_connection_t *conn)
{
@@ -462,14 +361,7 @@ static int xcbgrab_read_packet(AVFormatContext *s, AVPacket *pkt)
if (c->show_region)
xcbgrab_update_region(s, win_x, win_y);
-#if CONFIG_LIBXCB_SHM
- if (c->has_shm && xcbgrab_frame_shm(s, pkt) < 0) {
- av_log(s, AV_LOG_WARNING, "Continuing without shared memory.\n");
- c->has_shm = 0;
- }
-#endif
- if (!c->has_shm)
- ret = xcbgrab_frame(s, pkt);
+ ret = xcbgrab_frame(s, pkt);
pkt->dts = pkt->pts = pts;
pkt->duration = c->frame_duration;
@@ -488,11 +380,8 @@ static av_cold int xcbgrab_read_close(AVFormatContext *s)
{
XCBGrabContext *ctx = s->priv_data;
-#if CONFIG_LIBXCB_SHM
- av_buffer_pool_uninit(&ctx->shm_pool);
-#endif
-
xcb_disconnect(ctx->conn);
+ ctx->conn = NULL;
return 0;
}
@@ -572,7 +461,15 @@ static int pixfmt_from_pixmap_format(AVFormatContext *s, int depth,
static int create_stream(AVFormatContext *s)
{
XCBGrabContext *c = s->priv_data;
- AVStream *st = avformat_new_stream(s, NULL);
+
+ // If we try to open another stream to x11grab, there is no reason
+ // to keep more than one stream in the context.
+ AVStream *st;
+ if (!s->nb_streams) {
+ st = avformat_new_stream(s, NULL);
+ } else {
+ st = s->streams[0];
+ }
xcb_get_geometry_cookie_t gc;
xcb_get_geometry_reply_t *geo;
int64_t frame_size_bits;
@@ -594,7 +491,16 @@ static int create_stream(AVFormatContext *s)
return AVERROR_EXTERNAL;
}
+ // Width and Height are not 0 only when we set a window area to share
+ // This if may be valid only in the first call to create_stream
if (!c->width || !c->height) {
+ c->is_area = 0;
+ c->width = geo->width;
+ c->height = geo->height;
+ }
+ // If not a predefined area, then we should follow geometry changes
+ // This can be valid only on the second call onwards
+ if (!c->is_area && (c->width != geo->width || c->height != geo->height)) {
c->width = geo->width;
c->height = geo->height;
}
@@ -628,13 +534,6 @@ static int create_stream(AVFormatContext *s)
}
c->frame_size = frame_size_bits / 8;
-#if CONFIG_LIBXCB_SHM
- c->shm_pool = av_buffer_pool_init2(c->frame_size + AV_INPUT_BUFFER_PADDING_SIZE,
- c->conn, allocate_shm_buffer, NULL);
- if (!c->shm_pool)
- return AVERROR(ENOMEM);
-#endif
-
st->codecpar->codec_type = AVMEDIA_TYPE_VIDEO;
st->codecpar->codec_id = AV_CODEC_ID_RAWVIDEO;
st->codecpar->width = c->width;
@@ -829,23 +728,26 @@ static av_cold int xcbgrab_read_header(AVFormatContext *s)
sscanf(s->url, "+%d,%d", &c->x, &c->y);
}
- c->conn = xcb_connect(display_name[0] ? display_name : NULL, &screen_num);
- av_freep(&display_name);
+ if (!c->conn || !c->screen) {
+ xcbgrab_read_close(s);
+ c->conn = xcb_connect(display_name[0] ? display_name : NULL, &screen_num);
+ av_freep(&display_name);
- if ((ret = xcb_connection_has_error(c->conn))) {
- av_log(s, AV_LOG_ERROR, "Cannot open display %s, error %d.\n",
- s->url[0] ? s->url : "default", ret);
- return AVERROR(EIO);
- }
+ if ((ret = xcb_connection_has_error(c->conn))) {
+ av_log(s, AV_LOG_ERROR, "Cannot open display %s, error %d.\n",
+ s->url[0] ? s->url : "default", ret);
+ return AVERROR(EIO);
+ }
- setup = xcb_get_setup(c->conn);
+ setup = xcb_get_setup(c->conn);
- c->screen = get_screen(setup, screen_num);
- if (!c->screen) {
- av_log(s, AV_LOG_ERROR, "The screen %d does not exist.\n",
- screen_num);
- xcbgrab_read_close(s);
- return AVERROR(EIO);
+ c->screen = get_screen(setup, screen_num);
+ if (!c->screen) {
+ av_log(s, AV_LOG_ERROR, "The screen %d does not exist.\n",
+ screen_num);
+ xcbgrab_read_close(s);
+ return AVERROR(EIO);
+ }
}
if (c->window_id == XCB_NONE)
@@ -876,10 +778,6 @@ static av_cold int xcbgrab_read_header(AVFormatContext *s)
return ret;
}
-#if CONFIG_LIBXCB_SHM
- c->has_shm = check_shm(c->conn);
-#endif
-
#if CONFIG_LIBXCB_XFIXES
if (c->draw_mouse) {
if (!(c->draw_mouse = check_xfixes(c->conn))) {
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [FFmpeg-devel] [PATCH] avdevice/xcbgrab: enable window resizing
2023-02-09 18:29 ` Aline Gondim Santos Gondim Santos
@ 2023-02-09 20:09 ` Marton Balint
0 siblings, 0 replies; 15+ messages in thread
From: Marton Balint @ 2023-02-09 20:09 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Thu, 9 Feb 2023, Aline Gondim Santos Gondim Santos wrote:
> Hello Nicolas,
> Bellow you can find the bechmarks using `ffmpeg -benchmark` option.
Utime change seems significant - as expected.
In general I don't like the idea of removing shared memory support, and I
don't quite see why window resizing cannot be supported with shm.
Regards,
Marton
>
> 1 - master
>
> ./ffmpeg -benchmark -t 10 -framerate 25 -f x11grab -i ":1+0,0 1920x1080" output1master.mp4
> ffmpeg version N-109782-g458ae405ef Copyright (c) 2000-2023 the FFmpeg developers
> built with gcc 11 (Ubuntu 11.3.0-1ubuntu1~22.04)
> configuration:
> libavutil 57. 44.100 / 57. 44.100
> libavcodec 59. 63.100 / 59. 63.100
> libavformat 59. 38.100 / 59. 38.100
> libavdevice 59. 8.101 / 59. 8.101
> libavfilter 8. 56.100 / 8. 56.100
> libswscale 6. 8.112 / 6. 8.112
> libswresample 4. 9.100 / 4. 9.100
> [x11grab @ 0x564d03e165c0] Stream #0: not enough frames to estimate rate; consider increasing probesize
> Input #0, x11grab, from ':1+0,0 1920x1080':
> Duration: N/A, start: 1675963927.428661, bitrate: 3379200 kb/s
> Stream #0:0: Video: rawvideo (BGR[0] / 0x524742), bgr0, 3520x1200, 3379200 kb/s, 25 fps, 1000k tbr, 1000k tbn
> Stream mapping:
> Stream #0:0 -> #0:0 (rawvideo (native) -> mpeg4 (native))
> Press [q] to stop, [?] for help
> Output #0, mp4, to 'output1master.mp4':
> Metadata:
> encoder : Lavf59.38.100
> Stream #0:0: Video: mpeg4 (mp4v / 0x7634706D), yuv420p(tv, progressive), 3520x1200, q=2-31, 200 kb/s, 25 fps, 12800 tbn
> Metadata:
> encoder : Lavc59.63.100 mpeg4
> Side data:
> cpb: bitrate max/min/avg: 0/0/200000 buffer size: 0 vbv_delay: N/A
> frame= 251 fps= 25 q=31.0 Lsize= 5720kB time=00:00:10.00 bitrate=4686.0kbits/s speed=0.996x
> video:5718kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.034719%
> bench: utime=13.142s stime=0.307s rtime=10.039s
> bench: maxrss=207576kB
>
> 2 - master
>
> ./ffmpeg -benchmark -t 10 -framerate 25 -f x11grab -window_id 0x5600008 -i ":1+0,0 1920x1080" output2master.mp4
> ffmpeg version N-109782-g458ae405ef Copyright (c) 2000-2023 the FFmpeg developers
> built with gcc 11 (Ubuntu 11.3.0-1ubuntu1~22.04)
> configuration:
> libavutil 57. 44.100 / 57. 44.100
> libavcodec 59. 63.100 / 59. 63.100
> libavformat 59. 38.100 / 59. 38.100
> libavdevice 59. 8.101 / 59. 8.101
> libavfilter 8. 56.100 / 8. 56.100
> libswscale 6. 8.112 / 6. 8.112
> libswresample 4. 9.100 / 4. 9.100
> Input #0, x11grab, from ':1+0,0 1920x1080':
> Duration: N/A, start: 1675963986.581500, bitrate: 472305 kb/s
> Stream #0:0: Video: rawvideo (BGR[0] / 0x524742), bgr0, 841x702, 472305 kb/s, 25 fps, 25 tbr, 1000k tbn
> Stream mapping:
> Stream #0:0 -> #0:0 (rawvideo (native) -> mpeg4 (native))
> Press [q] to stop, [?] for help
> Output #0, mp4, to 'output2master.mp4':
> Metadata:
> encoder : Lavf59.38.100
> Stream #0:0: Video: mpeg4 (mp4v / 0x7634706D), yuv420p(tv, progressive), 841x702, q=2-31, 200 kb/s, 25 fps, 12800 tbn
> Metadata:
> encoder : Lavc59.63.100 mpeg4
> Side data:
> cpb: bitrate max/min/avg: 0/0/200000 buffer size: 0 vbv_delay: N/A
> frame= 250 fps= 25 q=31.0 Lsize= 1274kB time=00:00:09.96 bitrate=1047.9kbits/s speed= 1x
> video:1272kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.151768%
> bench: utime=0.628s stime=1.465s rtime=9.920s
> bench: maxrss=52268kB
>
>
> 3 - patch applied
>
> ./ffmpeg -benchmark -t 10 -framerate 25 -f x11grab -i ":1+0,0 1920x1080" output1.mp4
> ffmpeg version N-109783-g2352934f8b Copyright (c) 2000-2023 the FFmpeg developers
> built with gcc 11 (Ubuntu 11.3.0-1ubuntu1~22.04)
> configuration:
> libavutil 57. 44.100 / 57. 44.100
> libavcodec 59. 63.100 / 59. 63.100
> libavformat 59. 38.100 / 59. 38.100
> libavdevice 59. 8.101 / 59. 8.101
> libavfilter 8. 56.100 / 8. 56.100
> libswscale 6. 8.112 / 6. 8.112
> libswresample 4. 9.100 / 4. 9.100
> [x11grab @ 0x55e86905b5c0] Stream #0: not enough frames to estimate rate; consider increasing probesize
> Input #0, x11grab, from ':1+0,0 1920x1080':
> Duration: N/A, start: 1675964519.431271, bitrate: 3379200 kb/s
> Stream #0:0: Video: rawvideo (BGR[0] / 0x524742), bgr0, 3520x1200, 3379200 kb/s, 25 fps, 1000k tbr, 1000k tbn
> Stream mapping:
> Stream #0:0 -> #0:0 (rawvideo (native) -> mpeg4 (native))
> Press [q] to stop, [?] for help
> Output #0, mp4, to 'output1.mp4':
> Metadata:
> encoder : Lavf59.38.100
> Stream #0:0: Video: mpeg4 (mp4v / 0x7634706D), yuv420p(tv, progressive), 3520x1200, q=2-31, 200 kb/s, 25 fps, 12800 tbn
> Metadata:
> encoder : Lavc59.63.100 mpeg4
> Side data:
> cpb: bitrate max/min/avg: 0/0/200000 buffer size: 0 vbv_delay: N/A
> frame= 250 fps= 25 q=31.0 Lsize= 5723kB time=00:00:09.96 bitrate=4706.8kbits/s speed=0.996x
> video:5721kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.034227%
> bench: utime=14.005s stime=0.168s rtime=9.998s
> bench: maxrss=207828kB
>
>
> 4 - patch applied
>
>
> ./ffmpeg -benchmark -t 10 -framerate 25 -f x11grab -window_id 0x5600008 -i ":1+0,0 1920x1080" output2.mp4
> ffmpeg version N-109783-g2352934f8b Copyright (c) 2000-2023 the FFmpeg developers
> built with gcc 11 (Ubuntu 11.3.0-1ubuntu1~22.04)
> configuration:
> libavutil 57. 44.100 / 57. 44.100
> libavcodec 59. 63.100 / 59. 63.100
> libavformat 59. 38.100 / 59. 38.100
> libavdevice 59. 8.101 / 59. 8.101
> libavfilter 8. 56.100 / 8. 56.100
> libswscale 6. 8.112 / 6. 8.112
> libswresample 4. 9.100 / 4. 9.100
> Input #0, x11grab, from ':1+0,0 1920x1080':
> Duration: N/A, start: 1675964455.191272, bitrate: 472305 kb/s
> Stream #0:0: Video: rawvideo (BGR[0] / 0x524742), bgr0, 841x702, 472305 kb/s, 25 fps, 24.83 tbr, 1000k tbn
> Stream mapping:
> Stream #0:0 -> #0:0 (rawvideo (native) -> mpeg4 (native))
> Press [q] to stop, [?] for help
> Output #0, mp4, to 'output2.mp4':
> Metadata:
> encoder : Lavf59.38.100
> Stream #0:0: Video: mpeg4 (mp4v / 0x7634706D), yuv420p(tv, progressive), 841x702, q=2-31, 200 kb/s, 24.83 fps, 19072 tbn
> Metadata:
> encoder : Lavc59.63.100 mpeg4
> Side data:
> cpb: bitrate max/min/avg: 0/0/200000 buffer size: 0 vbv_delay: N/A
> frame= 250 fps= 25 q=31.0 Lsize= 961kB time=00:00:10.02 bitrate= 785.0kbits/s speed=1.01x
> video:959kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.199722%
> bench: utime=1.624s stime=0.049s rtime=9.920s
> bench: maxrss=51996kB
>
> Aline Gondim Santos,
> Software Developer - Montreal, QC
>
> Savoir-faire Linux inc.
> Contactez moi sur [ https://jami.net/ | Jami ] : agsantos
>
> ----- Original Message -----
> From: "Aline Gondim Santos Gondim Santos" <aline.gondimsantos@savoirfairelinux.com>
> To: "ffmpeg-devel" <ffmpeg-devel@ffmpeg.org>
> Cc: "Aline Gondim Santos Gondim Santos" <aline.gondimsantos@savoirfairelinux.com>
> Sent: Thursday, 9 February, 2023 15:19:53
> Subject: [FFmpeg-devel] [PATCH] avdevice/xcbgrab: enable window resizing
>
> Signed-off-by: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
> ---
> libavdevice/xcbgrab.c | 180 +++++++++---------------------------------
> 1 file changed, 39 insertions(+), 141 deletions(-)
>
> diff --git a/libavdevice/xcbgrab.c b/libavdevice/xcbgrab.c
> index 64a68ba497..05282911a9 100644
> --- a/libavdevice/xcbgrab.c
> +++ b/libavdevice/xcbgrab.c
> @@ -29,11 +29,6 @@
> #include <xcb/xfixes.h>
> #endif
>
> -#if CONFIG_LIBXCB_SHM
> -#include <sys/shm.h>
> -#include <xcb/shm.h>
> -#endif
> -
> #if CONFIG_LIBXCB_SHAPE
> #include <xcb/shape.h>
> #endif
> @@ -53,9 +48,6 @@ typedef struct XCBGrabContext {
> xcb_connection_t *conn;
> xcb_screen_t *screen;
> xcb_window_t window;
> -#if CONFIG_LIBXCB_SHM
> - AVBufferPool *shm_pool;
> -#endif
> int64_t time_frame;
> AVRational time_base;
> int64_t frame_duration;
> @@ -72,10 +64,9 @@ typedef struct XCBGrabContext {
> int region_border;
> int centered;
> int select_region;
> + int is_area;
>
> const char *framerate;
> -
> - int has_shm;
> } XCBGrabContext;
>
> #define FOLLOW_CENTER -1
> @@ -97,6 +88,7 @@ static const AVOption options[] = {
> { "show_region", "Show the grabbing region.", OFFSET(show_region), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, D },
> { "region_border", "Set the region border thickness.", OFFSET(region_border), AV_OPT_TYPE_INT, { .i64 = 3 }, 1, 128, D },
> { "select_region", "Select the grabbing region graphically using the pointer.", OFFSET(select_region), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, D },
> + { "is_area", "Define if we are grabing a region of the display/window.", OFFSET(is_area), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, D },
> { NULL },
> };
>
> @@ -216,99 +208,6 @@ static int64_t wait_frame(AVFormatContext *s, AVPacket *pkt)
> return curtime;
> }
>
> -#if CONFIG_LIBXCB_SHM
> -static int check_shm(xcb_connection_t *conn)
> -{
> - xcb_shm_query_version_cookie_t cookie = xcb_shm_query_version(conn);
> - xcb_shm_query_version_reply_t *reply;
> -
> - reply = xcb_shm_query_version_reply(conn, cookie, NULL);
> - if (reply) {
> - free(reply);
> - return 1;
> - }
> -
> - return 0;
> -}
> -
> -static void free_shm_buffer(void *opaque, uint8_t *data)
> -{
> - shmdt(data);
> -}
> -
> -static AVBufferRef *allocate_shm_buffer(void *opaque, size_t size)
> -{
> - xcb_connection_t *conn = opaque;
> - xcb_shm_seg_t segment;
> - AVBufferRef *ref;
> - uint8_t *data;
> - int id;
> -
> - id = shmget(IPC_PRIVATE, size, IPC_CREAT | 0777);
> - if (id == -1)
> - return NULL;
> -
> - segment = xcb_generate_id(conn);
> - xcb_shm_attach(conn, segment, id, 0);
> - data = shmat(id, NULL, 0);
> - shmctl(id, IPC_RMID, 0);
> - if ((intptr_t)data == -1 || !data)
> - return NULL;
> -
> - ref = av_buffer_create(data, size, free_shm_buffer, (void *)(ptrdiff_t)segment, 0);
> - if (!ref)
> - shmdt(data);
> -
> - return ref;
> -}
> -
> -static int xcbgrab_frame_shm(AVFormatContext *s, AVPacket *pkt)
> -{
> - XCBGrabContext *c = s->priv_data;
> - xcb_shm_get_image_cookie_t iq;
> - xcb_shm_get_image_reply_t *img;
> - xcb_drawable_t drawable = c->window_id;
> - xcb_generic_error_t *e = NULL;
> - AVBufferRef *buf;
> - xcb_shm_seg_t segment;
> -
> - buf = av_buffer_pool_get(c->shm_pool);
> - if (!buf) {
> - av_log(s, AV_LOG_ERROR, "Could not get shared memory buffer.\n");
> - return AVERROR(ENOMEM);
> - }
> - segment = (xcb_shm_seg_t)(uintptr_t)av_buffer_pool_buffer_get_opaque(buf);
> -
> - iq = xcb_shm_get_image(c->conn, drawable,
> - c->x, c->y, c->width, c->height, ~0,
> - XCB_IMAGE_FORMAT_Z_PIXMAP, segment, 0);
> - img = xcb_shm_get_image_reply(c->conn, iq, &e);
> -
> - xcb_flush(c->conn);
> -
> - if (e) {
> - av_log(s, AV_LOG_ERROR,
> - "Cannot get the image data "
> - "event_error: response_type:%u error_code:%u "
> - "sequence:%u resource_id:%u minor_code:%u major_code:%u.\n",
> - e->response_type, e->error_code,
> - e->sequence, e->resource_id, e->minor_code, e->major_code);
> -
> - free(e);
> - av_buffer_unref(&buf);
> - return AVERROR(EACCES);
> - }
> -
> - free(img);
> -
> - pkt->buf = buf;
> - pkt->data = buf->data;
> - pkt->size = c->frame_size;
> -
> - return 0;
> -}
> -#endif /* CONFIG_LIBXCB_SHM */
> -
> #if CONFIG_LIBXCB_XFIXES
> static int check_xfixes(xcb_connection_t *conn)
> {
> @@ -462,14 +361,7 @@ static int xcbgrab_read_packet(AVFormatContext *s, AVPacket *pkt)
> if (c->show_region)
> xcbgrab_update_region(s, win_x, win_y);
>
> -#if CONFIG_LIBXCB_SHM
> - if (c->has_shm && xcbgrab_frame_shm(s, pkt) < 0) {
> - av_log(s, AV_LOG_WARNING, "Continuing without shared memory.\n");
> - c->has_shm = 0;
> - }
> -#endif
> - if (!c->has_shm)
> - ret = xcbgrab_frame(s, pkt);
> + ret = xcbgrab_frame(s, pkt);
> pkt->dts = pkt->pts = pts;
> pkt->duration = c->frame_duration;
>
> @@ -488,11 +380,8 @@ static av_cold int xcbgrab_read_close(AVFormatContext *s)
> {
> XCBGrabContext *ctx = s->priv_data;
>
> -#if CONFIG_LIBXCB_SHM
> - av_buffer_pool_uninit(&ctx->shm_pool);
> -#endif
> -
> xcb_disconnect(ctx->conn);
> + ctx->conn = NULL;
>
> return 0;
> }
> @@ -572,7 +461,15 @@ static int pixfmt_from_pixmap_format(AVFormatContext *s, int depth,
> static int create_stream(AVFormatContext *s)
> {
> XCBGrabContext *c = s->priv_data;
> - AVStream *st = avformat_new_stream(s, NULL);
> +
> + // If we try to open another stream to x11grab, there is no reason
> + // to keep more than one stream in the context.
> + AVStream *st;
> + if (!s->nb_streams) {
> + st = avformat_new_stream(s, NULL);
> + } else {
> + st = s->streams[0];
> + }
> xcb_get_geometry_cookie_t gc;
> xcb_get_geometry_reply_t *geo;
> int64_t frame_size_bits;
> @@ -594,7 +491,16 @@ static int create_stream(AVFormatContext *s)
> return AVERROR_EXTERNAL;
> }
>
> + // Width and Height are not 0 only when we set a window area to share
> + // This if may be valid only in the first call to create_stream
> if (!c->width || !c->height) {
> + c->is_area = 0;
> + c->width = geo->width;
> + c->height = geo->height;
> + }
> + // If not a predefined area, then we should follow geometry changes
> + // This can be valid only on the second call onwards
> + if (!c->is_area && (c->width != geo->width || c->height != geo->height)) {
> c->width = geo->width;
> c->height = geo->height;
> }
> @@ -628,13 +534,6 @@ static int create_stream(AVFormatContext *s)
> }
> c->frame_size = frame_size_bits / 8;
>
> -#if CONFIG_LIBXCB_SHM
> - c->shm_pool = av_buffer_pool_init2(c->frame_size + AV_INPUT_BUFFER_PADDING_SIZE,
> - c->conn, allocate_shm_buffer, NULL);
> - if (!c->shm_pool)
> - return AVERROR(ENOMEM);
> -#endif
> -
> st->codecpar->codec_type = AVMEDIA_TYPE_VIDEO;
> st->codecpar->codec_id = AV_CODEC_ID_RAWVIDEO;
> st->codecpar->width = c->width;
> @@ -829,23 +728,26 @@ static av_cold int xcbgrab_read_header(AVFormatContext *s)
> sscanf(s->url, "+%d,%d", &c->x, &c->y);
> }
>
> - c->conn = xcb_connect(display_name[0] ? display_name : NULL, &screen_num);
> - av_freep(&display_name);
> + if (!c->conn || !c->screen) {
> + xcbgrab_read_close(s);
> + c->conn = xcb_connect(display_name[0] ? display_name : NULL, &screen_num);
> + av_freep(&display_name);
>
> - if ((ret = xcb_connection_has_error(c->conn))) {
> - av_log(s, AV_LOG_ERROR, "Cannot open display %s, error %d.\n",
> - s->url[0] ? s->url : "default", ret);
> - return AVERROR(EIO);
> - }
> + if ((ret = xcb_connection_has_error(c->conn))) {
> + av_log(s, AV_LOG_ERROR, "Cannot open display %s, error %d.\n",
> + s->url[0] ? s->url : "default", ret);
> + return AVERROR(EIO);
> + }
>
> - setup = xcb_get_setup(c->conn);
> + setup = xcb_get_setup(c->conn);
>
> - c->screen = get_screen(setup, screen_num);
> - if (!c->screen) {
> - av_log(s, AV_LOG_ERROR, "The screen %d does not exist.\n",
> - screen_num);
> - xcbgrab_read_close(s);
> - return AVERROR(EIO);
> + c->screen = get_screen(setup, screen_num);
> + if (!c->screen) {
> + av_log(s, AV_LOG_ERROR, "The screen %d does not exist.\n",
> + screen_num);
> + xcbgrab_read_close(s);
> + return AVERROR(EIO);
> + }
> }
>
> if (c->window_id == XCB_NONE)
> @@ -876,10 +778,6 @@ static av_cold int xcbgrab_read_header(AVFormatContext *s)
> return ret;
> }
>
> -#if CONFIG_LIBXCB_SHM
> - c->has_shm = check_shm(c->conn);
> -#endif
> -
> #if CONFIG_LIBXCB_XFIXES
> if (c->draw_mouse) {
> if (!(c->draw_mouse = check_xfixes(c->conn))) {
> --
> 2.34.1
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* [FFmpeg-devel] (no subject)
@ 2024-08-07 15:58 cyfdel-at-hotmail.com
0 siblings, 0 replies; 15+ messages in thread
From: cyfdel-at-hotmail.com @ 2024-08-07 15:58 UTC (permalink / raw)
To: ffmpeg-devel
hat the patch does:
fix gdigrab capture a window with hwnd shows "Invalid window
handle x, must be a vlid integer", althought a valid integer is
input
why:
line 284 of libavdevice/gdigrab.c, one of the condition leads to
check failed is p[0]='\0'. if a integer only string is process,
the p[0] after strtoull process will be null which equal to
'\0', otherwise, a non-integer string will make p[0] not null to
pass the check
how:
change p[0]=='\0' to p[0]!='\0' will works. no any side effect
reproduce and verify:
a simple command: ffmpeg -f gdigrab -i hwnd=12345
* althought a workaround command will work currently:
* ffmpeg -f gdigrab -i hwnd=12345x. (x could be any char)
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* [FFmpeg-devel] (no subject)
@ 2024-04-18 9:42 pengxu
0 siblings, 0 replies; 15+ messages in thread
From: pengxu @ 2024-04-18 9:42 UTC (permalink / raw)
To: ffmpeg-devel
v2: Fixed fate errors in [Patch 2/2]
v3: Fixed fate errors in [Patch 2/2]
Subject:[PATCH V3][Loongarch]Optimize aac decode/encode for Loongarch by LSX
In-Reply-To:
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* [FFmpeg-devel] (no subject)
@ 2024-04-18 7:36 pengxu
0 siblings, 0 replies; 15+ messages in thread
From: pengxu @ 2024-04-18 7:36 UTC (permalink / raw)
To: ffmpeg-devel
v2: Fixed build errors in [PATCH 2/2]
Subject: [PATCH V2][Loongarch]Optimize aac decode/encode for Loongarch by LSX
In-Reply-To:
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* [FFmpeg-devel] (no subject)
@ 2023-10-14 8:40 Logan.Lyu
0 siblings, 0 replies; 15+ messages in thread
From: Logan.Lyu @ 2023-10-14 8:40 UTC (permalink / raw)
To: ffmpeg-devel
[-- Attachment #1: Type: text/plain, Size: 21092 bytes --]
checkasm bench:
put_hevc_qpel_hv4_8_c: 422.1
put_hevc_qpel_hv4_8_i8mm: 101.6
put_hevc_qpel_hv6_8_c: 756.4
put_hevc_qpel_hv6_8_i8mm: 225.9
put_hevc_qpel_hv8_8_c: 1189.9
put_hevc_qpel_hv8_8_i8mm: 296.6
put_hevc_qpel_hv12_8_c: 2407.4
put_hevc_qpel_hv12_8_i8mm: 552.4
put_hevc_qpel_hv16_8_c: 4021.4
put_hevc_qpel_hv16_8_i8mm: 886.6
put_hevc_qpel_hv24_8_c: 8992.1
put_hevc_qpel_hv24_8_i8mm: 1968.9
put_hevc_qpel_hv32_8_c: 15197.9
put_hevc_qpel_hv32_8_i8mm: 3209.4
put_hevc_qpel_hv48_8_c: 32811.1
put_hevc_qpel_hv48_8_i8mm: 7442.1
put_hevc_qpel_hv64_8_c: 58106.1
put_hevc_qpel_hv64_8_i8mm: 12423.9
Co-Authored-By: J. Dekker <jdek@itanimul.li>
Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn>
---
libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 +
libavcodec/aarch64/hevcdsp_qpel_neon.S | 397 ++++++++++++++++++++++
2 files changed, 402 insertions(+)
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index f6b4c31d17..7d889efe68 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -208,6 +208,10 @@ NEON8_FNPROTO(qpel_v, (int16_t *dst,
const uint8_t *src, ptrdiff_t srcstride,
int height, intptr_t mx, intptr_t my, int width),);
+NEON8_FNPROTO(qpel_hv, (int16_t *dst,
+ const uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width), _i8mm);
+
NEON8_FNPROTO(qpel_uni_v, (uint8_t *dst, ptrdiff_t dststride,
const uint8_t *src, ptrdiff_t srcstride,
int height, intptr_t mx, intptr_t my, int width),);
@@ -335,6 +339,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext
*c, const int bit_depth)
NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 1, epel_uni_hv,
_i8mm);
NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h
,_i8mm);
NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
+ NEON8_FNASSIGN(c->put_hevc_qpel, 1, 1, qpel_hv, _i8mm);
NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 1, qpel_uni_hv,
_i8mm);
NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h,
_i8mm);
NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 1,
epel_uni_w_hv, _i8mm);
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S
b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index eff70d70a4..e4475ba920 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -3070,6 +3070,403 @@ function ff_hevc_put_hevc_qpel_h64_8_neon_i8mm,
export=1
ret
endfunc
+
+function ff_hevc_put_hevc_qpel_hv4_8_neon_i8mm, export=1
+ add w10, w3, #7
+ mov x7, #128
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ sub x1, x1, x2, lsl #1
+ add x3, x3, #7
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h4_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_qpel_filterh x5, x4
+ ldr d16, [sp]
+ ldr d17, [sp, x7]
+ add sp, sp, x7, lsl #1
+ ldr d18, [sp]
+ ldr d19, [sp, x7]
+ add sp, sp, x7, lsl #1
+ ldr d20, [sp]
+ ldr d21, [sp, x7]
+ add sp, sp, x7, lsl #1
+ ldr d22, [sp]
+ add sp, sp, x7
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().4h}, [sp], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5,
\src6, \src7, sqshrn
+ subs w3, w3, #1
+ st1 {v1.4h}, [x0], x7
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv6_8_neon_i8mm, export=1
+ add w10, w3, #7
+ mov x7, #128
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ sub x1, x1, x2, lsl #1
+ add x3, x3, #7
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h6_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ mov x8, #120
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_qpel_filterh x5, x4
+ ldr q16, [sp]
+ ldr q17, [sp, x7]
+ add sp, sp, x7, lsl #1
+ ldr q18, [sp]
+ ldr q19, [sp, x7]
+ add sp, sp, x7, lsl #1
+ ldr q20, [sp]
+ ldr q21, [sp, x7]
+ add sp, sp, x7, lsl #1
+ ldr q22, [sp]
+ add sp, sp, x7
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8h}, [sp], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5,
\src6, \src7, sqshrn
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4,
\src5, \src6, \src7, sqshrn2
+ st1 {v1.4h}, [x0], #8
+ subs w3, w3, #1
+ st1 {v1.s}[2], [x0], x8
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv8_8_neon_i8mm, export=1
+ add w10, w3, #7
+ lsl x10, x10, #7
+ sub x1, x1, x2, lsl #1
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ add x3, x3, #7
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h8_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ mov x7, #128
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_qpel_filterh x5, x4
+ ldr q16, [sp]
+ ldr q17, [sp, x7]
+ add sp, sp, x7, lsl #1
+ ldr q18, [sp]
+ ldr q19, [sp, x7]
+ add sp, sp, x7, lsl #1
+ ldr q20, [sp]
+ ldr q21, [sp, x7]
+ add sp, sp, x7, lsl #1
+ ldr q22, [sp]
+ add sp, sp, x7
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8h}, [sp], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5,
\src6, \src7, sqshrn
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4,
\src5, \src6, \src7, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h}, [x0], x7
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv12_8_neon_i8mm, export=1
+ add w10, w3, #7
+ lsl x10, x10, #7
+ sub x1, x1, x2, lsl #1
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ add x3, x3, #7
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h12_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ mov x7, #128
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_qpel_filterh x5, x4
+ mov x8, #112
+ ld1 {v16.8h, v17.8h}, [sp], x7
+ ld1 {v18.8h, v19.8h}, [sp], x7
+ ld1 {v20.8h, v21.8h}, [sp], x7
+ ld1 {v22.8h, v23.8h}, [sp], x7
+ ld1 {v24.8h, v25.8h}, [sp], x7
+ ld1 {v26.8h, v27.8h}, [sp], x7
+ ld1 {v28.8h, v29.8h}, [sp], x7
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7,
src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().8h, \tmp1\().8h}, [sp], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4,
\src5, \src6, \src7, sqshrn
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4,
\src5, \src6, \src7, sqshrn2
+ calc_qpelh v2, \src8, \src9, \src10, \src11, \src12,
\src13, \src14, \src15, sqshrn
+ st1 {v1.8h}, [x0], #16
+ subs w3, w3, #1
+ st1 {v2.4h}, [x0], x8
+.endm
+1: calc_all2
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv16_8_neon_i8mm, export=1
+ add w10, w3, #7
+ lsl x10, x10, #7
+ sub x1, x1, x2, lsl #1
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x3, x3, #7
+ add x0, sp, #32
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h16_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ mov x7, #128
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_qpel_filterh x5, x4
+ ld1 {v16.8h, v17.8h}, [sp], x7
+ ld1 {v18.8h, v19.8h}, [sp], x7
+ ld1 {v20.8h, v21.8h}, [sp], x7
+ ld1 {v22.8h, v23.8h}, [sp], x7
+ ld1 {v24.8h, v25.8h}, [sp], x7
+ ld1 {v26.8h, v27.8h}, [sp], x7
+ ld1 {v28.8h, v29.8h}, [sp], x7
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7,
src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().8h, \tmp1\().8h}, [sp], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4,
\src5, \src6, \src7, sqshrn
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4,
\src5, \src6, \src7, sqshrn2
+ calc_qpelh v2, \src8, \src9, \src10, \src11, \src12,
\src13, \src14, \src15, sqshrn
+ calc_qpelh2 v2, v3, \src8, \src9, \src10, \src11, \src12,
\src13, \src14, \src15, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h, v2.8h}, [x0], x7
+.endm
+1: calc_all2
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv24_8_neon_i8mm, export=1
+ sub sp, sp, #32
+ st1 {v8.8b-v11.8b}, [sp]
+ sub x1, x1, x2, lsl #1
+ sub sp, sp, #32
+ add w10, w3, #7
+ st1 {v12.8b-v15.8b}, [sp]
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ add x3, x3, #7
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h24_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ mov x7, #128
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_qpel_filterh x5, x4
+ ld1 {v8.8h-v10.8h}, [sp], x7
+ ld1 {v11.8h-v13.8h}, [sp], x7
+ ld1 {v14.8h-v16.8h}, [sp], x7
+ ld1 {v17.8h-v19.8h}, [sp], x7
+ ld1 {v20.8h-v22.8h}, [sp], x7
+ ld1 {v23.8h-v25.8h}, [sp], x7
+ ld1 {v26.8h-v28.8h}, [sp], x7
+1: ld1 {v29.8h-v31.8h}, [sp], x7
+ calc_qpelh v1, v8, v11, v14, v17, v20, v23, v26, v29, sqshrn
+ calc_qpelh2 v1, v2, v8, v11, v14, v17, v20, v23, v26, v29,
sqshrn2
+ calc_qpelh v2, v9, v12, v15, v18, v21, v24, v27, v30, sqshrn
+ calc_qpelh2 v2, v3, v9, v12, v15, v18, v21, v24, v27, v30,
sqshrn2
+ calc_qpelh v3, v10, v13, v16, v19, v22, v25, v28, v31, sqshrn
+ calc_qpelh2 v3, v4, v10, v13, v16, v19, v22, v25, v28, v31,
sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v8.8h-v10.8h}, [sp], x7
+ calc_qpelh v1, v11, v14, v17, v20, v23, v26, v29, v8, sqshrn
+ calc_qpelh2 v1, v2, v11, v14, v17, v20, v23, v26, v29, v8,
sqshrn2
+ calc_qpelh v2, v12, v15, v18, v21, v24, v27, v30, v9, sqshrn
+ calc_qpelh2 v2, v3, v12, v15, v18, v21, v24, v27, v30, v9,
sqshrn2
+ calc_qpelh v3, v13, v16, v19, v22, v25, v28, v31, v10, sqshrn
+ calc_qpelh2 v3, v4, v13, v16, v19, v22, v25, v28, v31, v10,
sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v11.8h-v13.8h}, [sp], x7
+ calc_qpelh v1, v14, v17, v20, v23, v26, v29, v8, v11, sqshrn
+ calc_qpelh2 v1, v2, v14, v17, v20, v23, v26, v29, v8, v11,
sqshrn2
+ calc_qpelh v2, v15, v18, v21, v24, v27, v30, v9, v12, sqshrn
+ calc_qpelh2 v2, v3, v15, v18, v21, v24, v27, v30, v9, v12,
sqshrn2
+ calc_qpelh v3, v16, v19, v22, v25, v28, v31, v10, v13, sqshrn
+ calc_qpelh2 v3, v4, v16, v19, v22, v25, v28, v31, v10, v13,
sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v14.8h-v16.8h}, [sp], x7
+ calc_qpelh v1, v17, v20, v23, v26, v29, v8, v11, v14, sqshrn
+ calc_qpelh2 v1, v2, v17, v20, v23, v26, v29, v8, v11, v14,
sqshrn2
+ calc_qpelh v2, v18, v21, v24, v27, v30, v9, v12, v15, sqshrn
+ calc_qpelh2 v2, v3, v18, v21, v24, v27, v30, v9, v12, v15,
sqshrn2
+ calc_qpelh v3, v19, v22, v25, v28, v31, v10, v13, v16, sqshrn
+ calc_qpelh2 v3, v4, v19, v22, v25, v28, v31, v10, v13, v16,
sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v17.8h-v19.8h}, [sp], x7
+ calc_qpelh v1, v20, v23, v26, v29, v8, v11, v14, v17, sqshrn
+ calc_qpelh2 v1, v2, v20, v23, v26, v29, v8, v11, v14, v17,
sqshrn2
+ calc_qpelh v2, v21, v24, v27, v30, v9, v12, v15, v18, sqshrn
+ calc_qpelh2 v2, v3, v21, v24, v27, v30, v9, v12, v15, v18,
sqshrn2
+ calc_qpelh v3, v22, v25, v28, v31, v10, v13, v16, v19, sqshrn
+ calc_qpelh2 v3, v4, v22, v25, v28, v31, v10, v13, v16, v19,
sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v20.8h-v22.8h}, [sp], x7
+ calc_qpelh v1, v23, v26, v29, v8, v11, v14, v17, v20, sqshrn
+ calc_qpelh2 v1, v2, v23, v26, v29, v8, v11, v14, v17, v20,
sqshrn2
+ calc_qpelh v2, v24, v27, v30, v9, v12, v15, v18, v21, sqshrn
+ calc_qpelh2 v2, v3, v24, v27, v30, v9, v12, v15, v18, v21,
sqshrn2
+ calc_qpelh v3, v25, v28, v31, v10, v13, v16, v19, v22, sqshrn
+ calc_qpelh2 v3, v4, v25, v28, v31, v10, v13, v16, v19, v22,
sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v23.8h-v25.8h}, [sp], x7
+ calc_qpelh v1, v26, v29, v8, v11, v14, v17, v20, v23, sqshrn
+ calc_qpelh2 v1, v2, v26, v29, v8, v11, v14, v17, v20, v23,
sqshrn2
+ calc_qpelh v2, v27, v30, v9, v12, v15, v18, v21, v24, sqshrn
+ calc_qpelh2 v2, v3, v27, v30, v9, v12, v15, v18, v21, v24,
sqshrn2
+ calc_qpelh v3, v28, v31, v10, v13, v16, v19, v22, v25, sqshrn
+ calc_qpelh2 v3, v4, v28, v31, v10, v13, v16, v19, v22, v25,
sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v26.8h-v28.8h}, [sp], x7
+ calc_qpelh v1, v29, v8, v11, v14, v17, v20, v23, v26, sqshrn
+ calc_qpelh2 v1, v2, v29, v8, v11, v14, v17, v20, v23, v26,
sqshrn2
+ calc_qpelh v2, v30, v9, v12, v15, v18, v21, v24, v27, sqshrn
+ calc_qpelh2 v2, v3, v30, v9, v12, v15, v18, v21, v24, v27,
sqshrn2
+ calc_qpelh v3, v31, v10, v13, v16, v19, v22, v25, v28, sqshrn
+ calc_qpelh2 v3, v4, v31, v10, v13, v16, v19, v22, v25, v28,
sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.hi 1b
+2: ld1 {v12.8b-v15.8b}, [sp], #32
+ ld1 {v8.8b-v11.8b}, [sp], #32
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv32_8_neon_i8mm, export=1
+ add w10, w3, #7
+ sub x1, x1, x2, lsl #1
+ lsl x10, x10, #7
+ sub x1, x1, x2
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x3, x3, #7
+ add x0, sp, #32
+ bl X(ff_hevc_put_hevc_qpel_h32_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ mov x7, #128
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_qpel_filterh x5, x4
+0: mov x8, sp // src
+ ld1 {v16.8h, v17.8h}, [x8], x7
+ mov w9, w3 // height
+ ld1 {v18.8h, v19.8h}, [x8], x7
+ mov x5, x0 // dst
+ ld1 {v20.8h, v21.8h}, [x8], x7
+ ld1 {v22.8h, v23.8h}, [x8], x7
+ ld1 {v24.8h, v25.8h}, [x8], x7
+ ld1 {v26.8h, v27.8h}, [x8], x7
+ ld1 {v28.8h, v29.8h}, [x8], x7
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7,
src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().8h, \tmp1\().8h}, [x8], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4,
\src5, \src6, \src7, sqshrn
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4,
\src5, \src6, \src7, sqshrn2
+ calc_qpelh v2, \src8, \src9, \src10, \src11, \src12,
\src13, \src14, \src15, sqshrn
+ calc_qpelh2 v2, v3, \src8, \src9, \src10, \src11, \src12,
\src13, \src14, \src15, sqshrn2
+ subs x9, x9, #1
+ st1 {v1.8h, v2.8h}, [x5], x7
+.endm
+1: calc_all2
+.purgem calc
+2: add x0, x0, #32
+ add sp, sp, #32
+ subs w6, w6, #16
+ b.hi 0b
+ add w10, w3, #6
+ add sp, sp, #64 // discard rest of first line
+ lsl x10, x10, #7
+ add sp, sp, x10 // tmp_array without first line
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv48_8_neon_i8mm, export=1
+ stp x4, x5, [sp, #-64]!
+ stp x2, x3, [sp, #16]
+ stp x0, x1, [sp, #32]
+ str x30, [sp, #48]
+ bl X(ff_hevc_put_hevc_qpel_hv24_8_neon_i8mm)
+ ldp x4, x5, [sp]
+ ldp x2, x3, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ add sp, sp, #48
+ add x1, x1, #24
+ add x0, x0, #48
+ bl X(ff_hevc_put_hevc_qpel_hv24_8_neon_i8mm)
+ ldr x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv64_8_neon_i8mm, export=1
+ stp x4, x5, [sp, #-64]!
+ stp x2, x3, [sp, #16]
+ stp x0, x1, [sp, #32]
+ str x30, [sp, #48]
+ mov x6, #32
+ bl X(ff_hevc_put_hevc_qpel_hv32_8_neon_i8mm)
+ ldp x4, x5, [sp]
+ ldp x2, x3, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ add sp, sp, #48
+ add x1, x1, #32
+ add x0, x0, #64
+ mov x6, #32
+ bl X(ff_hevc_put_hevc_qpel_hv32_8_neon_i8mm)
+ ldr x30, [sp], #16
+ ret
+endfunc
+
.macro QPEL_UNI_W_HV_HEADER width
ldp x14, x15, [sp] // mx, my
ldr w13, [sp, #16] // width
--
2.38.0.windows.1
[-- Attachment #2: 0004-lavc-aarch64-new-optimization-for-8-bit-hevc_qpel_hv.patch --]
[-- Type: text/plain, Size: 21203 bytes --]
From 6a7f049fd0382c04297fb9cefd9f5ce022abbe5f Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu@myais.com.cn>
Date: Sat, 9 Sep 2023 22:40:51 +0800
Subject: [PATCH 4/4] lavc/aarch64: new optimization for 8-bit hevc_qpel_hv
checkasm bench:
put_hevc_qpel_hv4_8_c: 422.1
put_hevc_qpel_hv4_8_i8mm: 101.6
put_hevc_qpel_hv6_8_c: 756.4
put_hevc_qpel_hv6_8_i8mm: 225.9
put_hevc_qpel_hv8_8_c: 1189.9
put_hevc_qpel_hv8_8_i8mm: 296.6
put_hevc_qpel_hv12_8_c: 2407.4
put_hevc_qpel_hv12_8_i8mm: 552.4
put_hevc_qpel_hv16_8_c: 4021.4
put_hevc_qpel_hv16_8_i8mm: 886.6
put_hevc_qpel_hv24_8_c: 8992.1
put_hevc_qpel_hv24_8_i8mm: 1968.9
put_hevc_qpel_hv32_8_c: 15197.9
put_hevc_qpel_hv32_8_i8mm: 3209.4
put_hevc_qpel_hv48_8_c: 32811.1
put_hevc_qpel_hv48_8_i8mm: 7442.1
put_hevc_qpel_hv64_8_c: 58106.1
put_hevc_qpel_hv64_8_i8mm: 12423.9
Co-Authored-By: J. Dekker <jdek@itanimul.li>
---
libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 +
libavcodec/aarch64/hevcdsp_qpel_neon.S | 397 ++++++++++++++++++++++
2 files changed, 402 insertions(+)
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index f6b4c31d17..7d889efe68 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -208,6 +208,10 @@ NEON8_FNPROTO(qpel_v, (int16_t *dst,
const uint8_t *src, ptrdiff_t srcstride,
int height, intptr_t mx, intptr_t my, int width),);
+NEON8_FNPROTO(qpel_hv, (int16_t *dst,
+ const uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width), _i8mm);
+
NEON8_FNPROTO(qpel_uni_v, (uint8_t *dst, ptrdiff_t dststride,
const uint8_t *src, ptrdiff_t srcstride,
int height, intptr_t mx, intptr_t my, int width),);
@@ -335,6 +339,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 1, epel_uni_hv, _i8mm);
NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h ,_i8mm);
NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
+ NEON8_FNASSIGN(c->put_hevc_qpel, 1, 1, qpel_hv, _i8mm);
NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 1, qpel_uni_hv, _i8mm);
NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 1, epel_uni_w_hv, _i8mm);
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index eff70d70a4..e4475ba920 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -3070,6 +3070,403 @@ function ff_hevc_put_hevc_qpel_h64_8_neon_i8mm, export=1
ret
endfunc
+
+function ff_hevc_put_hevc_qpel_hv4_8_neon_i8mm, export=1
+ add w10, w3, #7
+ mov x7, #128
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ sub x1, x1, x2, lsl #1
+ add x3, x3, #7
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h4_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_qpel_filterh x5, x4
+ ldr d16, [sp]
+ ldr d17, [sp, x7]
+ add sp, sp, x7, lsl #1
+ ldr d18, [sp]
+ ldr d19, [sp, x7]
+ add sp, sp, x7, lsl #1
+ ldr d20, [sp]
+ ldr d21, [sp, x7]
+ add sp, sp, x7, lsl #1
+ ldr d22, [sp]
+ add sp, sp, x7
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().4h}, [sp], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+ subs w3, w3, #1
+ st1 {v1.4h}, [x0], x7
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv6_8_neon_i8mm, export=1
+ add w10, w3, #7
+ mov x7, #128
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ sub x1, x1, x2, lsl #1
+ add x3, x3, #7
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h6_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ mov x8, #120
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_qpel_filterh x5, x4
+ ldr q16, [sp]
+ ldr q17, [sp, x7]
+ add sp, sp, x7, lsl #1
+ ldr q18, [sp]
+ ldr q19, [sp, x7]
+ add sp, sp, x7, lsl #1
+ ldr q20, [sp]
+ ldr q21, [sp, x7]
+ add sp, sp, x7, lsl #1
+ ldr q22, [sp]
+ add sp, sp, x7
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8h}, [sp], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
+ st1 {v1.4h}, [x0], #8
+ subs w3, w3, #1
+ st1 {v1.s}[2], [x0], x8
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv8_8_neon_i8mm, export=1
+ add w10, w3, #7
+ lsl x10, x10, #7
+ sub x1, x1, x2, lsl #1
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ add x3, x3, #7
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h8_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ mov x7, #128
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_qpel_filterh x5, x4
+ ldr q16, [sp]
+ ldr q17, [sp, x7]
+ add sp, sp, x7, lsl #1
+ ldr q18, [sp]
+ ldr q19, [sp, x7]
+ add sp, sp, x7, lsl #1
+ ldr q20, [sp]
+ ldr q21, [sp, x7]
+ add sp, sp, x7, lsl #1
+ ldr q22, [sp]
+ add sp, sp, x7
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8h}, [sp], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h}, [x0], x7
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv12_8_neon_i8mm, export=1
+ add w10, w3, #7
+ lsl x10, x10, #7
+ sub x1, x1, x2, lsl #1
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ add x3, x3, #7
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h12_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ mov x7, #128
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_qpel_filterh x5, x4
+ mov x8, #112
+ ld1 {v16.8h, v17.8h}, [sp], x7
+ ld1 {v18.8h, v19.8h}, [sp], x7
+ ld1 {v20.8h, v21.8h}, [sp], x7
+ ld1 {v22.8h, v23.8h}, [sp], x7
+ ld1 {v24.8h, v25.8h}, [sp], x7
+ ld1 {v26.8h, v27.8h}, [sp], x7
+ ld1 {v28.8h, v29.8h}, [sp], x7
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().8h, \tmp1\().8h}, [sp], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
+ calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
+ st1 {v1.8h}, [x0], #16
+ subs w3, w3, #1
+ st1 {v2.4h}, [x0], x8
+.endm
+1: calc_all2
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv16_8_neon_i8mm, export=1
+ add w10, w3, #7
+ lsl x10, x10, #7
+ sub x1, x1, x2, lsl #1
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x3, x3, #7
+ add x0, sp, #32
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h16_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ mov x7, #128
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_qpel_filterh x5, x4
+ ld1 {v16.8h, v17.8h}, [sp], x7
+ ld1 {v18.8h, v19.8h}, [sp], x7
+ ld1 {v20.8h, v21.8h}, [sp], x7
+ ld1 {v22.8h, v23.8h}, [sp], x7
+ ld1 {v24.8h, v25.8h}, [sp], x7
+ ld1 {v26.8h, v27.8h}, [sp], x7
+ ld1 {v28.8h, v29.8h}, [sp], x7
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().8h, \tmp1\().8h}, [sp], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
+ calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
+ calc_qpelh2 v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h, v2.8h}, [x0], x7
+.endm
+1: calc_all2
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv24_8_neon_i8mm, export=1
+ sub sp, sp, #32
+ st1 {v8.8b-v11.8b}, [sp]
+ sub x1, x1, x2, lsl #1
+ sub sp, sp, #32
+ add w10, w3, #7
+ st1 {v12.8b-v15.8b}, [sp]
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ add x3, x3, #7
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h24_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ mov x7, #128
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_qpel_filterh x5, x4
+ ld1 {v8.8h-v10.8h}, [sp], x7
+ ld1 {v11.8h-v13.8h}, [sp], x7
+ ld1 {v14.8h-v16.8h}, [sp], x7
+ ld1 {v17.8h-v19.8h}, [sp], x7
+ ld1 {v20.8h-v22.8h}, [sp], x7
+ ld1 {v23.8h-v25.8h}, [sp], x7
+ ld1 {v26.8h-v28.8h}, [sp], x7
+1: ld1 {v29.8h-v31.8h}, [sp], x7
+ calc_qpelh v1, v8, v11, v14, v17, v20, v23, v26, v29, sqshrn
+ calc_qpelh2 v1, v2, v8, v11, v14, v17, v20, v23, v26, v29, sqshrn2
+ calc_qpelh v2, v9, v12, v15, v18, v21, v24, v27, v30, sqshrn
+ calc_qpelh2 v2, v3, v9, v12, v15, v18, v21, v24, v27, v30, sqshrn2
+ calc_qpelh v3, v10, v13, v16, v19, v22, v25, v28, v31, sqshrn
+ calc_qpelh2 v3, v4, v10, v13, v16, v19, v22, v25, v28, v31, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v8.8h-v10.8h}, [sp], x7
+ calc_qpelh v1, v11, v14, v17, v20, v23, v26, v29, v8, sqshrn
+ calc_qpelh2 v1, v2, v11, v14, v17, v20, v23, v26, v29, v8, sqshrn2
+ calc_qpelh v2, v12, v15, v18, v21, v24, v27, v30, v9, sqshrn
+ calc_qpelh2 v2, v3, v12, v15, v18, v21, v24, v27, v30, v9, sqshrn2
+ calc_qpelh v3, v13, v16, v19, v22, v25, v28, v31, v10, sqshrn
+ calc_qpelh2 v3, v4, v13, v16, v19, v22, v25, v28, v31, v10, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v11.8h-v13.8h}, [sp], x7
+ calc_qpelh v1, v14, v17, v20, v23, v26, v29, v8, v11, sqshrn
+ calc_qpelh2 v1, v2, v14, v17, v20, v23, v26, v29, v8, v11, sqshrn2
+ calc_qpelh v2, v15, v18, v21, v24, v27, v30, v9, v12, sqshrn
+ calc_qpelh2 v2, v3, v15, v18, v21, v24, v27, v30, v9, v12, sqshrn2
+ calc_qpelh v3, v16, v19, v22, v25, v28, v31, v10, v13, sqshrn
+ calc_qpelh2 v3, v4, v16, v19, v22, v25, v28, v31, v10, v13, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v14.8h-v16.8h}, [sp], x7
+ calc_qpelh v1, v17, v20, v23, v26, v29, v8, v11, v14, sqshrn
+ calc_qpelh2 v1, v2, v17, v20, v23, v26, v29, v8, v11, v14, sqshrn2
+ calc_qpelh v2, v18, v21, v24, v27, v30, v9, v12, v15, sqshrn
+ calc_qpelh2 v2, v3, v18, v21, v24, v27, v30, v9, v12, v15, sqshrn2
+ calc_qpelh v3, v19, v22, v25, v28, v31, v10, v13, v16, sqshrn
+ calc_qpelh2 v3, v4, v19, v22, v25, v28, v31, v10, v13, v16, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v17.8h-v19.8h}, [sp], x7
+ calc_qpelh v1, v20, v23, v26, v29, v8, v11, v14, v17, sqshrn
+ calc_qpelh2 v1, v2, v20, v23, v26, v29, v8, v11, v14, v17, sqshrn2
+ calc_qpelh v2, v21, v24, v27, v30, v9, v12, v15, v18, sqshrn
+ calc_qpelh2 v2, v3, v21, v24, v27, v30, v9, v12, v15, v18, sqshrn2
+ calc_qpelh v3, v22, v25, v28, v31, v10, v13, v16, v19, sqshrn
+ calc_qpelh2 v3, v4, v22, v25, v28, v31, v10, v13, v16, v19, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v20.8h-v22.8h}, [sp], x7
+ calc_qpelh v1, v23, v26, v29, v8, v11, v14, v17, v20, sqshrn
+ calc_qpelh2 v1, v2, v23, v26, v29, v8, v11, v14, v17, v20, sqshrn2
+ calc_qpelh v2, v24, v27, v30, v9, v12, v15, v18, v21, sqshrn
+ calc_qpelh2 v2, v3, v24, v27, v30, v9, v12, v15, v18, v21, sqshrn2
+ calc_qpelh v3, v25, v28, v31, v10, v13, v16, v19, v22, sqshrn
+ calc_qpelh2 v3, v4, v25, v28, v31, v10, v13, v16, v19, v22, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v23.8h-v25.8h}, [sp], x7
+ calc_qpelh v1, v26, v29, v8, v11, v14, v17, v20, v23, sqshrn
+ calc_qpelh2 v1, v2, v26, v29, v8, v11, v14, v17, v20, v23, sqshrn2
+ calc_qpelh v2, v27, v30, v9, v12, v15, v18, v21, v24, sqshrn
+ calc_qpelh2 v2, v3, v27, v30, v9, v12, v15, v18, v21, v24, sqshrn2
+ calc_qpelh v3, v28, v31, v10, v13, v16, v19, v22, v25, sqshrn
+ calc_qpelh2 v3, v4, v28, v31, v10, v13, v16, v19, v22, v25, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v26.8h-v28.8h}, [sp], x7
+ calc_qpelh v1, v29, v8, v11, v14, v17, v20, v23, v26, sqshrn
+ calc_qpelh2 v1, v2, v29, v8, v11, v14, v17, v20, v23, v26, sqshrn2
+ calc_qpelh v2, v30, v9, v12, v15, v18, v21, v24, v27, sqshrn
+ calc_qpelh2 v2, v3, v30, v9, v12, v15, v18, v21, v24, v27, sqshrn2
+ calc_qpelh v3, v31, v10, v13, v16, v19, v22, v25, v28, sqshrn
+ calc_qpelh2 v3, v4, v31, v10, v13, v16, v19, v22, v25, v28, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.hi 1b
+2: ld1 {v12.8b-v15.8b}, [sp], #32
+ ld1 {v8.8b-v11.8b}, [sp], #32
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv32_8_neon_i8mm, export=1
+ add w10, w3, #7
+ sub x1, x1, x2, lsl #1
+ lsl x10, x10, #7
+ sub x1, x1, x2
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x3, x3, #7
+ add x0, sp, #32
+ bl X(ff_hevc_put_hevc_qpel_h32_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ mov x7, #128
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_qpel_filterh x5, x4
+0: mov x8, sp // src
+ ld1 {v16.8h, v17.8h}, [x8], x7
+ mov w9, w3 // height
+ ld1 {v18.8h, v19.8h}, [x8], x7
+ mov x5, x0 // dst
+ ld1 {v20.8h, v21.8h}, [x8], x7
+ ld1 {v22.8h, v23.8h}, [x8], x7
+ ld1 {v24.8h, v25.8h}, [x8], x7
+ ld1 {v26.8h, v27.8h}, [x8], x7
+ ld1 {v28.8h, v29.8h}, [x8], x7
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().8h, \tmp1\().8h}, [x8], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
+ calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
+ calc_qpelh2 v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn2
+ subs x9, x9, #1
+ st1 {v1.8h, v2.8h}, [x5], x7
+.endm
+1: calc_all2
+.purgem calc
+2: add x0, x0, #32
+ add sp, sp, #32
+ subs w6, w6, #16
+ b.hi 0b
+ add w10, w3, #6
+ add sp, sp, #64 // discard rest of first line
+ lsl x10, x10, #7
+ add sp, sp, x10 // tmp_array without first line
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv48_8_neon_i8mm, export=1
+ stp x4, x5, [sp, #-64]!
+ stp x2, x3, [sp, #16]
+ stp x0, x1, [sp, #32]
+ str x30, [sp, #48]
+ bl X(ff_hevc_put_hevc_qpel_hv24_8_neon_i8mm)
+ ldp x4, x5, [sp]
+ ldp x2, x3, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ add sp, sp, #48
+ add x1, x1, #24
+ add x0, x0, #48
+ bl X(ff_hevc_put_hevc_qpel_hv24_8_neon_i8mm)
+ ldr x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv64_8_neon_i8mm, export=1
+ stp x4, x5, [sp, #-64]!
+ stp x2, x3, [sp, #16]
+ stp x0, x1, [sp, #32]
+ str x30, [sp, #48]
+ mov x6, #32
+ bl X(ff_hevc_put_hevc_qpel_hv32_8_neon_i8mm)
+ ldp x4, x5, [sp]
+ ldp x2, x3, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ add sp, sp, #48
+ add x1, x1, #32
+ add x0, x0, #64
+ mov x6, #32
+ bl X(ff_hevc_put_hevc_qpel_hv32_8_neon_i8mm)
+ ldr x30, [sp], #16
+ ret
+endfunc
+
.macro QPEL_UNI_W_HV_HEADER width
ldp x14, x15, [sp] // mx, my
ldr w13, [sp, #16] // width
--
2.38.0.windows.1
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* [FFmpeg-devel] (no subject)
@ 2023-10-14 8:40 Logan.Lyu
0 siblings, 0 replies; 15+ messages in thread
From: Logan.Lyu @ 2023-10-14 8:40 UTC (permalink / raw)
To: ffmpeg-devel
[-- Attachment #1: Type: text/plain, Size: 17977 bytes --]
checkasm bench:
put_hevc_qpel_v4_8_c: 138.1
put_hevc_qpel_v4_8_neon: 41.1
put_hevc_qpel_v6_8_c: 276.6
put_hevc_qpel_v6_8_neon: 60.9
put_hevc_qpel_v8_8_c: 478.9
put_hevc_qpel_v8_8_neon: 72.9
put_hevc_qpel_v12_8_c: 1072.6
put_hevc_qpel_v12_8_neon: 203.9
put_hevc_qpel_v16_8_c: 1852.1
put_hevc_qpel_v16_8_neon: 264.1
put_hevc_qpel_v24_8_c: 4137.6
put_hevc_qpel_v24_8_neon: 586.9
put_hevc_qpel_v32_8_c: 7579.1
put_hevc_qpel_v32_8_neon: 1036.6
put_hevc_qpel_v48_8_c: 16355.6
put_hevc_qpel_v48_8_neon: 2326.4
put_hevc_qpel_v64_8_c: 33545.1
put_hevc_qpel_v64_8_neon: 4126.4
Co-Authored-By: J. Dekker <jdek@itanimul.li>
Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn>
---
libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 +
libavcodec/aarch64/hevcdsp_qpel_neon.S | 347 +++++++++++++++++++---
2 files changed, 314 insertions(+), 38 deletions(-)
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index e9a341ecb9..f6b4c31d17 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -204,6 +204,10 @@ NEON8_FNPROTO(qpel_h, (int16_t *dst,
const uint8_t *_src, ptrdiff_t _srcstride,
int height, intptr_t mx, intptr_t my, int width), _i8mm);
+NEON8_FNPROTO(qpel_v, (int16_t *dst,
+ const uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width),);
+
NEON8_FNPROTO(qpel_uni_v, (uint8_t *dst, ptrdiff_t dststride,
const uint8_t *src, ptrdiff_t srcstride,
int height, intptr_t mx, intptr_t my, int width),);
@@ -315,6 +319,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext
*c, const int bit_depth)
NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels,);
NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v,);
NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels,);
+ NEON8_FNASSIGN(c->put_hevc_qpel, 1, 0, qpel_v,);
NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);
NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,);
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S
b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index 4132d7a8a9..eff70d70a4 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -112,6 +112,44 @@ endconst
.endif
.endm
+.macro calc_all
+ calc v23, v16, v17, v18, v19, v20, v21, v22, v23
+ b.eq 2f
+ calc v16, v17, v18, v19, v20, v21, v22, v23, v16
+ b.eq 2f
+ calc v17, v18, v19, v20, v21, v22, v23, v16, v17
+ b.eq 2f
+ calc v18, v19, v20, v21, v22, v23, v16, v17, v18
+ b.eq 2f
+ calc v19, v20, v21, v22, v23, v16, v17, v18, v19
+ b.eq 2f
+ calc v20, v21, v22, v23, v16, v17, v18, v19, v20
+ b.eq 2f
+ calc v21, v22, v23, v16, v17, v18, v19, v20, v21
+ b.eq 2f
+ calc v22, v23, v16, v17, v18, v19, v20, v21, v22
+ b.hi 1b
+.endm
+
+.macro calc_all2
+ calc v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17,
v19, v21, v23, v25, v27, v29, v31
+ b.eq 2f
+ calc v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19,
v21, v23, v25, v27, v29, v31, v17
+ b.eq 2f
+ calc v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21,
v23, v25, v27, v29, v31, v17, v19
+ b.eq 2f
+ calc v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23,
v25, v27, v29, v31, v17, v19, v21
+ b.eq 2f
+ calc v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25,
v27, v29, v31, v17, v19, v21, v23
+ b.eq 2f
+ calc v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27,
v29, v31, v17, v19, v21, v23, v25
+ b.eq 2f
+ calc v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29,
v31, v17, v19, v21, v23, v25, v27
+ b.eq 2f
+ calc v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31,
v17, v19, v21, v23, v25, v27, v29
+ b.hi 1b
+.endm
+
.macro put_hevc type
.ifc \type, qpel
// void put_hevc_qpel_h(int16_t *dst,
@@ -558,6 +596,277 @@ put_hevc qpel
put_hevc qpel_uni
put_hevc qpel_bi
+function ff_hevc_put_hevc_qpel_v4_8_neon, export=1
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2)
+ sub x1, x1, x2
+ ldr s16, [x1]
+ ldr s17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr s18, [x1]
+ ldr s19, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr s20, [x1]
+ ldr s21, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr s22, [x1]
+ add x1, x1, x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().s}[0], [x1], x2
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5,
\src6, \src7
+ st1 {v24.4h}, [x0], x9
+ subs w3, w3, #1
+ b.eq 2f
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v6_8_neon, export=1
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2 - 8)
+ sub x1, x1, x2
+ ldr d16, [x1]
+ ldr d17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr d18, [x1]
+ ldr d19, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr d20, [x1]
+ ldr d21, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr d22, [x1]
+ add x1, x1, x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8b}, [x1], x2
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5,
\src6, \src7
+ st1 {v24.4h}, [x0], #8
+ st1 {v24.s}[2], [x0], x9
+ subs w3, w3, #1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v8_8_neon, export=1
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2)
+ sub x1, x1, x2
+ ldr d16, [x1]
+ ldr d17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr d18, [x1]
+ ldr d19, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr d20, [x1]
+ ldr d21, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr d22, [x1]
+ add x1, x1, x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8b}, [x1], x2
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5,
\src6, \src7
+ st1 {v24.8h}, [x0], x9
+ subs w3, w3, #1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v12_8_neon, export=1
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2 - 16)
+ sub x1, x1, x2
+ ldr q16, [x1]
+ ldr q17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr q18, [x1]
+ ldr q19, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr q20, [x1]
+ ldr q21, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr q22, [x1]
+ add x1, x1, x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().16b}, [x1], x2
+ movi v24.8h, #0
+ movi v25.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5,
\src6, \src7
+ calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5,
\src6, \src7
+ st1 {v24.8h}, [x0], #16
+ subs w3, w3, #1
+ st1 {v25.4h}, [x0], x9
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v16_8_neon, export=1
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2)
+ sub x1, x1, x2
+ ldr q16, [x1]
+ ldr q17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr q18, [x1]
+ ldr q19, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr q20, [x1]
+ ldr q21, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr q22, [x1]
+ add x1, x1, x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().16b}, [x1], x2
+ movi v24.8h, #0
+ movi v25.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5,
\src6, \src7
+ calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5,
\src6, \src7
+ subs w3, w3, #1
+ st1 {v24.8h, v25.8h}, [x0], x9
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+// todo: reads #32 bytes
+function ff_hevc_put_hevc_qpel_v24_8_neon, export=1
+ sub sp, sp, #32
+ st1 {v8.8b, v9.8b, v10.8b}, [sp]
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ sub x1, x1, x2
+ mov x9, #(MAX_PB_SIZE * 2)
+ ld1 {v16.16b, v17.16b}, [x1], x2
+ ld1 {v18.16b, v19.16b}, [x1], x2
+ ld1 {v20.16b, v21.16b}, [x1], x2
+ ld1 {v22.16b, v23.16b}, [x1], x2
+ ld1 {v24.16b, v25.16b}, [x1], x2
+ ld1 {v26.16b, v27.16b}, [x1], x2
+ ld1 {v28.16b, v29.16b}, [x1], x2
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7,
src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().16b, \tmp1\().16b}, [x1], x2
+ movi v8.8h, #0
+ movi v9.8h, #0
+ movi v10.8h, #0
+ calc_qpelb v8, \src0, \src1, \src2, \src3, \src4,
\src5, \src6, \src7
+ calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4,
\src5, \src6, \src7
+ calc_qpelb v10, \src8, \src9, \src10, \src11, \src12,
\src13, \src14, \src15
+ subs w3, w3, #1
+ st1 {v8.8h, v9.8h, v10.8h}, [x0], x9
+.endm
+1: calc_all2
+.purgem calc
+2: ld1 {v8.8b, v9.8b, v10.8b}, [sp]
+ add sp, sp, #32
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v32_8_neon, export=1
+ sub sp, sp, #32
+ st1 {v8.8b-v11.8b}, [sp]
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2)
+ sub x1, x1, x2
+ ld1 {v16.16b, v17.16b}, [x1], x2
+ ld1 {v18.16b, v19.16b}, [x1], x2
+ ld1 {v20.16b, v21.16b}, [x1], x2
+ ld1 {v22.16b, v23.16b}, [x1], x2
+ ld1 {v24.16b, v25.16b}, [x1], x2
+ ld1 {v26.16b, v27.16b}, [x1], x2
+ ld1 {v28.16b, v29.16b}, [x1], x2
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7,
src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().16b, \tmp1\().16b}, [x1], x2
+ movi v8.8h, #0
+ movi v9.8h, #0
+ movi v10.8h, #0
+ movi v11.8h, #0
+ calc_qpelb v8, \src0, \src1, \src2, \src3, \src4,
\src5, \src6, \src7
+ calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4,
\src5, \src6, \src7
+ calc_qpelb v10, \src8, \src9, \src10, \src11, \src12,
\src13, \src14, \src15
+ calc_qpelb2 v11, \src8, \src9, \src10, \src11, \src12,
\src13, \src14, \src15
+ subs w3, w3, #1
+ st1 {v8.8h-v11.8h}, [x0], x9
+.endm
+1: calc_all2
+.purgem calc
+2: ld1 {v8.8b-v11.8b}, [sp], #32
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v48_8_neon, export=1
+ stp x2, x3, [sp, #-48]!
+ stp x0, x1, [sp, #16]
+ stp x5, x30, [sp, #32]
+ bl X(ff_hevc_put_hevc_qpel_v24_8_neon)
+ ldp x2, x3, [sp]
+ ldp x0, x1, [sp, #16]
+ ldr x5, [sp, #32]
+ add sp, sp, #32
+ add x0, x0, #48
+ add x1, x1, #24
+ bl X(ff_hevc_put_hevc_qpel_v24_8_neon)
+ ldr x30, [sp, #8]
+ add sp, sp, #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v64_8_neon, export=1
+ sub sp, sp, #32
+ st1 {v8.8b-v11.8b}, [sp]
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ sub x1, x1, x2
+ mov x9, #(MAX_PB_SIZE * 2)
+0: mov x8, x1 // src
+ ld1 {v16.16b, v17.16b}, [x8], x2
+ mov w11, w3 // height
+ ld1 {v18.16b, v19.16b}, [x8], x2
+ mov x10, x0 // dst
+ ld1 {v20.16b, v21.16b}, [x8], x2
+ ld1 {v22.16b, v23.16b}, [x8], x2
+ ld1 {v24.16b, v25.16b}, [x8], x2
+ ld1 {v26.16b, v27.16b}, [x8], x2
+ ld1 {v28.16b, v29.16b}, [x8], x2
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7,
src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().16b, \tmp1\().16b}, [x8], x2
+ movi v8.8h, #0
+ movi v9.8h, #0
+ movi v10.8h, #0
+ movi v11.8h, #0
+ calc_qpelb v8, \src0, \src1, \src2, \src3, \src4,
\src5, \src6, \src7
+ calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4,
\src5, \src6, \src7
+ calc_qpelb v10, \src8, \src9, \src10, \src11, \src12,
\src13, \src14, \src15
+ calc_qpelb2 v11, \src8, \src9, \src10, \src11, \src12,
\src13, \src14, \src15
+ subs x11, x11, #1
+ st1 {v8.8h-v11.8h}, [x10], x9
+.endm
+1: calc_all2
+.purgem calc
+2: add x0, x0, #64
+ add x1, x1, #32
+ subs w6, w6, #32
+ b.hi 0b
+ ld1 {v8.8b-v11.8b}, [sp], #32
+ ret
+endfunc
+
+
function ff_hevc_put_hevc_pel_uni_pixels4_8_neon, export=1
1:
ldr s0, [x2]
@@ -663,25 +972,6 @@ function ff_hevc_put_hevc_pel_uni_pixels64_8_neon,
export=1
ret
endfunc
-.macro calc_all
- calc v23, v16, v17, v18, v19, v20, v21, v22, v23
- b.eq 2f
- calc v16, v17, v18, v19, v20, v21, v22, v23, v16
- b.eq 2f
- calc v17, v18, v19, v20, v21, v22, v23, v16, v17
- b.eq 2f
- calc v18, v19, v20, v21, v22, v23, v16, v17, v18
- b.eq 2f
- calc v19, v20, v21, v22, v23, v16, v17, v18, v19
- b.eq 2f
- calc v20, v21, v22, v23, v16, v17, v18, v19, v20
- b.eq 2f
- calc v21, v22, v23, v16, v17, v18, v19, v20, v21
- b.eq 2f
- calc v22, v23, v16, v17, v18, v19, v20, v21, v22
- b.hi 1b
-.endm
-
function ff_hevc_put_hevc_qpel_uni_v4_8_neon, export=1
load_qpel_filterb x6, x5
sub x2, x2, x3, lsl #1
@@ -1559,25 +1849,6 @@ endfunc
#if HAVE_I8MM
-.macro calc_all2
- calc v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17,
v19, v21, v23, v25, v27, v29, v31
- b.eq 2f
- calc v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19,
v21, v23, v25, v27, v29, v31, v17
- b.eq 2f
- calc v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21,
v23, v25, v27, v29, v31, v17, v19
- b.eq 2f
- calc v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23,
v25, v27, v29, v31, v17, v19, v21
- b.eq 2f
- calc v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25,
v27, v29, v31, v17, v19, v21, v23
- b.eq 2f
- calc v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27,
v29, v31, v17, v19, v21, v23, v25
- b.eq 2f
- calc v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29,
v31, v17, v19, v21, v23, v25, v27
- b.eq 2f
- calc v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31,
v17, v19, v21, v23, v25, v27, v29
- b.hi 1b
-.endm
-
function ff_hevc_put_hevc_qpel_uni_hv4_8_neon_i8mm, export=1
add w10, w4, #7
lsl x10, x10, #7
--
2.38.0.windows.1
[-- Attachment #2: 0003-lavc-aarch64-new-optimization-for-8-bit-hevc_qpel_v.patch --]
[-- Type: text/plain, Size: 18085 bytes --]
From 3cb075a5fcf0e696a55bcce8fa6415c1d2830fad Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu@myais.com.cn>
Date: Sat, 9 Sep 2023 21:54:48 +0800
Subject: [PATCH 3/4] lavc/aarch64: new optimization for 8-bit hevc_qpel_v
checkasm bench:
put_hevc_qpel_v4_8_c: 138.1
put_hevc_qpel_v4_8_neon: 41.1
put_hevc_qpel_v6_8_c: 276.6
put_hevc_qpel_v6_8_neon: 60.9
put_hevc_qpel_v8_8_c: 478.9
put_hevc_qpel_v8_8_neon: 72.9
put_hevc_qpel_v12_8_c: 1072.6
put_hevc_qpel_v12_8_neon: 203.9
put_hevc_qpel_v16_8_c: 1852.1
put_hevc_qpel_v16_8_neon: 264.1
put_hevc_qpel_v24_8_c: 4137.6
put_hevc_qpel_v24_8_neon: 586.9
put_hevc_qpel_v32_8_c: 7579.1
put_hevc_qpel_v32_8_neon: 1036.6
put_hevc_qpel_v48_8_c: 16355.6
put_hevc_qpel_v48_8_neon: 2326.4
put_hevc_qpel_v64_8_c: 33545.1
put_hevc_qpel_v64_8_neon: 4126.4
Co-Authored-By: J. Dekker <jdek@itanimul.li>
---
libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 +
libavcodec/aarch64/hevcdsp_qpel_neon.S | 347 +++++++++++++++++++---
2 files changed, 314 insertions(+), 38 deletions(-)
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index e9a341ecb9..f6b4c31d17 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -204,6 +204,10 @@ NEON8_FNPROTO(qpel_h, (int16_t *dst,
const uint8_t *_src, ptrdiff_t _srcstride,
int height, intptr_t mx, intptr_t my, int width), _i8mm);
+NEON8_FNPROTO(qpel_v, (int16_t *dst,
+ const uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width),);
+
NEON8_FNPROTO(qpel_uni_v, (uint8_t *dst, ptrdiff_t dststride,
const uint8_t *src, ptrdiff_t srcstride,
int height, intptr_t mx, intptr_t my, int width),);
@@ -315,6 +319,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels,);
NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v,);
NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels,);
+ NEON8_FNASSIGN(c->put_hevc_qpel, 1, 0, qpel_v,);
NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);
NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,);
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index 4132d7a8a9..eff70d70a4 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -112,6 +112,44 @@ endconst
.endif
.endm
+.macro calc_all
+ calc v23, v16, v17, v18, v19, v20, v21, v22, v23
+ b.eq 2f
+ calc v16, v17, v18, v19, v20, v21, v22, v23, v16
+ b.eq 2f
+ calc v17, v18, v19, v20, v21, v22, v23, v16, v17
+ b.eq 2f
+ calc v18, v19, v20, v21, v22, v23, v16, v17, v18
+ b.eq 2f
+ calc v19, v20, v21, v22, v23, v16, v17, v18, v19
+ b.eq 2f
+ calc v20, v21, v22, v23, v16, v17, v18, v19, v20
+ b.eq 2f
+ calc v21, v22, v23, v16, v17, v18, v19, v20, v21
+ b.eq 2f
+ calc v22, v23, v16, v17, v18, v19, v20, v21, v22
+ b.hi 1b
+.endm
+
+.macro calc_all2
+ calc v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17, v19, v21, v23, v25, v27, v29, v31
+ b.eq 2f
+ calc v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19, v21, v23, v25, v27, v29, v31, v17
+ b.eq 2f
+ calc v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21, v23, v25, v27, v29, v31, v17, v19
+ b.eq 2f
+ calc v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23, v25, v27, v29, v31, v17, v19, v21
+ b.eq 2f
+ calc v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25, v27, v29, v31, v17, v19, v21, v23
+ b.eq 2f
+ calc v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27, v29, v31, v17, v19, v21, v23, v25
+ b.eq 2f
+ calc v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29, v31, v17, v19, v21, v23, v25, v27
+ b.eq 2f
+ calc v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31, v17, v19, v21, v23, v25, v27, v29
+ b.hi 1b
+.endm
+
.macro put_hevc type
.ifc \type, qpel
// void put_hevc_qpel_h(int16_t *dst,
@@ -558,6 +596,277 @@ put_hevc qpel
put_hevc qpel_uni
put_hevc qpel_bi
+function ff_hevc_put_hevc_qpel_v4_8_neon, export=1
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2)
+ sub x1, x1, x2
+ ldr s16, [x1]
+ ldr s17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr s18, [x1]
+ ldr s19, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr s20, [x1]
+ ldr s21, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr s22, [x1]
+ add x1, x1, x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().s}[0], [x1], x2
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ st1 {v24.4h}, [x0], x9
+ subs w3, w3, #1
+ b.eq 2f
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v6_8_neon, export=1
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2 - 8)
+ sub x1, x1, x2
+ ldr d16, [x1]
+ ldr d17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr d18, [x1]
+ ldr d19, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr d20, [x1]
+ ldr d21, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr d22, [x1]
+ add x1, x1, x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8b}, [x1], x2
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ st1 {v24.4h}, [x0], #8
+ st1 {v24.s}[2], [x0], x9
+ subs w3, w3, #1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v8_8_neon, export=1
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2)
+ sub x1, x1, x2
+ ldr d16, [x1]
+ ldr d17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr d18, [x1]
+ ldr d19, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr d20, [x1]
+ ldr d21, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr d22, [x1]
+ add x1, x1, x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8b}, [x1], x2
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ st1 {v24.8h}, [x0], x9
+ subs w3, w3, #1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v12_8_neon, export=1
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2 - 16)
+ sub x1, x1, x2
+ ldr q16, [x1]
+ ldr q17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr q18, [x1]
+ ldr q19, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr q20, [x1]
+ ldr q21, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr q22, [x1]
+ add x1, x1, x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().16b}, [x1], x2
+ movi v24.8h, #0
+ movi v25.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ st1 {v24.8h}, [x0], #16
+ subs w3, w3, #1
+ st1 {v25.4h}, [x0], x9
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v16_8_neon, export=1
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2)
+ sub x1, x1, x2
+ ldr q16, [x1]
+ ldr q17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr q18, [x1]
+ ldr q19, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr q20, [x1]
+ ldr q21, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr q22, [x1]
+ add x1, x1, x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().16b}, [x1], x2
+ movi v24.8h, #0
+ movi v25.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ subs w3, w3, #1
+ st1 {v24.8h, v25.8h}, [x0], x9
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+// todo: reads #32 bytes
+function ff_hevc_put_hevc_qpel_v24_8_neon, export=1
+ sub sp, sp, #32
+ st1 {v8.8b, v9.8b, v10.8b}, [sp]
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ sub x1, x1, x2
+ mov x9, #(MAX_PB_SIZE * 2)
+ ld1 {v16.16b, v17.16b}, [x1], x2
+ ld1 {v18.16b, v19.16b}, [x1], x2
+ ld1 {v20.16b, v21.16b}, [x1], x2
+ ld1 {v22.16b, v23.16b}, [x1], x2
+ ld1 {v24.16b, v25.16b}, [x1], x2
+ ld1 {v26.16b, v27.16b}, [x1], x2
+ ld1 {v28.16b, v29.16b}, [x1], x2
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().16b, \tmp1\().16b}, [x1], x2
+ movi v8.8h, #0
+ movi v9.8h, #0
+ movi v10.8h, #0
+ calc_qpelb v8, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ subs w3, w3, #1
+ st1 {v8.8h, v9.8h, v10.8h}, [x0], x9
+.endm
+1: calc_all2
+.purgem calc
+2: ld1 {v8.8b, v9.8b, v10.8b}, [sp]
+ add sp, sp, #32
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v32_8_neon, export=1
+ sub sp, sp, #32
+ st1 {v8.8b-v11.8b}, [sp]
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2)
+ sub x1, x1, x2
+ ld1 {v16.16b, v17.16b}, [x1], x2
+ ld1 {v18.16b, v19.16b}, [x1], x2
+ ld1 {v20.16b, v21.16b}, [x1], x2
+ ld1 {v22.16b, v23.16b}, [x1], x2
+ ld1 {v24.16b, v25.16b}, [x1], x2
+ ld1 {v26.16b, v27.16b}, [x1], x2
+ ld1 {v28.16b, v29.16b}, [x1], x2
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().16b, \tmp1\().16b}, [x1], x2
+ movi v8.8h, #0
+ movi v9.8h, #0
+ movi v10.8h, #0
+ movi v11.8h, #0
+ calc_qpelb v8, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ calc_qpelb2 v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ subs w3, w3, #1
+ st1 {v8.8h-v11.8h}, [x0], x9
+.endm
+1: calc_all2
+.purgem calc
+2: ld1 {v8.8b-v11.8b}, [sp], #32
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v48_8_neon, export=1
+ stp x2, x3, [sp, #-48]!
+ stp x0, x1, [sp, #16]
+ stp x5, x30, [sp, #32]
+ bl X(ff_hevc_put_hevc_qpel_v24_8_neon)
+ ldp x2, x3, [sp]
+ ldp x0, x1, [sp, #16]
+ ldr x5, [sp, #32]
+ add sp, sp, #32
+ add x0, x0, #48
+ add x1, x1, #24
+ bl X(ff_hevc_put_hevc_qpel_v24_8_neon)
+ ldr x30, [sp, #8]
+ add sp, sp, #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v64_8_neon, export=1
+ sub sp, sp, #32
+ st1 {v8.8b-v11.8b}, [sp]
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ sub x1, x1, x2
+ mov x9, #(MAX_PB_SIZE * 2)
+0: mov x8, x1 // src
+ ld1 {v16.16b, v17.16b}, [x8], x2
+ mov w11, w3 // height
+ ld1 {v18.16b, v19.16b}, [x8], x2
+ mov x10, x0 // dst
+ ld1 {v20.16b, v21.16b}, [x8], x2
+ ld1 {v22.16b, v23.16b}, [x8], x2
+ ld1 {v24.16b, v25.16b}, [x8], x2
+ ld1 {v26.16b, v27.16b}, [x8], x2
+ ld1 {v28.16b, v29.16b}, [x8], x2
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().16b, \tmp1\().16b}, [x8], x2
+ movi v8.8h, #0
+ movi v9.8h, #0
+ movi v10.8h, #0
+ movi v11.8h, #0
+ calc_qpelb v8, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ calc_qpelb2 v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ subs x11, x11, #1
+ st1 {v8.8h-v11.8h}, [x10], x9
+.endm
+1: calc_all2
+.purgem calc
+2: add x0, x0, #64
+ add x1, x1, #32
+ subs w6, w6, #32
+ b.hi 0b
+ ld1 {v8.8b-v11.8b}, [sp], #32
+ ret
+endfunc
+
+
function ff_hevc_put_hevc_pel_uni_pixels4_8_neon, export=1
1:
ldr s0, [x2]
@@ -663,25 +972,6 @@ function ff_hevc_put_hevc_pel_uni_pixels64_8_neon, export=1
ret
endfunc
-.macro calc_all
- calc v23, v16, v17, v18, v19, v20, v21, v22, v23
- b.eq 2f
- calc v16, v17, v18, v19, v20, v21, v22, v23, v16
- b.eq 2f
- calc v17, v18, v19, v20, v21, v22, v23, v16, v17
- b.eq 2f
- calc v18, v19, v20, v21, v22, v23, v16, v17, v18
- b.eq 2f
- calc v19, v20, v21, v22, v23, v16, v17, v18, v19
- b.eq 2f
- calc v20, v21, v22, v23, v16, v17, v18, v19, v20
- b.eq 2f
- calc v21, v22, v23, v16, v17, v18, v19, v20, v21
- b.eq 2f
- calc v22, v23, v16, v17, v18, v19, v20, v21, v22
- b.hi 1b
-.endm
-
function ff_hevc_put_hevc_qpel_uni_v4_8_neon, export=1
load_qpel_filterb x6, x5
sub x2, x2, x3, lsl #1
@@ -1559,25 +1849,6 @@ endfunc
#if HAVE_I8MM
-.macro calc_all2
- calc v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17, v19, v21, v23, v25, v27, v29, v31
- b.eq 2f
- calc v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19, v21, v23, v25, v27, v29, v31, v17
- b.eq 2f
- calc v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21, v23, v25, v27, v29, v31, v17, v19
- b.eq 2f
- calc v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23, v25, v27, v29, v31, v17, v19, v21
- b.eq 2f
- calc v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25, v27, v29, v31, v17, v19, v21, v23
- b.eq 2f
- calc v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27, v29, v31, v17, v19, v21, v23, v25
- b.eq 2f
- calc v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29, v31, v17, v19, v21, v23, v25, v27
- b.eq 2f
- calc v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31, v17, v19, v21, v23, v25, v27, v29
- b.hi 1b
-.endm
-
function ff_hevc_put_hevc_qpel_uni_hv4_8_neon_i8mm, export=1
add w10, w4, #7
lsl x10, x10, #7
--
2.38.0.windows.1
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* [FFmpeg-devel] (no subject)
@ 2023-10-14 8:39 Logan.Lyu
0 siblings, 0 replies; 15+ messages in thread
From: Logan.Lyu @ 2023-10-14 8:39 UTC (permalink / raw)
To: ffmpeg-devel
[-- Attachment #1: Type: text/plain, Size: 10960 bytes --]
checkasm bench:
put_hevc_epel_v4_8_c: 79.9
put_hevc_epel_v4_8_neon: 25.7
put_hevc_epel_v6_8_c: 151.4
put_hevc_epel_v6_8_neon: 46.4
put_hevc_epel_v8_8_c: 250.9
put_hevc_epel_v8_8_neon: 41.7
put_hevc_epel_v12_8_c: 542.7
put_hevc_epel_v12_8_neon: 108.7
put_hevc_epel_v16_8_c: 939.4
put_hevc_epel_v16_8_neon: 169.2
put_hevc_epel_v24_8_c: 2104.9
put_hevc_epel_v24_8_neon: 307.9
put_hevc_epel_v32_8_c: 3713.9
put_hevc_epel_v32_8_neon: 524.2
put_hevc_epel_v48_8_c: 8175.2
put_hevc_epel_v48_8_neon: 1197.2
put_hevc_epel_v64_8_c: 16049.4
put_hevc_epel_v64_8_neon: 2094.9
Co-Authored-By: J. Dekker <jdek@itanimul.li>
Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn>
---
libavcodec/aarch64/hevcdsp_epel_neon.S | 223 ++++++++++++++++++++++
libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 +
2 files changed, 228 insertions(+)
diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S
b/libavcodec/aarch64/hevcdsp_epel_neon.S
index b4ca1e4c20..e541db5430 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -243,6 +243,229 @@ function ff_hevc_put_hevc_pel_pixels64_8_neon,
export=1
ret
endfunc
+
+function ff_hevc_put_hevc_epel_v4_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ldr s16, [x1]
+ ldr s17, [x1 ,x2]
+ add x1, x1, x2, lsl #1
+ ld1 {v18.s}[0], [x1], x2
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().s}[0], [x1], x2
+ movi v4.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ subs w3, w3, #1
+ st1 {v4.4h}, [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v6_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2 - 8)
+ ldr d16, [x1]
+ ldr d17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ld1 {v18.8b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().8b}, [x1], x2
+ movi v4.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ st1 {v4.d}[0], [x0], #8
+ subs w3, w3, #1
+ st1 {v4.s}[2], [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v8_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ldr d16, [x1]
+ ldr d17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ld1 {v18.8b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().8b}, [x1], x2
+ movi v4.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ subs w3, w3, #1
+ st1 {v4.8h}, [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v12_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ldr q16, [x1]
+ ldr q17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ld1 {v18.16b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().16b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ calc_epelb2 v5, \src0, \src1, \src2, \src3
+ str q4, [x0]
+ subs w3, w3, #1
+ str d5, [x0, #16]
+ add x0, x0, x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v16_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ldr q16, [x1]
+ ldr q17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ld1 {v18.16b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().16b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ calc_epelb2 v5, \src0, \src1, \src2, \src3
+ subs w3, w3, #1
+ st1 {v4.8h, v5.8h}, [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v24_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8b, v17.8b, v18.8b}, [x1], x2
+ ld1 {v19.8b, v20.8b, v21.8b}, [x1], x2
+ ld1 {v22.8b, v23.8b, v24.8b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9,
src10, src11
+ ld1 {\src9\().8b, \src10\().8b, \src11\().8b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ calc_epelb v4, \src0, \src3, \src6, \src9
+ calc_epelb v5, \src1, \src4, \src7, \src10
+ calc_epelb v6, \src2, \src5, \src8, \src11
+ subs w3, w3, #1
+ st1 {v4.8h-v6.8h}, [x0], x10
+.endm
+1: calc_all12
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v32_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.16b, v17.16b}, [x1], x2
+ ld1 {v18.16b, v19.16b}, [x1], x2
+ ld1 {v20.16b, v21.16b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\src6\().16b, \src7\().16b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+ calc_epelb v4, \src0, \src2, \src4, \src6
+ calc_epelb2 v5, \src0, \src2, \src4, \src6
+ calc_epelb v6, \src1, \src3, \src5, \src7
+ calc_epelb2 v7, \src1, \src3, \src5, \src7
+ subs w3, w3, #1
+ st1 {v4.8h-v7.8h}, [x0], x10
+.endm
+1: calc_all8
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v48_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #64
+ ld1 {v16.16b, v17.16b, v18.16b}, [x1], x2
+ ld1 {v19.16b, v20.16b, v21.16b}, [x1], x2
+ ld1 {v22.16b, v23.16b, v24.16b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9,
src10, src11
+ ld1 {\src9\().16b, \src10\().16b, \src11\().16b},
[x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+ movi v28.8h, #0
+ movi v29.8h, #0
+ calc_epelb v4, \src0, \src3, \src6, \src9
+ calc_epelb2 v5, \src0, \src3, \src6, \src9
+ calc_epelb v6, \src1, \src4, \src7, \src10
+ calc_epelb2 v7, \src1, \src4, \src7, \src10
+ calc_epelb v28, \src2, \src5, \src8, \src11
+ calc_epelb2 v29, \src2, \src5, \src8, \src11
+ st1 {v4.8h-v7.8h}, [x0], #64
+ subs w3, w3, #1
+ st1 {v28.8h-v29.8h}, [x0], x10
+.endm
+1: calc_all12
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v64_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub sp, sp, #32
+ st1 {v8.8b-v11.8b}, [sp]
+ sub x1, x1, x2
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x2
+ ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], x2
+ ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9,
src10, src11, src12, src13, src14, src15
+ ld1 {\src12\().16b-\src15\().16b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+ movi v8.8h, #0
+ movi v9.8h, #0
+ movi v10.8h, #0
+ movi v11.8h, #0
+ calc_epelb v4, \src0, \src4, \src8, \src12
+ calc_epelb2 v5, \src0, \src4, \src8, \src12
+ calc_epelb v6, \src1, \src5, \src9, \src13
+ calc_epelb2 v7, \src1, \src5, \src9, \src13
+ calc_epelb v8, \src2, \src6, \src10, \src14
+ calc_epelb2 v9, \src2, \src6, \src10, \src14
+ calc_epelb v10, \src3, \src7, \src11, \src15
+ calc_epelb2 v11, \src3, \src7, \src11, \src15
+ st1 {v4.8h-v7.8h}, [x0], #64
+ subs w3, w3, #1
+ st1 {v8.8h-v11.8h}, [x0], #64
+.endm
+1: calc_all16
+.purgem calc
+2: ld1 {v8.8b-v11.8b}, [sp]
+ add sp, sp, #32
+ ret
+endfunc
+
function ff_hevc_put_hevc_epel_uni_v4_8_neon, export=1
load_epel_filterb x6, x5
sub x2, x2, x3
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 4c377a7940..82e1623a67 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -156,6 +156,10 @@ NEON8_FNPROTO(pel_pixels, (int16_t *dst,
const uint8_t *src, ptrdiff_t srcstride,
int height, intptr_t mx, intptr_t my, int width),);
+NEON8_FNPROTO(epel_v, (int16_t *dst,
+ const uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width),);
+
NEON8_FNPROTO(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride,
int height, intptr_t mx, intptr_t my, int width),);
@@ -305,6 +309,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext
*c, const int bit_depth)
c->put_hevc_qpel_bi[9][0][1] =
ff_hevc_put_hevc_qpel_bi_h16_8_neon;
NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels,);
+ NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v,);
NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels,);
NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);
--
2.38.0.windows.1
[-- Attachment #2: 0001-lavc-aarch64-new-optimization-for-8-bit-hevc_epel_v.patch --]
[-- Type: text/plain, Size: 11109 bytes --]
From dfaaddf97b86817bc7adb50fdf0d29634b365bb1 Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu@myais.com.cn>
Date: Sat, 9 Sep 2023 16:50:29 +0800
Subject: [PATCH 1/4] lavc/aarch64: new optimization for 8-bit hevc_epel_v
checkasm bench:
put_hevc_epel_v4_8_c: 79.9
put_hevc_epel_v4_8_neon: 25.7
put_hevc_epel_v6_8_c: 151.4
put_hevc_epel_v6_8_neon: 46.4
put_hevc_epel_v8_8_c: 250.9
put_hevc_epel_v8_8_neon: 41.7
put_hevc_epel_v12_8_c: 542.7
put_hevc_epel_v12_8_neon: 108.7
put_hevc_epel_v16_8_c: 939.4
put_hevc_epel_v16_8_neon: 169.2
put_hevc_epel_v24_8_c: 2104.9
put_hevc_epel_v24_8_neon: 307.9
put_hevc_epel_v32_8_c: 3713.9
put_hevc_epel_v32_8_neon: 524.2
put_hevc_epel_v48_8_c: 8175.2
put_hevc_epel_v48_8_neon: 1197.2
put_hevc_epel_v64_8_c: 16049.4
put_hevc_epel_v64_8_neon: 2094.9
Co-Authored-By: J. Dekker <jdek@itanimul.li>
---
libavcodec/aarch64/hevcdsp_epel_neon.S | 223 ++++++++++++++++++++++
libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 +
2 files changed, 228 insertions(+)
diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S
index b4ca1e4c20..e541db5430 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -243,6 +243,229 @@ function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1
ret
endfunc
+
+function ff_hevc_put_hevc_epel_v4_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ldr s16, [x1]
+ ldr s17, [x1 ,x2]
+ add x1, x1, x2, lsl #1
+ ld1 {v18.s}[0], [x1], x2
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().s}[0], [x1], x2
+ movi v4.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ subs w3, w3, #1
+ st1 {v4.4h}, [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v6_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2 - 8)
+ ldr d16, [x1]
+ ldr d17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ld1 {v18.8b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().8b}, [x1], x2
+ movi v4.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ st1 {v4.d}[0], [x0], #8
+ subs w3, w3, #1
+ st1 {v4.s}[2], [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v8_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ldr d16, [x1]
+ ldr d17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ld1 {v18.8b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().8b}, [x1], x2
+ movi v4.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ subs w3, w3, #1
+ st1 {v4.8h}, [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v12_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ldr q16, [x1]
+ ldr q17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ld1 {v18.16b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().16b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ calc_epelb2 v5, \src0, \src1, \src2, \src3
+ str q4, [x0]
+ subs w3, w3, #1
+ str d5, [x0, #16]
+ add x0, x0, x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v16_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ldr q16, [x1]
+ ldr q17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ld1 {v18.16b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().16b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ calc_epelb2 v5, \src0, \src1, \src2, \src3
+ subs w3, w3, #1
+ st1 {v4.8h, v5.8h}, [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v24_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8b, v17.8b, v18.8b}, [x1], x2
+ ld1 {v19.8b, v20.8b, v21.8b}, [x1], x2
+ ld1 {v22.8b, v23.8b, v24.8b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11
+ ld1 {\src9\().8b, \src10\().8b, \src11\().8b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ calc_epelb v4, \src0, \src3, \src6, \src9
+ calc_epelb v5, \src1, \src4, \src7, \src10
+ calc_epelb v6, \src2, \src5, \src8, \src11
+ subs w3, w3, #1
+ st1 {v4.8h-v6.8h}, [x0], x10
+.endm
+1: calc_all12
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v32_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.16b, v17.16b}, [x1], x2
+ ld1 {v18.16b, v19.16b}, [x1], x2
+ ld1 {v20.16b, v21.16b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\src6\().16b, \src7\().16b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+ calc_epelb v4, \src0, \src2, \src4, \src6
+ calc_epelb2 v5, \src0, \src2, \src4, \src6
+ calc_epelb v6, \src1, \src3, \src5, \src7
+ calc_epelb2 v7, \src1, \src3, \src5, \src7
+ subs w3, w3, #1
+ st1 {v4.8h-v7.8h}, [x0], x10
+.endm
+1: calc_all8
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v48_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #64
+ ld1 {v16.16b, v17.16b, v18.16b}, [x1], x2
+ ld1 {v19.16b, v20.16b, v21.16b}, [x1], x2
+ ld1 {v22.16b, v23.16b, v24.16b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11
+ ld1 {\src9\().16b, \src10\().16b, \src11\().16b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+ movi v28.8h, #0
+ movi v29.8h, #0
+ calc_epelb v4, \src0, \src3, \src6, \src9
+ calc_epelb2 v5, \src0, \src3, \src6, \src9
+ calc_epelb v6, \src1, \src4, \src7, \src10
+ calc_epelb2 v7, \src1, \src4, \src7, \src10
+ calc_epelb v28, \src2, \src5, \src8, \src11
+ calc_epelb2 v29, \src2, \src5, \src8, \src11
+ st1 {v4.8h-v7.8h}, [x0], #64
+ subs w3, w3, #1
+ st1 {v28.8h-v29.8h}, [x0], x10
+.endm
+1: calc_all12
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v64_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub sp, sp, #32
+ st1 {v8.8b-v11.8b}, [sp]
+ sub x1, x1, x2
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x2
+ ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], x2
+ ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\src12\().16b-\src15\().16b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+ movi v8.8h, #0
+ movi v9.8h, #0
+ movi v10.8h, #0
+ movi v11.8h, #0
+ calc_epelb v4, \src0, \src4, \src8, \src12
+ calc_epelb2 v5, \src0, \src4, \src8, \src12
+ calc_epelb v6, \src1, \src5, \src9, \src13
+ calc_epelb2 v7, \src1, \src5, \src9, \src13
+ calc_epelb v8, \src2, \src6, \src10, \src14
+ calc_epelb2 v9, \src2, \src6, \src10, \src14
+ calc_epelb v10, \src3, \src7, \src11, \src15
+ calc_epelb2 v11, \src3, \src7, \src11, \src15
+ st1 {v4.8h-v7.8h}, [x0], #64
+ subs w3, w3, #1
+ st1 {v8.8h-v11.8h}, [x0], #64
+.endm
+1: calc_all16
+.purgem calc
+2: ld1 {v8.8b-v11.8b}, [sp]
+ add sp, sp, #32
+ ret
+endfunc
+
function ff_hevc_put_hevc_epel_uni_v4_8_neon, export=1
load_epel_filterb x6, x5
sub x2, x2, x3
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 4c377a7940..82e1623a67 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -156,6 +156,10 @@ NEON8_FNPROTO(pel_pixels, (int16_t *dst,
const uint8_t *src, ptrdiff_t srcstride,
int height, intptr_t mx, intptr_t my, int width),);
+NEON8_FNPROTO(epel_v, (int16_t *dst,
+ const uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width),);
+
NEON8_FNPROTO(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride,
int height, intptr_t mx, intptr_t my, int width),);
@@ -305,6 +309,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels,);
+ NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v,);
NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels,);
NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);
--
2.38.0.windows.1
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* [FFmpeg-devel] (no subject)
@ 2023-10-14 8:39 Logan.Lyu
0 siblings, 0 replies; 15+ messages in thread
From: Logan.Lyu @ 2023-10-14 8:39 UTC (permalink / raw)
To: ffmpeg-devel
[-- Attachment #1: Type: text/plain, Size: 13033 bytes --]
checkasm bench:
put_hevc_epel_hv4_8_c: 213.7
put_hevc_epel_hv4_8_i8mm: 59.4
put_hevc_epel_hv6_8_c: 350.9
put_hevc_epel_hv6_8_i8mm: 130.2
put_hevc_epel_hv8_8_c: 548.7
put_hevc_epel_hv8_8_i8mm: 136.9
put_hevc_epel_hv12_8_c: 1126.7
put_hevc_epel_hv12_8_i8mm: 302.2
put_hevc_epel_hv16_8_c: 1925.2
put_hevc_epel_hv16_8_i8mm: 459.9
put_hevc_epel_hv24_8_c: 4301.9
put_hevc_epel_hv24_8_i8mm: 1024.9
put_hevc_epel_hv32_8_c: 7509.2
put_hevc_epel_hv32_8_i8mm: 1680.4
put_hevc_epel_hv48_8_c: 16566.9
put_hevc_epel_hv48_8_i8mm: 3945.4
put_hevc_epel_hv64_8_c: 29134.2
put_hevc_epel_hv64_8_i8mm: 6567.7
Co-Authored-By: J. Dekker <jdek@itanimul.li>
Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn>
---
libavcodec/aarch64/hevcdsp_epel_neon.S | 265 ++++++++++++++++++++++
libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 +
2 files changed, 270 insertions(+)
diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S
b/libavcodec/aarch64/hevcdsp_epel_neon.S
index e541db5430..ebc16da5b6 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -1018,6 +1018,271 @@ function ff_hevc_put_hevc_epel_h64_8_neon_i8mm,
export=1
ret
endfunc
+
+function ff_hevc_put_hevc_epel_hv4_8_neon_i8mm, export=1
+ add w10, w3, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ sub x1, x1, x2
+ add w3, w3, #3
+ bl X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_epel_filterh x5, x4
+ mov x10, #(MAX_PB_SIZE * 2)
+ ldr d16, [sp]
+ ldr d17, [sp, x10]
+ add sp, sp, x10, lsl #1
+ ld1 {v18.4h}, [sp], x10
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().4h}, [sp], x10
+ calc_epelh v4, \src0, \src1, \src2, \src3
+ subs w3, w3, #1
+ st1 {v4.4h}, [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv6_8_neon_i8mm, export=1
+ add w10, w3, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ sub x1, x1, x2
+ add w3, w3, #3
+ bl X(ff_hevc_put_hevc_epel_h6_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_epel_filterh x5, x4
+ mov x5, #120
+ mov x10, #(MAX_PB_SIZE * 2)
+ ldr q16, [sp]
+ ldr q17, [sp, x10]
+ add sp, sp, x10, lsl #1
+ ld1 {v18.8h}, [sp], x10
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src1, \src2, \src3
+ calc_epelh2 v4, v5, \src0, \src1, \src2, \src3
+ st1 {v4.d}[0], [x0], #8
+ subs w3, w3, #1
+ st1 {v4.s}[2], [x0], x5
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv8_8_neon_i8mm, export=1
+ add w10, w3, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ sub x1, x1, x2
+ add w3, w3, #3
+ bl X(ff_hevc_put_hevc_epel_h8_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_epel_filterh x5, x4
+ mov x10, #(MAX_PB_SIZE * 2)
+ ldr q16, [sp]
+ ldr q17, [sp, x10]
+ add sp, sp, x10, lsl #1
+ ld1 {v18.8h}, [sp], x10
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src1, \src2, \src3
+ calc_epelh2 v4, v5, \src0, \src1, \src2, \src3
+ subs w3, w3, #1
+ st1 {v4.8h}, [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv12_8_neon_i8mm, export=1
+ add w10, w3, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ sub x1, x1, x2
+ add w3, w3, #3
+ bl X(ff_hevc_put_hevc_epel_h12_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_epel_filterh x5, x4
+ mov x5, #112
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h, v17.8h}, [sp], x10
+ ld1 {v18.8h, v19.8h}, [sp], x10
+ ld1 {v20.8h, v21.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\src6\().8h, \src7\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src2, \src4, \src6
+ calc_epelh2 v4, v5, \src0, \src2, \src4, \src6
+ calc_epelh v5, \src1, \src3, \src5, \src7
+ st1 {v4.8h}, [x0], #16
+ subs w3, w3, #1
+ st1 {v5.4h}, [x0], x5
+.endm
+1: calc_all8
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv16_8_neon_i8mm, export=1
+ add w10, w3, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ sub x1, x1, x2
+ add w3, w3, #3
+ bl X(ff_hevc_put_hevc_epel_h16_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_epel_filterh x5, x4
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h, v17.8h}, [sp], x10
+ ld1 {v18.8h, v19.8h}, [sp], x10
+ ld1 {v20.8h, v21.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\src6\().8h, \src7\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src2, \src4, \src6
+ calc_epelh2 v4, v5, \src0, \src2, \src4, \src6
+ calc_epelh v5, \src1, \src3, \src5, \src7
+ calc_epelh2 v5, v6, \src1, \src3, \src5, \src7
+ subs w3, w3, #1
+ st1 {v4.8h, v5.8h}, [x0], x10
+.endm
+1: calc_all8
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv24_8_neon_i8mm, export=1
+ add w10, w3, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ sub x1, x1, x2
+ add w3, w3, #3
+ bl X(ff_hevc_put_hevc_epel_h24_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_epel_filterh x5, x4
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h, v17.8h, v18.8h}, [sp], x10
+ ld1 {v19.8h, v20.8h, v21.8h}, [sp], x10
+ ld1 {v22.8h, v23.8h, v24.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9,
src10, src11
+ ld1 {\src9\().8h-\src11\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src3, \src6, \src9
+ calc_epelh2 v4, v5, \src0, \src3, \src6, \src9
+ calc_epelh v5, \src1, \src4, \src7, \src10
+ calc_epelh2 v5, v6, \src1, \src4, \src7, \src10
+ calc_epelh v6, \src2, \src5, \src8, \src11
+ calc_epelh2 v6, v7, \src2, \src5, \src8, \src11
+ subs w3, w3, #1
+ st1 {v4.8h-v6.8h}, [x0], x10
+.endm
+1: calc_all12
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv32_8_neon_i8mm, export=1
+ stp x4, x5, [sp, #-64]!
+ stp x2, x3, [sp, #16]
+ stp x0, x1, [sp, #32]
+ str x30, [sp, #48]
+ mov x6, #16
+ bl X(ff_hevc_put_hevc_epel_hv16_8_neon_i8mm)
+ ldp x4, x5, [sp]
+ ldp x2, x3, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ add sp, sp, #48
+ add x0, x0, #32
+ add x1, x1, #16
+ mov x6, #16
+ bl X(ff_hevc_put_hevc_epel_hv16_8_neon_i8mm)
+ ldr x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv48_8_neon_i8mm, export=1
+ stp x4, x5, [sp, #-64]!
+ stp x2, x3, [sp, #16]
+ stp x0, x1, [sp, #32]
+ str x30, [sp, #48]
+ mov x6, #24
+ bl X(ff_hevc_put_hevc_epel_hv24_8_neon_i8mm)
+ ldp x4, x5, [sp]
+ ldp x2, x3, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ add sp, sp, #48
+ add x0, x0, #48
+ add x1, x1, #24
+ mov x6, #24
+ bl X(ff_hevc_put_hevc_epel_hv24_8_neon_i8mm)
+ ldr x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv64_8_neon_i8mm, export=1
+ stp x4, x5, [sp, #-64]!
+ stp x2, x3, [sp, #16]
+ stp x0, x1, [sp, #32]
+ str x30, [sp, #48]
+ mov x6, #16
+ bl X(ff_hevc_put_hevc_epel_hv16_8_neon_i8mm)
+ ldp x4, x5, [sp]
+ ldp x2, x3, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ add x0, x0, #32
+ add x1, x1, #16
+ mov x6, #16
+ bl X(ff_hevc_put_hevc_epel_hv16_8_neon_i8mm)
+ ldp x4, x5, [sp]
+ ldp x2, x3, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ add x0, x0, #64
+ add x1, x1, #32
+ mov x6, #16
+ bl X(ff_hevc_put_hevc_epel_hv16_8_neon_i8mm)
+ ldp x4, x5, [sp]
+ ldp x2, x3, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ add sp, sp, #48
+ add x0, x0, #96
+ add x1, x1, #48
+ mov x6, #16
+ bl X(ff_hevc_put_hevc_epel_hv16_8_neon_i8mm)
+ ldr x30, [sp], #16
+ ret
+endfunc
+
function ff_hevc_put_hevc_epel_uni_hv4_8_neon_i8mm, export=1
add w10, w4, #3
lsl x10, x10, #7
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 82e1623a67..e9a341ecb9 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -191,6 +191,10 @@ NEON8_FNPROTO(epel_h, (int16_t *dst,
const uint8_t *_src, ptrdiff_t _srcstride,
int height, intptr_t mx, intptr_t my, int width), _i8mm);
+NEON8_FNPROTO(epel_hv, (int16_t *dst,
+ const uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width), _i8mm);
+
NEON8_FNPROTO(epel_uni_w_h, (uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride,
int height, int denom, int wx, int ox,
@@ -322,6 +326,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext
*c, const int bit_depth)
if (have_i8mm(cpu_flags)) {
NEON8_FNASSIGN(c->put_hevc_epel, 0, 1, epel_h, _i8mm);
+ NEON8_FNASSIGN(c->put_hevc_epel, 1, 1, epel_hv, _i8mm);
NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 1, epel_uni_hv,
_i8mm);
NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h
,_i8mm);
NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
--
2.38.0.windows.1
[-- Attachment #2: 0002-lavc-aarch64-new-optimization-for-8-bit-hevc_epel_hv.patch --]
[-- Type: text/plain, Size: 13185 bytes --]
From 83af7e79cf004c244ad3c771a0ca0e2357bbe944 Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu@myais.com.cn>
Date: Sat, 9 Sep 2023 21:29:51 +0800
Subject: [PATCH 2/4] lavc/aarch64: new optimization for 8-bit hevc_epel_hv
checkasm bench:
put_hevc_epel_hv4_8_c: 213.7
put_hevc_epel_hv4_8_i8mm: 59.4
put_hevc_epel_hv6_8_c: 350.9
put_hevc_epel_hv6_8_i8mm: 130.2
put_hevc_epel_hv8_8_c: 548.7
put_hevc_epel_hv8_8_i8mm: 136.9
put_hevc_epel_hv12_8_c: 1126.7
put_hevc_epel_hv12_8_i8mm: 302.2
put_hevc_epel_hv16_8_c: 1925.2
put_hevc_epel_hv16_8_i8mm: 459.9
put_hevc_epel_hv24_8_c: 4301.9
put_hevc_epel_hv24_8_i8mm: 1024.9
put_hevc_epel_hv32_8_c: 7509.2
put_hevc_epel_hv32_8_i8mm: 1680.4
put_hevc_epel_hv48_8_c: 16566.9
put_hevc_epel_hv48_8_i8mm: 3945.4
put_hevc_epel_hv64_8_c: 29134.2
put_hevc_epel_hv64_8_i8mm: 6567.7
Co-Authored-By: J. Dekker <jdek@itanimul.li>
---
libavcodec/aarch64/hevcdsp_epel_neon.S | 265 ++++++++++++++++++++++
libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 +
2 files changed, 270 insertions(+)
diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S
index e541db5430..ebc16da5b6 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -1018,6 +1018,271 @@ function ff_hevc_put_hevc_epel_h64_8_neon_i8mm, export=1
ret
endfunc
+
+function ff_hevc_put_hevc_epel_hv4_8_neon_i8mm, export=1
+ add w10, w3, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ sub x1, x1, x2
+ add w3, w3, #3
+ bl X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_epel_filterh x5, x4
+ mov x10, #(MAX_PB_SIZE * 2)
+ ldr d16, [sp]
+ ldr d17, [sp, x10]
+ add sp, sp, x10, lsl #1
+ ld1 {v18.4h}, [sp], x10
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().4h}, [sp], x10
+ calc_epelh v4, \src0, \src1, \src2, \src3
+ subs w3, w3, #1
+ st1 {v4.4h}, [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv6_8_neon_i8mm, export=1
+ add w10, w3, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ sub x1, x1, x2
+ add w3, w3, #3
+ bl X(ff_hevc_put_hevc_epel_h6_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_epel_filterh x5, x4
+ mov x5, #120
+ mov x10, #(MAX_PB_SIZE * 2)
+ ldr q16, [sp]
+ ldr q17, [sp, x10]
+ add sp, sp, x10, lsl #1
+ ld1 {v18.8h}, [sp], x10
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src1, \src2, \src3
+ calc_epelh2 v4, v5, \src0, \src1, \src2, \src3
+ st1 {v4.d}[0], [x0], #8
+ subs w3, w3, #1
+ st1 {v4.s}[2], [x0], x5
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv8_8_neon_i8mm, export=1
+ add w10, w3, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ sub x1, x1, x2
+ add w3, w3, #3
+ bl X(ff_hevc_put_hevc_epel_h8_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_epel_filterh x5, x4
+ mov x10, #(MAX_PB_SIZE * 2)
+ ldr q16, [sp]
+ ldr q17, [sp, x10]
+ add sp, sp, x10, lsl #1
+ ld1 {v18.8h}, [sp], x10
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src1, \src2, \src3
+ calc_epelh2 v4, v5, \src0, \src1, \src2, \src3
+ subs w3, w3, #1
+ st1 {v4.8h}, [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv12_8_neon_i8mm, export=1
+ add w10, w3, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ sub x1, x1, x2
+ add w3, w3, #3
+ bl X(ff_hevc_put_hevc_epel_h12_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_epel_filterh x5, x4
+ mov x5, #112
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h, v17.8h}, [sp], x10
+ ld1 {v18.8h, v19.8h}, [sp], x10
+ ld1 {v20.8h, v21.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\src6\().8h, \src7\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src2, \src4, \src6
+ calc_epelh2 v4, v5, \src0, \src2, \src4, \src6
+ calc_epelh v5, \src1, \src3, \src5, \src7
+ st1 {v4.8h}, [x0], #16
+ subs w3, w3, #1
+ st1 {v5.4h}, [x0], x5
+.endm
+1: calc_all8
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv16_8_neon_i8mm, export=1
+ add w10, w3, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ sub x1, x1, x2
+ add w3, w3, #3
+ bl X(ff_hevc_put_hevc_epel_h16_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_epel_filterh x5, x4
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h, v17.8h}, [sp], x10
+ ld1 {v18.8h, v19.8h}, [sp], x10
+ ld1 {v20.8h, v21.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\src6\().8h, \src7\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src2, \src4, \src6
+ calc_epelh2 v4, v5, \src0, \src2, \src4, \src6
+ calc_epelh v5, \src1, \src3, \src5, \src7
+ calc_epelh2 v5, v6, \src1, \src3, \src5, \src7
+ subs w3, w3, #1
+ st1 {v4.8h, v5.8h}, [x0], x10
+.endm
+1: calc_all8
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv24_8_neon_i8mm, export=1
+ add w10, w3, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ sub x1, x1, x2
+ add w3, w3, #3
+ bl X(ff_hevc_put_hevc_epel_h24_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_epel_filterh x5, x4
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h, v17.8h, v18.8h}, [sp], x10
+ ld1 {v19.8h, v20.8h, v21.8h}, [sp], x10
+ ld1 {v22.8h, v23.8h, v24.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11
+ ld1 {\src9\().8h-\src11\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src3, \src6, \src9
+ calc_epelh2 v4, v5, \src0, \src3, \src6, \src9
+ calc_epelh v5, \src1, \src4, \src7, \src10
+ calc_epelh2 v5, v6, \src1, \src4, \src7, \src10
+ calc_epelh v6, \src2, \src5, \src8, \src11
+ calc_epelh2 v6, v7, \src2, \src5, \src8, \src11
+ subs w3, w3, #1
+ st1 {v4.8h-v6.8h}, [x0], x10
+.endm
+1: calc_all12
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv32_8_neon_i8mm, export=1
+ stp x4, x5, [sp, #-64]!
+ stp x2, x3, [sp, #16]
+ stp x0, x1, [sp, #32]
+ str x30, [sp, #48]
+ mov x6, #16
+ bl X(ff_hevc_put_hevc_epel_hv16_8_neon_i8mm)
+ ldp x4, x5, [sp]
+ ldp x2, x3, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ add sp, sp, #48
+ add x0, x0, #32
+ add x1, x1, #16
+ mov x6, #16
+ bl X(ff_hevc_put_hevc_epel_hv16_8_neon_i8mm)
+ ldr x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv48_8_neon_i8mm, export=1
+ stp x4, x5, [sp, #-64]!
+ stp x2, x3, [sp, #16]
+ stp x0, x1, [sp, #32]
+ str x30, [sp, #48]
+ mov x6, #24
+ bl X(ff_hevc_put_hevc_epel_hv24_8_neon_i8mm)
+ ldp x4, x5, [sp]
+ ldp x2, x3, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ add sp, sp, #48
+ add x0, x0, #48
+ add x1, x1, #24
+ mov x6, #24
+ bl X(ff_hevc_put_hevc_epel_hv24_8_neon_i8mm)
+ ldr x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv64_8_neon_i8mm, export=1
+ stp x4, x5, [sp, #-64]!
+ stp x2, x3, [sp, #16]
+ stp x0, x1, [sp, #32]
+ str x30, [sp, #48]
+ mov x6, #16
+ bl X(ff_hevc_put_hevc_epel_hv16_8_neon_i8mm)
+ ldp x4, x5, [sp]
+ ldp x2, x3, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ add x0, x0, #32
+ add x1, x1, #16
+ mov x6, #16
+ bl X(ff_hevc_put_hevc_epel_hv16_8_neon_i8mm)
+ ldp x4, x5, [sp]
+ ldp x2, x3, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ add x0, x0, #64
+ add x1, x1, #32
+ mov x6, #16
+ bl X(ff_hevc_put_hevc_epel_hv16_8_neon_i8mm)
+ ldp x4, x5, [sp]
+ ldp x2, x3, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ add sp, sp, #48
+ add x0, x0, #96
+ add x1, x1, #48
+ mov x6, #16
+ bl X(ff_hevc_put_hevc_epel_hv16_8_neon_i8mm)
+ ldr x30, [sp], #16
+ ret
+endfunc
+
function ff_hevc_put_hevc_epel_uni_hv4_8_neon_i8mm, export=1
add w10, w4, #3
lsl x10, x10, #7
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 82e1623a67..e9a341ecb9 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -191,6 +191,10 @@ NEON8_FNPROTO(epel_h, (int16_t *dst,
const uint8_t *_src, ptrdiff_t _srcstride,
int height, intptr_t mx, intptr_t my, int width), _i8mm);
+NEON8_FNPROTO(epel_hv, (int16_t *dst,
+ const uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width), _i8mm);
+
NEON8_FNPROTO(epel_uni_w_h, (uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride,
int height, int denom, int wx, int ox,
@@ -322,6 +326,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
if (have_i8mm(cpu_flags)) {
NEON8_FNASSIGN(c->put_hevc_epel, 0, 1, epel_h, _i8mm);
+ NEON8_FNASSIGN(c->put_hevc_epel, 1, 1, epel_hv, _i8mm);
NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 1, epel_uni_hv, _i8mm);
NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h ,_i8mm);
NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
--
2.38.0.windows.1
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* [FFmpeg-devel] (no subject)
@ 2023-07-17 7:08 Водянников Александр
0 siblings, 0 replies; 15+ messages in thread
From: Водянников Александр @ 2023-07-17 7:08 UTC (permalink / raw)
To: ffmpeg-devel
[-- Attachment #1: Type: text/plain, Size: 1 bytes --]
[-- Attachment #2: 0001-Fixed-crash-when-using-hardware-acceleration-in-thir.txt --]
[-- Type: text/plain, Size: 1766 bytes --]
From 0fe666c4e3d10a689f4c6854a58eec3e7ff3c922 Mon Sep 17 00:00:00 2001
From: Aleksoid <Aleksoid1978@mail.ru>
Date: Mon, 17 Jul 2023 17:04:43 +1000
Subject: [PATCH] Fixed crash when using hardware acceleration in third party
projects without using hw_frames_ctx.
---
libavcodec/decode.c | 27 +++++++++++++++------------
1 file changed, 15 insertions(+), 12 deletions(-)
diff --git a/libavcodec/decode.c b/libavcodec/decode.c
index a19cca1a7c..f34f169910 100644
--- a/libavcodec/decode.c
+++ b/libavcodec/decode.c
@@ -1802,18 +1802,21 @@ AVBufferRef *ff_hwaccel_frame_priv_alloc(AVCodecContext *avctx,
const AVHWAccel *hwaccel)
{
AVBufferRef *ref;
- AVHWFramesContext *frames_ctx = (AVHWFramesContext *)avctx->hw_frames_ctx->data;
- uint8_t *data = av_mallocz(hwaccel->frame_priv_data_size);
- if (!data)
- return NULL;
-
- ref = av_buffer_create(data, hwaccel->frame_priv_data_size,
- hwaccel->free_frame_priv,
- frames_ctx->device_ctx, 0);
- if (!ref) {
- av_free(data);
- return NULL;
- }
+ if (avctx->hw_frames_ctx) {
+ AVHWFramesContext *frames_ctx = (AVHWFramesContext *)avctx->hw_frames_ctx->data;
+ uint8_t *data = av_mallocz(hwaccel->frame_priv_data_size);
+ if (!data)
+ return NULL;
+
+ ref = av_buffer_create(data, hwaccel->frame_priv_data_size,
+ hwaccel->free_frame_priv,
+ frames_ctx->device_ctx, 0);
+ if (!ref) {
+ av_free(data);
+ return NULL;
+ }
+ } else
+ ref = av_buffer_allocz(hwaccel->frame_priv_data_size);
return ref;
}
--
2.41.0.windows.1
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* [FFmpeg-devel] (no subject)
@ 2022-07-17 13:32 facefunk
0 siblings, 0 replies; 15+ messages in thread
From: facefunk @ 2022-07-17 13:32 UTC (permalink / raw)
To: ffmpeg-devel
Hi FFMDevs,
I've managed to get forced mov_text subtitles working in VLC Player. -disposition:s:0 +forced is honored but I'm not 100% sure about my approach.
The attached patch represents the best idea I came up with so far as the code is minimal and it doesn't require the user to set any extra parameters, however it does puncture an abstraction boundary ever so slightly by copying stream data to the codec, perhaps this isn't a problem.
If there's anybody who could look over my patch and let me know if there's a better way of going about this, that would be greatly appreciated.
Love your work!
Kind regards,
facefunk
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
end of thread, other threads:[~2024-08-07 15:59 UTC | newest]
Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-02-09 14:25 [FFmpeg-devel] [PATCH] avdevice/xcbgrab: enable window resizing aline.gondimsantos
2023-02-09 14:29 ` Nicolas George
2023-02-09 18:19 ` [FFmpeg-devel] (no subject) Aline Gondim Santos
2023-02-09 18:19 ` [FFmpeg-devel] [PATCH] avdevice/xcbgrab: enable window resizing Aline Gondim Santos
2023-02-09 18:29 ` Aline Gondim Santos Gondim Santos
2023-02-09 20:09 ` Marton Balint
-- strict thread matches above, loose matches on Subject: below --
2024-08-07 15:58 [FFmpeg-devel] (no subject) cyfdel-at-hotmail.com
2024-04-18 9:42 pengxu
2024-04-18 7:36 pengxu
2023-10-14 8:40 Logan.Lyu
2023-10-14 8:40 Logan.Lyu
2023-10-14 8:39 Logan.Lyu
2023-10-14 8:39 Logan.Lyu
2023-07-17 7:08 Водянников Александр
2022-07-17 13:32 facefunk
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git