Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH] avdevice/xcbgrab: enable window resizing
@ 2023-02-09 14:25 aline.gondimsantos
  2023-02-09 14:29 ` Nicolas George
  2023-02-09 18:19 ` [FFmpeg-devel] (no subject) Aline Gondim Santos
  0 siblings, 2 replies; 15+ messages in thread
From: aline.gondimsantos @ 2023-02-09 14:25 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Aline Gondim Santos

From: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>

Signed-off-by: Aline Gondim Santos <aline.gondimsantos@savoirfairelinux.com>
---
 libavdevice/xcbgrab.c | 180 +++++++++---------------------------------
 1 file changed, 39 insertions(+), 141 deletions(-)

diff --git a/libavdevice/xcbgrab.c b/libavdevice/xcbgrab.c
index 64a68ba497..05282911a9 100644
--- a/libavdevice/xcbgrab.c
+++ b/libavdevice/xcbgrab.c
@@ -29,11 +29,6 @@
 #include <xcb/xfixes.h>
 #endif
 
-#if CONFIG_LIBXCB_SHM
-#include <sys/shm.h>
-#include <xcb/shm.h>
-#endif
-
 #if CONFIG_LIBXCB_SHAPE
 #include <xcb/shape.h>
 #endif
@@ -53,9 +48,6 @@ typedef struct XCBGrabContext {
     xcb_connection_t *conn;
     xcb_screen_t *screen;
     xcb_window_t window;
-#if CONFIG_LIBXCB_SHM
-    AVBufferPool *shm_pool;
-#endif
     int64_t time_frame;
     AVRational time_base;
     int64_t frame_duration;
@@ -72,10 +64,9 @@ typedef struct XCBGrabContext {
     int region_border;
     int centered;
     int select_region;
+    int is_area;
 
     const char *framerate;
-
-    int has_shm;
 } XCBGrabContext;
 
 #define FOLLOW_CENTER -1
@@ -97,6 +88,7 @@ static const AVOption options[] = {
     { "show_region", "Show the grabbing region.", OFFSET(show_region), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, D },
     { "region_border", "Set the region border thickness.", OFFSET(region_border), AV_OPT_TYPE_INT, { .i64 = 3 }, 1, 128, D },
     { "select_region", "Select the grabbing region graphically using the pointer.", OFFSET(select_region), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, D },
+    { "is_area", "Define if we are grabing a region of the display/window.", OFFSET(is_area), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, D },
     { NULL },
 };
 
@@ -216,99 +208,6 @@ static int64_t wait_frame(AVFormatContext *s, AVPacket *pkt)
     return curtime;
 }
 
-#if CONFIG_LIBXCB_SHM
-static int check_shm(xcb_connection_t *conn)
-{
-    xcb_shm_query_version_cookie_t cookie = xcb_shm_query_version(conn);
-    xcb_shm_query_version_reply_t *reply;
-
-    reply = xcb_shm_query_version_reply(conn, cookie, NULL);
-    if (reply) {
-        free(reply);
-        return 1;
-    }
-
-    return 0;
-}
-
-static void free_shm_buffer(void *opaque, uint8_t *data)
-{
-    shmdt(data);
-}
-
-static AVBufferRef *allocate_shm_buffer(void *opaque, size_t size)
-{
-    xcb_connection_t *conn = opaque;
-    xcb_shm_seg_t segment;
-    AVBufferRef *ref;
-    uint8_t *data;
-    int id;
-
-    id = shmget(IPC_PRIVATE, size, IPC_CREAT | 0777);
-    if (id == -1)
-        return NULL;
-
-    segment = xcb_generate_id(conn);
-    xcb_shm_attach(conn, segment, id, 0);
-    data = shmat(id, NULL, 0);
-    shmctl(id, IPC_RMID, 0);
-    if ((intptr_t)data == -1 || !data)
-        return NULL;
-
-    ref = av_buffer_create(data, size, free_shm_buffer, (void *)(ptrdiff_t)segment, 0);
-    if (!ref)
-        shmdt(data);
-
-    return ref;
-}
-
-static int xcbgrab_frame_shm(AVFormatContext *s, AVPacket *pkt)
-{
-    XCBGrabContext *c = s->priv_data;
-    xcb_shm_get_image_cookie_t iq;
-    xcb_shm_get_image_reply_t *img;
-    xcb_drawable_t drawable = c->window_id;
-    xcb_generic_error_t *e = NULL;
-    AVBufferRef *buf;
-    xcb_shm_seg_t segment;
-
-    buf = av_buffer_pool_get(c->shm_pool);
-    if (!buf) {
-        av_log(s, AV_LOG_ERROR, "Could not get shared memory buffer.\n");
-        return AVERROR(ENOMEM);
-    }
-    segment = (xcb_shm_seg_t)(uintptr_t)av_buffer_pool_buffer_get_opaque(buf);
-
-    iq = xcb_shm_get_image(c->conn, drawable,
-                           c->x, c->y, c->width, c->height, ~0,
-                           XCB_IMAGE_FORMAT_Z_PIXMAP, segment, 0);
-    img = xcb_shm_get_image_reply(c->conn, iq, &e);
-
-    xcb_flush(c->conn);
-
-    if (e) {
-        av_log(s, AV_LOG_ERROR,
-               "Cannot get the image data "
-               "event_error: response_type:%u error_code:%u "
-               "sequence:%u resource_id:%u minor_code:%u major_code:%u.\n",
-               e->response_type, e->error_code,
-               e->sequence, e->resource_id, e->minor_code, e->major_code);
-
-        free(e);
-        av_buffer_unref(&buf);
-        return AVERROR(EACCES);
-    }
-
-    free(img);
-
-    pkt->buf = buf;
-    pkt->data = buf->data;
-    pkt->size = c->frame_size;
-
-    return 0;
-}
-#endif /* CONFIG_LIBXCB_SHM */
-
 #if CONFIG_LIBXCB_XFIXES
 static int check_xfixes(xcb_connection_t *conn)
 {
@@ -462,14 +361,7 @@ static int xcbgrab_read_packet(AVFormatContext *s, AVPacket *pkt)
     if (c->show_region)
         xcbgrab_update_region(s, win_x, win_y);
 
-#if CONFIG_LIBXCB_SHM
-    if (c->has_shm && xcbgrab_frame_shm(s, pkt) < 0) {
-        av_log(s, AV_LOG_WARNING, "Continuing without shared memory.\n");
-        c->has_shm = 0;
-    }
-#endif
-    if (!c->has_shm)
-        ret = xcbgrab_frame(s, pkt);
+    ret = xcbgrab_frame(s, pkt);
     pkt->dts = pkt->pts = pts;
     pkt->duration = c->frame_duration;
 
@@ -488,11 +380,8 @@ static av_cold int xcbgrab_read_close(AVFormatContext *s)
 {
     XCBGrabContext *ctx = s->priv_data;
 
-#if CONFIG_LIBXCB_SHM
-    av_buffer_pool_uninit(&ctx->shm_pool);
-#endif
-
     xcb_disconnect(ctx->conn);
+    ctx->conn = NULL;
 
     return 0;
 }
@@ -572,7 +461,15 @@ static int pixfmt_from_pixmap_format(AVFormatContext *s, int depth,
 static int create_stream(AVFormatContext *s)
 {
     XCBGrabContext *c = s->priv_data;
-    AVStream *st      = avformat_new_stream(s, NULL);
+
+    // If we try to open another stream to x11grab, there is no reason
+    // to keep more than one stream in the context.
+    AVStream *st;
+    if (!s->nb_streams) {
+        st = avformat_new_stream(s, NULL);
+    } else {
+        st = s->streams[0];
+    }
     xcb_get_geometry_cookie_t gc;
     xcb_get_geometry_reply_t *geo;
     int64_t frame_size_bits;
@@ -594,7 +491,16 @@ static int create_stream(AVFormatContext *s)
         return AVERROR_EXTERNAL;
     }
 
+    // Width and Height are not 0 only when we set a window area to share
+    // This if may be valid only in  the first call to create_stream
     if (!c->width || !c->height) {
+        c->is_area = 0;
+        c->width = geo->width;
+        c->height = geo->height;
+    }
+    // If not a predefined area, then we should follow geometry changes
+    // This can be valid only on the second call onwards
+    if (!c->is_area && (c->width != geo->width || c->height != geo->height)) {
         c->width = geo->width;
         c->height = geo->height;
     }
@@ -628,13 +534,6 @@ static int create_stream(AVFormatContext *s)
     }
     c->frame_size = frame_size_bits / 8;
 
-#if CONFIG_LIBXCB_SHM
-    c->shm_pool = av_buffer_pool_init2(c->frame_size + AV_INPUT_BUFFER_PADDING_SIZE,
-                                           c->conn, allocate_shm_buffer, NULL);
-    if (!c->shm_pool)
-        return AVERROR(ENOMEM);
-#endif
-
     st->codecpar->codec_type = AVMEDIA_TYPE_VIDEO;
     st->codecpar->codec_id   = AV_CODEC_ID_RAWVIDEO;
     st->codecpar->width      = c->width;
@@ -829,23 +728,26 @@ static av_cold int xcbgrab_read_header(AVFormatContext *s)
         sscanf(s->url, "+%d,%d", &c->x, &c->y);
     }
 
-    c->conn = xcb_connect(display_name[0] ? display_name : NULL, &screen_num);
-    av_freep(&display_name);
+    if (!c->conn || !c->screen) {
+        xcbgrab_read_close(s);
+        c->conn = xcb_connect(display_name[0] ? display_name : NULL, &screen_num);
+        av_freep(&display_name);
 
-    if ((ret = xcb_connection_has_error(c->conn))) {
-        av_log(s, AV_LOG_ERROR, "Cannot open display %s, error %d.\n",
-               s->url[0] ? s->url : "default", ret);
-        return AVERROR(EIO);
-    }
+        if ((ret = xcb_connection_has_error(c->conn))) {
+            av_log(s, AV_LOG_ERROR, "Cannot open display %s, error %d.\n",
+                s->url[0] ? s->url : "default", ret);
+            return AVERROR(EIO);
+        }
 
-    setup = xcb_get_setup(c->conn);
+       setup = xcb_get_setup(c->conn);
 
-    c->screen = get_screen(setup, screen_num);
-    if (!c->screen) {
-        av_log(s, AV_LOG_ERROR, "The screen %d does not exist.\n",
-               screen_num);
-        xcbgrab_read_close(s);
-        return AVERROR(EIO);
+        c->screen = get_screen(setup, screen_num);
+        if (!c->screen) {
+            av_log(s, AV_LOG_ERROR, "The screen %d does not exist.\n",
+                screen_num);
+            xcbgrab_read_close(s);
+            return AVERROR(EIO);
+        }
     }
 
     if (c->window_id == XCB_NONE)
@@ -876,10 +778,6 @@ static av_cold int xcbgrab_read_header(AVFormatContext *s)
         return ret;
     }
 
-#if CONFIG_LIBXCB_SHM
-    c->has_shm = check_shm(c->conn);
-#endif
-
 #if CONFIG_LIBXCB_XFIXES
     if (c->draw_mouse) {
         if (!(c->draw_mouse = check_xfixes(c->conn))) {
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 15+ messages in thread
* [FFmpeg-devel] (no subject)
@ 2024-08-07 15:58 cyfdel-at-hotmail.com
  0 siblings, 0 replies; 15+ messages in thread
From: cyfdel-at-hotmail.com @ 2024-08-07 15:58 UTC (permalink / raw)
  To: ffmpeg-devel


hat the patch does:
        fix gdigrab capture a window with hwnd shows "Invalid window
        handle x, must be a vlid integer", althought a valid integer is
        input

why:
        line 284 of libavdevice/gdigrab.c, one of the condition leads to
        check failed is p[0]='\0'. if a integer only string is process,
        the p[0] after strtoull process will be null which equal to
        '\0', otherwise, a non-integer string will make p[0] not null to
        pass the check

how:
        change p[0]=='\0' to p[0]!='\0' will works. no any side effect

reproduce and verify:
        a simple command: ffmpeg -f gdigrab -i hwnd=12345
        * althought a workaround command will work currently:
        *       ffmpeg -f gdigrab -i hwnd=12345x. (x could be any char)
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 15+ messages in thread
* [FFmpeg-devel] (no subject)
@ 2024-04-18  9:42 pengxu
  0 siblings, 0 replies; 15+ messages in thread
From: pengxu @ 2024-04-18  9:42 UTC (permalink / raw)
  To: ffmpeg-devel

v2: Fixed fate errors in [Patch 2/2]
v3: Fixed fate errors in [Patch 2/2] 
Subject:[PATCH V3][Loongarch]Optimize aac decode/encode for Loongarch by LSX
In-Reply-To: 


_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 15+ messages in thread
* [FFmpeg-devel] (no subject)
@ 2024-04-18  7:36 pengxu
  0 siblings, 0 replies; 15+ messages in thread
From: pengxu @ 2024-04-18  7:36 UTC (permalink / raw)
  To: ffmpeg-devel

v2: Fixed build errors in [PATCH 2/2]
 
Subject: [PATCH V2][Loongarch]Optimize aac decode/encode for Loongarch by LSX
In-Reply-To: 


_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 15+ messages in thread
* [FFmpeg-devel] (no subject)
@ 2023-10-14  8:40 Logan.Lyu
  0 siblings, 0 replies; 15+ messages in thread
From: Logan.Lyu @ 2023-10-14  8:40 UTC (permalink / raw)
  To: ffmpeg-devel

[-- Attachment #1: Type: text/plain, Size: 21092 bytes --]

checkasm bench:
put_hevc_qpel_hv4_8_c: 422.1
put_hevc_qpel_hv4_8_i8mm: 101.6
put_hevc_qpel_hv6_8_c: 756.4
put_hevc_qpel_hv6_8_i8mm: 225.9
put_hevc_qpel_hv8_8_c: 1189.9
put_hevc_qpel_hv8_8_i8mm: 296.6
put_hevc_qpel_hv12_8_c: 2407.4
put_hevc_qpel_hv12_8_i8mm: 552.4
put_hevc_qpel_hv16_8_c: 4021.4
put_hevc_qpel_hv16_8_i8mm: 886.6
put_hevc_qpel_hv24_8_c: 8992.1
put_hevc_qpel_hv24_8_i8mm: 1968.9
put_hevc_qpel_hv32_8_c: 15197.9
put_hevc_qpel_hv32_8_i8mm: 3209.4
put_hevc_qpel_hv48_8_c: 32811.1
put_hevc_qpel_hv48_8_i8mm: 7442.1
put_hevc_qpel_hv64_8_c: 58106.1
put_hevc_qpel_hv64_8_i8mm: 12423.9

Co-Authored-By: J. Dekker <jdek@itanimul.li>
Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn>
---
  libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
  libavcodec/aarch64/hevcdsp_qpel_neon.S    | 397 ++++++++++++++++++++++
  2 files changed, 402 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index f6b4c31d17..7d889efe68 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -208,6 +208,10 @@ NEON8_FNPROTO(qpel_v, (int16_t *dst,
          const uint8_t *src, ptrdiff_t srcstride,
          int height, intptr_t mx, intptr_t my, int width),);
  +NEON8_FNPROTO(qpel_hv, (int16_t *dst,
+        const uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width), _i8mm);
+
  NEON8_FNPROTO(qpel_uni_v, (uint8_t *dst,  ptrdiff_t dststride,
          const uint8_t *src, ptrdiff_t srcstride,
          int height, intptr_t mx, intptr_t my, int width),);
@@ -335,6 +339,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext 
*c, const int bit_depth)
              NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 1, epel_uni_hv, 
_i8mm);
              NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h 
,_i8mm);
              NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
+            NEON8_FNASSIGN(c->put_hevc_qpel, 1, 1, qpel_hv, _i8mm);
              NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 1, qpel_uni_hv, 
_i8mm);
              NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, 
_i8mm);
              NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 1, 
epel_uni_w_hv, _i8mm);
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S 
b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index eff70d70a4..e4475ba920 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -3070,6 +3070,403 @@ function ff_hevc_put_hevc_qpel_h64_8_neon_i8mm, 
export=1
          ret
  endfunc
  +
+function ff_hevc_put_hevc_qpel_hv4_8_neon_i8mm, export=1
+        add             w10, w3, #7
+        mov             x7, #128
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x5, x30, [sp, #-32]!
+        stp             x0, x3, [sp, #16]
+        add             x0, sp, #32
+        sub             x1, x1, x2, lsl #1
+        add             x3, x3, #7
+        sub             x1, x1, x2
+        bl              X(ff_hevc_put_hevc_qpel_h4_8_neon_i8mm)
+        ldp             x5, x30, [sp]
+        ldp             x0, x3, [sp, #16]
+        add             sp, sp, #32
+        load_qpel_filterh x5, x4
+        ldr             d16, [sp]
+        ldr             d17, [sp, x7]
+        add             sp, sp, x7, lsl #1
+        ldr             d18, [sp]
+        ldr             d19, [sp, x7]
+        add             sp, sp, x7, lsl #1
+        ldr             d20, [sp]
+        ldr             d21, [sp, x7]
+        add             sp, sp, x7, lsl #1
+        ldr             d22, [sp]
+        add             sp, sp, x7
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\tmp\().4h}, [sp], x7
+        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, 
\src6, \src7, sqshrn
+        subs            w3, w3, #1
+        st1             {v1.4h}, [x0], x7
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv6_8_neon_i8mm, export=1
+        add             w10, w3, #7
+        mov             x7, #128
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x5, x30, [sp, #-32]!
+        stp             x0, x3, [sp, #16]
+        add             x0, sp, #32
+        sub             x1, x1, x2, lsl #1
+        add             x3, x3, #7
+        sub             x1, x1, x2
+        bl              X(ff_hevc_put_hevc_qpel_h6_8_neon_i8mm)
+        ldp             x5, x30, [sp]
+        mov             x8, #120
+        ldp             x0, x3, [sp, #16]
+        add             sp, sp, #32
+        load_qpel_filterh x5, x4
+        ldr             q16, [sp]
+        ldr             q17, [sp, x7]
+        add             sp, sp, x7, lsl #1
+        ldr             q18, [sp]
+        ldr             q19, [sp, x7]
+        add             sp, sp, x7, lsl #1
+        ldr             q20, [sp]
+        ldr             q21, [sp, x7]
+        add             sp, sp, x7, lsl #1
+        ldr             q22, [sp]
+        add             sp, sp, x7
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\tmp\().8h}, [sp], x7
+        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, 
\src6, \src7, sqshrn
+        calc_qpelh2     v1, v2, \src0, \src1, \src2, \src3, \src4, 
\src5, \src6, \src7, sqshrn2
+        st1             {v1.4h}, [x0], #8
+        subs            w3, w3, #1
+        st1             {v1.s}[2], [x0], x8
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv8_8_neon_i8mm, export=1
+        add             w10, w3, #7
+        lsl             x10, x10, #7
+        sub             x1, x1, x2, lsl #1
+        sub             sp, sp, x10         // tmp_array
+        stp             x5, x30, [sp, #-32]!
+        stp             x0, x3, [sp, #16]
+        add             x0, sp, #32
+        add             x3, x3, #7
+        sub             x1, x1, x2
+        bl              X(ff_hevc_put_hevc_qpel_h8_8_neon_i8mm)
+        ldp             x5, x30, [sp]
+        mov             x7, #128
+        ldp             x0, x3, [sp, #16]
+        add             sp, sp, #32
+        load_qpel_filterh x5, x4
+        ldr             q16, [sp]
+        ldr             q17, [sp, x7]
+        add             sp, sp, x7, lsl #1
+        ldr             q18, [sp]
+        ldr             q19, [sp, x7]
+        add             sp, sp, x7, lsl #1
+        ldr             q20, [sp]
+        ldr             q21, [sp, x7]
+        add             sp, sp, x7, lsl #1
+        ldr             q22, [sp]
+        add             sp, sp, x7
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1            {\tmp\().8h}, [sp], x7
+        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, 
\src6, \src7, sqshrn
+        calc_qpelh2     v1, v2, \src0, \src1, \src2, \src3, \src4, 
\src5, \src6, \src7, sqshrn2
+        subs            w3, w3, #1
+        st1            {v1.8h}, [x0], x7
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv12_8_neon_i8mm, export=1
+        add             w10, w3, #7
+        lsl             x10, x10, #7
+        sub             x1, x1, x2, lsl #1
+        sub             sp, sp, x10         // tmp_array
+        stp             x5, x30, [sp, #-32]!
+        stp             x0, x3, [sp, #16]
+        add             x0, sp, #32
+        add             x3, x3, #7
+        sub             x1, x1, x2
+        bl              X(ff_hevc_put_hevc_qpel_h12_8_neon_i8mm)
+        ldp             x5, x30, [sp]
+        mov             x7, #128
+        ldp             x0, x3, [sp, #16]
+        add             sp, sp, #32
+        load_qpel_filterh x5, x4
+        mov             x8, #112
+        ld1             {v16.8h, v17.8h}, [sp], x7
+        ld1             {v18.8h, v19.8h}, [sp], x7
+        ld1             {v20.8h, v21.8h}, [sp], x7
+        ld1             {v22.8h, v23.8h}, [sp], x7
+        ld1             {v24.8h, v25.8h}, [sp], x7
+        ld1             {v26.8h, v27.8h}, [sp], x7
+        ld1             {v28.8h, v29.8h}, [sp], x7
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, 
src8, src9, src10, src11, src12, src13, src14, src15
+        ld1             {\tmp0\().8h, \tmp1\().8h}, [sp], x7
+        calc_qpelh      v1,     \src0,  \src1, \src2,  \src3,  \src4, 
\src5,  \src6,  \src7, sqshrn
+        calc_qpelh2     v1, v2, \src0, \src1,  \src2,  \src3,  \src4, 
\src5,  \src6,  \src7, sqshrn2
+        calc_qpelh      v2,     \src8, \src9, \src10, \src11, \src12, 
\src13, \src14, \src15, sqshrn
+        st1             {v1.8h}, [x0], #16
+        subs            w3, w3, #1
+        st1             {v2.4h}, [x0], x8
+.endm
+1:      calc_all2
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv16_8_neon_i8mm, export=1
+        add             w10, w3, #7
+        lsl             x10, x10, #7
+        sub             x1, x1, x2, lsl #1
+        sub             sp, sp, x10         // tmp_array
+        stp             x5, x30, [sp, #-32]!
+        stp             x0, x3, [sp, #16]
+        add             x3, x3, #7
+        add             x0, sp, #32
+        sub             x1, x1, x2
+        bl              X(ff_hevc_put_hevc_qpel_h16_8_neon_i8mm)
+        ldp             x5, x30, [sp]
+        mov             x7, #128
+        ldp             x0, x3, [sp, #16]
+        add             sp, sp, #32
+        load_qpel_filterh x5, x4
+        ld1             {v16.8h, v17.8h}, [sp], x7
+        ld1             {v18.8h, v19.8h}, [sp], x7
+        ld1             {v20.8h, v21.8h}, [sp], x7
+        ld1             {v22.8h, v23.8h}, [sp], x7
+        ld1             {v24.8h, v25.8h}, [sp], x7
+        ld1             {v26.8h, v27.8h}, [sp], x7
+        ld1             {v28.8h, v29.8h}, [sp], x7
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, 
src8, src9, src10, src11, src12, src13, src14, src15
+        ld1             {\tmp0\().8h, \tmp1\().8h}, [sp], x7
+        calc_qpelh      v1,     \src0,  \src1, \src2,  \src3,  \src4, 
\src5,  \src6,  \src7, sqshrn
+        calc_qpelh2     v1, v2, \src0, \src1,  \src2,  \src3,  \src4, 
\src5,  \src6,  \src7, sqshrn2
+        calc_qpelh      v2,     \src8, \src9, \src10, \src11, \src12, 
\src13, \src14, \src15, sqshrn
+        calc_qpelh2     v2, v3, \src8, \src9, \src10, \src11, \src12, 
\src13, \src14, \src15, sqshrn2
+        subs            w3, w3, #1
+        st1             {v1.8h, v2.8h}, [x0], x7
+.endm
+1:      calc_all2
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv24_8_neon_i8mm, export=1
+        sub             sp, sp, #32
+        st1             {v8.8b-v11.8b}, [sp]
+        sub             x1, x1, x2, lsl #1
+        sub             sp, sp, #32
+        add             w10, w3, #7
+        st1             {v12.8b-v15.8b}, [sp]
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x5, x30, [sp, #-32]!
+        stp             x0, x3, [sp, #16]
+        add             x0, sp, #32
+        add             x3, x3, #7
+        sub             x1, x1, x2
+        bl              X(ff_hevc_put_hevc_qpel_h24_8_neon_i8mm)
+        ldp             x5, x30, [sp]
+        mov             x7, #128
+        ldp             x0, x3, [sp, #16]
+        add             sp, sp, #32
+        load_qpel_filterh x5, x4
+        ld1             {v8.8h-v10.8h}, [sp], x7
+        ld1             {v11.8h-v13.8h}, [sp], x7
+        ld1             {v14.8h-v16.8h}, [sp], x7
+        ld1             {v17.8h-v19.8h}, [sp], x7
+        ld1             {v20.8h-v22.8h}, [sp], x7
+        ld1             {v23.8h-v25.8h}, [sp], x7
+        ld1             {v26.8h-v28.8h}, [sp], x7
+1:      ld1             {v29.8h-v31.8h}, [sp], x7
+        calc_qpelh      v1, v8, v11, v14, v17, v20, v23, v26, v29, sqshrn
+        calc_qpelh2     v1, v2, v8, v11, v14, v17, v20, v23, v26, v29, 
sqshrn2
+        calc_qpelh      v2, v9, v12, v15, v18, v21, v24, v27, v30, sqshrn
+        calc_qpelh2     v2, v3, v9, v12, v15, v18, v21, v24, v27, v30, 
sqshrn2
+        calc_qpelh      v3, v10, v13, v16, v19, v22, v25, v28, v31, sqshrn
+        calc_qpelh2     v3, v4, v10, v13, v16, v19, v22, v25, v28, v31, 
sqshrn2
+        subs            w3, w3, #1
+        st1             {v1.8h-v3.8h}, [x0], x7
+        b.eq            2f
+
+        ld1             {v8.8h-v10.8h}, [sp], x7
+        calc_qpelh      v1, v11, v14, v17, v20, v23, v26, v29, v8, sqshrn
+        calc_qpelh2     v1, v2, v11, v14, v17, v20, v23, v26, v29, v8, 
sqshrn2
+        calc_qpelh      v2, v12, v15, v18, v21, v24, v27, v30, v9, sqshrn
+        calc_qpelh2     v2, v3, v12, v15, v18, v21, v24, v27, v30, v9, 
sqshrn2
+        calc_qpelh      v3, v13, v16, v19, v22, v25, v28, v31, v10, sqshrn
+        calc_qpelh2     v3, v4, v13, v16, v19, v22, v25, v28, v31, v10, 
sqshrn2
+        subs            w3, w3, #1
+        st1             {v1.8h-v3.8h}, [x0], x7
+        b.eq            2f
+
+        ld1             {v11.8h-v13.8h}, [sp], x7
+        calc_qpelh      v1, v14, v17, v20, v23, v26, v29, v8, v11, sqshrn
+        calc_qpelh2     v1, v2, v14, v17, v20, v23, v26, v29, v8, v11, 
sqshrn2
+        calc_qpelh      v2, v15, v18, v21, v24, v27, v30, v9, v12, sqshrn
+        calc_qpelh2     v2, v3, v15, v18, v21, v24, v27, v30, v9, v12, 
sqshrn2
+        calc_qpelh      v3, v16, v19, v22, v25, v28, v31, v10, v13, sqshrn
+        calc_qpelh2     v3, v4, v16, v19, v22, v25, v28, v31, v10, v13, 
sqshrn2
+        subs            w3, w3, #1
+        st1             {v1.8h-v3.8h}, [x0], x7
+        b.eq            2f
+
+        ld1             {v14.8h-v16.8h}, [sp], x7
+        calc_qpelh      v1, v17, v20, v23, v26, v29, v8, v11, v14, sqshrn
+        calc_qpelh2     v1, v2, v17, v20, v23, v26, v29, v8, v11, v14, 
sqshrn2
+        calc_qpelh      v2, v18, v21, v24, v27, v30, v9, v12, v15, sqshrn
+        calc_qpelh2     v2, v3, v18, v21, v24, v27, v30, v9, v12, v15, 
sqshrn2
+        calc_qpelh      v3, v19, v22, v25, v28, v31, v10, v13, v16, sqshrn
+        calc_qpelh2     v3, v4, v19, v22, v25, v28, v31, v10, v13, v16, 
sqshrn2
+        subs            w3, w3, #1
+        st1             {v1.8h-v3.8h}, [x0], x7
+        b.eq            2f
+
+        ld1             {v17.8h-v19.8h}, [sp], x7
+        calc_qpelh      v1, v20, v23, v26, v29, v8, v11, v14, v17, sqshrn
+        calc_qpelh2     v1, v2, v20, v23, v26, v29, v8, v11, v14, v17, 
sqshrn2
+        calc_qpelh      v2, v21, v24, v27, v30, v9, v12, v15, v18, sqshrn
+        calc_qpelh2     v2, v3, v21, v24, v27, v30, v9, v12, v15, v18, 
sqshrn2
+        calc_qpelh      v3, v22, v25, v28, v31, v10, v13, v16, v19, sqshrn
+        calc_qpelh2     v3, v4, v22, v25, v28, v31, v10, v13, v16, v19, 
sqshrn2
+        subs            w3, w3, #1
+        st1             {v1.8h-v3.8h}, [x0], x7
+        b.eq            2f
+
+        ld1             {v20.8h-v22.8h}, [sp], x7
+        calc_qpelh      v1, v23, v26, v29, v8, v11, v14, v17, v20, sqshrn
+        calc_qpelh2     v1, v2, v23, v26, v29, v8, v11, v14, v17, v20, 
sqshrn2
+        calc_qpelh      v2, v24, v27, v30, v9, v12, v15, v18, v21, sqshrn
+        calc_qpelh2     v2, v3, v24, v27, v30, v9, v12, v15, v18, v21, 
sqshrn2
+        calc_qpelh      v3, v25, v28, v31, v10, v13, v16, v19, v22, sqshrn
+        calc_qpelh2     v3, v4, v25, v28, v31, v10, v13, v16, v19, v22, 
sqshrn2
+        subs            w3, w3, #1
+        st1             {v1.8h-v3.8h}, [x0], x7
+        b.eq            2f
+
+        ld1             {v23.8h-v25.8h}, [sp], x7
+        calc_qpelh      v1, v26, v29, v8, v11, v14, v17, v20, v23, sqshrn
+        calc_qpelh2     v1, v2, v26, v29, v8, v11, v14, v17, v20, v23, 
sqshrn2
+        calc_qpelh      v2, v27, v30, v9, v12, v15, v18, v21, v24, sqshrn
+        calc_qpelh2     v2, v3, v27, v30, v9, v12, v15, v18, v21, v24, 
sqshrn2
+        calc_qpelh      v3, v28, v31, v10, v13, v16, v19, v22, v25, sqshrn
+        calc_qpelh2     v3, v4, v28, v31, v10, v13, v16, v19, v22, v25, 
sqshrn2
+        subs            w3, w3, #1
+        st1             {v1.8h-v3.8h}, [x0], x7
+        b.eq            2f
+
+        ld1             {v26.8h-v28.8h}, [sp], x7
+        calc_qpelh      v1, v29, v8, v11, v14, v17, v20, v23, v26, sqshrn
+        calc_qpelh2     v1, v2, v29, v8, v11, v14, v17, v20, v23, v26, 
sqshrn2
+        calc_qpelh      v2, v30, v9, v12, v15, v18, v21, v24, v27, sqshrn
+        calc_qpelh2     v2, v3, v30, v9, v12, v15, v18, v21, v24, v27, 
sqshrn2
+        calc_qpelh      v3, v31, v10, v13, v16, v19, v22, v25, v28, sqshrn
+        calc_qpelh2     v3, v4, v31, v10, v13, v16, v19, v22, v25, v28, 
sqshrn2
+        subs            w3, w3, #1
+        st1             {v1.8h-v3.8h}, [x0], x7
+        b.hi            1b
+2:      ld1             {v12.8b-v15.8b}, [sp], #32
+        ld1             {v8.8b-v11.8b}, [sp], #32
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv32_8_neon_i8mm, export=1
+        add             w10, w3, #7
+        sub             x1, x1, x2, lsl #1
+        lsl             x10, x10, #7
+        sub             x1, x1, x2
+        sub             sp, sp, x10         // tmp_array
+        stp             x5, x30, [sp, #-32]!
+        stp             x0, x3, [sp, #16]
+        add             x3, x3, #7
+        add             x0, sp, #32
+        bl              X(ff_hevc_put_hevc_qpel_h32_8_neon_i8mm)
+        ldp             x5, x30, [sp]
+        mov             x7, #128
+        ldp             x0, x3, [sp, #16]
+        add             sp, sp, #32
+        load_qpel_filterh x5, x4
+0:      mov             x8, sp          // src
+        ld1             {v16.8h, v17.8h}, [x8], x7
+        mov             w9, w3          // height
+        ld1             {v18.8h, v19.8h}, [x8], x7
+        mov             x5, x0          // dst
+        ld1             {v20.8h, v21.8h}, [x8], x7
+        ld1             {v22.8h, v23.8h}, [x8], x7
+        ld1             {v24.8h, v25.8h}, [x8], x7
+        ld1             {v26.8h, v27.8h}, [x8], x7
+        ld1             {v28.8h, v29.8h}, [x8], x7
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, 
src8, src9, src10, src11, src12, src13, src14, src15
+        ld1             {\tmp0\().8h, \tmp1\().8h}, [x8], x7
+        calc_qpelh      v1,     \src0,  \src1, \src2,  \src3,  \src4, 
\src5,  \src6,  \src7, sqshrn
+        calc_qpelh2     v1, v2, \src0, \src1,  \src2,  \src3,  \src4, 
\src5,  \src6,  \src7, sqshrn2
+        calc_qpelh      v2,     \src8, \src9, \src10, \src11, \src12, 
\src13, \src14, \src15, sqshrn
+        calc_qpelh2     v2, v3, \src8, \src9, \src10, \src11, \src12, 
\src13, \src14, \src15, sqshrn2
+        subs            x9, x9, #1
+        st1             {v1.8h, v2.8h}, [x5], x7
+.endm
+1:      calc_all2
+.purgem calc
+2:      add             x0, x0, #32
+        add             sp, sp, #32
+        subs            w6, w6, #16
+        b.hi            0b
+        add             w10, w3, #6
+        add             sp, sp, #64          // discard rest of first line
+        lsl             x10, x10, #7
+        add             sp, sp, x10         // tmp_array without first line
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv48_8_neon_i8mm, export=1
+        stp             x4, x5, [sp, #-64]!
+        stp             x2, x3, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        str             x30, [sp, #48]
+        bl              X(ff_hevc_put_hevc_qpel_hv24_8_neon_i8mm)
+        ldp             x4, x5, [sp]
+        ldp             x2, x3, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             sp, sp, #48
+        add             x1, x1, #24
+        add             x0, x0, #48
+        bl              X(ff_hevc_put_hevc_qpel_hv24_8_neon_i8mm)
+        ldr             x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv64_8_neon_i8mm, export=1
+        stp             x4, x5, [sp, #-64]!
+        stp             x2, x3, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        str             x30, [sp, #48]
+        mov             x6, #32
+        bl              X(ff_hevc_put_hevc_qpel_hv32_8_neon_i8mm)
+        ldp             x4, x5, [sp]
+        ldp             x2, x3, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             sp, sp, #48
+        add             x1, x1, #32
+        add             x0, x0, #64
+        mov             x6, #32
+        bl              X(ff_hevc_put_hevc_qpel_hv32_8_neon_i8mm)
+        ldr             x30, [sp], #16
+        ret
+endfunc
+
  .macro QPEL_UNI_W_HV_HEADER width
          ldp             x14, x15, [sp]          // mx, my
          ldr             w13, [sp, #16]          // width
-- 
2.38.0.windows.1

[-- Attachment #2: 0004-lavc-aarch64-new-optimization-for-8-bit-hevc_qpel_hv.patch --]
[-- Type: text/plain, Size: 21203 bytes --]

From 6a7f049fd0382c04297fb9cefd9f5ce022abbe5f Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu@myais.com.cn>
Date: Sat, 9 Sep 2023 22:40:51 +0800
Subject: [PATCH 4/4] lavc/aarch64: new optimization for 8-bit hevc_qpel_hv

checkasm bench:
put_hevc_qpel_hv4_8_c: 422.1
put_hevc_qpel_hv4_8_i8mm: 101.6
put_hevc_qpel_hv6_8_c: 756.4
put_hevc_qpel_hv6_8_i8mm: 225.9
put_hevc_qpel_hv8_8_c: 1189.9
put_hevc_qpel_hv8_8_i8mm: 296.6
put_hevc_qpel_hv12_8_c: 2407.4
put_hevc_qpel_hv12_8_i8mm: 552.4
put_hevc_qpel_hv16_8_c: 4021.4
put_hevc_qpel_hv16_8_i8mm: 886.6
put_hevc_qpel_hv24_8_c: 8992.1
put_hevc_qpel_hv24_8_i8mm: 1968.9
put_hevc_qpel_hv32_8_c: 15197.9
put_hevc_qpel_hv32_8_i8mm: 3209.4
put_hevc_qpel_hv48_8_c: 32811.1
put_hevc_qpel_hv48_8_i8mm: 7442.1
put_hevc_qpel_hv64_8_c: 58106.1
put_hevc_qpel_hv64_8_i8mm: 12423.9

Co-Authored-By: J. Dekker <jdek@itanimul.li>
---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
 libavcodec/aarch64/hevcdsp_qpel_neon.S    | 397 ++++++++++++++++++++++
 2 files changed, 402 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index f6b4c31d17..7d889efe68 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -208,6 +208,10 @@ NEON8_FNPROTO(qpel_v, (int16_t *dst,
         const uint8_t *src, ptrdiff_t srcstride,
         int height, intptr_t mx, intptr_t my, int width),);
 
+NEON8_FNPROTO(qpel_hv, (int16_t *dst,
+        const uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width), _i8mm);
+
 NEON8_FNPROTO(qpel_uni_v, (uint8_t *dst,  ptrdiff_t dststride,
         const uint8_t *src, ptrdiff_t srcstride,
         int height, intptr_t mx, intptr_t my, int width),);
@@ -335,6 +339,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
             NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 1, epel_uni_hv, _i8mm);
             NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h ,_i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
+            NEON8_FNASSIGN(c->put_hevc_qpel, 1, 1, qpel_hv, _i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 1, qpel_uni_hv, _i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
             NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 1, epel_uni_w_hv, _i8mm);
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index eff70d70a4..e4475ba920 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -3070,6 +3070,403 @@ function ff_hevc_put_hevc_qpel_h64_8_neon_i8mm, export=1
         ret
 endfunc
 
+
+function ff_hevc_put_hevc_qpel_hv4_8_neon_i8mm, export=1
+        add             w10, w3, #7
+        mov             x7, #128
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x5, x30, [sp, #-32]!
+        stp             x0, x3, [sp, #16]
+        add             x0, sp, #32
+        sub             x1, x1, x2, lsl #1
+        add             x3, x3, #7
+        sub             x1, x1, x2
+        bl              X(ff_hevc_put_hevc_qpel_h4_8_neon_i8mm)
+        ldp             x5, x30, [sp]
+        ldp             x0, x3, [sp, #16]
+        add             sp, sp, #32
+        load_qpel_filterh x5, x4
+        ldr             d16, [sp]
+        ldr             d17, [sp, x7]
+        add             sp, sp, x7, lsl #1
+        ldr             d18, [sp]
+        ldr             d19, [sp, x7]
+        add             sp, sp, x7, lsl #1
+        ldr             d20, [sp]
+        ldr             d21, [sp, x7]
+        add             sp, sp, x7, lsl #1
+        ldr             d22, [sp]
+        add             sp, sp, x7
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\tmp\().4h}, [sp], x7
+        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+        subs            w3, w3, #1
+        st1             {v1.4h}, [x0], x7
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv6_8_neon_i8mm, export=1
+        add             w10, w3, #7
+        mov             x7, #128
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x5, x30, [sp, #-32]!
+        stp             x0, x3, [sp, #16]
+        add             x0, sp, #32
+        sub             x1, x1, x2, lsl #1
+        add             x3, x3, #7
+        sub             x1, x1, x2
+        bl              X(ff_hevc_put_hevc_qpel_h6_8_neon_i8mm)
+        ldp             x5, x30, [sp]
+        mov             x8, #120
+        ldp             x0, x3, [sp, #16]
+        add             sp, sp, #32
+        load_qpel_filterh x5, x4
+        ldr             q16, [sp]
+        ldr             q17, [sp, x7]
+        add             sp, sp, x7, lsl #1
+        ldr             q18, [sp]
+        ldr             q19, [sp, x7]
+        add             sp, sp, x7, lsl #1
+        ldr             q20, [sp]
+        ldr             q21, [sp, x7]
+        add             sp, sp, x7, lsl #1
+        ldr             q22, [sp]
+        add             sp, sp, x7
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\tmp\().8h}, [sp], x7
+        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+        calc_qpelh2     v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
+        st1             {v1.4h}, [x0], #8
+        subs            w3, w3, #1
+        st1             {v1.s}[2], [x0], x8
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv8_8_neon_i8mm, export=1
+        add             w10, w3, #7
+        lsl             x10, x10, #7
+        sub             x1, x1, x2, lsl #1
+        sub             sp, sp, x10         // tmp_array
+        stp             x5, x30, [sp, #-32]!
+        stp             x0, x3, [sp, #16]
+        add             x0, sp, #32
+        add             x3, x3, #7
+        sub             x1, x1, x2
+        bl              X(ff_hevc_put_hevc_qpel_h8_8_neon_i8mm)
+        ldp             x5, x30, [sp]
+        mov             x7, #128
+        ldp             x0, x3, [sp, #16]
+        add             sp, sp, #32
+        load_qpel_filterh x5, x4
+        ldr             q16, [sp]
+        ldr             q17, [sp, x7]
+        add             sp, sp, x7, lsl #1
+        ldr             q18, [sp]
+        ldr             q19, [sp, x7]
+        add             sp, sp, x7, lsl #1
+        ldr             q20, [sp]
+        ldr             q21, [sp, x7]
+        add             sp, sp, x7, lsl #1
+        ldr             q22, [sp]
+        add             sp, sp, x7
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1            {\tmp\().8h}, [sp], x7
+        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+        calc_qpelh2     v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
+        subs            w3, w3, #1
+        st1            {v1.8h}, [x0], x7
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv12_8_neon_i8mm, export=1
+        add             w10, w3, #7
+        lsl             x10, x10, #7
+        sub             x1, x1, x2, lsl #1
+        sub             sp, sp, x10         // tmp_array
+        stp             x5, x30, [sp, #-32]!
+        stp             x0, x3, [sp, #16]
+        add             x0, sp, #32
+        add             x3, x3, #7
+        sub             x1, x1, x2
+        bl              X(ff_hevc_put_hevc_qpel_h12_8_neon_i8mm)
+        ldp             x5, x30, [sp]
+        mov             x7, #128
+        ldp             x0, x3, [sp, #16]
+        add             sp, sp, #32
+        load_qpel_filterh x5, x4
+        mov             x8, #112
+        ld1             {v16.8h, v17.8h}, [sp], x7
+        ld1             {v18.8h, v19.8h}, [sp], x7
+        ld1             {v20.8h, v21.8h}, [sp], x7
+        ld1             {v22.8h, v23.8h}, [sp], x7
+        ld1             {v24.8h, v25.8h}, [sp], x7
+        ld1             {v26.8h, v27.8h}, [sp], x7
+        ld1             {v28.8h, v29.8h}, [sp], x7
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+        ld1             {\tmp0\().8h, \tmp1\().8h}, [sp], x7
+        calc_qpelh      v1,     \src0,  \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sqshrn
+        calc_qpelh2     v1, v2, \src0, \src1,  \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sqshrn2
+        calc_qpelh      v2,     \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
+        st1             {v1.8h}, [x0], #16
+        subs            w3, w3, #1
+        st1             {v2.4h}, [x0], x8
+.endm
+1:      calc_all2
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv16_8_neon_i8mm, export=1
+        add             w10, w3, #7
+        lsl             x10, x10, #7
+        sub             x1, x1, x2, lsl #1
+        sub             sp, sp, x10         // tmp_array
+        stp             x5, x30, [sp, #-32]!
+        stp             x0, x3, [sp, #16]
+        add             x3, x3, #7
+        add             x0, sp, #32
+        sub             x1, x1, x2
+        bl              X(ff_hevc_put_hevc_qpel_h16_8_neon_i8mm)
+        ldp             x5, x30, [sp]
+        mov             x7, #128
+        ldp             x0, x3, [sp, #16]
+        add             sp, sp, #32
+        load_qpel_filterh x5, x4
+        ld1             {v16.8h, v17.8h}, [sp], x7
+        ld1             {v18.8h, v19.8h}, [sp], x7
+        ld1             {v20.8h, v21.8h}, [sp], x7
+        ld1             {v22.8h, v23.8h}, [sp], x7
+        ld1             {v24.8h, v25.8h}, [sp], x7
+        ld1             {v26.8h, v27.8h}, [sp], x7
+        ld1             {v28.8h, v29.8h}, [sp], x7
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+        ld1             {\tmp0\().8h, \tmp1\().8h}, [sp], x7
+        calc_qpelh      v1,     \src0,  \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sqshrn
+        calc_qpelh2     v1, v2, \src0, \src1,  \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sqshrn2
+        calc_qpelh      v2,     \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
+        calc_qpelh2     v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn2
+        subs            w3, w3, #1
+        st1             {v1.8h, v2.8h}, [x0], x7
+.endm
+1:      calc_all2
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv24_8_neon_i8mm, export=1
+        sub             sp, sp, #32
+        st1             {v8.8b-v11.8b}, [sp]
+        sub             x1, x1, x2, lsl #1
+        sub             sp, sp, #32
+        add             w10, w3, #7
+        st1             {v12.8b-v15.8b}, [sp]
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x5, x30, [sp, #-32]!
+        stp             x0, x3, [sp, #16]
+        add             x0, sp, #32
+        add             x3, x3, #7
+        sub             x1, x1, x2
+        bl              X(ff_hevc_put_hevc_qpel_h24_8_neon_i8mm)
+        ldp             x5, x30, [sp]
+        mov             x7, #128
+        ldp             x0, x3, [sp, #16]
+        add             sp, sp, #32
+        load_qpel_filterh x5, x4
+        ld1             {v8.8h-v10.8h}, [sp], x7
+        ld1             {v11.8h-v13.8h}, [sp], x7
+        ld1             {v14.8h-v16.8h}, [sp], x7
+        ld1             {v17.8h-v19.8h}, [sp], x7
+        ld1             {v20.8h-v22.8h}, [sp], x7
+        ld1             {v23.8h-v25.8h}, [sp], x7
+        ld1             {v26.8h-v28.8h}, [sp], x7
+1:      ld1             {v29.8h-v31.8h}, [sp], x7
+        calc_qpelh      v1, v8, v11, v14, v17, v20, v23, v26, v29, sqshrn
+        calc_qpelh2     v1, v2, v8, v11, v14, v17, v20, v23, v26, v29, sqshrn2
+        calc_qpelh      v2, v9, v12, v15, v18, v21, v24, v27, v30, sqshrn
+        calc_qpelh2     v2, v3, v9, v12, v15, v18, v21, v24, v27, v30, sqshrn2
+        calc_qpelh      v3, v10, v13, v16, v19, v22, v25, v28, v31, sqshrn
+        calc_qpelh2     v3, v4, v10, v13, v16, v19, v22, v25, v28, v31, sqshrn2
+        subs            w3, w3, #1
+        st1             {v1.8h-v3.8h}, [x0], x7
+        b.eq            2f
+
+        ld1             {v8.8h-v10.8h}, [sp], x7
+        calc_qpelh      v1, v11, v14, v17, v20, v23, v26, v29, v8, sqshrn
+        calc_qpelh2     v1, v2, v11, v14, v17, v20, v23, v26, v29, v8, sqshrn2
+        calc_qpelh      v2, v12, v15, v18, v21, v24, v27, v30, v9, sqshrn
+        calc_qpelh2     v2, v3, v12, v15, v18, v21, v24, v27, v30, v9, sqshrn2
+        calc_qpelh      v3, v13, v16, v19, v22, v25, v28, v31, v10, sqshrn
+        calc_qpelh2     v3, v4, v13, v16, v19, v22, v25, v28, v31, v10, sqshrn2
+        subs            w3, w3, #1
+        st1             {v1.8h-v3.8h}, [x0], x7
+        b.eq            2f
+
+        ld1             {v11.8h-v13.8h}, [sp], x7
+        calc_qpelh      v1, v14, v17, v20, v23, v26, v29, v8, v11, sqshrn
+        calc_qpelh2     v1, v2, v14, v17, v20, v23, v26, v29, v8, v11, sqshrn2
+        calc_qpelh      v2, v15, v18, v21, v24, v27, v30, v9, v12, sqshrn
+        calc_qpelh2     v2, v3, v15, v18, v21, v24, v27, v30, v9, v12, sqshrn2
+        calc_qpelh      v3, v16, v19, v22, v25, v28, v31, v10, v13, sqshrn
+        calc_qpelh2     v3, v4, v16, v19, v22, v25, v28, v31, v10, v13, sqshrn2
+        subs            w3, w3, #1
+        st1             {v1.8h-v3.8h}, [x0], x7
+        b.eq            2f
+
+        ld1             {v14.8h-v16.8h}, [sp], x7
+        calc_qpelh      v1, v17, v20, v23, v26, v29, v8, v11, v14, sqshrn
+        calc_qpelh2     v1, v2, v17, v20, v23, v26, v29, v8, v11, v14, sqshrn2
+        calc_qpelh      v2, v18, v21, v24, v27, v30, v9, v12, v15, sqshrn
+        calc_qpelh2     v2, v3, v18, v21, v24, v27, v30, v9, v12, v15, sqshrn2
+        calc_qpelh      v3, v19, v22, v25, v28, v31, v10, v13, v16, sqshrn
+        calc_qpelh2     v3, v4, v19, v22, v25, v28, v31, v10, v13, v16, sqshrn2
+        subs            w3, w3, #1
+        st1             {v1.8h-v3.8h}, [x0], x7
+        b.eq            2f
+
+        ld1             {v17.8h-v19.8h}, [sp], x7
+        calc_qpelh      v1, v20, v23, v26, v29, v8, v11, v14, v17, sqshrn
+        calc_qpelh2     v1, v2, v20, v23, v26, v29, v8, v11, v14, v17, sqshrn2
+        calc_qpelh      v2, v21, v24, v27, v30, v9, v12, v15, v18, sqshrn
+        calc_qpelh2     v2, v3, v21, v24, v27, v30, v9, v12, v15, v18, sqshrn2
+        calc_qpelh      v3, v22, v25, v28, v31, v10, v13, v16, v19, sqshrn
+        calc_qpelh2     v3, v4, v22, v25, v28, v31, v10, v13, v16, v19, sqshrn2
+        subs            w3, w3, #1
+        st1             {v1.8h-v3.8h}, [x0], x7
+        b.eq            2f
+
+        ld1             {v20.8h-v22.8h}, [sp], x7
+        calc_qpelh      v1, v23, v26, v29, v8, v11, v14, v17, v20, sqshrn
+        calc_qpelh2     v1, v2, v23, v26, v29, v8, v11, v14, v17, v20, sqshrn2
+        calc_qpelh      v2, v24, v27, v30, v9, v12, v15, v18, v21, sqshrn
+        calc_qpelh2     v2, v3, v24, v27, v30, v9, v12, v15, v18, v21, sqshrn2
+        calc_qpelh      v3, v25, v28, v31, v10, v13, v16, v19, v22, sqshrn
+        calc_qpelh2     v3, v4, v25, v28, v31, v10, v13, v16, v19, v22, sqshrn2
+        subs            w3, w3, #1
+        st1             {v1.8h-v3.8h}, [x0], x7
+        b.eq            2f
+
+        ld1             {v23.8h-v25.8h}, [sp], x7
+        calc_qpelh      v1, v26, v29, v8, v11, v14, v17, v20, v23, sqshrn
+        calc_qpelh2     v1, v2, v26, v29, v8, v11, v14, v17, v20, v23, sqshrn2
+        calc_qpelh      v2, v27, v30, v9, v12, v15, v18, v21, v24, sqshrn
+        calc_qpelh2     v2, v3, v27, v30, v9, v12, v15, v18, v21, v24, sqshrn2
+        calc_qpelh      v3, v28, v31, v10, v13, v16, v19, v22, v25, sqshrn
+        calc_qpelh2     v3, v4, v28, v31, v10, v13, v16, v19, v22, v25, sqshrn2
+        subs            w3, w3, #1
+        st1             {v1.8h-v3.8h}, [x0], x7
+        b.eq            2f
+
+        ld1             {v26.8h-v28.8h}, [sp], x7
+        calc_qpelh      v1, v29, v8, v11, v14, v17, v20, v23, v26, sqshrn
+        calc_qpelh2     v1, v2, v29, v8, v11, v14, v17, v20, v23, v26, sqshrn2
+        calc_qpelh      v2, v30, v9, v12, v15, v18, v21, v24, v27, sqshrn
+        calc_qpelh2     v2, v3, v30, v9, v12, v15, v18, v21, v24, v27, sqshrn2
+        calc_qpelh      v3, v31, v10, v13, v16, v19, v22, v25, v28, sqshrn
+        calc_qpelh2     v3, v4, v31, v10, v13, v16, v19, v22, v25, v28, sqshrn2
+        subs            w3, w3, #1
+        st1             {v1.8h-v3.8h}, [x0], x7
+        b.hi            1b
+2:      ld1             {v12.8b-v15.8b}, [sp], #32
+        ld1             {v8.8b-v11.8b}, [sp], #32
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv32_8_neon_i8mm, export=1
+        add             w10, w3, #7
+        sub             x1, x1, x2, lsl #1
+        lsl             x10, x10, #7
+        sub             x1, x1, x2
+        sub             sp, sp, x10         // tmp_array
+        stp             x5, x30, [sp, #-32]!
+        stp             x0, x3, [sp, #16]
+        add             x3, x3, #7
+        add             x0, sp, #32
+        bl              X(ff_hevc_put_hevc_qpel_h32_8_neon_i8mm)
+        ldp             x5, x30, [sp]
+        mov             x7, #128
+        ldp             x0, x3, [sp, #16]
+        add             sp, sp, #32
+        load_qpel_filterh x5, x4
+0:      mov             x8, sp          // src
+        ld1             {v16.8h, v17.8h}, [x8], x7
+        mov             w9, w3          // height
+        ld1             {v18.8h, v19.8h}, [x8], x7
+        mov             x5, x0          // dst
+        ld1             {v20.8h, v21.8h}, [x8], x7
+        ld1             {v22.8h, v23.8h}, [x8], x7
+        ld1             {v24.8h, v25.8h}, [x8], x7
+        ld1             {v26.8h, v27.8h}, [x8], x7
+        ld1             {v28.8h, v29.8h}, [x8], x7
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+        ld1             {\tmp0\().8h, \tmp1\().8h}, [x8], x7
+        calc_qpelh      v1,     \src0,  \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sqshrn
+        calc_qpelh2     v1, v2, \src0, \src1,  \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sqshrn2
+        calc_qpelh      v2,     \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
+        calc_qpelh2     v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn2
+        subs            x9, x9, #1
+        st1             {v1.8h, v2.8h}, [x5], x7
+.endm
+1:      calc_all2
+.purgem calc
+2:      add             x0, x0, #32
+        add             sp, sp, #32
+        subs            w6, w6, #16
+        b.hi            0b
+        add             w10, w3, #6
+        add             sp, sp, #64          // discard rest of first line
+        lsl             x10, x10, #7
+        add             sp, sp, x10         // tmp_array without first line
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv48_8_neon_i8mm, export=1
+        stp             x4, x5, [sp, #-64]!
+        stp             x2, x3, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        str             x30, [sp, #48]
+        bl              X(ff_hevc_put_hevc_qpel_hv24_8_neon_i8mm)
+        ldp             x4, x5, [sp]
+        ldp             x2, x3, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             sp, sp, #48
+        add             x1, x1, #24
+        add             x0, x0, #48
+        bl              X(ff_hevc_put_hevc_qpel_hv24_8_neon_i8mm)
+        ldr             x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv64_8_neon_i8mm, export=1
+        stp             x4, x5, [sp, #-64]!
+        stp             x2, x3, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        str             x30, [sp, #48]
+        mov             x6, #32
+        bl              X(ff_hevc_put_hevc_qpel_hv32_8_neon_i8mm)
+        ldp             x4, x5, [sp]
+        ldp             x2, x3, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             sp, sp, #48
+        add             x1, x1, #32
+        add             x0, x0, #64
+        mov             x6, #32
+        bl              X(ff_hevc_put_hevc_qpel_hv32_8_neon_i8mm)
+        ldr             x30, [sp], #16
+        ret
+endfunc
+
 .macro QPEL_UNI_W_HV_HEADER width
         ldp             x14, x15, [sp]          // mx, my
         ldr             w13, [sp, #16]          // width
-- 
2.38.0.windows.1


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 15+ messages in thread
* [FFmpeg-devel] (no subject)
@ 2023-10-14  8:40 Logan.Lyu
  0 siblings, 0 replies; 15+ messages in thread
From: Logan.Lyu @ 2023-10-14  8:40 UTC (permalink / raw)
  To: ffmpeg-devel

[-- Attachment #1: Type: text/plain, Size: 17977 bytes --]

checkasm bench:
put_hevc_qpel_v4_8_c: 138.1
put_hevc_qpel_v4_8_neon: 41.1
put_hevc_qpel_v6_8_c: 276.6
put_hevc_qpel_v6_8_neon: 60.9
put_hevc_qpel_v8_8_c: 478.9
put_hevc_qpel_v8_8_neon: 72.9
put_hevc_qpel_v12_8_c: 1072.6
put_hevc_qpel_v12_8_neon: 203.9
put_hevc_qpel_v16_8_c: 1852.1
put_hevc_qpel_v16_8_neon: 264.1
put_hevc_qpel_v24_8_c: 4137.6
put_hevc_qpel_v24_8_neon: 586.9
put_hevc_qpel_v32_8_c: 7579.1
put_hevc_qpel_v32_8_neon: 1036.6
put_hevc_qpel_v48_8_c: 16355.6
put_hevc_qpel_v48_8_neon: 2326.4
put_hevc_qpel_v64_8_c: 33545.1
put_hevc_qpel_v64_8_neon: 4126.4

Co-Authored-By: J. Dekker <jdek@itanimul.li>
Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn>
---
  libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
  libavcodec/aarch64/hevcdsp_qpel_neon.S    | 347 +++++++++++++++++++---
  2 files changed, 314 insertions(+), 38 deletions(-)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index e9a341ecb9..f6b4c31d17 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -204,6 +204,10 @@ NEON8_FNPROTO(qpel_h, (int16_t *dst,
          const uint8_t *_src, ptrdiff_t _srcstride,
          int height, intptr_t mx, intptr_t my, int width), _i8mm);
  +NEON8_FNPROTO(qpel_v, (int16_t *dst,
+        const uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width),);
+
  NEON8_FNPROTO(qpel_uni_v, (uint8_t *dst,  ptrdiff_t dststride,
          const uint8_t *src, ptrdiff_t srcstride,
          int height, intptr_t mx, intptr_t my, int width),);
@@ -315,6 +319,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext 
*c, const int bit_depth)
          NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels,);
          NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v,);
          NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels,);
+        NEON8_FNASSIGN(c->put_hevc_qpel, 1, 0, qpel_v,);
          NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
          NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);
          NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,);
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S 
b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index 4132d7a8a9..eff70d70a4 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -112,6 +112,44 @@ endconst
  .endif
  .endm
  +.macro calc_all
+        calc            v23, v16, v17, v18, v19, v20, v21, v22, v23
+        b.eq            2f
+        calc            v16, v17, v18, v19, v20, v21, v22, v23, v16
+        b.eq            2f
+        calc            v17, v18, v19, v20, v21, v22, v23, v16, v17
+        b.eq            2f
+        calc            v18, v19, v20, v21, v22, v23, v16, v17, v18
+        b.eq            2f
+        calc            v19, v20, v21, v22, v23, v16, v17, v18, v19
+        b.eq            2f
+        calc            v20, v21, v22, v23, v16, v17, v18, v19, v20
+        b.eq            2f
+        calc            v21, v22, v23, v16, v17, v18, v19, v20, v21
+        b.eq            2f
+        calc            v22, v23, v16, v17, v18, v19, v20, v21, v22
+        b.hi            1b
+.endm
+
+.macro calc_all2
+        calc v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17, 
v19, v21, v23, v25, v27, v29, v31
+        b.eq            2f
+        calc v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19, 
v21, v23, v25, v27, v29, v31, v17
+        b.eq            2f
+        calc v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21, 
v23, v25, v27, v29, v31, v17, v19
+        b.eq            2f
+        calc v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23, 
v25, v27, v29, v31, v17, v19, v21
+        b.eq            2f
+        calc v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25, 
v27, v29, v31, v17, v19, v21, v23
+        b.eq            2f
+        calc v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27, 
v29, v31, v17, v19, v21, v23, v25
+        b.eq            2f
+        calc v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29, 
v31, v17, v19, v21, v23, v25, v27
+        b.eq            2f
+        calc v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31, 
v17, v19, v21, v23, v25, v27, v29
+        b.hi            1b
+.endm
+
  .macro put_hevc type
  .ifc \type, qpel
          // void put_hevc_qpel_h(int16_t *dst,
@@ -558,6 +596,277 @@ put_hevc qpel
  put_hevc qpel_uni
  put_hevc qpel_bi
  +function ff_hevc_put_hevc_qpel_v4_8_neon, export=1
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        mov             x9, #(MAX_PB_SIZE * 2)
+        sub             x1, x1, x2
+        ldr             s16, [x1]
+        ldr             s17, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             s18, [x1]
+        ldr             s19, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             s20, [x1]
+        ldr             s21, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             s22, [x1]
+        add             x1, x1, x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\tmp\().s}[0], [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, 
\src6, \src7
+        st1             {v24.4h}, [x0], x9
+        subs            w3, w3, #1
+        b.eq            2f
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v6_8_neon, export=1
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        mov             x9, #(MAX_PB_SIZE * 2 - 8)
+        sub             x1, x1, x2
+        ldr             d16, [x1]
+        ldr             d17, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             d18, [x1]
+        ldr             d19, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             d20, [x1]
+        ldr             d21, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             d22, [x1]
+        add             x1, x1, x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\tmp\().8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, 
\src6, \src7
+        st1             {v24.4h}, [x0], #8
+        st1             {v24.s}[2], [x0], x9
+        subs            w3, w3, #1
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v8_8_neon, export=1
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        mov             x9, #(MAX_PB_SIZE * 2)
+        sub             x1, x1, x2
+        ldr             d16, [x1]
+        ldr             d17, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             d18, [x1]
+        ldr             d19, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             d20, [x1]
+        ldr             d21, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             d22, [x1]
+        add             x1, x1, x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1            {\tmp\().8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, 
\src6, \src7
+        st1            {v24.8h}, [x0], x9
+        subs            w3, w3, #1
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v12_8_neon, export=1
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        mov             x9, #(MAX_PB_SIZE * 2 - 16)
+        sub             x1, x1, x2
+        ldr             q16, [x1]
+        ldr             q17, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             q18, [x1]
+        ldr             q19, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             q20, [x1]
+        ldr             q21, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             q22, [x1]
+        add             x1, x1, x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\tmp\().16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, 
\src6, \src7
+        calc_qpelb2     v25, \src0, \src1, \src2, \src3, \src4, \src5, 
\src6, \src7
+        st1             {v24.8h}, [x0], #16
+        subs            w3, w3, #1
+        st1             {v25.4h}, [x0], x9
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v16_8_neon, export=1
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        mov             x9, #(MAX_PB_SIZE * 2)
+        sub             x1, x1, x2
+        ldr             q16, [x1]
+        ldr             q17, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             q18, [x1]
+        ldr             q19, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             q20, [x1]
+        ldr             q21, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             q22, [x1]
+        add             x1, x1, x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\tmp\().16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, 
\src6, \src7
+        calc_qpelb2     v25, \src0, \src1, \src2, \src3, \src4, \src5, 
\src6, \src7
+        subs            w3, w3, #1
+        st1             {v24.8h, v25.8h}, [x0], x9
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+// todo: reads #32 bytes
+function ff_hevc_put_hevc_qpel_v24_8_neon, export=1
+        sub             sp, sp, #32
+        st1             {v8.8b, v9.8b, v10.8b}, [sp]
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ld1             {v16.16b, v17.16b}, [x1], x2
+        ld1             {v18.16b, v19.16b}, [x1], x2
+        ld1             {v20.16b, v21.16b}, [x1], x2
+        ld1             {v22.16b, v23.16b}, [x1], x2
+        ld1             {v24.16b, v25.16b}, [x1], x2
+        ld1             {v26.16b, v27.16b}, [x1], x2
+        ld1             {v28.16b, v29.16b}, [x1], x2
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, 
src8, src9, src10, src11, src12, src13, src14, src15
+        ld1             {\tmp0\().16b, \tmp1\().16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        calc_qpelb      v8,  \src0, \src1, \src2,  \src3,  \src4, 
\src5,  \src6,  \src7
+        calc_qpelb2     v9,  \src0, \src1, \src2,  \src3,  \src4, 
\src5,  \src6,  \src7
+        calc_qpelb      v10, \src8, \src9, \src10, \src11, \src12, 
\src13, \src14, \src15
+        subs            w3, w3, #1
+        st1             {v8.8h, v9.8h, v10.8h}, [x0], x9
+.endm
+1:      calc_all2
+.purgem calc
+2:      ld1             {v8.8b, v9.8b, v10.8b}, [sp]
+        add             sp, sp, #32
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v32_8_neon, export=1
+        sub             sp, sp, #32
+        st1             {v8.8b-v11.8b}, [sp]
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        mov             x9, #(MAX_PB_SIZE * 2)
+        sub             x1, x1, x2
+        ld1             {v16.16b, v17.16b}, [x1], x2
+        ld1             {v18.16b, v19.16b}, [x1], x2
+        ld1             {v20.16b, v21.16b}, [x1], x2
+        ld1             {v22.16b, v23.16b}, [x1], x2
+        ld1             {v24.16b, v25.16b}, [x1], x2
+        ld1             {v26.16b, v27.16b}, [x1], x2
+        ld1             {v28.16b, v29.16b}, [x1], x2
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, 
src8, src9, src10, src11, src12, src13, src14, src15
+        ld1             {\tmp0\().16b, \tmp1\().16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  \src0, \src1, \src2,  \src3,  \src4, 
\src5,  \src6,  \src7
+        calc_qpelb2     v9,  \src0, \src1, \src2,  \src3,  \src4, 
\src5,  \src6,  \src7
+        calc_qpelb      v10, \src8, \src9, \src10, \src11, \src12, 
\src13, \src14, \src15
+        calc_qpelb2     v11, \src8, \src9, \src10, \src11, \src12, 
\src13, \src14, \src15
+        subs            w3, w3, #1
+        st1             {v8.8h-v11.8h}, [x0], x9
+.endm
+1:      calc_all2
+.purgem calc
+2:      ld1             {v8.8b-v11.8b}, [sp], #32
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v48_8_neon, export=1
+        stp             x2, x3, [sp, #-48]!
+        stp             x0, x1, [sp, #16]
+        stp             x5, x30, [sp, #32]
+        bl              X(ff_hevc_put_hevc_qpel_v24_8_neon)
+        ldp             x2, x3, [sp]
+        ldp             x0, x1, [sp, #16]
+        ldr             x5, [sp, #32]
+        add             sp, sp, #32
+        add             x0, x0, #48
+        add             x1, x1, #24
+        bl              X(ff_hevc_put_hevc_qpel_v24_8_neon)
+        ldr             x30, [sp, #8]
+        add             sp, sp, #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v64_8_neon, export=1
+        sub             sp, sp, #32
+        st1             {v8.8b-v11.8b}, [sp]
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        mov             x9, #(MAX_PB_SIZE * 2)
+0:      mov             x8, x1          // src
+        ld1             {v16.16b, v17.16b}, [x8], x2
+        mov             w11, w3         // height
+        ld1             {v18.16b, v19.16b}, [x8], x2
+        mov             x10, x0         // dst
+        ld1             {v20.16b, v21.16b}, [x8], x2
+        ld1             {v22.16b, v23.16b}, [x8], x2
+        ld1             {v24.16b, v25.16b}, [x8], x2
+        ld1             {v26.16b, v27.16b}, [x8], x2
+        ld1             {v28.16b, v29.16b}, [x8], x2
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, 
src8, src9, src10, src11, src12, src13, src14, src15
+        ld1             {\tmp0\().16b, \tmp1\().16b}, [x8], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  \src0, \src1, \src2,  \src3,  \src4, 
\src5,  \src6,  \src7
+        calc_qpelb2     v9,  \src0, \src1, \src2,  \src3,  \src4, 
\src5,  \src6,  \src7
+        calc_qpelb      v10, \src8, \src9, \src10, \src11, \src12, 
\src13, \src14, \src15
+        calc_qpelb2     v11, \src8, \src9, \src10, \src11, \src12, 
\src13, \src14, \src15
+        subs            x11, x11, #1
+        st1             {v8.8h-v11.8h}, [x10], x9
+.endm
+1:      calc_all2
+.purgem calc
+2:      add             x0, x0, #64
+        add             x1, x1, #32
+        subs            w6, w6, #32
+        b.hi            0b
+        ld1             {v8.8b-v11.8b}, [sp], #32
+        ret
+endfunc
+
+
  function ff_hevc_put_hevc_pel_uni_pixels4_8_neon, export=1
  1:
          ldr             s0, [x2]
@@ -663,25 +972,6 @@ function ff_hevc_put_hevc_pel_uni_pixels64_8_neon, 
export=1
          ret
  endfunc
  -.macro calc_all
-        calc            v23, v16, v17, v18, v19, v20, v21, v22, v23
-        b.eq            2f
-        calc            v16, v17, v18, v19, v20, v21, v22, v23, v16
-        b.eq            2f
-        calc            v17, v18, v19, v20, v21, v22, v23, v16, v17
-        b.eq            2f
-        calc            v18, v19, v20, v21, v22, v23, v16, v17, v18
-        b.eq            2f
-        calc            v19, v20, v21, v22, v23, v16, v17, v18, v19
-        b.eq            2f
-        calc            v20, v21, v22, v23, v16, v17, v18, v19, v20
-        b.eq            2f
-        calc            v21, v22, v23, v16, v17, v18, v19, v20, v21
-        b.eq            2f
-        calc            v22, v23, v16, v17, v18, v19, v20, v21, v22
-        b.hi            1b
-.endm
-
  function ff_hevc_put_hevc_qpel_uni_v4_8_neon, export=1
          load_qpel_filterb x6, x5
          sub             x2, x2, x3, lsl #1
@@ -1559,25 +1849,6 @@ endfunc
   #if HAVE_I8MM
  -.macro calc_all2
-        calc v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17, 
v19, v21, v23, v25, v27, v29, v31
-        b.eq            2f
-        calc v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19, 
v21, v23, v25, v27, v29, v31, v17
-        b.eq            2f
-        calc v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21, 
v23, v25, v27, v29, v31, v17, v19
-        b.eq            2f
-        calc v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23, 
v25, v27, v29, v31, v17, v19, v21
-        b.eq            2f
-        calc v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25, 
v27, v29, v31, v17, v19, v21, v23
-        b.eq            2f
-        calc v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27, 
v29, v31, v17, v19, v21, v23, v25
-        b.eq            2f
-        calc v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29, 
v31, v17, v19, v21, v23, v25, v27
-        b.eq            2f
-        calc v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31, 
v17, v19, v21, v23, v25, v27, v29
-        b.hi            1b
-.endm
-
  function ff_hevc_put_hevc_qpel_uni_hv4_8_neon_i8mm, export=1
          add             w10, w4, #7
          lsl             x10, x10, #7
-- 
2.38.0.windows.1

[-- Attachment #2: 0003-lavc-aarch64-new-optimization-for-8-bit-hevc_qpel_v.patch --]
[-- Type: text/plain, Size: 18085 bytes --]

From 3cb075a5fcf0e696a55bcce8fa6415c1d2830fad Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu@myais.com.cn>
Date: Sat, 9 Sep 2023 21:54:48 +0800
Subject: [PATCH 3/4] lavc/aarch64: new optimization for 8-bit hevc_qpel_v

checkasm bench:
put_hevc_qpel_v4_8_c: 138.1
put_hevc_qpel_v4_8_neon: 41.1
put_hevc_qpel_v6_8_c: 276.6
put_hevc_qpel_v6_8_neon: 60.9
put_hevc_qpel_v8_8_c: 478.9
put_hevc_qpel_v8_8_neon: 72.9
put_hevc_qpel_v12_8_c: 1072.6
put_hevc_qpel_v12_8_neon: 203.9
put_hevc_qpel_v16_8_c: 1852.1
put_hevc_qpel_v16_8_neon: 264.1
put_hevc_qpel_v24_8_c: 4137.6
put_hevc_qpel_v24_8_neon: 586.9
put_hevc_qpel_v32_8_c: 7579.1
put_hevc_qpel_v32_8_neon: 1036.6
put_hevc_qpel_v48_8_c: 16355.6
put_hevc_qpel_v48_8_neon: 2326.4
put_hevc_qpel_v64_8_c: 33545.1
put_hevc_qpel_v64_8_neon: 4126.4

Co-Authored-By: J. Dekker <jdek@itanimul.li>
---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
 libavcodec/aarch64/hevcdsp_qpel_neon.S    | 347 +++++++++++++++++++---
 2 files changed, 314 insertions(+), 38 deletions(-)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index e9a341ecb9..f6b4c31d17 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -204,6 +204,10 @@ NEON8_FNPROTO(qpel_h, (int16_t *dst,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, intptr_t mx, intptr_t my, int width), _i8mm);
 
+NEON8_FNPROTO(qpel_v, (int16_t *dst,
+        const uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width),);
+
 NEON8_FNPROTO(qpel_uni_v, (uint8_t *dst,  ptrdiff_t dststride,
         const uint8_t *src, ptrdiff_t srcstride,
         int height, intptr_t mx, intptr_t my, int width),);
@@ -315,6 +319,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels,);
         NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v,);
         NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels,);
+        NEON8_FNASSIGN(c->put_hevc_qpel, 1, 0, qpel_v,);
         NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
         NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);
         NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,);
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index 4132d7a8a9..eff70d70a4 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -112,6 +112,44 @@ endconst
 .endif
 .endm
 
+.macro calc_all
+        calc            v23, v16, v17, v18, v19, v20, v21, v22, v23
+        b.eq            2f
+        calc            v16, v17, v18, v19, v20, v21, v22, v23, v16
+        b.eq            2f
+        calc            v17, v18, v19, v20, v21, v22, v23, v16, v17
+        b.eq            2f
+        calc            v18, v19, v20, v21, v22, v23, v16, v17, v18
+        b.eq            2f
+        calc            v19, v20, v21, v22, v23, v16, v17, v18, v19
+        b.eq            2f
+        calc            v20, v21, v22, v23, v16, v17, v18, v19, v20
+        b.eq            2f
+        calc            v21, v22, v23, v16, v17, v18, v19, v20, v21
+        b.eq            2f
+        calc            v22, v23, v16, v17, v18, v19, v20, v21, v22
+        b.hi            1b
+.endm
+
+.macro calc_all2
+        calc v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17, v19, v21, v23, v25, v27, v29, v31
+        b.eq            2f
+        calc v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19, v21, v23, v25, v27, v29, v31, v17
+        b.eq            2f
+        calc v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21, v23, v25, v27, v29, v31, v17, v19
+        b.eq            2f
+        calc v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23, v25, v27, v29, v31, v17, v19, v21
+        b.eq            2f
+        calc v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25, v27, v29, v31, v17, v19, v21, v23
+        b.eq            2f
+        calc v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27, v29, v31, v17, v19, v21, v23, v25
+        b.eq            2f
+        calc v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29, v31, v17, v19, v21, v23, v25, v27
+        b.eq            2f
+        calc v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31, v17, v19, v21, v23, v25, v27, v29
+        b.hi            1b
+.endm
+
 .macro put_hevc type
 .ifc \type, qpel
         // void put_hevc_qpel_h(int16_t *dst,
@@ -558,6 +596,277 @@ put_hevc qpel
 put_hevc qpel_uni
 put_hevc qpel_bi
 
+function ff_hevc_put_hevc_qpel_v4_8_neon, export=1
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        mov             x9, #(MAX_PB_SIZE * 2)
+        sub             x1, x1, x2
+        ldr             s16, [x1]
+        ldr             s17, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             s18, [x1]
+        ldr             s19, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             s20, [x1]
+        ldr             s21, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             s22, [x1]
+        add             x1, x1, x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\tmp\().s}[0], [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        st1             {v24.4h}, [x0], x9
+        subs            w3, w3, #1
+        b.eq            2f
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v6_8_neon, export=1
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        mov             x9, #(MAX_PB_SIZE * 2 - 8)
+        sub             x1, x1, x2
+        ldr             d16, [x1]
+        ldr             d17, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             d18, [x1]
+        ldr             d19, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             d20, [x1]
+        ldr             d21, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             d22, [x1]
+        add             x1, x1, x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\tmp\().8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        st1             {v24.4h}, [x0], #8
+        st1             {v24.s}[2], [x0], x9
+        subs            w3, w3, #1
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v8_8_neon, export=1
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        mov             x9, #(MAX_PB_SIZE * 2)
+        sub             x1, x1, x2
+        ldr             d16, [x1]
+        ldr             d17, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             d18, [x1]
+        ldr             d19, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             d20, [x1]
+        ldr             d21, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             d22, [x1]
+        add             x1, x1, x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1            {\tmp\().8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        st1            {v24.8h}, [x0], x9
+        subs            w3, w3, #1
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v12_8_neon, export=1
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        mov             x9, #(MAX_PB_SIZE * 2 - 16)
+        sub             x1, x1, x2
+        ldr             q16, [x1]
+        ldr             q17, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             q18, [x1]
+        ldr             q19, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             q20, [x1]
+        ldr             q21, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             q22, [x1]
+        add             x1, x1, x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\tmp\().16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        calc_qpelb2     v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        st1             {v24.8h}, [x0], #16
+        subs            w3, w3, #1
+        st1             {v25.4h}, [x0], x9
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v16_8_neon, export=1
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        mov             x9, #(MAX_PB_SIZE * 2)
+        sub             x1, x1, x2
+        ldr             q16, [x1]
+        ldr             q17, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             q18, [x1]
+        ldr             q19, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             q20, [x1]
+        ldr             q21, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             q22, [x1]
+        add             x1, x1, x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\tmp\().16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        calc_qpelb2     v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        subs            w3, w3, #1
+        st1             {v24.8h, v25.8h}, [x0], x9
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+// todo: reads #32 bytes
+function ff_hevc_put_hevc_qpel_v24_8_neon, export=1
+        sub             sp, sp, #32
+        st1             {v8.8b, v9.8b, v10.8b}, [sp]
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ld1             {v16.16b, v17.16b}, [x1], x2
+        ld1             {v18.16b, v19.16b}, [x1], x2
+        ld1             {v20.16b, v21.16b}, [x1], x2
+        ld1             {v22.16b, v23.16b}, [x1], x2
+        ld1             {v24.16b, v25.16b}, [x1], x2
+        ld1             {v26.16b, v27.16b}, [x1], x2
+        ld1             {v28.16b, v29.16b}, [x1], x2
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+        ld1             {\tmp0\().16b, \tmp1\().16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        calc_qpelb      v8,  \src0, \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7
+        calc_qpelb2     v9,  \src0, \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7
+        calc_qpelb      v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+        subs            w3, w3, #1
+        st1             {v8.8h, v9.8h, v10.8h}, [x0], x9
+.endm
+1:      calc_all2
+.purgem calc
+2:      ld1             {v8.8b, v9.8b, v10.8b}, [sp]
+        add             sp, sp, #32
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v32_8_neon, export=1
+        sub             sp, sp, #32
+        st1             {v8.8b-v11.8b}, [sp]
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        mov             x9, #(MAX_PB_SIZE * 2)
+        sub             x1, x1, x2
+        ld1             {v16.16b, v17.16b}, [x1], x2
+        ld1             {v18.16b, v19.16b}, [x1], x2
+        ld1             {v20.16b, v21.16b}, [x1], x2
+        ld1             {v22.16b, v23.16b}, [x1], x2
+        ld1             {v24.16b, v25.16b}, [x1], x2
+        ld1             {v26.16b, v27.16b}, [x1], x2
+        ld1             {v28.16b, v29.16b}, [x1], x2
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+        ld1             {\tmp0\().16b, \tmp1\().16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  \src0, \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7
+        calc_qpelb2     v9,  \src0, \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7
+        calc_qpelb      v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+        calc_qpelb2     v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+        subs            w3, w3, #1
+        st1             {v8.8h-v11.8h}, [x0], x9
+.endm
+1:      calc_all2
+.purgem calc
+2:      ld1             {v8.8b-v11.8b}, [sp], #32
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v48_8_neon, export=1
+        stp             x2, x3, [sp, #-48]!
+        stp             x0, x1, [sp, #16]
+        stp             x5, x30, [sp, #32]
+        bl              X(ff_hevc_put_hevc_qpel_v24_8_neon)
+        ldp             x2, x3, [sp]
+        ldp             x0, x1, [sp, #16]
+        ldr             x5, [sp, #32]
+        add             sp, sp, #32
+        add             x0, x0, #48
+        add             x1, x1, #24
+        bl              X(ff_hevc_put_hevc_qpel_v24_8_neon)
+        ldr             x30, [sp, #8]
+        add             sp, sp, #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v64_8_neon, export=1
+        sub             sp, sp, #32
+        st1             {v8.8b-v11.8b}, [sp]
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        mov             x9, #(MAX_PB_SIZE * 2)
+0:      mov             x8, x1          // src
+        ld1             {v16.16b, v17.16b}, [x8], x2
+        mov             w11, w3         // height
+        ld1             {v18.16b, v19.16b}, [x8], x2
+        mov             x10, x0         // dst
+        ld1             {v20.16b, v21.16b}, [x8], x2
+        ld1             {v22.16b, v23.16b}, [x8], x2
+        ld1             {v24.16b, v25.16b}, [x8], x2
+        ld1             {v26.16b, v27.16b}, [x8], x2
+        ld1             {v28.16b, v29.16b}, [x8], x2
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+        ld1             {\tmp0\().16b, \tmp1\().16b}, [x8], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  \src0, \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7
+        calc_qpelb2     v9,  \src0, \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7
+        calc_qpelb      v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+        calc_qpelb2     v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+        subs            x11, x11, #1
+        st1             {v8.8h-v11.8h}, [x10], x9
+.endm
+1:      calc_all2
+.purgem calc
+2:      add             x0, x0, #64
+        add             x1, x1, #32
+        subs            w6, w6, #32
+        b.hi            0b
+        ld1             {v8.8b-v11.8b}, [sp], #32
+        ret
+endfunc
+
+
 function ff_hevc_put_hevc_pel_uni_pixels4_8_neon, export=1
 1:
         ldr             s0, [x2]
@@ -663,25 +972,6 @@ function ff_hevc_put_hevc_pel_uni_pixels64_8_neon, export=1
         ret
 endfunc
 
-.macro calc_all
-        calc            v23, v16, v17, v18, v19, v20, v21, v22, v23
-        b.eq            2f
-        calc            v16, v17, v18, v19, v20, v21, v22, v23, v16
-        b.eq            2f
-        calc            v17, v18, v19, v20, v21, v22, v23, v16, v17
-        b.eq            2f
-        calc            v18, v19, v20, v21, v22, v23, v16, v17, v18
-        b.eq            2f
-        calc            v19, v20, v21, v22, v23, v16, v17, v18, v19
-        b.eq            2f
-        calc            v20, v21, v22, v23, v16, v17, v18, v19, v20
-        b.eq            2f
-        calc            v21, v22, v23, v16, v17, v18, v19, v20, v21
-        b.eq            2f
-        calc            v22, v23, v16, v17, v18, v19, v20, v21, v22
-        b.hi            1b
-.endm
-
 function ff_hevc_put_hevc_qpel_uni_v4_8_neon, export=1
         load_qpel_filterb x6, x5
         sub             x2, x2, x3, lsl #1
@@ -1559,25 +1849,6 @@ endfunc
 
 #if HAVE_I8MM
 
-.macro calc_all2
-        calc v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17, v19, v21, v23, v25, v27, v29, v31
-        b.eq            2f
-        calc v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19, v21, v23, v25, v27, v29, v31, v17
-        b.eq            2f
-        calc v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21, v23, v25, v27, v29, v31, v17, v19
-        b.eq            2f
-        calc v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23, v25, v27, v29, v31, v17, v19, v21
-        b.eq            2f
-        calc v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25, v27, v29, v31, v17, v19, v21, v23
-        b.eq            2f
-        calc v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27, v29, v31, v17, v19, v21, v23, v25
-        b.eq            2f
-        calc v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29, v31, v17, v19, v21, v23, v25, v27
-        b.eq            2f
-        calc v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31, v17, v19, v21, v23, v25, v27, v29
-        b.hi            1b
-.endm
-
 function ff_hevc_put_hevc_qpel_uni_hv4_8_neon_i8mm, export=1
         add             w10, w4, #7
         lsl             x10, x10, #7
-- 
2.38.0.windows.1


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 15+ messages in thread
* [FFmpeg-devel] (no subject)
@ 2023-10-14  8:39 Logan.Lyu
  0 siblings, 0 replies; 15+ messages in thread
From: Logan.Lyu @ 2023-10-14  8:39 UTC (permalink / raw)
  To: ffmpeg-devel

[-- Attachment #1: Type: text/plain, Size: 10960 bytes --]

checkasm bench:
put_hevc_epel_v4_8_c: 79.9
put_hevc_epel_v4_8_neon: 25.7
put_hevc_epel_v6_8_c: 151.4
put_hevc_epel_v6_8_neon: 46.4
put_hevc_epel_v8_8_c: 250.9
put_hevc_epel_v8_8_neon: 41.7
put_hevc_epel_v12_8_c: 542.7
put_hevc_epel_v12_8_neon: 108.7
put_hevc_epel_v16_8_c: 939.4
put_hevc_epel_v16_8_neon: 169.2
put_hevc_epel_v24_8_c: 2104.9
put_hevc_epel_v24_8_neon: 307.9
put_hevc_epel_v32_8_c: 3713.9
put_hevc_epel_v32_8_neon: 524.2
put_hevc_epel_v48_8_c: 8175.2
put_hevc_epel_v48_8_neon: 1197.2
put_hevc_epel_v64_8_c: 16049.4
put_hevc_epel_v64_8_neon: 2094.9

Co-Authored-By: J. Dekker <jdek@itanimul.li>
Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn>
---
  libavcodec/aarch64/hevcdsp_epel_neon.S    | 223 ++++++++++++++++++++++
  libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
  2 files changed, 228 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S 
b/libavcodec/aarch64/hevcdsp_epel_neon.S
index b4ca1e4c20..e541db5430 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -243,6 +243,229 @@ function ff_hevc_put_hevc_pel_pixels64_8_neon, 
export=1
          ret
  endfunc
  +
+function ff_hevc_put_hevc_epel_v4_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ldr             s16, [x1]
+        ldr             s17, [x1 ,x2]
+        add             x1, x1, x2, lsl #1
+        ld1             {v18.s}[0], [x1], x2
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().s}[0], [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        subs            w3, w3, #1
+        st1             {v4.4h}, [x0], x10
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v6_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2 - 8)
+        ldr             d16, [x1]
+        ldr             d17, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ld1             {v18.8b}, [x1], x2
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().8b}, [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        st1             {v4.d}[0], [x0], #8
+        subs            w3, w3, #1
+        st1             {v4.s}[2], [x0], x10
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v8_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ldr             d16, [x1]
+        ldr             d17, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ld1             {v18.8b}, [x1], x2
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().8b}, [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        subs            w3, w3, #1
+        st1             {v4.8h}, [x0], x10
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v12_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ldr             q16, [x1]
+        ldr             q17, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ld1             {v18.16b}, [x1], x2
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        calc_epelb2     v5, \src0, \src1, \src2, \src3
+        str             q4, [x0]
+        subs            w3, w3, #1
+        str             d5, [x0, #16]
+        add             x0, x0, x10
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v16_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ldr             q16, [x1]
+        ldr             q17, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ld1             {v18.16b}, [x1], x2
+.macro calc src0, src1, src2, src3
+        ld1            {\src3\().16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        calc_epelb2     v5, \src0, \src1, \src2, \src3
+        subs            w3, w3, #1
+        st1             {v4.8h, v5.8h}, [x0], x10
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v24_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8b, v17.8b, v18.8b}, [x1], x2
+        ld1             {v19.8b, v20.8b, v21.8b}, [x1], x2
+        ld1             {v22.8b, v23.8b, v24.8b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, 
src10, src11
+        ld1             {\src9\().8b, \src10\().8b, \src11\().8b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        calc_epelb      v4, \src0, \src3, \src6, \src9
+        calc_epelb      v5, \src1, \src4, \src7, \src10
+        calc_epelb      v6, \src2, \src5, \src8, \src11
+        subs            w3, w3, #1
+        st1             {v4.8h-v6.8h}, [x0], x10
+.endm
+1:      calc_all12
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v32_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.16b, v17.16b}, [x1], x2
+        ld1             {v18.16b, v19.16b}, [x1], x2
+        ld1             {v20.16b, v21.16b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\src6\().16b, \src7\().16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        calc_epelb      v4, \src0, \src2, \src4, \src6
+        calc_epelb2     v5, \src0, \src2, \src4, \src6
+        calc_epelb      v6, \src1, \src3, \src5, \src7
+        calc_epelb2     v7, \src1, \src3, \src5, \src7
+        subs            w3, w3, #1
+        st1             {v4.8h-v7.8h}, [x0], x10
+.endm
+1:      calc_all8
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v48_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #64
+        ld1             {v16.16b, v17.16b, v18.16b}, [x1], x2
+        ld1             {v19.16b, v20.16b, v21.16b}, [x1], x2
+        ld1             {v22.16b, v23.16b, v24.16b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, 
src10, src11
+        ld1             {\src9\().16b, \src10\().16b, \src11\().16b}, 
[x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        calc_epelb      v4,  \src0, \src3, \src6, \src9
+        calc_epelb2     v5,  \src0, \src3, \src6, \src9
+        calc_epelb      v6,  \src1, \src4, \src7, \src10
+        calc_epelb2     v7,  \src1, \src4, \src7, \src10
+        calc_epelb      v28, \src2, \src5, \src8, \src11
+        calc_epelb2     v29, \src2, \src5, \src8, \src11
+        st1             {v4.8h-v7.8h}, [x0], #64
+        subs            w3, w3, #1
+        st1             {v28.8h-v29.8h}, [x0], x10
+.endm
+1:      calc_all12
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v64_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             sp, sp, #32
+        st1             {v8.8b-v11.8b}, [sp]
+        sub             x1, x1, x2
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x2
+        ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], x2
+        ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, 
src10, src11, src12, src13, src14, src15
+        ld1             {\src12\().16b-\src15\().16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_epelb      v4,  \src0, \src4, \src8,  \src12
+        calc_epelb2     v5,  \src0, \src4, \src8,  \src12
+        calc_epelb      v6,  \src1, \src5, \src9,  \src13
+        calc_epelb2     v7,  \src1, \src5, \src9,  \src13
+        calc_epelb      v8,  \src2, \src6, \src10, \src14
+        calc_epelb2     v9,  \src2, \src6, \src10, \src14
+        calc_epelb      v10, \src3, \src7, \src11, \src15
+        calc_epelb2     v11, \src3, \src7, \src11, \src15
+        st1             {v4.8h-v7.8h}, [x0], #64
+        subs            w3, w3, #1
+        st1             {v8.8h-v11.8h}, [x0], #64
+.endm
+1:      calc_all16
+.purgem calc
+2:     	ld1             {v8.8b-v11.8b}, [sp]
+        add             sp, sp, #32
+        ret
+endfunc
+
  function ff_hevc_put_hevc_epel_uni_v4_8_neon, export=1
          load_epel_filterb x6, x5
          sub             x2, x2, x3
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 4c377a7940..82e1623a67 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -156,6 +156,10 @@ NEON8_FNPROTO(pel_pixels, (int16_t *dst,
          const uint8_t *src, ptrdiff_t srcstride,
          int height, intptr_t mx, intptr_t my, int width),);
  +NEON8_FNPROTO(epel_v, (int16_t *dst,
+        const uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width),);
+
  NEON8_FNPROTO(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
          const uint8_t *_src, ptrdiff_t _srcstride,
          int height, intptr_t mx, intptr_t my, int width),);
@@ -305,6 +309,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext 
*c, const int bit_depth)
          c->put_hevc_qpel_bi[9][0][1]   = 
ff_hevc_put_hevc_qpel_bi_h16_8_neon;
           NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels,);
+        NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v,);
          NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels,);
          NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
          NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);
-- 
2.38.0.windows.1

[-- Attachment #2: 0001-lavc-aarch64-new-optimization-for-8-bit-hevc_epel_v.patch --]
[-- Type: text/plain, Size: 11109 bytes --]

From dfaaddf97b86817bc7adb50fdf0d29634b365bb1 Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu@myais.com.cn>
Date: Sat, 9 Sep 2023 16:50:29 +0800
Subject: [PATCH 1/4] lavc/aarch64: new optimization for 8-bit hevc_epel_v

checkasm bench:
put_hevc_epel_v4_8_c: 79.9
put_hevc_epel_v4_8_neon: 25.7
put_hevc_epel_v6_8_c: 151.4
put_hevc_epel_v6_8_neon: 46.4
put_hevc_epel_v8_8_c: 250.9
put_hevc_epel_v8_8_neon: 41.7
put_hevc_epel_v12_8_c: 542.7
put_hevc_epel_v12_8_neon: 108.7
put_hevc_epel_v16_8_c: 939.4
put_hevc_epel_v16_8_neon: 169.2
put_hevc_epel_v24_8_c: 2104.9
put_hevc_epel_v24_8_neon: 307.9
put_hevc_epel_v32_8_c: 3713.9
put_hevc_epel_v32_8_neon: 524.2
put_hevc_epel_v48_8_c: 8175.2
put_hevc_epel_v48_8_neon: 1197.2
put_hevc_epel_v64_8_c: 16049.4
put_hevc_epel_v64_8_neon: 2094.9

Co-Authored-By: J. Dekker <jdek@itanimul.li>
---
 libavcodec/aarch64/hevcdsp_epel_neon.S    | 223 ++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
 2 files changed, 228 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S
index b4ca1e4c20..e541db5430 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -243,6 +243,229 @@ function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1
         ret
 endfunc
 
+
+function ff_hevc_put_hevc_epel_v4_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ldr             s16, [x1]
+        ldr             s17, [x1 ,x2]
+        add             x1, x1, x2, lsl #1
+        ld1             {v18.s}[0], [x1], x2
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().s}[0], [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        subs            w3, w3, #1
+        st1             {v4.4h}, [x0], x10
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v6_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2 - 8)
+        ldr             d16, [x1]
+        ldr             d17, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ld1             {v18.8b}, [x1], x2
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().8b}, [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        st1             {v4.d}[0], [x0], #8
+        subs            w3, w3, #1
+        st1             {v4.s}[2], [x0], x10
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v8_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ldr             d16, [x1]
+        ldr             d17, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ld1             {v18.8b}, [x1], x2
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().8b}, [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        subs            w3, w3, #1
+        st1             {v4.8h}, [x0], x10
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v12_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ldr             q16, [x1]
+        ldr             q17, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ld1             {v18.16b}, [x1], x2
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        calc_epelb2     v5, \src0, \src1, \src2, \src3
+        str             q4, [x0]
+        subs            w3, w3, #1
+        str             d5, [x0, #16]
+        add             x0, x0, x10
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v16_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ldr             q16, [x1]
+        ldr             q17, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ld1             {v18.16b}, [x1], x2
+.macro calc src0, src1, src2, src3
+        ld1            {\src3\().16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        calc_epelb2     v5, \src0, \src1, \src2, \src3
+        subs            w3, w3, #1
+        st1             {v4.8h, v5.8h}, [x0], x10
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v24_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8b, v17.8b, v18.8b}, [x1], x2
+        ld1             {v19.8b, v20.8b, v21.8b}, [x1], x2
+        ld1             {v22.8b, v23.8b, v24.8b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11
+        ld1             {\src9\().8b, \src10\().8b, \src11\().8b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        calc_epelb      v4, \src0, \src3, \src6, \src9
+        calc_epelb      v5, \src1, \src4, \src7, \src10
+        calc_epelb      v6, \src2, \src5, \src8, \src11
+        subs            w3, w3, #1
+        st1             {v4.8h-v6.8h}, [x0], x10
+.endm
+1:      calc_all12
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v32_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.16b, v17.16b}, [x1], x2
+        ld1             {v18.16b, v19.16b}, [x1], x2
+        ld1             {v20.16b, v21.16b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\src6\().16b, \src7\().16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        calc_epelb      v4, \src0, \src2, \src4, \src6
+        calc_epelb2     v5, \src0, \src2, \src4, \src6
+        calc_epelb      v6, \src1, \src3, \src5, \src7
+        calc_epelb2     v7, \src1, \src3, \src5, \src7
+        subs            w3, w3, #1
+        st1             {v4.8h-v7.8h}, [x0], x10
+.endm
+1:      calc_all8
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v48_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #64
+        ld1             {v16.16b, v17.16b, v18.16b}, [x1], x2
+        ld1             {v19.16b, v20.16b, v21.16b}, [x1], x2
+        ld1             {v22.16b, v23.16b, v24.16b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11
+        ld1             {\src9\().16b, \src10\().16b, \src11\().16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        calc_epelb      v4,  \src0, \src3, \src6, \src9
+        calc_epelb2     v5,  \src0, \src3, \src6, \src9
+        calc_epelb      v6,  \src1, \src4, \src7, \src10
+        calc_epelb2     v7,  \src1, \src4, \src7, \src10
+        calc_epelb      v28, \src2, \src5, \src8, \src11
+        calc_epelb2     v29, \src2, \src5, \src8, \src11
+        st1             {v4.8h-v7.8h}, [x0], #64
+        subs            w3, w3, #1
+        st1             {v28.8h-v29.8h}, [x0], x10
+.endm
+1:      calc_all12
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v64_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             sp, sp, #32
+        st1             {v8.8b-v11.8b}, [sp]
+        sub             x1, x1, x2
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x2
+        ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], x2
+        ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+        ld1             {\src12\().16b-\src15\().16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_epelb      v4,  \src0, \src4, \src8,  \src12
+        calc_epelb2     v5,  \src0, \src4, \src8,  \src12
+        calc_epelb      v6,  \src1, \src5, \src9,  \src13
+        calc_epelb2     v7,  \src1, \src5, \src9,  \src13
+        calc_epelb      v8,  \src2, \src6, \src10, \src14
+        calc_epelb2     v9,  \src2, \src6, \src10, \src14
+        calc_epelb      v10, \src3, \src7, \src11, \src15
+        calc_epelb2     v11, \src3, \src7, \src11, \src15
+        st1             {v4.8h-v7.8h}, [x0], #64
+        subs            w3, w3, #1
+        st1             {v8.8h-v11.8h}, [x0], #64
+.endm
+1:      calc_all16
+.purgem calc
+2:     	ld1             {v8.8b-v11.8b}, [sp]
+        add             sp, sp, #32
+        ret
+endfunc
+
 function ff_hevc_put_hevc_epel_uni_v4_8_neon, export=1
         load_epel_filterb x6, x5
         sub             x2, x2, x3
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 4c377a7940..82e1623a67 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -156,6 +156,10 @@ NEON8_FNPROTO(pel_pixels, (int16_t *dst,
         const uint8_t *src, ptrdiff_t srcstride,
         int height, intptr_t mx, intptr_t my, int width),);
 
+NEON8_FNPROTO(epel_v, (int16_t *dst,
+        const uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width),);
+
 NEON8_FNPROTO(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, intptr_t mx, intptr_t my, int width),);
@@ -305,6 +309,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->put_hevc_qpel_bi[9][0][1]   = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
 
         NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels,);
+        NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v,);
         NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels,);
         NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
         NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);
-- 
2.38.0.windows.1


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 15+ messages in thread
* [FFmpeg-devel] (no subject)
@ 2023-10-14  8:39 Logan.Lyu
  0 siblings, 0 replies; 15+ messages in thread
From: Logan.Lyu @ 2023-10-14  8:39 UTC (permalink / raw)
  To: ffmpeg-devel

[-- Attachment #1: Type: text/plain, Size: 13033 bytes --]

checkasm bench:
put_hevc_epel_hv4_8_c: 213.7
put_hevc_epel_hv4_8_i8mm: 59.4
put_hevc_epel_hv6_8_c: 350.9
put_hevc_epel_hv6_8_i8mm: 130.2
put_hevc_epel_hv8_8_c: 548.7
put_hevc_epel_hv8_8_i8mm: 136.9
put_hevc_epel_hv12_8_c: 1126.7
put_hevc_epel_hv12_8_i8mm: 302.2
put_hevc_epel_hv16_8_c: 1925.2
put_hevc_epel_hv16_8_i8mm: 459.9
put_hevc_epel_hv24_8_c: 4301.9
put_hevc_epel_hv24_8_i8mm: 1024.9
put_hevc_epel_hv32_8_c: 7509.2
put_hevc_epel_hv32_8_i8mm: 1680.4
put_hevc_epel_hv48_8_c: 16566.9
put_hevc_epel_hv48_8_i8mm: 3945.4
put_hevc_epel_hv64_8_c: 29134.2
put_hevc_epel_hv64_8_i8mm: 6567.7

Co-Authored-By: J. Dekker <jdek@itanimul.li>
Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn>
---
  libavcodec/aarch64/hevcdsp_epel_neon.S    | 265 ++++++++++++++++++++++
  libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
  2 files changed, 270 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S 
b/libavcodec/aarch64/hevcdsp_epel_neon.S
index e541db5430..ebc16da5b6 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -1018,6 +1018,271 @@ function ff_hevc_put_hevc_epel_h64_8_neon_i8mm, 
export=1
          ret
  endfunc
  +
+function ff_hevc_put_hevc_epel_hv4_8_neon_i8mm, export=1
+        add             w10, w3, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10 // tmp_array
+        stp             x5, x30, [sp, #-32]!
+        stp             x0, x3, [sp, #16]
+        add             x0, sp, #32
+        sub             x1, x1, x2
+        add             w3, w3, #3
+        bl              X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm)
+        ldp             x5, x30, [sp]
+        ldp             x0, x3, [sp, #16]
+        add             sp, sp, #32
+        load_epel_filterh x5, x4
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ldr             d16, [sp]
+        ldr             d17, [sp, x10]
+        add             sp, sp, x10, lsl #1
+        ld1             {v18.4h}, [sp], x10
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().4h}, [sp], x10
+        calc_epelh      v4, \src0, \src1, \src2, \src3
+        subs            w3, w3, #1
+        st1             {v4.4h}, [x0], x10
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv6_8_neon_i8mm, export=1
+        add             w10, w3, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10 // tmp_array
+        stp             x5, x30, [sp, #-32]!
+        stp             x0, x3, [sp, #16]
+        add             x0, sp, #32
+        sub             x1, x1, x2
+        add             w3, w3, #3
+        bl              X(ff_hevc_put_hevc_epel_h6_8_neon_i8mm)
+        ldp             x5, x30, [sp]
+        ldp             x0,  x3, [sp, #16]
+        add             sp, sp, #32
+        load_epel_filterh x5, x4
+        mov             x5, #120
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ldr             q16, [sp]
+        ldr             q17, [sp, x10]
+        add             sp, sp, x10, lsl #1
+        ld1             {v18.8h}, [sp], x10
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().8h}, [sp], x10
+        calc_epelh      v4,     \src0, \src1, \src2, \src3
+        calc_epelh2     v4, v5, \src0, \src1, \src2, \src3
+        st1             {v4.d}[0], [x0], #8
+        subs            w3, w3, #1
+        st1             {v4.s}[2], [x0], x5
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv8_8_neon_i8mm, export=1
+        add             w10, w3, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10 // tmp_array
+        stp             x5, x30, [sp, #-32]!
+        stp             x0, x3, [sp, #16]
+        add             x0, sp, #32
+        sub             x1, x1, x2
+        add             w3, w3, #3
+        bl              X(ff_hevc_put_hevc_epel_h8_8_neon_i8mm)
+        ldp             x5, x30, [sp]
+        ldp             x0, x3, [sp, #16]
+        add             sp, sp, #32
+        load_epel_filterh x5, x4
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ldr             q16, [sp]
+        ldr             q17, [sp, x10]
+        add             sp, sp, x10, lsl #1
+        ld1             {v18.8h}, [sp], x10
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().8h}, [sp], x10
+        calc_epelh      v4,     \src0, \src1, \src2, \src3
+        calc_epelh2     v4, v5, \src0, \src1, \src2, \src3
+        subs            w3, w3, #1
+        st1             {v4.8h}, [x0], x10
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv12_8_neon_i8mm, export=1
+        add             w10, w3, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10 // tmp_array
+        stp             x5, x30, [sp, #-32]!
+        stp             x0, x3, [sp, #16]
+        add             x0, sp, #32
+        sub             x1, x1, x2
+        add             w3, w3, #3
+        bl              X(ff_hevc_put_hevc_epel_h12_8_neon_i8mm)
+        ldp             x5, x30, [sp]
+        ldp             x0, x3, [sp, #16]
+        add             sp, sp, #32
+        load_epel_filterh x5, x4
+        mov             x5, #112
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h, v17.8h}, [sp], x10
+        ld1             {v18.8h, v19.8h}, [sp], x10
+        ld1             {v20.8h, v21.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\src6\().8h, \src7\().8h}, [sp], x10
+        calc_epelh      v4,     \src0, \src2, \src4, \src6
+        calc_epelh2     v4, v5, \src0, \src2, \src4, \src6
+        calc_epelh      v5,     \src1, \src3, \src5, \src7
+        st1             {v4.8h}, [x0], #16
+        subs            w3, w3, #1
+        st1             {v5.4h}, [x0], x5
+.endm
+1:      calc_all8
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv16_8_neon_i8mm, export=1
+        add             w10, w3, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10 // tmp_array
+        stp             x5, x30, [sp, #-32]!
+        stp             x0, x3, [sp, #16]
+        add             x0, sp, #32
+        sub             x1, x1, x2
+        add             w3, w3, #3
+        bl              X(ff_hevc_put_hevc_epel_h16_8_neon_i8mm)
+        ldp             x5, x30, [sp]
+        ldp             x0, x3, [sp, #16]
+        add             sp, sp, #32
+        load_epel_filterh x5, x4
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h, v17.8h}, [sp], x10
+        ld1             {v18.8h, v19.8h}, [sp], x10
+        ld1             {v20.8h, v21.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\src6\().8h, \src7\().8h}, [sp], x10
+        calc_epelh      v4,     \src0, \src2, \src4, \src6
+        calc_epelh2     v4, v5, \src0, \src2, \src4, \src6
+        calc_epelh      v5,     \src1, \src3, \src5, \src7
+        calc_epelh2     v5, v6, \src1, \src3, \src5, \src7
+        subs            w3, w3, #1
+        st1             {v4.8h, v5.8h}, [x0], x10
+.endm
+1:      calc_all8
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv24_8_neon_i8mm, export=1
+        add             w10, w3, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10 // tmp_array
+        stp             x5, x30, [sp, #-32]!
+        stp             x0, x3, [sp, #16]
+        add             x0, sp, #32
+        sub             x1, x1, x2
+        add             w3, w3, #3
+        bl              X(ff_hevc_put_hevc_epel_h24_8_neon_i8mm)
+        ldp             x5, x30, [sp]
+        ldp             x0, x3, [sp, #16]
+        add             sp, sp, #32
+        load_epel_filterh x5, x4
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h, v17.8h, v18.8h}, [sp], x10
+        ld1             {v19.8h, v20.8h, v21.8h}, [sp], x10
+        ld1             {v22.8h, v23.8h, v24.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, 
src10, src11
+        ld1             {\src9\().8h-\src11\().8h}, [sp], x10
+        calc_epelh      v4,     \src0, \src3, \src6, \src9
+        calc_epelh2     v4, v5, \src0, \src3, \src6, \src9
+        calc_epelh      v5,     \src1, \src4, \src7, \src10
+        calc_epelh2     v5, v6, \src1, \src4, \src7, \src10
+        calc_epelh      v6,     \src2, \src5, \src8, \src11
+        calc_epelh2     v6, v7, \src2, \src5, \src8, \src11
+        subs            w3, w3, #1
+        st1             {v4.8h-v6.8h}, [x0], x10
+.endm
+1:      calc_all12
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv32_8_neon_i8mm, export=1
+        stp             x4, x5, [sp, #-64]!
+        stp             x2, x3, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        str             x30, [sp, #48]
+        mov             x6, #16
+        bl              X(ff_hevc_put_hevc_epel_hv16_8_neon_i8mm)
+        ldp             x4, x5, [sp]
+        ldp             x2, x3, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             sp, sp, #48
+        add             x0, x0, #32
+        add             x1, x1, #16
+        mov             x6, #16
+        bl              X(ff_hevc_put_hevc_epel_hv16_8_neon_i8mm)
+        ldr             x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv48_8_neon_i8mm, export=1
+        stp             x4, x5, [sp, #-64]!
+        stp             x2, x3, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        str             x30, [sp, #48]
+        mov             x6, #24
+        bl              X(ff_hevc_put_hevc_epel_hv24_8_neon_i8mm)
+        ldp             x4, x5, [sp]
+        ldp             x2, x3, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             sp, sp, #48
+        add             x0, x0, #48
+        add             x1, x1, #24
+        mov             x6, #24
+        bl              X(ff_hevc_put_hevc_epel_hv24_8_neon_i8mm)
+        ldr             x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv64_8_neon_i8mm, export=1
+        stp             x4, x5, [sp, #-64]!
+        stp             x2, x3, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        str             x30, [sp, #48]
+        mov             x6, #16
+        bl              X(ff_hevc_put_hevc_epel_hv16_8_neon_i8mm)
+        ldp             x4, x5, [sp]
+        ldp             x2, x3, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             x0, x0, #32
+        add             x1, x1, #16
+        mov             x6, #16
+        bl              X(ff_hevc_put_hevc_epel_hv16_8_neon_i8mm)
+        ldp             x4, x5, [sp]
+        ldp             x2, x3, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             x0, x0, #64
+        add             x1, x1, #32
+        mov             x6, #16
+        bl              X(ff_hevc_put_hevc_epel_hv16_8_neon_i8mm)
+        ldp             x4, x5, [sp]
+        ldp             x2, x3, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             sp, sp, #48
+        add             x0, x0, #96
+        add             x1, x1, #48
+        mov             x6, #16
+        bl              X(ff_hevc_put_hevc_epel_hv16_8_neon_i8mm)
+        ldr             x30, [sp], #16
+        ret
+endfunc
+
  function ff_hevc_put_hevc_epel_uni_hv4_8_neon_i8mm, export=1
          add             w10, w4, #3
          lsl             x10, x10, #7
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 82e1623a67..e9a341ecb9 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -191,6 +191,10 @@ NEON8_FNPROTO(epel_h, (int16_t *dst,
          const uint8_t *_src, ptrdiff_t _srcstride,
          int height, intptr_t mx, intptr_t my, int width), _i8mm);
  +NEON8_FNPROTO(epel_hv, (int16_t *dst,
+        const uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width), _i8mm);
+
  NEON8_FNPROTO(epel_uni_w_h, (uint8_t *_dst,  ptrdiff_t _dststride,
          const uint8_t *_src, ptrdiff_t _srcstride,
          int height, int denom, int wx, int ox,
@@ -322,6 +326,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext 
*c, const int bit_depth)
           if (have_i8mm(cpu_flags)) {
              NEON8_FNASSIGN(c->put_hevc_epel, 0, 1, epel_h, _i8mm);
+            NEON8_FNASSIGN(c->put_hevc_epel, 1, 1, epel_hv, _i8mm);
              NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 1, epel_uni_hv, 
_i8mm);
              NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h 
,_i8mm);
              NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
-- 
2.38.0.windows.1

[-- Attachment #2: 0002-lavc-aarch64-new-optimization-for-8-bit-hevc_epel_hv.patch --]
[-- Type: text/plain, Size: 13185 bytes --]

From 83af7e79cf004c244ad3c771a0ca0e2357bbe944 Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu@myais.com.cn>
Date: Sat, 9 Sep 2023 21:29:51 +0800
Subject: [PATCH 2/4] lavc/aarch64: new optimization for 8-bit hevc_epel_hv

checkasm bench:
put_hevc_epel_hv4_8_c: 213.7
put_hevc_epel_hv4_8_i8mm: 59.4
put_hevc_epel_hv6_8_c: 350.9
put_hevc_epel_hv6_8_i8mm: 130.2
put_hevc_epel_hv8_8_c: 548.7
put_hevc_epel_hv8_8_i8mm: 136.9
put_hevc_epel_hv12_8_c: 1126.7
put_hevc_epel_hv12_8_i8mm: 302.2
put_hevc_epel_hv16_8_c: 1925.2
put_hevc_epel_hv16_8_i8mm: 459.9
put_hevc_epel_hv24_8_c: 4301.9
put_hevc_epel_hv24_8_i8mm: 1024.9
put_hevc_epel_hv32_8_c: 7509.2
put_hevc_epel_hv32_8_i8mm: 1680.4
put_hevc_epel_hv48_8_c: 16566.9
put_hevc_epel_hv48_8_i8mm: 3945.4
put_hevc_epel_hv64_8_c: 29134.2
put_hevc_epel_hv64_8_i8mm: 6567.7

Co-Authored-By: J. Dekker <jdek@itanimul.li>
---
 libavcodec/aarch64/hevcdsp_epel_neon.S    | 265 ++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
 2 files changed, 270 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S
index e541db5430..ebc16da5b6 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -1018,6 +1018,271 @@ function ff_hevc_put_hevc_epel_h64_8_neon_i8mm, export=1
         ret
 endfunc
 
+
+function ff_hevc_put_hevc_epel_hv4_8_neon_i8mm, export=1
+        add             w10, w3, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10 // tmp_array
+        stp             x5, x30, [sp, #-32]!
+        stp             x0, x3, [sp, #16]
+        add             x0, sp, #32
+        sub             x1, x1, x2
+        add             w3, w3, #3
+        bl              X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm)
+        ldp             x5, x30, [sp]
+        ldp             x0, x3, [sp, #16]
+        add             sp, sp, #32
+        load_epel_filterh x5, x4
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ldr             d16, [sp]
+        ldr             d17, [sp, x10]
+        add             sp, sp, x10, lsl #1
+        ld1             {v18.4h}, [sp], x10
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().4h}, [sp], x10
+        calc_epelh      v4, \src0, \src1, \src2, \src3
+        subs            w3, w3, #1
+        st1             {v4.4h}, [x0], x10
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv6_8_neon_i8mm, export=1
+        add             w10, w3, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10 // tmp_array
+        stp             x5, x30, [sp, #-32]!
+        stp             x0, x3, [sp, #16]
+        add             x0, sp, #32
+        sub             x1, x1, x2
+        add             w3, w3, #3
+        bl              X(ff_hevc_put_hevc_epel_h6_8_neon_i8mm)
+        ldp             x5, x30, [sp]
+        ldp             x0,  x3, [sp, #16]
+        add             sp, sp, #32
+        load_epel_filterh x5, x4
+        mov             x5, #120
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ldr             q16, [sp]
+        ldr             q17, [sp, x10]
+        add             sp, sp, x10, lsl #1
+        ld1             {v18.8h}, [sp], x10
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().8h}, [sp], x10
+        calc_epelh      v4,     \src0, \src1, \src2, \src3
+        calc_epelh2     v4, v5, \src0, \src1, \src2, \src3
+        st1             {v4.d}[0], [x0], #8
+        subs            w3, w3, #1
+        st1             {v4.s}[2], [x0], x5
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv8_8_neon_i8mm, export=1
+        add             w10, w3, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10 // tmp_array
+        stp             x5, x30, [sp, #-32]!
+        stp             x0, x3, [sp, #16]
+        add             x0, sp, #32
+        sub             x1, x1, x2
+        add             w3, w3, #3
+        bl              X(ff_hevc_put_hevc_epel_h8_8_neon_i8mm)
+        ldp             x5, x30, [sp]
+        ldp             x0, x3, [sp, #16]
+        add             sp, sp, #32
+        load_epel_filterh x5, x4
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ldr             q16, [sp]
+        ldr             q17, [sp, x10]
+        add             sp, sp, x10, lsl #1
+        ld1             {v18.8h}, [sp], x10
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().8h}, [sp], x10
+        calc_epelh      v4,     \src0, \src1, \src2, \src3
+        calc_epelh2     v4, v5, \src0, \src1, \src2, \src3
+        subs            w3, w3, #1
+        st1             {v4.8h}, [x0], x10
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv12_8_neon_i8mm, export=1
+        add             w10, w3, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10 // tmp_array
+        stp             x5, x30, [sp, #-32]!
+        stp             x0, x3, [sp, #16]
+        add             x0, sp, #32
+        sub             x1, x1, x2
+        add             w3, w3, #3
+        bl              X(ff_hevc_put_hevc_epel_h12_8_neon_i8mm)
+        ldp             x5, x30, [sp]
+        ldp             x0, x3, [sp, #16]
+        add             sp, sp, #32
+        load_epel_filterh x5, x4
+        mov             x5, #112
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h, v17.8h}, [sp], x10
+        ld1             {v18.8h, v19.8h}, [sp], x10
+        ld1             {v20.8h, v21.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\src6\().8h, \src7\().8h}, [sp], x10
+        calc_epelh      v4,     \src0, \src2, \src4, \src6
+        calc_epelh2     v4, v5, \src0, \src2, \src4, \src6
+        calc_epelh      v5,     \src1, \src3, \src5, \src7
+        st1             {v4.8h}, [x0], #16
+        subs            w3, w3, #1
+        st1             {v5.4h}, [x0], x5
+.endm
+1:      calc_all8
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv16_8_neon_i8mm, export=1
+        add             w10, w3, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10 // tmp_array
+        stp             x5, x30, [sp, #-32]!
+        stp             x0, x3, [sp, #16]
+        add             x0, sp, #32
+        sub             x1, x1, x2
+        add             w3, w3, #3
+        bl              X(ff_hevc_put_hevc_epel_h16_8_neon_i8mm)
+        ldp             x5, x30, [sp]
+        ldp             x0, x3, [sp, #16]
+        add             sp, sp, #32
+        load_epel_filterh x5, x4
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h, v17.8h}, [sp], x10
+        ld1             {v18.8h, v19.8h}, [sp], x10
+        ld1             {v20.8h, v21.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\src6\().8h, \src7\().8h}, [sp], x10
+        calc_epelh      v4,     \src0, \src2, \src4, \src6
+        calc_epelh2     v4, v5, \src0, \src2, \src4, \src6
+        calc_epelh      v5,     \src1, \src3, \src5, \src7
+        calc_epelh2     v5, v6, \src1, \src3, \src5, \src7
+        subs            w3, w3, #1
+        st1             {v4.8h, v5.8h}, [x0], x10
+.endm
+1:      calc_all8
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv24_8_neon_i8mm, export=1
+        add             w10, w3, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10 // tmp_array
+        stp             x5, x30, [sp, #-32]!
+        stp             x0, x3, [sp, #16]
+        add             x0, sp, #32
+        sub             x1, x1, x2
+        add             w3, w3, #3
+        bl              X(ff_hevc_put_hevc_epel_h24_8_neon_i8mm)
+        ldp             x5, x30, [sp]
+        ldp             x0, x3, [sp, #16]
+        add             sp, sp, #32
+        load_epel_filterh x5, x4
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h, v17.8h, v18.8h}, [sp], x10
+        ld1             {v19.8h, v20.8h, v21.8h}, [sp], x10
+        ld1             {v22.8h, v23.8h, v24.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11
+        ld1             {\src9\().8h-\src11\().8h}, [sp], x10
+        calc_epelh      v4,     \src0, \src3, \src6, \src9
+        calc_epelh2     v4, v5, \src0, \src3, \src6, \src9
+        calc_epelh      v5,     \src1, \src4, \src7, \src10
+        calc_epelh2     v5, v6, \src1, \src4, \src7, \src10
+        calc_epelh      v6,     \src2, \src5, \src8, \src11
+        calc_epelh2     v6, v7, \src2, \src5, \src8, \src11
+        subs            w3, w3, #1
+        st1             {v4.8h-v6.8h}, [x0], x10
+.endm
+1:      calc_all12
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv32_8_neon_i8mm, export=1
+        stp             x4, x5, [sp, #-64]!
+        stp             x2, x3, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        str             x30, [sp, #48]
+        mov             x6, #16
+        bl              X(ff_hevc_put_hevc_epel_hv16_8_neon_i8mm)
+        ldp             x4, x5, [sp]
+        ldp             x2, x3, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             sp, sp, #48
+        add             x0, x0, #32
+        add             x1, x1, #16
+        mov             x6, #16
+        bl              X(ff_hevc_put_hevc_epel_hv16_8_neon_i8mm)
+        ldr             x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv48_8_neon_i8mm, export=1
+        stp             x4, x5, [sp, #-64]!
+        stp             x2, x3, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        str             x30, [sp, #48]
+        mov             x6, #24
+        bl              X(ff_hevc_put_hevc_epel_hv24_8_neon_i8mm)
+        ldp             x4, x5, [sp]
+        ldp             x2, x3, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             sp, sp, #48
+        add             x0, x0, #48
+        add             x1, x1, #24
+        mov             x6, #24
+        bl              X(ff_hevc_put_hevc_epel_hv24_8_neon_i8mm)
+        ldr             x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv64_8_neon_i8mm, export=1
+        stp             x4, x5, [sp, #-64]!
+        stp             x2, x3, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        str             x30, [sp, #48]
+        mov             x6, #16
+        bl              X(ff_hevc_put_hevc_epel_hv16_8_neon_i8mm)
+        ldp             x4, x5, [sp]
+        ldp             x2, x3, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             x0, x0, #32
+        add             x1, x1, #16
+        mov             x6, #16
+        bl              X(ff_hevc_put_hevc_epel_hv16_8_neon_i8mm)
+        ldp             x4, x5, [sp]
+        ldp             x2, x3, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             x0, x0, #64
+        add             x1, x1, #32
+        mov             x6, #16
+        bl              X(ff_hevc_put_hevc_epel_hv16_8_neon_i8mm)
+        ldp             x4, x5, [sp]
+        ldp             x2, x3, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             sp, sp, #48
+        add             x0, x0, #96
+        add             x1, x1, #48
+        mov             x6, #16
+        bl              X(ff_hevc_put_hevc_epel_hv16_8_neon_i8mm)
+        ldr             x30, [sp], #16
+        ret
+endfunc
+
 function ff_hevc_put_hevc_epel_uni_hv4_8_neon_i8mm, export=1
         add             w10, w4, #3
         lsl             x10, x10, #7
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 82e1623a67..e9a341ecb9 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -191,6 +191,10 @@ NEON8_FNPROTO(epel_h, (int16_t *dst,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, intptr_t mx, intptr_t my, int width), _i8mm);
 
+NEON8_FNPROTO(epel_hv, (int16_t *dst,
+        const uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width), _i8mm);
+
 NEON8_FNPROTO(epel_uni_w_h, (uint8_t *_dst,  ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, int denom, int wx, int ox,
@@ -322,6 +326,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 
         if (have_i8mm(cpu_flags)) {
             NEON8_FNASSIGN(c->put_hevc_epel, 0, 1, epel_h, _i8mm);
+            NEON8_FNASSIGN(c->put_hevc_epel, 1, 1, epel_hv, _i8mm);
             NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 1, epel_uni_hv, _i8mm);
             NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h ,_i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
-- 
2.38.0.windows.1


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 15+ messages in thread
* [FFmpeg-devel] (no subject)
@ 2023-07-17  7:08 Водянников Александр
  0 siblings, 0 replies; 15+ messages in thread
From: Водянников Александр @ 2023-07-17  7:08 UTC (permalink / raw)
  To: ffmpeg-devel

[-- Attachment #1: Type: text/plain, Size: 1 bytes --]



[-- Attachment #2: 0001-Fixed-crash-when-using-hardware-acceleration-in-thir.txt --]
[-- Type: text/plain, Size: 1766 bytes --]

From 0fe666c4e3d10a689f4c6854a58eec3e7ff3c922 Mon Sep 17 00:00:00 2001
From: Aleksoid <Aleksoid1978@mail.ru>
Date: Mon, 17 Jul 2023 17:04:43 +1000
Subject: [PATCH] Fixed crash when using hardware acceleration in third party
 projects without using hw_frames_ctx.

---
 libavcodec/decode.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/libavcodec/decode.c b/libavcodec/decode.c
index a19cca1a7c..f34f169910 100644
--- a/libavcodec/decode.c
+++ b/libavcodec/decode.c
@@ -1802,18 +1802,21 @@ AVBufferRef *ff_hwaccel_frame_priv_alloc(AVCodecContext *avctx,
                                          const AVHWAccel *hwaccel)
 {
     AVBufferRef *ref;
-    AVHWFramesContext *frames_ctx = (AVHWFramesContext *)avctx->hw_frames_ctx->data;
-    uint8_t *data = av_mallocz(hwaccel->frame_priv_data_size);
-    if (!data)
-        return NULL;
-
-    ref = av_buffer_create(data, hwaccel->frame_priv_data_size,
-                           hwaccel->free_frame_priv,
-                           frames_ctx->device_ctx, 0);
-    if (!ref) {
-        av_free(data);
-        return NULL;
-    }
+    if (avctx->hw_frames_ctx) {
+        AVHWFramesContext *frames_ctx = (AVHWFramesContext *)avctx->hw_frames_ctx->data;
+        uint8_t *data = av_mallocz(hwaccel->frame_priv_data_size);
+        if (!data)
+            return NULL;
+
+        ref = av_buffer_create(data, hwaccel->frame_priv_data_size,
+                               hwaccel->free_frame_priv,
+                               frames_ctx->device_ctx, 0);
+        if (!ref) {
+            av_free(data);
+            return NULL;
+        }
+    } else
+        ref = av_buffer_allocz(hwaccel->frame_priv_data_size);
 
     return ref;
 }
-- 
2.41.0.windows.1


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 15+ messages in thread
* [FFmpeg-devel] (no subject)
@ 2022-07-17 13:32 facefunk
  0 siblings, 0 replies; 15+ messages in thread
From: facefunk @ 2022-07-17 13:32 UTC (permalink / raw)
  To: ffmpeg-devel

Hi FFMDevs,

I've managed to get forced mov_text subtitles working in VLC Player. -disposition:s:0 +forced is honored but I'm not 100% sure about my approach.

The attached patch represents the best idea I came up with so far as the code is minimal and it doesn't require the user to set any extra parameters, however it does puncture an abstraction boundary ever so slightly by copying stream data to the codec, perhaps this isn't a problem.

If there's anybody who could look over my patch and let me know if there's a better way of going about this, that would be greatly appreciated. 

Love your work!

Kind regards,

facefunk

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2024-08-07 15:59 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-02-09 14:25 [FFmpeg-devel] [PATCH] avdevice/xcbgrab: enable window resizing aline.gondimsantos
2023-02-09 14:29 ` Nicolas George
2023-02-09 18:19 ` [FFmpeg-devel] (no subject) Aline Gondim Santos
2023-02-09 18:19   ` [FFmpeg-devel] [PATCH] avdevice/xcbgrab: enable window resizing Aline Gondim Santos
2023-02-09 18:29     ` Aline Gondim Santos Gondim Santos
2023-02-09 20:09       ` Marton Balint
  -- strict thread matches above, loose matches on Subject: below --
2024-08-07 15:58 [FFmpeg-devel] (no subject) cyfdel-at-hotmail.com
2024-04-18  9:42 pengxu
2024-04-18  7:36 pengxu
2023-10-14  8:40 Logan.Lyu
2023-10-14  8:40 Logan.Lyu
2023-10-14  8:39 Logan.Lyu
2023-10-14  8:39 Logan.Lyu
2023-07-17  7:08 Водянников Александр
2022-07-17 13:32 facefunk

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git