Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCHv4 1/4] lavu/riscv: add assembler macros for adjusting vector LMUL
@ 2024-05-16 16:48 Rémi Denis-Courmont
  2024-05-16 16:48 ` [FFmpeg-devel] [PATCHv4 2/4] lavc/flacdsp: optimise RVV vector type for lpc16 Rémi Denis-Courmont
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-16 16:48 UTC (permalink / raw)
  To: ffmpeg-devel

vtype_vli computes the VTYPE value with the optimal LMUL for a given
element width, tail and mask policies and a run-time vector length.

vtype_ivli does the same, but with the compile-time constant vector
length.

vwtypei and vntypei can be used to widen or narrow a VTYPE value for
use in mixed-width vector-optimised functions.
---
 libavutil/riscv/asm.S | 166 +++++++++++++++++++++++++++++-------------
 1 file changed, 117 insertions(+), 49 deletions(-)

diff --git a/libavutil/riscv/asm.S b/libavutil/riscv/asm.S
index 14be5055f5..1e6358dcb5 100644
--- a/libavutil/riscv/asm.S
+++ b/libavutil/riscv/asm.S
@@ -96,77 +96,145 @@
         .endm
 #endif
 
-        /* Convenience macro to load a Vector type (vtype) as immediate */
-        .macro  lvtypei rd, e, m=m1, tp=tu, mp=mu
+#if defined (__riscv_v_elen)
+# define RV_V_ELEN __riscv_v_elen
+#else
+/* Run-time detection of the V extension implies ELEN >= 64. */
+# define RV_V_ELEN 64
+#endif
+#if RV_V_ELEN == 32
+# define VSEW_MAX 2
+#else
+# define VSEW_MAX 3
+#endif
 
-        .ifc \e,e8
-        .equ ei, 0
+        .macro  parse_vtype ew, tp, mp
+        .ifc    \ew,e8
+        .equ    vsew, 0
         .else
-        .ifc \e,e16
-        .equ ei, 8
+        .ifc    \ew,e16
+        .equ    vsew, 1
         .else
-        .ifc \e,e32
-        .equ ei, 16
+        .ifc    \ew,e32
+        .equ    vsew, 2
         .else
-        .ifc \e,e64
-        .equ ei, 24
+        .ifc    \ew,e64
+        .equ    vsew, 3
         .else
-        .error "Unknown element type"
+        .error  "Unknown element width \ew"
         .endif
         .endif
         .endif
         .endif
 
-        .ifc \m,m1
-        .equ mi, 0
-        .else
-        .ifc \m,m2
-        .equ mi, 1
-        .else
-        .ifc \m,m4
-        .equ mi, 2
+        .ifc    \tp,tu
+        .equ    tp, 0
         .else
-        .ifc \m,m8
-        .equ mi, 3
+        .ifc    \tp,ta
+        .equ    tp, 1
         .else
-        .ifc \m,mf8
-        .equ mi, 5
-        .else
-        .ifc \m,mf4
-        .equ mi, 6
-        .else
-        .ifc \m,mf2
-        .equ mi, 7
-        .else
-        .error "Unknown multiplier"
-        .equ mi, 3
-        .endif
-        .endif
-        .endif
-        .endif
-        .endif
+        .error  "Unknown tail policy \tp"
         .endif
         .endif
 
-        .ifc \tp,tu
-        .equ tpi, 0
+        .ifc    \mp,mu
+        .equ    mp, 0
         .else
-        .ifc \tp,ta
-        .equ tpi, 64
+        .ifc    \mp,ma
+        .equ    mp, 1
         .else
-        .error "Unknown tail policy"
+        .error  "Unknown mask policy \mp"
         .endif
         .endif
+        .endm
 
-        .ifc \mp,mu
-        .equ mpi, 0
-        .else
-        .ifc \mp,ma
-        .equ mpi, 128
+        /**
+         * Gets the vector type with the smallest suitable LMUL value.
+         * @param[out] rd vector type destination register
+         * @param vl vector length constant
+         * @param ew element width: e8, e16, e32 or e64
+         * @param tp tail policy: tu or ta
+         * @param mp mask policty: mu or ma
+         */
+        .macro  vtype_ivli rd, avl, ew, tp=tu, mp=mu
+        .if     \avl <= 1
+        .equ    log2vl, 0
+        .elseif \avl <= 2
+        .equ    log2vl, 1
+        .elseif \avl <= 4
+        .equ    log2vl, 2
+        .elseif \avl <= 8
+        .equ    log2vl, 3
+        .elseif \avl <= 16
+        .equ    log2vl, 4
+        .elseif \avl <= 32
+        .equ    log2vl, 5
+        .elseif \avl <= 64
+        .equ    log2vl, 6
+        .elseif \avl <= 128
+        .equ    log2vl, 7
         .else
-        .error "Unknown mask policy"
+        .error  "Vector length \avl out of range"
         .endif
+        parse_vtype \ew, \tp, \mp
+        csrr    \rd, vlenb
+        clz     \rd, \rd
+        addi    \rd, \rd, log2vl + 1 + VSEW_MAX - __riscv_xlen
+        max     \rd, \rd, zero // VLMUL must be >= VSEW - VSEW_MAX
+        .if     vsew < VSEW_MAX
+        addi    \rd, \rd, vsew - VSEW_MAX
+        andi    \rd, \rd, 7
         .endif
+        ori     \rd, \rd, (vsew << 3) | (tp << 6) | (mp << 7)
+        .endm
+
+        /**
+         * Gets the vector type with the smallest suitable LMUL value.
+         * @param[out] rd vector type destination register
+         * @param rs vector length source register
+         * @param[out] tmp temporary register to be clobbered
+         * @param ew element width: e8, e16, e32 or e64
+         * @param tp tail policy: tu or ta
+         * @param mp mask policty: mu or ma
+         */
+        .macro  vtype_vli rd, rs, tmp, ew, tp=tu, mp=mu
+        parse_vtype \ew, \tp, \mp
+        /*
+         * The difference between the CLZ's notionally equals the VLMUL value
+         * for 4-bit elements. But we want the value for SEW_MAX-bit elements.
+         */
+        slli    \tmp, \rs, 1 + VSEW_MAX
+        csrr    \rd, vlenb
+        addi    \tmp, \tmp, -1
+        clz     \rd, \rd
+        clz     \tmp, \tmp
+        sub     \rd, \rd, \tmp
+        max     \rd, \rd, zero // VLMUL must be >= VSEW - VSEW_MAX
+        .if     vsew < VSEW_MAX
+        addi    \rd, \rd, vsew - VSEW_MAX
+        andi    \rd, \rd, 7
+        .endif
+        ori     \rd, \rd, (vsew << 3) | (tp << 6) | (mp << 7)
+        .endm
+
+        /**
+         * Widens a vector type.
+         * @param[out] rd widened vector type destination register
+         * @param rs vector type source register
+         * @param n number of times to widen (once by default)
+         */
+        .macro  vwtypei rd, rs, n=1
+        xori    \rd, \rs, 4
+        addi    \rd, \rd, (\n) * 011
+        xori    \rd, \rd, 4
+        .endm
 
-        li      \rd, (ei | mi | tpi | mpi)
+        /**
+         * Narrows a vector type.
+         * @param[out] rd narrowed vector type destination register
+         * @param rs vector type source register
+         * @param n number of times to narrow (once by default)
+         */
+        .macro  vntypei rd, rs, n=1
+        vwtypei \rd, \rs, -(\n)
         .endm
-- 
2.43.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [FFmpeg-devel] [PATCHv4 2/4] lavc/flacdsp: optimise RVV vector type for lpc16
  2024-05-16 16:48 [FFmpeg-devel] [PATCHv4 1/4] lavu/riscv: add assembler macros for adjusting vector LMUL Rémi Denis-Courmont
@ 2024-05-16 16:48 ` Rémi Denis-Courmont
  2024-05-16 16:48 ` [FFmpeg-devel] [PATCHv4 3/4] lavc/flacdsp: optimise RVV vector type for lpc32 Rémi Denis-Courmont
  2024-05-16 16:48 ` [FFmpeg-devel] [PATCHv4 4/4] lavc/huffyuvdsp: optimise RVV vtype for add_hfyu_left_pred_bgr32 Rémi Denis-Courmont
  2 siblings, 0 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-16 16:48 UTC (permalink / raw)
  To: ffmpeg-devel

This calculates the optimal vector type value at run-time based on the
hardware vector length and the FLAC LPC prediction order. In this
particular case, the additional computation is easily amortised over
the loop iterations:

T-Head C908:
    C     V before  V after
 1   48.0    214.7     95.2
 2   64.7    214.2     94.7
 3   79.7    213.5     94.5
 4   96.2    196.5     94.2 #
 5  111.0    195.7    118.5
 6  127.0    211.2    102.0
 7  143.7    194.2    101.5
 8  175.7    193.2    101.2 #
 9  176.2    224.2    126.0
10  191.5    192.0    125.5
11  224.5    191.2    124.7
12  223.0    190.2    124.2
13  239.2    189.5    123.7
14  253.7    188.7    139.5
15  286.2    188.0    122.7
16  284.0    187.0    122.5 #
17  300.2    186.5    186.5
18  314.0    185.5    185.7
19  329.7    184.7    185.0
20  343.0    184.2    184.2
21  358.7    199.2    183.7
22  371.7    182.7    182.7
23  387.5    181.7    182.0
24  400.7    181.0    181.2
25  431.5    180.2    196.5
26  443.7    195.5    196.0
27  459.0    178.7    196.2
28  470.7    177.7    194.2
29  470.0    177.0    193.5
30  481.2    176.2    176.5
31  496.2    175.5    175.7
32  507.2    174.7    191.0 #

 # Power of two boundary.

With 128-bit vectors, improvements are expected for the first two
test cases only. For the other two, there is overhead but below noise.
Improvements should be better observable with prediction order of 8
and less, or on hardware with larger vector sizes.

The same optimisation strategy should be applicable to LPC32
(and work-in-progress LPC33), but is left as a future exercise.

flac

lpc16
---
 libavcodec/riscv/flacdsp_init.c | 2 +-
 libavcodec/riscv/flacdsp_rvv.S  | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/libavcodec/riscv/flacdsp_init.c b/libavcodec/riscv/flacdsp_init.c
index 4f1652dbe7..735aec0691 100644
--- a/libavcodec/riscv/flacdsp_init.c
+++ b/libavcodec/riscv/flacdsp_init.c
@@ -71,7 +71,7 @@ av_cold void ff_flacdsp_init_riscv(FLACDSPContext *c, enum AVSampleFormat fmt,
     if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR)) {
         int vlenb = ff_get_rv_vlenb();
 
-        if (vlenb >= 16)
+        if ((flags & AV_CPU_FLAG_RVB_BASIC) && vlenb >= 16)
             c->lpc16 = ff_flac_lpc16_rvv;
 
 # if (__riscv_xlen >= 64)
diff --git a/libavcodec/riscv/flacdsp_rvv.S b/libavcodec/riscv/flacdsp_rvv.S
index 6287faa260..e1a20ce8e1 100644
--- a/libavcodec/riscv/flacdsp_rvv.S
+++ b/libavcodec/riscv/flacdsp_rvv.S
@@ -20,8 +20,9 @@
 
 #include "libavutil/riscv/asm.S"
 
-func ff_flac_lpc16_rvv, zve32x
-        vsetvli zero, a2, e32, m8, ta, ma
+func ff_flac_lpc16_rvv, zve32x, zbb
+        vtype_vli t0, a2, t2, e32, ta, ma
+        vsetvl  zero, a2, t0
         vle32.v v8, (a1)
         sub     a4, a4, a2
         vle32.v v16, (a0)
-- 
2.43.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [FFmpeg-devel] [PATCHv4 3/4] lavc/flacdsp: optimise RVV vector type for lpc32
  2024-05-16 16:48 [FFmpeg-devel] [PATCHv4 1/4] lavu/riscv: add assembler macros for adjusting vector LMUL Rémi Denis-Courmont
  2024-05-16 16:48 ` [FFmpeg-devel] [PATCHv4 2/4] lavc/flacdsp: optimise RVV vector type for lpc16 Rémi Denis-Courmont
@ 2024-05-16 16:48 ` Rémi Denis-Courmont
  2024-05-16 16:48 ` [FFmpeg-devel] [PATCHv4 4/4] lavc/huffyuvdsp: optimise RVV vtype for add_hfyu_left_pred_bgr32 Rémi Denis-Courmont
  2 siblings, 0 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-16 16:48 UTC (permalink / raw)
  To: ffmpeg-devel

This is pretty much the same as for lpc16, though it only improves half
as large prediction orders. With 128-bit vectors, this gives:

   C      V old  V new
1   69.2  181.5   95.5
2  107.7  180.7   95.2
3  145.5  180.0  103.5
4  183.0  179.2  102.7
5  220.7  178.5  128.0
6  257.7  194.0  127.5
7  294.5  193.7  126.7
8  331.0  193.0  126.5

Larger prediction orders see no significant changes at that size.
---
 libavcodec/riscv/flacdsp_init.c | 15 ++++++++-------
 libavcodec/riscv/flacdsp_rvv.S  | 12 +++++++-----
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/libavcodec/riscv/flacdsp_init.c b/libavcodec/riscv/flacdsp_init.c
index 735aec0691..830ae36534 100644
--- a/libavcodec/riscv/flacdsp_init.c
+++ b/libavcodec/riscv/flacdsp_init.c
@@ -71,17 +71,18 @@ av_cold void ff_flacdsp_init_riscv(FLACDSPContext *c, enum AVSampleFormat fmt,
     if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR)) {
         int vlenb = ff_get_rv_vlenb();
 
-        if ((flags & AV_CPU_FLAG_RVB_BASIC) && vlenb >= 16)
+        if ((flags & AV_CPU_FLAG_RVB_BASIC) && vlenb >= 16) {
             c->lpc16 = ff_flac_lpc16_rvv;
 
 # if (__riscv_xlen >= 64)
-        if (flags & AV_CPU_FLAG_RVV_I64) {
-            if (vlenb > 16)
-                c->lpc32 = ff_flac_lpc32_rvv_simple;
-            else
-                c->lpc32 = ff_flac_lpc32_rvv;
-        }
+            if (flags & AV_CPU_FLAG_RVV_I64) {
+                if (vlenb > 16)
+                    c->lpc32 = ff_flac_lpc32_rvv_simple;
+                else
+                    c->lpc32 = ff_flac_lpc32_rvv;
+            }
 # endif
+        }
 
         c->wasted32 = ff_flac_wasted32_rvv;
 
diff --git a/libavcodec/riscv/flacdsp_rvv.S b/libavcodec/riscv/flacdsp_rvv.S
index e1a20ce8e1..2941928465 100644
--- a/libavcodec/riscv/flacdsp_rvv.S
+++ b/libavcodec/riscv/flacdsp_rvv.S
@@ -76,22 +76,24 @@ func ff_flac_lpc32_rvv, zve64x
         ret
 endfunc
 
-func ff_flac_lpc32_rvv_simple, zve64x
-        vsetivli zero, 1, e64, m1, ta, ma
+func ff_flac_lpc32_rvv_simple, zve64x, zbb
+        vtype_vli t3, a2, t1, e64, ta, ma
+        vntypei t2, t3
+        vsetvl  zero, a2, t3 // e64
         vmv.s.x v0, zero
-        vsetvli zero, a2, e32, m4, ta, ma
+        vsetvl  zero, zero, t2 // e32
         vle32.v v8, (a1)
         sub     a4, a4, a2
         vle32.v v16, (a0)
         sh2add  a0, a2, a0
 1:
         vwmul.vv v24, v8, v16
-        vsetvli zero, zero, e64, m8, ta, ma
+        vsetvl  zero, zero, t3 // e64
         vredsum.vs v24, v24, v0
         lw      t0, (a0)
         addi    a4, a4, -1
         vmv.x.s t1, v24
-        vsetvli zero, zero, e32, m4, ta, ma
+        vsetvl  zero, zero, t2 // e32
         sra     t1, t1, a3
         add     t0, t0, t1
         vslide1down.vx v16, v16, t0
-- 
2.43.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [FFmpeg-devel] [PATCHv4 4/4] lavc/huffyuvdsp: optimise RVV vtype for add_hfyu_left_pred_bgr32
  2024-05-16 16:48 [FFmpeg-devel] [PATCHv4 1/4] lavu/riscv: add assembler macros for adjusting vector LMUL Rémi Denis-Courmont
  2024-05-16 16:48 ` [FFmpeg-devel] [PATCHv4 2/4] lavc/flacdsp: optimise RVV vector type for lpc16 Rémi Denis-Courmont
  2024-05-16 16:48 ` [FFmpeg-devel] [PATCHv4 3/4] lavc/flacdsp: optimise RVV vector type for lpc32 Rémi Denis-Courmont
@ 2024-05-16 16:48 ` Rémi Denis-Courmont
  2024-05-16 17:14   ` Rémi Denis-Courmont
  2 siblings, 1 reply; 5+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-16 16:48 UTC (permalink / raw)
  To: ffmpeg-devel

T-Head C908:
add_hfyu_left_pred_bgr32_c:       237.5
add_hfyu_left_pred_bgr32_rvv_i32: 173.5 (before)
add_hfyu_left_pred_bgr32_rvv_i32: 110.0 (after)
---
 libavcodec/riscv/huffyuvdsp_rvv.S | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/libavcodec/riscv/huffyuvdsp_rvv.S b/libavcodec/riscv/huffyuvdsp_rvv.S
index 9c4434907d..d334f5c6d0 100644
--- a/libavcodec/riscv/huffyuvdsp_rvv.S
+++ b/libavcodec/riscv/huffyuvdsp_rvv.S
@@ -36,8 +36,10 @@ func ff_add_int16_rvv, zve32x
         ret
 endfunc
 
-func ff_add_hfyu_left_pred_bgr32_rvv, zve32x
-        vsetivli zero, 4, e8, m1, ta, ma
+func ff_add_hfyu_left_pred_bgr32_rvv, zve32x, zbb
+        vtype_ivli t1, 4, e8, ta, ma
+        li      t0, 4
+        vsetvl  zero, t0, t1
         vle8.v  v8, (a3)
         sh2add  a2, a2, a1
 1:
-- 
2.43.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [FFmpeg-devel] [PATCHv4 4/4] lavc/huffyuvdsp: optimise RVV vtype for add_hfyu_left_pred_bgr32
  2024-05-16 16:48 ` [FFmpeg-devel] [PATCHv4 4/4] lavc/huffyuvdsp: optimise RVV vtype for add_hfyu_left_pred_bgr32 Rémi Denis-Courmont
@ 2024-05-16 17:14   ` Rémi Denis-Courmont
  0 siblings, 0 replies; 5+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-16 17:14 UTC (permalink / raw)
  To: ffmpeg-devel

Le torstaina 16. toukokuuta 2024, 19.48.40 EEST Rémi Denis-Courmont a écrit :
> T-Head C908:
> add_hfyu_left_pred_bgr32_c:       237.5
> add_hfyu_left_pred_bgr32_rvv_i32: 173.5 (before)
> add_hfyu_left_pred_bgr32_rvv_i32: 110.0 (after)
> ---
>  libavcodec/riscv/huffyuvdsp_rvv.S | 6 ++++--
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/libavcodec/riscv/huffyuvdsp_rvv.S
> b/libavcodec/riscv/huffyuvdsp_rvv.S index 9c4434907d..d334f5c6d0 100644
> --- a/libavcodec/riscv/huffyuvdsp_rvv.S
> +++ b/libavcodec/riscv/huffyuvdsp_rvv.S
> @@ -36,8 +36,10 @@ func ff_add_int16_rvv, zve32x
>          ret
>  endfunc
> 
> -func ff_add_hfyu_left_pred_bgr32_rvv, zve32x
> -        vsetivli zero, 4, e8, m1, ta, ma
> +func ff_add_hfyu_left_pred_bgr32_rvv, zve32x, zbb
> +        vtype_ivli t1, 4, e8, ta, ma
> +        li      t0, 4
> +        vsetvl  zero, t0, t1
>          vle8.v  v8, (a3)
>          sh2add  a2, a2, a1
>  1:

Missing Zvbb check in init.c will be added.

-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2024-05-16 17:14 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-05-16 16:48 [FFmpeg-devel] [PATCHv4 1/4] lavu/riscv: add assembler macros for adjusting vector LMUL Rémi Denis-Courmont
2024-05-16 16:48 ` [FFmpeg-devel] [PATCHv4 2/4] lavc/flacdsp: optimise RVV vector type for lpc16 Rémi Denis-Courmont
2024-05-16 16:48 ` [FFmpeg-devel] [PATCHv4 3/4] lavc/flacdsp: optimise RVV vector type for lpc32 Rémi Denis-Courmont
2024-05-16 16:48 ` [FFmpeg-devel] [PATCHv4 4/4] lavc/huffyuvdsp: optimise RVV vtype for add_hfyu_left_pred_bgr32 Rémi Denis-Courmont
2024-05-16 17:14   ` Rémi Denis-Courmont

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git