1 |
A shallow git clone is a clone of the repository which keeps only |
2 |
the latest commit in the branch. Aside from many disadvantages |
3 |
and potential issues, they are very popular as they allow our users to |
4 |
save both space and bandwidth. Therefore, they are supported natively |
5 |
and used by default in git-r3. |
6 |
|
7 |
While it is straightforward to perform the first shallow clone, updating |
8 |
shallow clones is a more problematic matter. Shallow clones can be |
9 |
either updated using plain 'git fetch' or 'git fetch --depth 1'. |
10 |
|
11 |
Using 'git fetch' is the recommended way. In this case, new commits are |
12 |
fetched alike in regular git repo. However, in a very outdated clone |
13 |
this may involve fetching a number of objects that are no longer |
14 |
relevant. Worse than that, if there are no common ancestors between |
15 |
the clone and upstream (this may happen due to branch switch or force |
16 |
push upstream), 'git fetch' starts fetching all the commits like with |
17 |
regular clone. |
18 |
|
19 |
'git fetch --depth 1' re-creates a shallow clone starting with newest |
20 |
commit. That is, it not only fetches new commits but also discards all |
21 |
the old commits. However, either due to protocol limitations or |
22 |
intentional behavior, 'git fetch --depth 1' can re-fetch objects that |
23 |
were fetched already, resulting in a major bandwidth loss and slow down. |
24 |
|
25 |
Due to this issue, git-r3 uses plain 'git fetch' on subsequent updates |
26 |
to the repository. However, as noted above, this may cause |
27 |
re-downloading the repository history unintentionally. This patch aims |
28 |
to solve this. |
29 |
|
30 |
The patch introduces a 'smart fetch' concept that choses between 'git |
31 |
fetch' and 'git fetch --depth 1' based on transferred object count. |
32 |
Since the object count is transferred as progress information by |
33 |
the remote server and is not really obligatory, the smart fetch function |
34 |
works the following way: |
35 |
|
36 |
1. plain 'git fetch' is launched to fetch new commits, |
37 |
|
38 |
2. the progress output is teed to a sed pipe that looks for the object |
39 |
count and writes it to a state file, |
40 |
|
41 |
3. a parallel process waits for the state file to be written. when it is |
42 |
written, it launches 'git fetch --dry-run --depth 1' to obtain |
43 |
the respective object count. |
44 |
|
45 |
4. if the count of (--depth 1) fetch <= 0.75*count of plain fetch, |
46 |
the plain fetch is interrupted and shallow fetch is started instead. |
47 |
|
48 |
The usual overhead of this is extra connection on client side and commit |
49 |
counting and possibly some compression on server side (git is killed |
50 |
via SIGPIPE when sed gets the commit count). |
51 |
|
52 |
If '--depth 1' seems beneficial, the first fetch is usually killed |
53 |
before the compression finishes on server side, therefore avoiding waste |
54 |
of bandwidth on client. |
55 |
|
56 |
The factor of 0.75 was chosen arbitrarily and may change. It should be |
57 |
noted that we operate purely on object counts and not sizes. Therefore, |
58 |
we need to assume that objects fetched by '--depth 1' may be |
59 |
significantly larger than those by incremental fetch. |
60 |
|
61 |
What are your thoughts? |
62 |
--- |
63 |
gx86/eclass/git-r3.eclass | 98 +++++++++++++++++++++++++++++++++++++++++++++-- |
64 |
1 file changed, 95 insertions(+), 3 deletions(-) |
65 |
|
66 |
diff --git a/gx86/eclass/git-r3.eclass b/gx86/eclass/git-r3.eclass |
67 |
index 8585252..a076da7 100644 |
68 |
--- a/gx86/eclass/git-r3.eclass |
69 |
+++ b/gx86/eclass/git-r3.eclass |
70 |
@@ -247,6 +247,93 @@ _git-r3_set_submodules() { |
71 |
done < <(echo "${data}" | git config -f /dev/fd/0 -l) |
72 |
} |
73 |
|
74 |
+# @FUNCTION: _git-r3_smart_fetch |
75 |
+# @USAGE: <git-fetch-args>... |
76 |
+# @DESCRIPTION: |
77 |
+# Try fetching without '--depth' and switch to '--depth 1' if that |
78 |
+# will involve less objects fetched. |
79 |
+_git-r3_smart_fetch() { |
80 |
+ debug-print-function ${FUNCNAME} "$@" |
81 |
+ |
82 |
+ local sed_regexp='.*Counting objects: \([0-9]*\), done\..*' |
83 |
+ |
84 |
+ # start the main fetch |
85 |
+ local cmd=( git fetch --progress "${@}" ) |
86 |
+ echo "${cmd[@]}" >&2 |
87 |
+ |
88 |
+ # we copy the output to the 'sed' pipe for parsing. whenever sed finds |
89 |
+ # the process count, it quits quickly to avoid delays in writing it. |
90 |
+ # then, we start a dummy 'cat' to keep the pipe alive |
91 |
+ |
92 |
+ "${cmd[@]}" 2>&1 \ |
93 |
+ | tee >( |
94 |
+ sed -n -e "/${sed_regexp}/{s/${sed_regexp}/\1/p;q}" \ |
95 |
+ > "${T}"/git-r3_main.count |
96 |
+ exec cat >/dev/null |
97 |
+ ) & |
98 |
+ local main_pid=${!} |
99 |
+ |
100 |
+ # start the helper process |
101 |
+ _git-r3_sub_fetch() { |
102 |
+ # wait for main fetch to get object count; if the server doesn't |
103 |
+ # output it, we won't even launch the parallel process |
104 |
+ while [[ ! -s ${T}/git-r3_main.count ]]; do |
105 |
+ sleep 0.25 |
106 |
+ done |
107 |
+ |
108 |
+ # ok, let's see if parallel fetch gives us smaller count |
109 |
+ # --dry-run will prevent it from writing to the local clone |
110 |
+ # and sed should terminate git with SIGPIPE |
111 |
+ local sub_count=$(git fetch --progress --dry-run --depth 1 "${@}" 2>&1 \ |
112 |
+ | sed -n -e "/${sed_regexp}/{s/${sed_regexp}/\1/p;q}") |
113 |
+ local main_count=$(<"${T}"/git-r3_main.count) |
114 |
+ |
115 |
+ # let's be real sure that '--depth 1' will be good for us. |
116 |
+ # note that we have purely objects counts, and '--depth 1' |
117 |
+ # may involve much bigger objects |
118 |
+ if [[ ${main_count} && ${main_count} -ge $(( sub_count * 3/2 )) ]] |
119 |
+ then |
120 |
+ # signal that we want shallow fetch instead, |
121 |
+ # and terminate the non-shallow fetch process |
122 |
+ touch "${T}"/git-r3_want_shallow || die |
123 |
+ kill ${main_pid} &>/dev/null |
124 |
+ exit 0 |
125 |
+ fi |
126 |
+ |
127 |
+ exit 1 |
128 |
+ } |
129 |
+ _git-r3_sub_fetch "${@}" & |
130 |
+ local sub_pid=${!} |
131 |
+ |
132 |
+ # wait for main process to terminate, either of its own |
133 |
+ # or by signal from subprocess |
134 |
+ wait ${main_pid} |
135 |
+ local main_ret=${?} |
136 |
+ |
137 |
+ # wait for subprocess to terminate, killing it if necessary. |
138 |
+ # if main fetch finished before it, there's no point in keeping |
139 |
+ # it alive. if main fetch was killed by it, it's done anyway |
140 |
+ kill ${sub_pid} &>/dev/null |
141 |
+ wait ${sub_pid} |
142 |
+ |
143 |
+ # now see if subprocess wanted to tell us something... |
144 |
+ if [[ -f ${T}/git-r3_want_shallow ]]; then |
145 |
+ rm "${T}"/git-r3_want_shallow || die |
146 |
+ |
147 |
+ # if fetch finished already (wasn't killed), ignore it |
148 |
+ [[ ${main_ret} -eq 0 ]] && return 0 |
149 |
+ |
150 |
+ # otherwise, restart as shallow fetch |
151 |
+ einfo "Restarting fetch using --depth 1 to save bandwidth ..." |
152 |
+ local cmd=( git fetch --progress --depth 1 "${@}" ) |
153 |
+ echo "${cmd[@]}" >&2 |
154 |
+ "${cmd[@]}" |
155 |
+ main_ret=${?} |
156 |
+ fi |
157 |
+ |
158 |
+ return ${main_ret} |
159 |
+} |
160 |
+ |
161 |
# @FUNCTION: git-r3_fetch |
162 |
# @USAGE: [<repo-uri> [<remote-ref> [<local-id>]]] |
163 |
# @DESCRIPTION: |
164 |
@@ -325,9 +412,12 @@ git-r3_fetch() { |
165 |
# to the first fetch in the repo. passing '--depth' |
166 |
# to further requests usually results in more data being |
167 |
# downloaded than without it. |
168 |
- # 3. in any other case, we just do plain 'git fetch' and let |
169 |
- # git to do its best (on top of shallow or non-shallow repo). |
170 |
+ # 3. if we update a shallow clone, we try without '--depth' |
171 |
+ # first since that usually transfers less data. however, |
172 |
+ # we use git-r3_smart_fetch that can switch into '--depth 1' |
173 |
+ # if that looks beneficial. |
174 |
|
175 |
+ local fetch_command=( git fetch ) |
176 |
if [[ ${EGIT_NONSHALLOW} ]]; then |
177 |
if [[ -f ${GIT_DIR}/shallow ]]; then |
178 |
ref_param+=( --unshallow ) |
179 |
@@ -336,6 +426,8 @@ git-r3_fetch() { |
180 |
# 'git show-ref --heads' returns 1 when there are no branches |
181 |
if ! git show-ref --heads -q; then |
182 |
ref_param+=( --depth 1 ) |
183 |
+ else |
184 |
+ fetch_command=( _git-r3_smart_fetch ) |
185 |
fi |
186 |
fi |
187 |
|
188 |
@@ -354,7 +446,7 @@ git-r3_fetch() { |
189 |
# if ${remote_ref} is branch or tag, ${ref[@]} will contain |
190 |
# the respective commit id. otherwise, it will be an empty |
191 |
# array, so the following won't evaluate to a parameter. |
192 |
- set -- git fetch --no-tags "${r}" "${ref_param[@]}" |
193 |
+ set -- "${fetch_command[@]}" --no-tags "${r}" "${ref_param[@]}" |
194 |
echo "${@}" >&2 |
195 |
if "${@}"; then |
196 |
if [[ ! ${is_branch} ]]; then |
197 |
-- |
198 |
1.8.3.2 |