1 |
From: Daniel Robbins <drobbins@××××××.org> |
2 |
|
3 |
The content-hash layout is identical to the filename-hash layout, |
4 |
except for these two differences: |
5 |
|
6 |
1) A content digest is used instead of a filename digest. |
7 |
2) The final element of the path returned from the get_path method |
8 |
corresponds to the complete content digest. The path is a |
9 |
function of the content digest alone. |
10 |
|
11 |
Motivations to use the content-hash layout instead of the |
12 |
filename-hash layout may include: |
13 |
|
14 |
1) Since the file path is independent of the file name, file |
15 |
name collisions cannot occur. This makes the content-hash |
16 |
layout suitable for storage of multiple types of files (not |
17 |
only gentoo distfiles). For example, it can be used to store |
18 |
distfiles for multiple linux distros within the same tree, |
19 |
with automatic deduplication based on content digest. This |
20 |
layout can be used to store and distribute practically anything |
21 |
(including binary packages for example). |
22 |
|
23 |
2) Allows multiple revisions for the same distfiles name. An |
24 |
existing distfile can be updated, and if a user still has an |
25 |
older copy of an ebuild repository (or an overlay), then a user |
26 |
can successfully fetch a desired revision of the distfile as |
27 |
long as it has not been purged from the mirror. |
28 |
|
29 |
3) File integrity data is integrated into the layout itself, |
30 |
making it very simple to verify the integrity of any file that |
31 |
it contains. The only tool required is an implementation of |
32 |
the chosen hash algorithm. |
33 |
|
34 |
Bug: https://bugs.gentoo.org/756778 |
35 |
Signed-off-by: Zac Medico <zmedico@g.o> |
36 |
--- |
37 |
lib/portage/package/ebuild/fetch.py | 160 +++++++++++++++++++++++-- |
38 |
lib/portage/tests/ebuild/test_fetch.py | 40 ++++++- |
39 |
2 files changed, 184 insertions(+), 16 deletions(-) |
40 |
|
41 |
diff --git a/lib/portage/package/ebuild/fetch.py b/lib/portage/package/ebuild/fetch.py |
42 |
index e0fecaf23..7d2ef93bf 100644 |
43 |
--- a/lib/portage/package/ebuild/fetch.py |
44 |
+++ b/lib/portage/package/ebuild/fetch.py |
45 |
@@ -344,6 +344,31 @@ _size_suffix_map = { |
46 |
} |
47 |
|
48 |
|
49 |
+class DistfileName(str): |
50 |
+ def __new__(cls, s, digests=None): |
51 |
+ return str.__new__(cls, s) |
52 |
+ |
53 |
+ def __init__(self, s, digests=None): |
54 |
+ super().__init__() |
55 |
+ self.digests = {} if digests is None else digests |
56 |
+ |
57 |
+ def digests_equal(self, other): |
58 |
+ """ |
59 |
+ Test if digests compare equal to those of another instance. |
60 |
+ """ |
61 |
+ if not isinstance(other, DistfileName): |
62 |
+ return False |
63 |
+ matches = [] |
64 |
+ for algo, digest in self.digests.items(): |
65 |
+ other_digest = other.digests.get(algo) |
66 |
+ if other_digest is not None: |
67 |
+ if other_digest == digest: |
68 |
+ matches.append(algo) |
69 |
+ else: |
70 |
+ return False |
71 |
+ return bool(matches) |
72 |
+ |
73 |
+ |
74 |
class FlatLayout: |
75 |
def get_path(self, filename): |
76 |
return filename |
77 |
@@ -413,6 +438,90 @@ class FilenameHashLayout: |
78 |
return False |
79 |
|
80 |
|
81 |
+class ContentHashLayout(FilenameHashLayout): |
82 |
+ """ |
83 |
+ The content-hash layout is identical to the filename-hash layout, |
84 |
+ except for these two differences: |
85 |
+ |
86 |
+ 1) A content digest is used instead of a filename digest. |
87 |
+ 2) The final element of the path returned from the get_path method |
88 |
+ corresponds to the complete content digest. The path is a |
89 |
+ function of the content digest alone. |
90 |
+ |
91 |
+ Motivations to use the content-hash layout instead of the |
92 |
+ filename-hash layout may include: |
93 |
+ |
94 |
+ 1) Since the file path is independent of the file name, file |
95 |
+ name collisions cannot occur. This makes the content-hash |
96 |
+ layout suitable for storage of multiple types of files (not |
97 |
+ only gentoo distfiles). For example, it can be used to store |
98 |
+ distfiles for multiple linux distros within the same tree, |
99 |
+ with automatic deduplication based on content digest. This |
100 |
+ layout can be used to store and distribute practically anything |
101 |
+ (including binary packages for example). |
102 |
+ |
103 |
+ 2) Allows multiple revisions for the same distfiles name. An |
104 |
+ existing distfile can be updated, and if a user still has an |
105 |
+ older copy of an ebuild repository (or an overlay), then a user |
106 |
+ can successfully fetch a desired revision of the distfile as |
107 |
+ long as it has not been purged from the mirror. |
108 |
+ |
109 |
+ 3) File integrity data is integrated into the layout itself, |
110 |
+ making it very simple to verify the integrity of any file that |
111 |
+ it contains. The only tool required is an implementation of |
112 |
+ the chosen hash algorithm. |
113 |
+ """ |
114 |
+ |
115 |
+ def get_path(self, filename): |
116 |
+ """ |
117 |
+ For content-hash, the path is a function of the content digest alone. |
118 |
+ The final element of the path returned from the get_path method |
119 |
+ corresponds to the complete content digest. |
120 |
+ """ |
121 |
+ fnhash = remaining = filename.digests[self.algo] |
122 |
+ ret = "" |
123 |
+ for c in self.cutoffs: |
124 |
+ assert c % 4 == 0 |
125 |
+ c = c // 4 |
126 |
+ ret += remaining[:c] + "/" |
127 |
+ remaining = remaining[c:] |
128 |
+ return ret + fnhash |
129 |
+ |
130 |
+ def get_filenames(self, distdir): |
131 |
+ """ |
132 |
+ Yields DistfileName instances each with filename corresponding |
133 |
+ to a digest value for self.algo. These can be compared to other |
134 |
+ DistfileName instances with their digests_equal method. |
135 |
+ """ |
136 |
+ for filename in super(ContentHashLayout, self).get_filenames(distdir): |
137 |
+ yield DistfileName( |
138 |
+ filename, digests=dict([(self.algo, os.path.basename(filename))]) |
139 |
+ ) |
140 |
+ |
141 |
+ @staticmethod |
142 |
+ def verify_args(args, filename=None): |
143 |
+ """ |
144 |
+ If the filename argument is given, then supported hash |
145 |
+ algorithms are constrained by digests available in the filename |
146 |
+ digests attribute. |
147 |
+ |
148 |
+ @param args: layout.conf entry args |
149 |
+ @param filename: filename with digests attribute |
150 |
+ @return: True if args are valid for available digest algorithms, |
151 |
+ and False otherwise |
152 |
+ """ |
153 |
+ if len(args) != 3: |
154 |
+ return False |
155 |
+ if filename is None: |
156 |
+ supported_algos = get_valid_checksum_keys() |
157 |
+ else: |
158 |
+ supported_algos = filename.digests |
159 |
+ algo = args[1].upper() |
160 |
+ if algo not in supported_algos: |
161 |
+ return False |
162 |
+ return FilenameHashLayout.verify_args(args) |
163 |
+ |
164 |
+ |
165 |
class MirrorLayoutConfig: |
166 |
""" |
167 |
Class to read layout.conf from a mirror. |
168 |
@@ -439,20 +548,41 @@ class MirrorLayoutConfig: |
169 |
self.structure = data |
170 |
|
171 |
@staticmethod |
172 |
- def validate_structure(val): |
173 |
+ def validate_structure(val, filename=None): |
174 |
+ """ |
175 |
+ If the filename argument is given, then supported hash |
176 |
+ algorithms are constrained by digests available in the filename |
177 |
+ digests attribute. |
178 |
+ |
179 |
+ @param val: layout.conf entry args |
180 |
+ @param filename: filename with digests attribute |
181 |
+ @return: True if args are valid for available digest algorithms, |
182 |
+ and False otherwise |
183 |
+ """ |
184 |
if val[0] == 'flat': |
185 |
return FlatLayout.verify_args(val) |
186 |
- if val[0] == 'filename-hash': |
187 |
+ elif val[0] == 'filename-hash': |
188 |
return FilenameHashLayout.verify_args(val) |
189 |
+ elif val[0] == 'content-hash': |
190 |
+ return ContentHashLayout.verify_args(val, filename=filename) |
191 |
return False |
192 |
|
193 |
- def get_best_supported_layout(self): |
194 |
+ def get_best_supported_layout(self, filename=None): |
195 |
+ """ |
196 |
+ If the filename argument is given, then acceptable hash |
197 |
+ algorithms are constrained by digests available in the filename |
198 |
+ digests attribute. |
199 |
+ |
200 |
+ @param filename: filename with digests attribute |
201 |
+ """ |
202 |
for val in self.structure: |
203 |
- if self.validate_structure(val): |
204 |
+ if self.validate_structure(val, filename=filename): |
205 |
if val[0] == 'flat': |
206 |
return FlatLayout(*val[1:]) |
207 |
- if val[0] == 'filename-hash': |
208 |
+ elif val[0] == 'filename-hash': |
209 |
return FilenameHashLayout(*val[1:]) |
210 |
+ elif val[0] == 'content-hash': |
211 |
+ return ContentHashLayout(*val[1:]) |
212 |
# fallback |
213 |
return FlatLayout() |
214 |
|
215 |
@@ -465,6 +595,8 @@ class MirrorLayoutConfig: |
216 |
ret.append(FlatLayout(*val[1:])) |
217 |
elif val[0] == 'filename-hash': |
218 |
ret.append(FilenameHashLayout(*val[1:])) |
219 |
+ elif val[0] == 'content-hash': |
220 |
+ ret.append(ContentHashLayout(*val[1:])) |
221 |
if not ret: |
222 |
ret.append(FlatLayout()) |
223 |
return ret |
224 |
@@ -515,7 +647,7 @@ def get_mirror_url(mirror_url, filename, mysettings, cache_path=None): |
225 |
|
226 |
# For some protocols, urlquote is required for correct behavior, |
227 |
# and it must not be used for other protocols like rsync and sftp. |
228 |
- path = mirror_conf.get_best_supported_layout().get_path(filename) |
229 |
+ path = mirror_conf.get_best_supported_layout(filename=filename).get_path(filename) |
230 |
if urlparse(mirror_url).scheme in ('ftp', 'http', 'https'): |
231 |
path = urlquote(path) |
232 |
return mirror_url + "/distfiles/" + path |
233 |
@@ -722,15 +854,23 @@ def fetch(myuris, mysettings, listonly=0, fetchonly=0, |
234 |
if hasattr(myuris, 'items'): |
235 |
for myfile, uri_set in myuris.items(): |
236 |
for myuri in uri_set: |
237 |
- file_uri_tuples.append((myfile, myuri)) |
238 |
+ file_uri_tuples.append( |
239 |
+ (DistfileName(myfile, digests=mydigests.get(myfile)), myuri) |
240 |
+ ) |
241 |
if not uri_set: |
242 |
- file_uri_tuples.append((myfile, None)) |
243 |
+ file_uri_tuples.append( |
244 |
+ (DistfileName(myfile, digests=mydigests.get(myfile)), None) |
245 |
+ ) |
246 |
else: |
247 |
for myuri in myuris: |
248 |
if urlparse(myuri).scheme: |
249 |
- file_uri_tuples.append((os.path.basename(myuri), myuri)) |
250 |
+ file_uri_tuples.append( |
251 |
+ (DistfileName(myfile, digests=mydigests.get(myfile)), myuri) |
252 |
+ ) |
253 |
else: |
254 |
- file_uri_tuples.append((os.path.basename(myuri), None)) |
255 |
+ file_uri_tuples.append( |
256 |
+ (DistfileName(myfile, digests=mydigests.get(myfile)), None) |
257 |
+ ) |
258 |
|
259 |
filedict = OrderedDict() |
260 |
primaryuri_dict = {} |
261 |
diff --git a/lib/portage/tests/ebuild/test_fetch.py b/lib/portage/tests/ebuild/test_fetch.py |
262 |
index c5ea8253b..73ae45ebf 100644 |
263 |
--- a/lib/portage/tests/ebuild/test_fetch.py |
264 |
+++ b/lib/portage/tests/ebuild/test_fetch.py |
265 |
@@ -7,7 +7,8 @@ import tempfile |
266 |
|
267 |
import portage |
268 |
from portage import shutil, os |
269 |
-from portage.const import BASH_BINARY, PORTAGE_PYM_PATH |
270 |
+from portage.checksum import checksum_str |
271 |
+from portage.const import BASH_BINARY, MANIFEST2_HASH_DEFAULTS, PORTAGE_PYM_PATH |
272 |
from portage.tests import TestCase |
273 |
from portage.tests.resolver.ResolverPlayground import ResolverPlayground |
274 |
from portage.tests.util.test_socks5 import AsyncHTTPServer |
275 |
@@ -18,8 +19,15 @@ from portage.util._async.SchedulerInterface import SchedulerInterface |
276 |
from portage.util._eventloop.global_event_loop import global_event_loop |
277 |
from portage.package.ebuild.config import config |
278 |
from portage.package.ebuild.digestgen import digestgen |
279 |
-from portage.package.ebuild.fetch import (_download_suffix, fetch, FlatLayout, |
280 |
- FilenameHashLayout, MirrorLayoutConfig) |
281 |
+from portage.package.ebuild.fetch import ( |
282 |
+ ContentHashLayout, |
283 |
+ DistfileName, |
284 |
+ _download_suffix, |
285 |
+ fetch, |
286 |
+ FilenameHashLayout, |
287 |
+ FlatLayout, |
288 |
+ MirrorLayoutConfig, |
289 |
+) |
290 |
from _emerge.EbuildFetcher import EbuildFetcher |
291 |
from _emerge.Package import Package |
292 |
|
293 |
@@ -102,6 +110,11 @@ class EbuildFetchTestCase(TestCase): |
294 |
"1=filename-hash BLAKE2B 8", |
295 |
"0=flat", |
296 |
), |
297 |
+ ( |
298 |
+ "[structure]", |
299 |
+ "0=content-hash SHA512 8:8:8", |
300 |
+ "1=flat", |
301 |
+ ), |
302 |
) |
303 |
|
304 |
fetchcommand = portage.util.shlex_split(playground.settings["FETCHCOMMAND"]) |
305 |
@@ -142,9 +155,14 @@ class EbuildFetchTestCase(TestCase): |
306 |
content["/distfiles/layout.conf"] = layout_data.encode("utf8") |
307 |
|
308 |
for k, v in distfiles.items(): |
309 |
+ filename = DistfileName( |
310 |
+ k, |
311 |
+ digests=dict((algo, checksum_str(v, hashname=algo)) for algo in MANIFEST2_HASH_DEFAULTS), |
312 |
+ ) |
313 |
+ |
314 |
# mirror path |
315 |
for layout in layouts: |
316 |
- content["/distfiles/" + layout.get_path(k)] = v |
317 |
+ content["/distfiles/" + layout.get_path(filename)] = v |
318 |
# upstream path |
319 |
content["/distfiles/{}.txt".format(k)] = v |
320 |
|
321 |
@@ -499,14 +517,18 @@ class EbuildFetchTestCase(TestCase): |
322 |
io.StringIO(conf)) |
323 |
|
324 |
def test_filename_hash_layout_get_filenames(self): |
325 |
+ filename = DistfileName( |
326 |
+ 'foo-1.tar.gz', |
327 |
+ digests=dict((algo, checksum_str(b'', hashname=algo)) for algo in MANIFEST2_HASH_DEFAULTS), |
328 |
+ ) |
329 |
layouts = ( |
330 |
FlatLayout(), |
331 |
FilenameHashLayout('SHA1', '4'), |
332 |
FilenameHashLayout('SHA1', '8'), |
333 |
FilenameHashLayout('SHA1', '8:16'), |
334 |
FilenameHashLayout('SHA1', '8:16:24'), |
335 |
+ ContentHashLayout('SHA512', '8:8:8'), |
336 |
) |
337 |
- filename = 'foo-1.tar.gz' |
338 |
|
339 |
for layout in layouts: |
340 |
distdir = tempfile.mkdtemp() |
341 |
@@ -520,6 +542,12 @@ class EbuildFetchTestCase(TestCase): |
342 |
with open(path, 'wb') as f: |
343 |
pass |
344 |
|
345 |
- self.assertEqual([filename], list(layout.get_filenames(distdir))) |
346 |
+ file_list = list(layout.get_filenames(distdir)) |
347 |
+ self.assertTrue(len(file_list) > 0) |
348 |
+ for filename_result in file_list: |
349 |
+ if isinstance(filename_result, DistfileName): |
350 |
+ self.assertTrue(filename_result.digests_equal(filename)) |
351 |
+ else: |
352 |
+ self.assertEqual(filename_result, str(filename)) |
353 |
finally: |
354 |
shutil.rmtree(distdir) |
355 |
-- |
356 |
2.26.2 |