Gentoo Archives: gentoo-portage-dev

From: Zac Medico <zmedico@g.o>
To: gentoo-portage-dev@l.g.o
Cc: Daniel Robbins <drobbins@××××××.org>, Zac Medico <zmedico@g.o>
Subject: [gentoo-portage-dev] [PATCH] Add content-hash distfiles layout (bug 756778)
Date: Sun, 21 Feb 2021 13:51:03
Message-Id: 20210221135027.467678-1-zmedico@gentoo.org
1 From: Daniel Robbins <drobbins@××××××.org>
2
3 The content-hash layout is identical to the filename-hash layout,
4 except for these two differences:
5
6 1) A content digest is used instead of a filename digest.
7 2) The final element of the path returned from the get_path method
8 corresponds to the complete content digest. The path is a
9 function of the content digest alone.
10
11 Motivations to use the content-hash layout instead of the
12 filename-hash layout may include:
13
14 1) Since the file path is independent of the file name, file
15 name collisions cannot occur. This makes the content-hash
16 layout suitable for storage of multiple types of files (not
17 only gentoo distfiles). For example, it can be used to store
18 distfiles for multiple linux distros within the same tree,
19 with automatic deduplication based on content digest. This
20 layout can be used to store and distribute practically anything
21 (including binary packages for example).
22
23 2) Allows multiple revisions for the same distfiles name. An
24 existing distfile can be updated, and if a user still has an
25 older copy of an ebuild repository (or an overlay), then a user
26 can successfully fetch a desired revision of the distfile as
27 long as it has not been purged from the mirror.
28
29 3) File integrity data is integrated into the layout itself,
30 making it very simple to verify the integrity of any file that
31 it contains. The only tool required is an implementation of
32 the chosen hash algorithm.
33
34 Bug: https://bugs.gentoo.org/756778
35 Signed-off-by: Zac Medico <zmedico@g.o>
36 ---
37 lib/portage/package/ebuild/fetch.py | 160 +++++++++++++++++++++++--
38 lib/portage/tests/ebuild/test_fetch.py | 40 ++++++-
39 2 files changed, 184 insertions(+), 16 deletions(-)
40
41 diff --git a/lib/portage/package/ebuild/fetch.py b/lib/portage/package/ebuild/fetch.py
42 index e0fecaf23..7d2ef93bf 100644
43 --- a/lib/portage/package/ebuild/fetch.py
44 +++ b/lib/portage/package/ebuild/fetch.py
45 @@ -344,6 +344,31 @@ _size_suffix_map = {
46 }
47
48
49 +class DistfileName(str):
50 + def __new__(cls, s, digests=None):
51 + return str.__new__(cls, s)
52 +
53 + def __init__(self, s, digests=None):
54 + super().__init__()
55 + self.digests = {} if digests is None else digests
56 +
57 + def digests_equal(self, other):
58 + """
59 + Test if digests compare equal to those of another instance.
60 + """
61 + if not isinstance(other, DistfileName):
62 + return False
63 + matches = []
64 + for algo, digest in self.digests.items():
65 + other_digest = other.digests.get(algo)
66 + if other_digest is not None:
67 + if other_digest == digest:
68 + matches.append(algo)
69 + else:
70 + return False
71 + return bool(matches)
72 +
73 +
74 class FlatLayout:
75 def get_path(self, filename):
76 return filename
77 @@ -413,6 +438,90 @@ class FilenameHashLayout:
78 return False
79
80
81 +class ContentHashLayout(FilenameHashLayout):
82 + """
83 + The content-hash layout is identical to the filename-hash layout,
84 + except for these two differences:
85 +
86 + 1) A content digest is used instead of a filename digest.
87 + 2) The final element of the path returned from the get_path method
88 + corresponds to the complete content digest. The path is a
89 + function of the content digest alone.
90 +
91 + Motivations to use the content-hash layout instead of the
92 + filename-hash layout may include:
93 +
94 + 1) Since the file path is independent of the file name, file
95 + name collisions cannot occur. This makes the content-hash
96 + layout suitable for storage of multiple types of files (not
97 + only gentoo distfiles). For example, it can be used to store
98 + distfiles for multiple linux distros within the same tree,
99 + with automatic deduplication based on content digest. This
100 + layout can be used to store and distribute practically anything
101 + (including binary packages for example).
102 +
103 + 2) Allows multiple revisions for the same distfiles name. An
104 + existing distfile can be updated, and if a user still has an
105 + older copy of an ebuild repository (or an overlay), then a user
106 + can successfully fetch a desired revision of the distfile as
107 + long as it has not been purged from the mirror.
108 +
109 + 3) File integrity data is integrated into the layout itself,
110 + making it very simple to verify the integrity of any file that
111 + it contains. The only tool required is an implementation of
112 + the chosen hash algorithm.
113 + """
114 +
115 + def get_path(self, filename):
116 + """
117 + For content-hash, the path is a function of the content digest alone.
118 + The final element of the path returned from the get_path method
119 + corresponds to the complete content digest.
120 + """
121 + fnhash = remaining = filename.digests[self.algo]
122 + ret = ""
123 + for c in self.cutoffs:
124 + assert c % 4 == 0
125 + c = c // 4
126 + ret += remaining[:c] + "/"
127 + remaining = remaining[c:]
128 + return ret + fnhash
129 +
130 + def get_filenames(self, distdir):
131 + """
132 + Yields DistfileName instances each with filename corresponding
133 + to a digest value for self.algo. These can be compared to other
134 + DistfileName instances with their digests_equal method.
135 + """
136 + for filename in super(ContentHashLayout, self).get_filenames(distdir):
137 + yield DistfileName(
138 + filename, digests=dict([(self.algo, os.path.basename(filename))])
139 + )
140 +
141 + @staticmethod
142 + def verify_args(args, filename=None):
143 + """
144 + If the filename argument is given, then supported hash
145 + algorithms are constrained by digests available in the filename
146 + digests attribute.
147 +
148 + @param args: layout.conf entry args
149 + @param filename: filename with digests attribute
150 + @return: True if args are valid for available digest algorithms,
151 + and False otherwise
152 + """
153 + if len(args) != 3:
154 + return False
155 + if filename is None:
156 + supported_algos = get_valid_checksum_keys()
157 + else:
158 + supported_algos = filename.digests
159 + algo = args[1].upper()
160 + if algo not in supported_algos:
161 + return False
162 + return FilenameHashLayout.verify_args(args)
163 +
164 +
165 class MirrorLayoutConfig:
166 """
167 Class to read layout.conf from a mirror.
168 @@ -439,20 +548,41 @@ class MirrorLayoutConfig:
169 self.structure = data
170
171 @staticmethod
172 - def validate_structure(val):
173 + def validate_structure(val, filename=None):
174 + """
175 + If the filename argument is given, then supported hash
176 + algorithms are constrained by digests available in the filename
177 + digests attribute.
178 +
179 + @param val: layout.conf entry args
180 + @param filename: filename with digests attribute
181 + @return: True if args are valid for available digest algorithms,
182 + and False otherwise
183 + """
184 if val[0] == 'flat':
185 return FlatLayout.verify_args(val)
186 - if val[0] == 'filename-hash':
187 + elif val[0] == 'filename-hash':
188 return FilenameHashLayout.verify_args(val)
189 + elif val[0] == 'content-hash':
190 + return ContentHashLayout.verify_args(val, filename=filename)
191 return False
192
193 - def get_best_supported_layout(self):
194 + def get_best_supported_layout(self, filename=None):
195 + """
196 + If the filename argument is given, then acceptable hash
197 + algorithms are constrained by digests available in the filename
198 + digests attribute.
199 +
200 + @param filename: filename with digests attribute
201 + """
202 for val in self.structure:
203 - if self.validate_structure(val):
204 + if self.validate_structure(val, filename=filename):
205 if val[0] == 'flat':
206 return FlatLayout(*val[1:])
207 - if val[0] == 'filename-hash':
208 + elif val[0] == 'filename-hash':
209 return FilenameHashLayout(*val[1:])
210 + elif val[0] == 'content-hash':
211 + return ContentHashLayout(*val[1:])
212 # fallback
213 return FlatLayout()
214
215 @@ -465,6 +595,8 @@ class MirrorLayoutConfig:
216 ret.append(FlatLayout(*val[1:]))
217 elif val[0] == 'filename-hash':
218 ret.append(FilenameHashLayout(*val[1:]))
219 + elif val[0] == 'content-hash':
220 + ret.append(ContentHashLayout(*val[1:]))
221 if not ret:
222 ret.append(FlatLayout())
223 return ret
224 @@ -515,7 +647,7 @@ def get_mirror_url(mirror_url, filename, mysettings, cache_path=None):
225
226 # For some protocols, urlquote is required for correct behavior,
227 # and it must not be used for other protocols like rsync and sftp.
228 - path = mirror_conf.get_best_supported_layout().get_path(filename)
229 + path = mirror_conf.get_best_supported_layout(filename=filename).get_path(filename)
230 if urlparse(mirror_url).scheme in ('ftp', 'http', 'https'):
231 path = urlquote(path)
232 return mirror_url + "/distfiles/" + path
233 @@ -722,15 +854,23 @@ def fetch(myuris, mysettings, listonly=0, fetchonly=0,
234 if hasattr(myuris, 'items'):
235 for myfile, uri_set in myuris.items():
236 for myuri in uri_set:
237 - file_uri_tuples.append((myfile, myuri))
238 + file_uri_tuples.append(
239 + (DistfileName(myfile, digests=mydigests.get(myfile)), myuri)
240 + )
241 if not uri_set:
242 - file_uri_tuples.append((myfile, None))
243 + file_uri_tuples.append(
244 + (DistfileName(myfile, digests=mydigests.get(myfile)), None)
245 + )
246 else:
247 for myuri in myuris:
248 if urlparse(myuri).scheme:
249 - file_uri_tuples.append((os.path.basename(myuri), myuri))
250 + file_uri_tuples.append(
251 + (DistfileName(myfile, digests=mydigests.get(myfile)), myuri)
252 + )
253 else:
254 - file_uri_tuples.append((os.path.basename(myuri), None))
255 + file_uri_tuples.append(
256 + (DistfileName(myfile, digests=mydigests.get(myfile)), None)
257 + )
258
259 filedict = OrderedDict()
260 primaryuri_dict = {}
261 diff --git a/lib/portage/tests/ebuild/test_fetch.py b/lib/portage/tests/ebuild/test_fetch.py
262 index c5ea8253b..73ae45ebf 100644
263 --- a/lib/portage/tests/ebuild/test_fetch.py
264 +++ b/lib/portage/tests/ebuild/test_fetch.py
265 @@ -7,7 +7,8 @@ import tempfile
266
267 import portage
268 from portage import shutil, os
269 -from portage.const import BASH_BINARY, PORTAGE_PYM_PATH
270 +from portage.checksum import checksum_str
271 +from portage.const import BASH_BINARY, MANIFEST2_HASH_DEFAULTS, PORTAGE_PYM_PATH
272 from portage.tests import TestCase
273 from portage.tests.resolver.ResolverPlayground import ResolverPlayground
274 from portage.tests.util.test_socks5 import AsyncHTTPServer
275 @@ -18,8 +19,15 @@ from portage.util._async.SchedulerInterface import SchedulerInterface
276 from portage.util._eventloop.global_event_loop import global_event_loop
277 from portage.package.ebuild.config import config
278 from portage.package.ebuild.digestgen import digestgen
279 -from portage.package.ebuild.fetch import (_download_suffix, fetch, FlatLayout,
280 - FilenameHashLayout, MirrorLayoutConfig)
281 +from portage.package.ebuild.fetch import (
282 + ContentHashLayout,
283 + DistfileName,
284 + _download_suffix,
285 + fetch,
286 + FilenameHashLayout,
287 + FlatLayout,
288 + MirrorLayoutConfig,
289 +)
290 from _emerge.EbuildFetcher import EbuildFetcher
291 from _emerge.Package import Package
292
293 @@ -102,6 +110,11 @@ class EbuildFetchTestCase(TestCase):
294 "1=filename-hash BLAKE2B 8",
295 "0=flat",
296 ),
297 + (
298 + "[structure]",
299 + "0=content-hash SHA512 8:8:8",
300 + "1=flat",
301 + ),
302 )
303
304 fetchcommand = portage.util.shlex_split(playground.settings["FETCHCOMMAND"])
305 @@ -142,9 +155,14 @@ class EbuildFetchTestCase(TestCase):
306 content["/distfiles/layout.conf"] = layout_data.encode("utf8")
307
308 for k, v in distfiles.items():
309 + filename = DistfileName(
310 + k,
311 + digests=dict((algo, checksum_str(v, hashname=algo)) for algo in MANIFEST2_HASH_DEFAULTS),
312 + )
313 +
314 # mirror path
315 for layout in layouts:
316 - content["/distfiles/" + layout.get_path(k)] = v
317 + content["/distfiles/" + layout.get_path(filename)] = v
318 # upstream path
319 content["/distfiles/{}.txt".format(k)] = v
320
321 @@ -499,14 +517,18 @@ class EbuildFetchTestCase(TestCase):
322 io.StringIO(conf))
323
324 def test_filename_hash_layout_get_filenames(self):
325 + filename = DistfileName(
326 + 'foo-1.tar.gz',
327 + digests=dict((algo, checksum_str(b'', hashname=algo)) for algo in MANIFEST2_HASH_DEFAULTS),
328 + )
329 layouts = (
330 FlatLayout(),
331 FilenameHashLayout('SHA1', '4'),
332 FilenameHashLayout('SHA1', '8'),
333 FilenameHashLayout('SHA1', '8:16'),
334 FilenameHashLayout('SHA1', '8:16:24'),
335 + ContentHashLayout('SHA512', '8:8:8'),
336 )
337 - filename = 'foo-1.tar.gz'
338
339 for layout in layouts:
340 distdir = tempfile.mkdtemp()
341 @@ -520,6 +542,12 @@ class EbuildFetchTestCase(TestCase):
342 with open(path, 'wb') as f:
343 pass
344
345 - self.assertEqual([filename], list(layout.get_filenames(distdir)))
346 + file_list = list(layout.get_filenames(distdir))
347 + self.assertTrue(len(file_list) > 0)
348 + for filename_result in file_list:
349 + if isinstance(filename_result, DistfileName):
350 + self.assertTrue(filename_result.digests_equal(filename))
351 + else:
352 + self.assertEqual(filename_result, str(filename))
353 finally:
354 shutil.rmtree(distdir)
355 --
356 2.26.2