Gentoo Archives: gentoo-portage-dev

From: Zac Medico <zmedico@g.o>
To: gentoo-portage-dev@l.g.o
Cc: Zac Medico <zmedico@g.o>
Subject: [gentoo-portage-dev] [PATCH v3] emirrordist: add --content-db option required for content-hash layout (bug 756778)
Date: Sat, 27 Feb 2021 02:05:29
Message-Id: 20210227020511.1201641-1-zmedico@gentoo.org
In Reply to: [gentoo-portage-dev] [PATCH v2] emirrordist: add --content-db option required for content-hash layout (bug 756778) by Zac Medico
1 Add a --content-db option which is required for the content-hash
2 layout because its file listings return content digests instead of
3 distfile names.
4
5 The content db serves to translate content digests to distfiles
6 names, and distfiles names to content digests. All keys have a
7 prefix separated by a colon. For digest keys, the prefix is the
8 hash algorithm name. For filename keys, the prefix is "filename".
9
10 The value associated with a digest key is a set of file names. The
11 value associated with a distfile key is a set of content revisions.
12 Each content revision is expressed as a dictionary of digests which
13 is suitable for construction of a DistfileName instance.
14
15 Bug: https://bugs.gentoo.org/756778
16 Signed-off-by: Zac Medico <zmedico@g.o>
17 ---
18 [PATCH v3] changed the value associated with a digest key is a set
19 of file name, and fixed ContentDB.remove to preserved independent
20 references to identical content (like removing one of multiple
21 hardlinks).
22
23 lib/portage/_emirrordist/Config.py | 8 +-
24 lib/portage/_emirrordist/ContentDB.py | 178 +++++++++++++++++++
25 lib/portage/_emirrordist/DeletionIterator.py | 25 ++-
26 lib/portage/_emirrordist/DeletionTask.py | 8 +
27 lib/portage/_emirrordist/FetchTask.py | 5 +-
28 lib/portage/_emirrordist/main.py | 15 +-
29 lib/portage/package/ebuild/fetch.py | 8 +-
30 lib/portage/tests/ebuild/test_fetch.py | 14 ++
31 man/emirrordist.1 | 6 +-
32 9 files changed, 256 insertions(+), 11 deletions(-)
33 create mode 100644 lib/portage/_emirrordist/ContentDB.py
34
35 diff --git a/lib/portage/_emirrordist/Config.py b/lib/portage/_emirrordist/Config.py
36 index 4bee4f45e..cfe944040 100644
37 --- a/lib/portage/_emirrordist/Config.py
38 +++ b/lib/portage/_emirrordist/Config.py
39 @@ -1,4 +1,4 @@
40 -# Copyright 2013-2020 Gentoo Authors
41 +# Copyright 2013-2021 Gentoo Authors
42 # Distributed under the terms of the GNU General Public License v2
43
44 import copy
45 @@ -10,6 +10,7 @@ import time
46 from portage import os
47 from portage.package.ebuild.fetch import MirrorLayoutConfig
48 from portage.util import grabdict, grablines
49 +from .ContentDB import ContentDB
50
51 class Config:
52 def __init__(self, options, portdb, event_loop):
53 @@ -65,6 +66,11 @@ class Config:
54 self.distfiles_db = self._open_shelve(
55 options.distfiles_db, 'distfiles')
56
57 + self.content_db = None
58 + if options.content_db is not None:
59 + self.content_db = ContentDB(self._open_shelve(
60 + options.content_db, 'content'))
61 +
62 self.deletion_db = None
63 if options.deletion_db is not None:
64 self.deletion_db = self._open_shelve(
65 diff --git a/lib/portage/_emirrordist/ContentDB.py b/lib/portage/_emirrordist/ContentDB.py
66 new file mode 100644
67 index 000000000..7084cecff
68 --- /dev/null
69 +++ b/lib/portage/_emirrordist/ContentDB.py
70 @@ -0,0 +1,178 @@
71 +# Copyright 2021 Gentoo Authors
72 +# Distributed under the terms of the GNU General Public License v2
73 +
74 +import logging
75 +import operator
76 +import shelve
77 +import typing
78 +
79 +from portage.package.ebuild.fetch import DistfileName
80 +
81 +
82 +class ContentDB:
83 + """
84 + The content db serves to translate content digests to distfiles
85 + names, and distfiles names to content digests. All keys have a
86 + prefix separated by a colon. For digest keys, the prefix is the
87 + hash algorithm name. For filename keys, the prefix is "filename".
88 +
89 + The value associated with a digest key is a set of file names. The
90 + value associated with a distfile key is a set of content revisions.
91 + Each content revision is expressed as a dictionary of digests which
92 + is suitable for construction of a DistfileName instance.
93 + """
94 +
95 + def __init__(self, shelve_instance: shelve.Shelf):
96 + self._shelve = shelve_instance
97 +
98 + def add(self, filename: DistfileName):
99 + """
100 + Add file name and digests.
101 +
102 + @param filename: file name with digests attribute
103 + """
104 + distfile_str = str(filename)
105 + distfile_key = "filename:{}".format(distfile_str)
106 + for k, v in filename.digests.items():
107 + if k != "size":
108 + digest_key = "{}:{}".format(k, v).lower()
109 + try:
110 + digest_files = self._shelve[digest_key]
111 + except KeyError:
112 + digest_files = set()
113 + digest_files.add(distfile_str)
114 + self._shelve[digest_key] = digest_files
115 + try:
116 + content_revisions = self._shelve[distfile_key]
117 + except KeyError:
118 + content_revisions = set()
119 +
120 + revision_key = tuple(
121 + sorted(
122 + (
123 + (algo.lower(), filename.digests[algo].lower())
124 + for algo in filename.digests
125 + if algo != "size"
126 + ),
127 + key=operator.itemgetter(0),
128 + )
129 + )
130 + content_revisions.add(revision_key)
131 + self._shelve[distfile_key] = content_revisions
132 +
133 + def remove(self, filename: DistfileName):
134 + """
135 + Remove a file name from the database. If identical content is still
136 + referenced by one or more other file names, then those references
137 + are preserved (like removing one of many hardlinks).
138 +
139 + @param filename: file name with digests attribute
140 + """
141 + distfile_key = "filename:{}".format(filename)
142 + try:
143 + content_revisions = self._shelve[distfile_key]
144 + except KeyError:
145 + pass
146 + else:
147 + for revision_key in content_revisions:
148 + for k, v in revision_key:
149 + digest_key = "{}:{}".format(k, v)
150 + try:
151 + digest_files = self._shelve[digest_key]
152 + except KeyError:
153 + digest_files = set()
154 +
155 + try:
156 + digest_files.remove(filename)
157 + except KeyError:
158 + pass
159 + else:
160 + if digest_files:
161 + self._shelve[digest_key] = digest_files
162 + else:
163 + try:
164 + del self._shelve[digest_key]
165 + except KeyError:
166 + pass
167 +
168 + logging.debug(("drop '%s' from content db") % filename)
169 + try:
170 + del self._shelve[distfile_key]
171 + except KeyError:
172 + pass
173 +
174 + def get_filenames_translate(
175 + self, filename: typing.Union[str, DistfileName]
176 + ) -> typing.Generator[DistfileName, None, None]:
177 + """
178 + Translate distfiles content digests to distfile names.
179 + If filename is already a distfile name, then it will pass
180 + through unchanged.
181 +
182 + @param filename: A filename listed by layout get_filenames
183 + @return: The distfile name, translated from the corresponding
184 + content digest when necessary
185 + """
186 + if not isinstance(filename, DistfileName):
187 + filename = DistfileName(filename)
188 +
189 + # Match content digests with zero or more content revisions.
190 + matched_revisions = {}
191 +
192 + for k, v in filename.digests.items():
193 + digest_item = (k.lower(), v.lower())
194 + digest_key = "{}:{}".format(*digest_item)
195 + try:
196 + digest_files = self._shelve[digest_key]
197 + except KeyError:
198 + continue
199 +
200 + for distfile_str in digest_files:
201 + matched_revisions.setdefault(distfile_str, set())
202 + try:
203 + content_revisions = self._shelve["filename:{}".format(distfile_str)]
204 + except KeyError:
205 + pass
206 + else:
207 + for revision_key in content_revisions:
208 + if (
209 + digest_item in revision_key
210 + and revision_key not in matched_revisions[distfile_str]
211 + ):
212 + matched_revisions[distfile_str].add(revision_key)
213 + yield DistfileName(distfile_str, digests=dict(revision_key))
214 +
215 + if not any(matched_revisions.values()):
216 + # Since filename matched zero content revisions, allow
217 + # it to pass through unchanged (on the path toward deletion).
218 + yield filename
219 +
220 + def __len__(self):
221 + return len(self._shelve)
222 +
223 + def __contains__(self, k):
224 + return k in self._shelve
225 +
226 + def __iter__(self):
227 + return self._shelve.__iter__()
228 +
229 + def items(self):
230 + return self._shelve.items()
231 +
232 + def __setitem__(self, k, v):
233 + self._shelve[k] = v
234 +
235 + def __getitem__(self, k):
236 + return self._shelve[k]
237 +
238 + def __delitem__(self, k):
239 + del self._shelve[k]
240 +
241 + def get(self, k, *args):
242 + return self._shelve.get(k, *args)
243 +
244 + def close(self):
245 + self._shelve.close()
246 +
247 + def clear(self):
248 + self._shelve.clear()
249 diff --git a/lib/portage/_emirrordist/DeletionIterator.py b/lib/portage/_emirrordist/DeletionIterator.py
250 index 08985ed6c..ab4309f9a 100644
251 --- a/lib/portage/_emirrordist/DeletionIterator.py
252 +++ b/lib/portage/_emirrordist/DeletionIterator.py
253 @@ -1,10 +1,12 @@
254 -# Copyright 2013-2019 Gentoo Authors
255 +# Copyright 2013-2021 Gentoo Authors
256 # Distributed under the terms of the GNU General Public License v2
257
258 +import itertools
259 import logging
260 import stat
261
262 from portage import os
263 +from portage.package.ebuild.fetch import DistfileName
264 from .DeletionTask import DeletionTask
265
266 class DeletionIterator:
267 @@ -21,8 +23,25 @@ class DeletionIterator:
268 deletion_delay = self._config.options.deletion_delay
269 start_time = self._config.start_time
270 distfiles_set = set()
271 - for layout in self._config.layouts:
272 - distfiles_set.update(layout.get_filenames(distdir))
273 + distfiles_set.update(
274 + (
275 + filename
276 + if isinstance(filename, DistfileName)
277 + else DistfileName(filename)
278 + for filename in itertools.chain.from_iterable(
279 + layout.get_filenames(distdir) for layout in self._config.layouts
280 + )
281 + )
282 + if self._config.content_db is None
283 + else itertools.chain.from_iterable(
284 + (
285 + self._config.content_db.get_filenames_translate(filename)
286 + for filename in itertools.chain.from_iterable(
287 + layout.get_filenames(distdir) for layout in self._config.layouts
288 + )
289 + )
290 + )
291 + )
292 for filename in distfiles_set:
293 # require at least one successful stat()
294 exceptions = []
295 diff --git a/lib/portage/_emirrordist/DeletionTask.py b/lib/portage/_emirrordist/DeletionTask.py
296 index 5eb01d840..73493c5a1 100644
297 --- a/lib/portage/_emirrordist/DeletionTask.py
298 +++ b/lib/portage/_emirrordist/DeletionTask.py
299 @@ -5,6 +5,7 @@ import errno
300 import logging
301
302 from portage import os
303 +from portage.package.ebuild.fetch import ContentHashLayout
304 from portage.util._async.FileCopier import FileCopier
305 from _emerge.CompositeTask import CompositeTask
306
307 @@ -99,6 +100,10 @@ class DeletionTask(CompositeTask):
308 def _delete_links(self):
309 success = True
310 for layout in self.config.layouts:
311 + if isinstance(layout, ContentHashLayout) and not self.distfile.digests:
312 + logging.debug(("_delete_links: '%s' has "
313 + "no digests") % self.distfile)
314 + continue
315 distfile_path = os.path.join(
316 self.config.options.distfiles,
317 layout.get_path(self.distfile))
318 @@ -134,6 +139,9 @@ class DeletionTask(CompositeTask):
319 logging.debug(("drop '%s' from "
320 "distfiles db") % self.distfile)
321
322 + if self.config.content_db is not None:
323 + self.config.content_db.remove(self.distfile)
324 +
325 if self.config.deletion_db is not None:
326 try:
327 del self.config.deletion_db[self.distfile]
328 diff --git a/lib/portage/_emirrordist/FetchTask.py b/lib/portage/_emirrordist/FetchTask.py
329 index 997762082..5a48f91cd 100644
330 --- a/lib/portage/_emirrordist/FetchTask.py
331 +++ b/lib/portage/_emirrordist/FetchTask.py
332 @@ -1,4 +1,4 @@
333 -# Copyright 2013-2020 Gentoo Authors
334 +# Copyright 2013-2021 Gentoo Authors
335 # Distributed under the terms of the GNU General Public License v2
336
337 import collections
338 @@ -47,6 +47,9 @@ class FetchTask(CompositeTask):
339 # Convert _pkg_str to str in order to prevent pickle problems.
340 self.config.distfiles_db[self.distfile] = str(self.cpv)
341
342 + if self.config.content_db is not None:
343 + self.config.content_db.add(self.distfile)
344 +
345 if not self._have_needed_digests():
346 msg = "incomplete digests: %s" % " ".join(self.digests)
347 self.scheduler.output(msg, background=self.background,
348 diff --git a/lib/portage/_emirrordist/main.py b/lib/portage/_emirrordist/main.py
349 index 8d00a05f5..2200ec715 100644
350 --- a/lib/portage/_emirrordist/main.py
351 +++ b/lib/portage/_emirrordist/main.py
352 @@ -1,4 +1,4 @@
353 -# Copyright 2013-2020 Gentoo Authors
354 +# Copyright 2013-2021 Gentoo Authors
355 # Distributed under the terms of the GNU General Public License v2
356
357 import argparse
358 @@ -7,6 +7,7 @@ import sys
359
360 import portage
361 from portage import os
362 +from portage.package.ebuild.fetch import ContentHashLayout
363 from portage.util import normalize_path, _recursive_file_list
364 from portage.util._async.run_main_scheduler import run_main_scheduler
365 from portage.util._async.SchedulerInterface import SchedulerInterface
366 @@ -151,6 +152,12 @@ common_options = (
367 "distfile belongs to",
368 "metavar" : "FILE"
369 },
370 + {
371 + "longopt" : "--content-db",
372 + "help" : "database file used to map content digests to"
373 + "distfiles names (required for content-hash layout)",
374 + "metavar" : "FILE"
375 + },
376 {
377 "longopt" : "--recycle-dir",
378 "help" : "directory for extended retention of files that "
379 @@ -441,6 +448,12 @@ def emirrordist_main(args):
380 if not options.mirror:
381 parser.error('No action specified')
382
383 + if options.delete and config.content_db is None:
384 + for layout in config.layouts:
385 + if isinstance(layout, ContentHashLayout):
386 + parser.error("content-hash layout requires "
387 + "--content-db to be specified")
388 +
389 returncode = os.EX_OK
390
391 if options.mirror:
392 diff --git a/lib/portage/package/ebuild/fetch.py b/lib/portage/package/ebuild/fetch.py
393 index a683793f0..73abec595 100644
394 --- a/lib/portage/package/ebuild/fetch.py
395 +++ b/lib/portage/package/ebuild/fetch.py
396 @@ -365,10 +365,10 @@ class DistfileName(str):
397 In order to prepare for a migration from filename-hash to
398 content-hash layout, all consumers of the layout get_filenames
399 method need to be updated to work with content digests as a
400 - substitute for distfile names. For example, in order to prepare
401 - emirrordist for content-hash, a key-value store needs to be
402 - added as a means to associate distfile names with content
403 - digest values yielded by the content-hash get_filenames
404 + substitute for distfile names. For example, emirrordist requires
405 + the --content-db option when working with a content-hash layout,
406 + which serves as a means to associate distfile names
407 + with content digest values yielded by the content-hash get_filenames
408 implementation.
409 """
410 def __new__(cls, s, digests=None):
411 diff --git a/lib/portage/tests/ebuild/test_fetch.py b/lib/portage/tests/ebuild/test_fetch.py
412 index d50a4cbfc..881288cdc 100644
413 --- a/lib/portage/tests/ebuild/test_fetch.py
414 +++ b/lib/portage/tests/ebuild/test_fetch.py
415 @@ -172,6 +172,16 @@ class EbuildFetchTestCase(TestCase):
416 with open(os.path.join(settings['DISTDIR'], 'layout.conf'), 'wt') as f:
417 f.write(layout_data)
418
419 + if any(isinstance(layout, ContentHashLayout) for layout in layouts):
420 + content_db = os.path.join(playground.eprefix, 'var/db/emirrordist/content.db')
421 + os.makedirs(os.path.dirname(content_db), exist_ok=True)
422 + try:
423 + os.unlink(content_db)
424 + except OSError:
425 + pass
426 + else:
427 + content_db = None
428 +
429 # Demonstrate that fetch preserves a stale file in DISTDIR when no digests are given.
430 foo_uri = {'foo': ('{scheme}://{host}:{port}/distfiles/foo'.format(scheme=scheme, host=host, port=server.server_port),)}
431 foo_path = os.path.join(settings['DISTDIR'], 'foo')
432 @@ -233,9 +243,13 @@ class EbuildFetchTestCase(TestCase):
433 os.path.join(self.bindir, 'emirrordist'),
434 '--distfiles', settings['DISTDIR'],
435 '--config-root', settings['EPREFIX'],
436 + '--delete',
437 '--repositories-configuration', settings.repositories.config_string(),
438 '--repo', 'test_repo', '--mirror')
439
440 + if content_db is not None:
441 + emirrordist_cmd = emirrordist_cmd + ('--content-db', content_db,)
442 +
443 env = settings.environ()
444 env['PYTHONPATH'] = ':'.join(
445 filter(None, [PORTAGE_PYM_PATH] + os.environ.get('PYTHONPATH', '').split(':')))
446 diff --git a/man/emirrordist.1 b/man/emirrordist.1
447 index 45108ef8c..7ad10dfd0 100644
448 --- a/man/emirrordist.1
449 +++ b/man/emirrordist.1
450 @@ -1,4 +1,4 @@
451 -.TH "EMIRRORDIST" "1" "Dec 2015" "Portage VERSION" "Portage"
452 +.TH "EMIRRORDIST" "1" "Feb 2021" "Portage VERSION" "Portage"
453 .SH "NAME"
454 emirrordist \- a fetch tool for mirroring of package distfiles
455 .SH SYNOPSIS
456 @@ -66,6 +66,10 @@ reporting purposes. Opened in append mode.
457 Log file for scheduled deletions, with tab\-delimited output, for
458 reporting purposes. Overwritten with each run.
459 .TP
460 +\fB\-\-content\-db\fR=\fIFILE\fR
461 +Database file used to pair content digests with distfiles names
462 +(required fo content\-hash layout).
463 +.TP
464 \fB\-\-delete\fR
465 Enable deletion of unused distfiles.
466 .TP
467 --
468 2.26.2