1 |
Add a --content-db option which is required for the content-hash |
2 |
layout because its file listings return content digests instead of |
3 |
distfile names. |
4 |
|
5 |
The content db serves to translate content digests to distfiles |
6 |
names, and distfiles names to content digests. All keys have a |
7 |
prefix separated by a colon. For digest keys, the prefix is the |
8 |
hash algorithm name. For filename keys, the prefix is "filename". |
9 |
|
10 |
The value associated with a digest key is a set of file names. The |
11 |
value associated with a distfile key is a set of content revisions. |
12 |
Each content revision is expressed as a dictionary of digests which |
13 |
is suitable for construction of a DistfileName instance. |
14 |
|
15 |
Bug: https://bugs.gentoo.org/756778 |
16 |
Signed-off-by: Zac Medico <zmedico@g.o> |
17 |
--- |
18 |
[PATCH v3] changed the value associated with a digest key is a set |
19 |
of file name, and fixed ContentDB.remove to preserved independent |
20 |
references to identical content (like removing one of multiple |
21 |
hardlinks). |
22 |
|
23 |
lib/portage/_emirrordist/Config.py | 8 +- |
24 |
lib/portage/_emirrordist/ContentDB.py | 178 +++++++++++++++++++ |
25 |
lib/portage/_emirrordist/DeletionIterator.py | 25 ++- |
26 |
lib/portage/_emirrordist/DeletionTask.py | 8 + |
27 |
lib/portage/_emirrordist/FetchTask.py | 5 +- |
28 |
lib/portage/_emirrordist/main.py | 15 +- |
29 |
lib/portage/package/ebuild/fetch.py | 8 +- |
30 |
lib/portage/tests/ebuild/test_fetch.py | 14 ++ |
31 |
man/emirrordist.1 | 6 +- |
32 |
9 files changed, 256 insertions(+), 11 deletions(-) |
33 |
create mode 100644 lib/portage/_emirrordist/ContentDB.py |
34 |
|
35 |
diff --git a/lib/portage/_emirrordist/Config.py b/lib/portage/_emirrordist/Config.py |
36 |
index 4bee4f45e..cfe944040 100644 |
37 |
--- a/lib/portage/_emirrordist/Config.py |
38 |
+++ b/lib/portage/_emirrordist/Config.py |
39 |
@@ -1,4 +1,4 @@ |
40 |
-# Copyright 2013-2020 Gentoo Authors |
41 |
+# Copyright 2013-2021 Gentoo Authors |
42 |
# Distributed under the terms of the GNU General Public License v2 |
43 |
|
44 |
import copy |
45 |
@@ -10,6 +10,7 @@ import time |
46 |
from portage import os |
47 |
from portage.package.ebuild.fetch import MirrorLayoutConfig |
48 |
from portage.util import grabdict, grablines |
49 |
+from .ContentDB import ContentDB |
50 |
|
51 |
class Config: |
52 |
def __init__(self, options, portdb, event_loop): |
53 |
@@ -65,6 +66,11 @@ class Config: |
54 |
self.distfiles_db = self._open_shelve( |
55 |
options.distfiles_db, 'distfiles') |
56 |
|
57 |
+ self.content_db = None |
58 |
+ if options.content_db is not None: |
59 |
+ self.content_db = ContentDB(self._open_shelve( |
60 |
+ options.content_db, 'content')) |
61 |
+ |
62 |
self.deletion_db = None |
63 |
if options.deletion_db is not None: |
64 |
self.deletion_db = self._open_shelve( |
65 |
diff --git a/lib/portage/_emirrordist/ContentDB.py b/lib/portage/_emirrordist/ContentDB.py |
66 |
new file mode 100644 |
67 |
index 000000000..7084cecff |
68 |
--- /dev/null |
69 |
+++ b/lib/portage/_emirrordist/ContentDB.py |
70 |
@@ -0,0 +1,178 @@ |
71 |
+# Copyright 2021 Gentoo Authors |
72 |
+# Distributed under the terms of the GNU General Public License v2 |
73 |
+ |
74 |
+import logging |
75 |
+import operator |
76 |
+import shelve |
77 |
+import typing |
78 |
+ |
79 |
+from portage.package.ebuild.fetch import DistfileName |
80 |
+ |
81 |
+ |
82 |
+class ContentDB: |
83 |
+ """ |
84 |
+ The content db serves to translate content digests to distfiles |
85 |
+ names, and distfiles names to content digests. All keys have a |
86 |
+ prefix separated by a colon. For digest keys, the prefix is the |
87 |
+ hash algorithm name. For filename keys, the prefix is "filename". |
88 |
+ |
89 |
+ The value associated with a digest key is a set of file names. The |
90 |
+ value associated with a distfile key is a set of content revisions. |
91 |
+ Each content revision is expressed as a dictionary of digests which |
92 |
+ is suitable for construction of a DistfileName instance. |
93 |
+ """ |
94 |
+ |
95 |
+ def __init__(self, shelve_instance: shelve.Shelf): |
96 |
+ self._shelve = shelve_instance |
97 |
+ |
98 |
+ def add(self, filename: DistfileName): |
99 |
+ """ |
100 |
+ Add file name and digests. |
101 |
+ |
102 |
+ @param filename: file name with digests attribute |
103 |
+ """ |
104 |
+ distfile_str = str(filename) |
105 |
+ distfile_key = "filename:{}".format(distfile_str) |
106 |
+ for k, v in filename.digests.items(): |
107 |
+ if k != "size": |
108 |
+ digest_key = "{}:{}".format(k, v).lower() |
109 |
+ try: |
110 |
+ digest_files = self._shelve[digest_key] |
111 |
+ except KeyError: |
112 |
+ digest_files = set() |
113 |
+ digest_files.add(distfile_str) |
114 |
+ self._shelve[digest_key] = digest_files |
115 |
+ try: |
116 |
+ content_revisions = self._shelve[distfile_key] |
117 |
+ except KeyError: |
118 |
+ content_revisions = set() |
119 |
+ |
120 |
+ revision_key = tuple( |
121 |
+ sorted( |
122 |
+ ( |
123 |
+ (algo.lower(), filename.digests[algo].lower()) |
124 |
+ for algo in filename.digests |
125 |
+ if algo != "size" |
126 |
+ ), |
127 |
+ key=operator.itemgetter(0), |
128 |
+ ) |
129 |
+ ) |
130 |
+ content_revisions.add(revision_key) |
131 |
+ self._shelve[distfile_key] = content_revisions |
132 |
+ |
133 |
+ def remove(self, filename: DistfileName): |
134 |
+ """ |
135 |
+ Remove a file name from the database. If identical content is still |
136 |
+ referenced by one or more other file names, then those references |
137 |
+ are preserved (like removing one of many hardlinks). |
138 |
+ |
139 |
+ @param filename: file name with digests attribute |
140 |
+ """ |
141 |
+ distfile_key = "filename:{}".format(filename) |
142 |
+ try: |
143 |
+ content_revisions = self._shelve[distfile_key] |
144 |
+ except KeyError: |
145 |
+ pass |
146 |
+ else: |
147 |
+ for revision_key in content_revisions: |
148 |
+ for k, v in revision_key: |
149 |
+ digest_key = "{}:{}".format(k, v) |
150 |
+ try: |
151 |
+ digest_files = self._shelve[digest_key] |
152 |
+ except KeyError: |
153 |
+ digest_files = set() |
154 |
+ |
155 |
+ try: |
156 |
+ digest_files.remove(filename) |
157 |
+ except KeyError: |
158 |
+ pass |
159 |
+ else: |
160 |
+ if digest_files: |
161 |
+ self._shelve[digest_key] = digest_files |
162 |
+ else: |
163 |
+ try: |
164 |
+ del self._shelve[digest_key] |
165 |
+ except KeyError: |
166 |
+ pass |
167 |
+ |
168 |
+ logging.debug(("drop '%s' from content db") % filename) |
169 |
+ try: |
170 |
+ del self._shelve[distfile_key] |
171 |
+ except KeyError: |
172 |
+ pass |
173 |
+ |
174 |
+ def get_filenames_translate( |
175 |
+ self, filename: typing.Union[str, DistfileName] |
176 |
+ ) -> typing.Generator[DistfileName, None, None]: |
177 |
+ """ |
178 |
+ Translate distfiles content digests to distfile names. |
179 |
+ If filename is already a distfile name, then it will pass |
180 |
+ through unchanged. |
181 |
+ |
182 |
+ @param filename: A filename listed by layout get_filenames |
183 |
+ @return: The distfile name, translated from the corresponding |
184 |
+ content digest when necessary |
185 |
+ """ |
186 |
+ if not isinstance(filename, DistfileName): |
187 |
+ filename = DistfileName(filename) |
188 |
+ |
189 |
+ # Match content digests with zero or more content revisions. |
190 |
+ matched_revisions = {} |
191 |
+ |
192 |
+ for k, v in filename.digests.items(): |
193 |
+ digest_item = (k.lower(), v.lower()) |
194 |
+ digest_key = "{}:{}".format(*digest_item) |
195 |
+ try: |
196 |
+ digest_files = self._shelve[digest_key] |
197 |
+ except KeyError: |
198 |
+ continue |
199 |
+ |
200 |
+ for distfile_str in digest_files: |
201 |
+ matched_revisions.setdefault(distfile_str, set()) |
202 |
+ try: |
203 |
+ content_revisions = self._shelve["filename:{}".format(distfile_str)] |
204 |
+ except KeyError: |
205 |
+ pass |
206 |
+ else: |
207 |
+ for revision_key in content_revisions: |
208 |
+ if ( |
209 |
+ digest_item in revision_key |
210 |
+ and revision_key not in matched_revisions[distfile_str] |
211 |
+ ): |
212 |
+ matched_revisions[distfile_str].add(revision_key) |
213 |
+ yield DistfileName(distfile_str, digests=dict(revision_key)) |
214 |
+ |
215 |
+ if not any(matched_revisions.values()): |
216 |
+ # Since filename matched zero content revisions, allow |
217 |
+ # it to pass through unchanged (on the path toward deletion). |
218 |
+ yield filename |
219 |
+ |
220 |
+ def __len__(self): |
221 |
+ return len(self._shelve) |
222 |
+ |
223 |
+ def __contains__(self, k): |
224 |
+ return k in self._shelve |
225 |
+ |
226 |
+ def __iter__(self): |
227 |
+ return self._shelve.__iter__() |
228 |
+ |
229 |
+ def items(self): |
230 |
+ return self._shelve.items() |
231 |
+ |
232 |
+ def __setitem__(self, k, v): |
233 |
+ self._shelve[k] = v |
234 |
+ |
235 |
+ def __getitem__(self, k): |
236 |
+ return self._shelve[k] |
237 |
+ |
238 |
+ def __delitem__(self, k): |
239 |
+ del self._shelve[k] |
240 |
+ |
241 |
+ def get(self, k, *args): |
242 |
+ return self._shelve.get(k, *args) |
243 |
+ |
244 |
+ def close(self): |
245 |
+ self._shelve.close() |
246 |
+ |
247 |
+ def clear(self): |
248 |
+ self._shelve.clear() |
249 |
diff --git a/lib/portage/_emirrordist/DeletionIterator.py b/lib/portage/_emirrordist/DeletionIterator.py |
250 |
index 08985ed6c..ab4309f9a 100644 |
251 |
--- a/lib/portage/_emirrordist/DeletionIterator.py |
252 |
+++ b/lib/portage/_emirrordist/DeletionIterator.py |
253 |
@@ -1,10 +1,12 @@ |
254 |
-# Copyright 2013-2019 Gentoo Authors |
255 |
+# Copyright 2013-2021 Gentoo Authors |
256 |
# Distributed under the terms of the GNU General Public License v2 |
257 |
|
258 |
+import itertools |
259 |
import logging |
260 |
import stat |
261 |
|
262 |
from portage import os |
263 |
+from portage.package.ebuild.fetch import DistfileName |
264 |
from .DeletionTask import DeletionTask |
265 |
|
266 |
class DeletionIterator: |
267 |
@@ -21,8 +23,25 @@ class DeletionIterator: |
268 |
deletion_delay = self._config.options.deletion_delay |
269 |
start_time = self._config.start_time |
270 |
distfiles_set = set() |
271 |
- for layout in self._config.layouts: |
272 |
- distfiles_set.update(layout.get_filenames(distdir)) |
273 |
+ distfiles_set.update( |
274 |
+ ( |
275 |
+ filename |
276 |
+ if isinstance(filename, DistfileName) |
277 |
+ else DistfileName(filename) |
278 |
+ for filename in itertools.chain.from_iterable( |
279 |
+ layout.get_filenames(distdir) for layout in self._config.layouts |
280 |
+ ) |
281 |
+ ) |
282 |
+ if self._config.content_db is None |
283 |
+ else itertools.chain.from_iterable( |
284 |
+ ( |
285 |
+ self._config.content_db.get_filenames_translate(filename) |
286 |
+ for filename in itertools.chain.from_iterable( |
287 |
+ layout.get_filenames(distdir) for layout in self._config.layouts |
288 |
+ ) |
289 |
+ ) |
290 |
+ ) |
291 |
+ ) |
292 |
for filename in distfiles_set: |
293 |
# require at least one successful stat() |
294 |
exceptions = [] |
295 |
diff --git a/lib/portage/_emirrordist/DeletionTask.py b/lib/portage/_emirrordist/DeletionTask.py |
296 |
index 5eb01d840..73493c5a1 100644 |
297 |
--- a/lib/portage/_emirrordist/DeletionTask.py |
298 |
+++ b/lib/portage/_emirrordist/DeletionTask.py |
299 |
@@ -5,6 +5,7 @@ import errno |
300 |
import logging |
301 |
|
302 |
from portage import os |
303 |
+from portage.package.ebuild.fetch import ContentHashLayout |
304 |
from portage.util._async.FileCopier import FileCopier |
305 |
from _emerge.CompositeTask import CompositeTask |
306 |
|
307 |
@@ -99,6 +100,10 @@ class DeletionTask(CompositeTask): |
308 |
def _delete_links(self): |
309 |
success = True |
310 |
for layout in self.config.layouts: |
311 |
+ if isinstance(layout, ContentHashLayout) and not self.distfile.digests: |
312 |
+ logging.debug(("_delete_links: '%s' has " |
313 |
+ "no digests") % self.distfile) |
314 |
+ continue |
315 |
distfile_path = os.path.join( |
316 |
self.config.options.distfiles, |
317 |
layout.get_path(self.distfile)) |
318 |
@@ -134,6 +139,9 @@ class DeletionTask(CompositeTask): |
319 |
logging.debug(("drop '%s' from " |
320 |
"distfiles db") % self.distfile) |
321 |
|
322 |
+ if self.config.content_db is not None: |
323 |
+ self.config.content_db.remove(self.distfile) |
324 |
+ |
325 |
if self.config.deletion_db is not None: |
326 |
try: |
327 |
del self.config.deletion_db[self.distfile] |
328 |
diff --git a/lib/portage/_emirrordist/FetchTask.py b/lib/portage/_emirrordist/FetchTask.py |
329 |
index 997762082..5a48f91cd 100644 |
330 |
--- a/lib/portage/_emirrordist/FetchTask.py |
331 |
+++ b/lib/portage/_emirrordist/FetchTask.py |
332 |
@@ -1,4 +1,4 @@ |
333 |
-# Copyright 2013-2020 Gentoo Authors |
334 |
+# Copyright 2013-2021 Gentoo Authors |
335 |
# Distributed under the terms of the GNU General Public License v2 |
336 |
|
337 |
import collections |
338 |
@@ -47,6 +47,9 @@ class FetchTask(CompositeTask): |
339 |
# Convert _pkg_str to str in order to prevent pickle problems. |
340 |
self.config.distfiles_db[self.distfile] = str(self.cpv) |
341 |
|
342 |
+ if self.config.content_db is not None: |
343 |
+ self.config.content_db.add(self.distfile) |
344 |
+ |
345 |
if not self._have_needed_digests(): |
346 |
msg = "incomplete digests: %s" % " ".join(self.digests) |
347 |
self.scheduler.output(msg, background=self.background, |
348 |
diff --git a/lib/portage/_emirrordist/main.py b/lib/portage/_emirrordist/main.py |
349 |
index 8d00a05f5..2200ec715 100644 |
350 |
--- a/lib/portage/_emirrordist/main.py |
351 |
+++ b/lib/portage/_emirrordist/main.py |
352 |
@@ -1,4 +1,4 @@ |
353 |
-# Copyright 2013-2020 Gentoo Authors |
354 |
+# Copyright 2013-2021 Gentoo Authors |
355 |
# Distributed under the terms of the GNU General Public License v2 |
356 |
|
357 |
import argparse |
358 |
@@ -7,6 +7,7 @@ import sys |
359 |
|
360 |
import portage |
361 |
from portage import os |
362 |
+from portage.package.ebuild.fetch import ContentHashLayout |
363 |
from portage.util import normalize_path, _recursive_file_list |
364 |
from portage.util._async.run_main_scheduler import run_main_scheduler |
365 |
from portage.util._async.SchedulerInterface import SchedulerInterface |
366 |
@@ -151,6 +152,12 @@ common_options = ( |
367 |
"distfile belongs to", |
368 |
"metavar" : "FILE" |
369 |
}, |
370 |
+ { |
371 |
+ "longopt" : "--content-db", |
372 |
+ "help" : "database file used to map content digests to" |
373 |
+ "distfiles names (required for content-hash layout)", |
374 |
+ "metavar" : "FILE" |
375 |
+ }, |
376 |
{ |
377 |
"longopt" : "--recycle-dir", |
378 |
"help" : "directory for extended retention of files that " |
379 |
@@ -441,6 +448,12 @@ def emirrordist_main(args): |
380 |
if not options.mirror: |
381 |
parser.error('No action specified') |
382 |
|
383 |
+ if options.delete and config.content_db is None: |
384 |
+ for layout in config.layouts: |
385 |
+ if isinstance(layout, ContentHashLayout): |
386 |
+ parser.error("content-hash layout requires " |
387 |
+ "--content-db to be specified") |
388 |
+ |
389 |
returncode = os.EX_OK |
390 |
|
391 |
if options.mirror: |
392 |
diff --git a/lib/portage/package/ebuild/fetch.py b/lib/portage/package/ebuild/fetch.py |
393 |
index a683793f0..73abec595 100644 |
394 |
--- a/lib/portage/package/ebuild/fetch.py |
395 |
+++ b/lib/portage/package/ebuild/fetch.py |
396 |
@@ -365,10 +365,10 @@ class DistfileName(str): |
397 |
In order to prepare for a migration from filename-hash to |
398 |
content-hash layout, all consumers of the layout get_filenames |
399 |
method need to be updated to work with content digests as a |
400 |
- substitute for distfile names. For example, in order to prepare |
401 |
- emirrordist for content-hash, a key-value store needs to be |
402 |
- added as a means to associate distfile names with content |
403 |
- digest values yielded by the content-hash get_filenames |
404 |
+ substitute for distfile names. For example, emirrordist requires |
405 |
+ the --content-db option when working with a content-hash layout, |
406 |
+ which serves as a means to associate distfile names |
407 |
+ with content digest values yielded by the content-hash get_filenames |
408 |
implementation. |
409 |
""" |
410 |
def __new__(cls, s, digests=None): |
411 |
diff --git a/lib/portage/tests/ebuild/test_fetch.py b/lib/portage/tests/ebuild/test_fetch.py |
412 |
index d50a4cbfc..881288cdc 100644 |
413 |
--- a/lib/portage/tests/ebuild/test_fetch.py |
414 |
+++ b/lib/portage/tests/ebuild/test_fetch.py |
415 |
@@ -172,6 +172,16 @@ class EbuildFetchTestCase(TestCase): |
416 |
with open(os.path.join(settings['DISTDIR'], 'layout.conf'), 'wt') as f: |
417 |
f.write(layout_data) |
418 |
|
419 |
+ if any(isinstance(layout, ContentHashLayout) for layout in layouts): |
420 |
+ content_db = os.path.join(playground.eprefix, 'var/db/emirrordist/content.db') |
421 |
+ os.makedirs(os.path.dirname(content_db), exist_ok=True) |
422 |
+ try: |
423 |
+ os.unlink(content_db) |
424 |
+ except OSError: |
425 |
+ pass |
426 |
+ else: |
427 |
+ content_db = None |
428 |
+ |
429 |
# Demonstrate that fetch preserves a stale file in DISTDIR when no digests are given. |
430 |
foo_uri = {'foo': ('{scheme}://{host}:{port}/distfiles/foo'.format(scheme=scheme, host=host, port=server.server_port),)} |
431 |
foo_path = os.path.join(settings['DISTDIR'], 'foo') |
432 |
@@ -233,9 +243,13 @@ class EbuildFetchTestCase(TestCase): |
433 |
os.path.join(self.bindir, 'emirrordist'), |
434 |
'--distfiles', settings['DISTDIR'], |
435 |
'--config-root', settings['EPREFIX'], |
436 |
+ '--delete', |
437 |
'--repositories-configuration', settings.repositories.config_string(), |
438 |
'--repo', 'test_repo', '--mirror') |
439 |
|
440 |
+ if content_db is not None: |
441 |
+ emirrordist_cmd = emirrordist_cmd + ('--content-db', content_db,) |
442 |
+ |
443 |
env = settings.environ() |
444 |
env['PYTHONPATH'] = ':'.join( |
445 |
filter(None, [PORTAGE_PYM_PATH] + os.environ.get('PYTHONPATH', '').split(':'))) |
446 |
diff --git a/man/emirrordist.1 b/man/emirrordist.1 |
447 |
index 45108ef8c..7ad10dfd0 100644 |
448 |
--- a/man/emirrordist.1 |
449 |
+++ b/man/emirrordist.1 |
450 |
@@ -1,4 +1,4 @@ |
451 |
-.TH "EMIRRORDIST" "1" "Dec 2015" "Portage VERSION" "Portage" |
452 |
+.TH "EMIRRORDIST" "1" "Feb 2021" "Portage VERSION" "Portage" |
453 |
.SH "NAME" |
454 |
emirrordist \- a fetch tool for mirroring of package distfiles |
455 |
.SH SYNOPSIS |
456 |
@@ -66,6 +66,10 @@ reporting purposes. Opened in append mode. |
457 |
Log file for scheduled deletions, with tab\-delimited output, for |
458 |
reporting purposes. Overwritten with each run. |
459 |
.TP |
460 |
+\fB\-\-content\-db\fR=\fIFILE\fR |
461 |
+Database file used to pair content digests with distfiles names |
462 |
+(required fo content\-hash layout). |
463 |
+.TP |
464 |
\fB\-\-delete\fR |
465 |
Enable deletion of unused distfiles. |
466 |
.TP |
467 |
-- |
468 |
2.26.2 |