Gentoo Archives: gentoo-portage-dev

From: Zac Medico <zmedico@g.o>
To: gentoo-portage-dev@l.g.o
Cc: Zac Medico <zmedico@g.o>
Subject: [gentoo-portage-dev] [PATCH v2] emirrordist: add --content-db option required for content-hash layout (bug 756778)
Date: Fri, 26 Feb 2021 12:22:45
Message-Id: 20210226122150.1112987-1-zmedico@gentoo.org
In Reply to: [gentoo-portage-dev] [PATCH] emirrordist: add --content-db option required for content-hash layout (bug 756778) by Zac Medico
1 Add a --content-db option which is required for the content-hash
2 layout because its file listings return content digests instead of
3 distfile names.
4
5 The content db serves to translate content digests to distfiles
6 names, and distfiles names to content digests. All keys have a
7 prefix separated by a colon. For digest keys, the prefix is the
8 hash algorithm name. For filename keys, the prefix is "filename".
9
10 The value associated with a digest key is a plain filename. The
11 value associated with a distfile key is a set of content revisions.
12 Each content revision is expressed as a dictionary of digests which
13 is suitable for construction of a DistfileName instance.
14
15 Bug: https://bugs.gentoo.org/756778
16 Signed-off-by: Zac Medico <zmedico@g.o>
17 ---
18 [PATCH v2] Split out ContentDB class and associate distfile key
19 with a set of content revisions, where each content revision is
20 expressed as a dictionary of digests.
21
22 lib/portage/_emirrordist/Config.py | 8 +-
23 lib/portage/_emirrordist/ContentDB.py | 158 +++++++++++++++++++
24 lib/portage/_emirrordist/DeletionIterator.py | 25 ++-
25 lib/portage/_emirrordist/DeletionTask.py | 8 +
26 lib/portage/_emirrordist/FetchTask.py | 5 +-
27 lib/portage/_emirrordist/main.py | 15 +-
28 lib/portage/tests/ebuild/test_fetch.py | 14 ++
29 man/emirrordist.1 | 6 +-
30 8 files changed, 232 insertions(+), 7 deletions(-)
31 create mode 100644 lib/portage/_emirrordist/ContentDB.py
32
33 diff --git a/lib/portage/_emirrordist/Config.py b/lib/portage/_emirrordist/Config.py
34 index 4bee4f45e..cfe944040 100644
35 --- a/lib/portage/_emirrordist/Config.py
36 +++ b/lib/portage/_emirrordist/Config.py
37 @@ -1,4 +1,4 @@
38 -# Copyright 2013-2020 Gentoo Authors
39 +# Copyright 2013-2021 Gentoo Authors
40 # Distributed under the terms of the GNU General Public License v2
41
42 import copy
43 @@ -10,6 +10,7 @@ import time
44 from portage import os
45 from portage.package.ebuild.fetch import MirrorLayoutConfig
46 from portage.util import grabdict, grablines
47 +from .ContentDB import ContentDB
48
49 class Config:
50 def __init__(self, options, portdb, event_loop):
51 @@ -65,6 +66,11 @@ class Config:
52 self.distfiles_db = self._open_shelve(
53 options.distfiles_db, 'distfiles')
54
55 + self.content_db = None
56 + if options.content_db is not None:
57 + self.content_db = ContentDB(self._open_shelve(
58 + options.content_db, 'content'))
59 +
60 self.deletion_db = None
61 if options.deletion_db is not None:
62 self.deletion_db = self._open_shelve(
63 diff --git a/lib/portage/_emirrordist/ContentDB.py b/lib/portage/_emirrordist/ContentDB.py
64 new file mode 100644
65 index 000000000..60e6ef39d
66 --- /dev/null
67 +++ b/lib/portage/_emirrordist/ContentDB.py
68 @@ -0,0 +1,158 @@
69 +# Copyright 2021 Gentoo Authors
70 +# Distributed under the terms of the GNU General Public License v2
71 +
72 +import logging
73 +import operator
74 +import shelve
75 +import typing
76 +
77 +from portage.package.ebuild.fetch import DistfileName
78 +
79 +
80 +class ContentDB:
81 + """
82 + The content db serves to translate content digests to distfiles
83 + names, and distfiles names to content digests. All keys have a
84 + prefix separated by a colon. For digest keys, the prefix is the
85 + hash algorithm name. For filename keys, the prefix is "filename".
86 +
87 + The value associated with a digest key is a plain filename. The
88 + value associated with a distfile key is a set of content revisions.
89 + Each content revision is expressed as a dictionary of digests which
90 + is suitable for construction of a DistfileName instance.
91 + """
92 +
93 + def __init__(self, shelve_instance: shelve.Shelf):
94 + self._shelve = shelve_instance
95 +
96 + def add(self, filename: DistfileName):
97 + """
98 + Add file name and digests.
99 +
100 + @param filename: file name with digests attribute
101 + """
102 + distfile_str = str(filename)
103 + distfile_key = "filename:{}".format(distfile_str)
104 + for k, v in filename.digests.items():
105 + if k != "size":
106 + self._shelve["{}:{}".format(k, v).lower()] = distfile_str
107 + try:
108 + content_revisions = self._shelve[distfile_key]
109 + except KeyError:
110 + content_revisions = set()
111 +
112 + revision_key = tuple(
113 + sorted(
114 + (
115 + (algo.lower(), filename.digests[algo].lower())
116 + for algo in filename.digests
117 + if algo != "size"
118 + ),
119 + key=operator.itemgetter(0),
120 + )
121 + )
122 + content_revisions.add(revision_key)
123 + self._shelve[distfile_key] = content_revisions
124 +
125 + def remove(self, filename: DistfileName):
126 + """
127 + Remove a file name from the database.
128 +
129 + @param filename: file name with digests attribute
130 + """
131 + distfile_key = "filename:{}".format(filename)
132 + try:
133 + content_revisions = self._shelve[distfile_key]
134 + except KeyError:
135 + pass
136 + else:
137 + for revision_key in content_revisions:
138 + for k, v in revision_key:
139 + try:
140 + del self._shelve["{}:{}".format(k, v)]
141 + except KeyError:
142 + pass
143 +
144 + logging.debug(("drop '%s' from content db") % filename)
145 + try:
146 + del self._shelve[distfile_key]
147 + except KeyError:
148 + pass
149 +
150 + def get_filenames_translate(
151 + self, filename: typing.Union[str, DistfileName]
152 + ) -> typing.Generator[DistfileName, None, None]:
153 + """
154 + Translate distfiles content digests to distfile names.
155 + If filename is already a distfile name, then it will pass
156 + through unchanged.
157 +
158 + @param filename: A filename listed by layout get_filenames
159 + @return: The distfile name, translated from the corresponding
160 + content digest when necessary
161 + """
162 + if not isinstance(filename, DistfileName):
163 + filename = DistfileName(filename)
164 + if self._shelve is None:
165 + yield filename
166 + return
167 +
168 + # Match content digests with zero or more content revisions.
169 + matched_revisions = {}
170 +
171 + for k, v in filename.digests.items():
172 + digest_item = (k.lower(), v.lower())
173 + digest_key = "{}:{}".format(*digest_item)
174 + try:
175 + distfile_str = self._shelve[digest_key]
176 + except KeyError:
177 + continue
178 +
179 + matched_revisions.setdefault(distfile_str, set())
180 + try:
181 + content_revisions = self._shelve["filename:{}".format(distfile_str)]
182 + except KeyError:
183 + pass
184 + else:
185 + for revision_key in content_revisions:
186 + if (
187 + digest_item in revision_key
188 + and revision_key not in matched_revisions.get(distfile_str, ())
189 + ):
190 + matched_revisions[distfile_str].add(revision_key)
191 + yield DistfileName(distfile_str, digests=dict(revision_key))
192 +
193 + if not any(matched_revisions.values()):
194 + # Since filename matched zero content revisions, allow
195 + # it to pass through unchanged (on the path toward deletion).
196 + yield filename
197 +
198 + def __len__(self):
199 + return len(self._shelve)
200 +
201 + def __contains__(self, k):
202 + return k in self._shelve
203 +
204 + def __iter__(self):
205 + return self._shelve.__iter__()
206 +
207 + def items(self):
208 + return self._shelve.iteritems()
209 +
210 + def __setitem__(self, k, v):
211 + self._shelve[k] = v
212 +
213 + def __getitem__(self, k):
214 + return self._shelve[k]
215 +
216 + def __delitem__(self, k):
217 + del self._shelve[k]
218 +
219 + def get(self, k, *args):
220 + return self._shelve.get(k, *args)
221 +
222 + def close(self):
223 + self._shelve.close()
224 +
225 + def clear(self):
226 + self._shelve.clear()
227 diff --git a/lib/portage/_emirrordist/DeletionIterator.py b/lib/portage/_emirrordist/DeletionIterator.py
228 index 08985ed6c..ab4309f9a 100644
229 --- a/lib/portage/_emirrordist/DeletionIterator.py
230 +++ b/lib/portage/_emirrordist/DeletionIterator.py
231 @@ -1,10 +1,12 @@
232 -# Copyright 2013-2019 Gentoo Authors
233 +# Copyright 2013-2021 Gentoo Authors
234 # Distributed under the terms of the GNU General Public License v2
235
236 +import itertools
237 import logging
238 import stat
239
240 from portage import os
241 +from portage.package.ebuild.fetch import DistfileName
242 from .DeletionTask import DeletionTask
243
244 class DeletionIterator:
245 @@ -21,8 +23,25 @@ class DeletionIterator:
246 deletion_delay = self._config.options.deletion_delay
247 start_time = self._config.start_time
248 distfiles_set = set()
249 - for layout in self._config.layouts:
250 - distfiles_set.update(layout.get_filenames(distdir))
251 + distfiles_set.update(
252 + (
253 + filename
254 + if isinstance(filename, DistfileName)
255 + else DistfileName(filename)
256 + for filename in itertools.chain.from_iterable(
257 + layout.get_filenames(distdir) for layout in self._config.layouts
258 + )
259 + )
260 + if self._config.content_db is None
261 + else itertools.chain.from_iterable(
262 + (
263 + self._config.content_db.get_filenames_translate(filename)
264 + for filename in itertools.chain.from_iterable(
265 + layout.get_filenames(distdir) for layout in self._config.layouts
266 + )
267 + )
268 + )
269 + )
270 for filename in distfiles_set:
271 # require at least one successful stat()
272 exceptions = []
273 diff --git a/lib/portage/_emirrordist/DeletionTask.py b/lib/portage/_emirrordist/DeletionTask.py
274 index 5eb01d840..73493c5a1 100644
275 --- a/lib/portage/_emirrordist/DeletionTask.py
276 +++ b/lib/portage/_emirrordist/DeletionTask.py
277 @@ -5,6 +5,7 @@ import errno
278 import logging
279
280 from portage import os
281 +from portage.package.ebuild.fetch import ContentHashLayout
282 from portage.util._async.FileCopier import FileCopier
283 from _emerge.CompositeTask import CompositeTask
284
285 @@ -99,6 +100,10 @@ class DeletionTask(CompositeTask):
286 def _delete_links(self):
287 success = True
288 for layout in self.config.layouts:
289 + if isinstance(layout, ContentHashLayout) and not self.distfile.digests:
290 + logging.debug(("_delete_links: '%s' has "
291 + "no digests") % self.distfile)
292 + continue
293 distfile_path = os.path.join(
294 self.config.options.distfiles,
295 layout.get_path(self.distfile))
296 @@ -134,6 +139,9 @@ class DeletionTask(CompositeTask):
297 logging.debug(("drop '%s' from "
298 "distfiles db") % self.distfile)
299
300 + if self.config.content_db is not None:
301 + self.config.content_db.remove(self.distfile)
302 +
303 if self.config.deletion_db is not None:
304 try:
305 del self.config.deletion_db[self.distfile]
306 diff --git a/lib/portage/_emirrordist/FetchTask.py b/lib/portage/_emirrordist/FetchTask.py
307 index 997762082..5a48f91cd 100644
308 --- a/lib/portage/_emirrordist/FetchTask.py
309 +++ b/lib/portage/_emirrordist/FetchTask.py
310 @@ -1,4 +1,4 @@
311 -# Copyright 2013-2020 Gentoo Authors
312 +# Copyright 2013-2021 Gentoo Authors
313 # Distributed under the terms of the GNU General Public License v2
314
315 import collections
316 @@ -47,6 +47,9 @@ class FetchTask(CompositeTask):
317 # Convert _pkg_str to str in order to prevent pickle problems.
318 self.config.distfiles_db[self.distfile] = str(self.cpv)
319
320 + if self.config.content_db is not None:
321 + self.config.content_db.add(self.distfile)
322 +
323 if not self._have_needed_digests():
324 msg = "incomplete digests: %s" % " ".join(self.digests)
325 self.scheduler.output(msg, background=self.background,
326 diff --git a/lib/portage/_emirrordist/main.py b/lib/portage/_emirrordist/main.py
327 index 8d00a05f5..2200ec715 100644
328 --- a/lib/portage/_emirrordist/main.py
329 +++ b/lib/portage/_emirrordist/main.py
330 @@ -1,4 +1,4 @@
331 -# Copyright 2013-2020 Gentoo Authors
332 +# Copyright 2013-2021 Gentoo Authors
333 # Distributed under the terms of the GNU General Public License v2
334
335 import argparse
336 @@ -7,6 +7,7 @@ import sys
337
338 import portage
339 from portage import os
340 +from portage.package.ebuild.fetch import ContentHashLayout
341 from portage.util import normalize_path, _recursive_file_list
342 from portage.util._async.run_main_scheduler import run_main_scheduler
343 from portage.util._async.SchedulerInterface import SchedulerInterface
344 @@ -151,6 +152,12 @@ common_options = (
345 "distfile belongs to",
346 "metavar" : "FILE"
347 },
348 + {
349 + "longopt" : "--content-db",
350 + "help" : "database file used to map content digests to"
351 + "distfiles names (required for content-hash layout)",
352 + "metavar" : "FILE"
353 + },
354 {
355 "longopt" : "--recycle-dir",
356 "help" : "directory for extended retention of files that "
357 @@ -441,6 +448,12 @@ def emirrordist_main(args):
358 if not options.mirror:
359 parser.error('No action specified')
360
361 + if options.delete and config.content_db is None:
362 + for layout in config.layouts:
363 + if isinstance(layout, ContentHashLayout):
364 + parser.error("content-hash layout requires "
365 + "--content-db to be specified")
366 +
367 returncode = os.EX_OK
368
369 if options.mirror:
370 diff --git a/lib/portage/tests/ebuild/test_fetch.py b/lib/portage/tests/ebuild/test_fetch.py
371 index d50a4cbfc..881288cdc 100644
372 --- a/lib/portage/tests/ebuild/test_fetch.py
373 +++ b/lib/portage/tests/ebuild/test_fetch.py
374 @@ -172,6 +172,16 @@ class EbuildFetchTestCase(TestCase):
375 with open(os.path.join(settings['DISTDIR'], 'layout.conf'), 'wt') as f:
376 f.write(layout_data)
377
378 + if any(isinstance(layout, ContentHashLayout) for layout in layouts):
379 + content_db = os.path.join(playground.eprefix, 'var/db/emirrordist/content.db')
380 + os.makedirs(os.path.dirname(content_db), exist_ok=True)
381 + try:
382 + os.unlink(content_db)
383 + except OSError:
384 + pass
385 + else:
386 + content_db = None
387 +
388 # Demonstrate that fetch preserves a stale file in DISTDIR when no digests are given.
389 foo_uri = {'foo': ('{scheme}://{host}:{port}/distfiles/foo'.format(scheme=scheme, host=host, port=server.server_port),)}
390 foo_path = os.path.join(settings['DISTDIR'], 'foo')
391 @@ -233,9 +243,13 @@ class EbuildFetchTestCase(TestCase):
392 os.path.join(self.bindir, 'emirrordist'),
393 '--distfiles', settings['DISTDIR'],
394 '--config-root', settings['EPREFIX'],
395 + '--delete',
396 '--repositories-configuration', settings.repositories.config_string(),
397 '--repo', 'test_repo', '--mirror')
398
399 + if content_db is not None:
400 + emirrordist_cmd = emirrordist_cmd + ('--content-db', content_db,)
401 +
402 env = settings.environ()
403 env['PYTHONPATH'] = ':'.join(
404 filter(None, [PORTAGE_PYM_PATH] + os.environ.get('PYTHONPATH', '').split(':')))
405 diff --git a/man/emirrordist.1 b/man/emirrordist.1
406 index 45108ef8c..7ad10dfd0 100644
407 --- a/man/emirrordist.1
408 +++ b/man/emirrordist.1
409 @@ -1,4 +1,4 @@
410 -.TH "EMIRRORDIST" "1" "Dec 2015" "Portage VERSION" "Portage"
411 +.TH "EMIRRORDIST" "1" "Feb 2021" "Portage VERSION" "Portage"
412 .SH "NAME"
413 emirrordist \- a fetch tool for mirroring of package distfiles
414 .SH SYNOPSIS
415 @@ -66,6 +66,10 @@ reporting purposes. Opened in append mode.
416 Log file for scheduled deletions, with tab\-delimited output, for
417 reporting purposes. Overwritten with each run.
418 .TP
419 +\fB\-\-content\-db\fR=\fIFILE\fR
420 +Database file used to pair content digests with distfiles names
421 +(required fo content\-hash layout).
422 +.TP
423 \fB\-\-delete\fR
424 Enable deletion of unused distfiles.
425 .TP
426 --
427 2.26.2

Replies