Gentoo Archives: gentoo-commits

From: "Zac Medico (zmedico)" <zmedico@g.o>
To: gentoo-commits@l.g.o
Subject: [gentoo-commits] portage r10609 - in main/trunk: bin pym/_emerge pym/portage/dbapi
Date: Mon, 09 Jun 2008 13:55:14
Message-Id: E1K5hpz-0004Qs-MT@stork.gentoo.org
1 Author: zmedico
2 Date: 2008-06-09 13:55:06 +0000 (Mon, 09 Jun 2008)
3 New Revision: 10609
4
5 Modified:
6 main/trunk/bin/portageq
7 main/trunk/pym/_emerge/__init__.py
8 main/trunk/pym/portage/dbapi/vartree.py
9 Log:
10 Add CONTENTS indexing support for optimization of owner lookups. The
11 vardbapi cache maintains a hash table (inside vdb_metadata.pickle)
12 that serves to index package contents by mapping the basename of file
13 to a list of possible packages that own it. This is used to optimize
14 owner lookups by narrowing the search down to a smaller number of
15 packages. It increases the size of vdb_metadata.pickle by approximately
16 30% and it's used in the following cases:
17
18 * When an unexpected file collision occurs (whether
19 or not collision-protect is enabled)
20
21 * `emerge <filename>`
22
23 * `portageq owners`
24
25 The
26
27
28 Modified: main/trunk/bin/portageq
29 ===================================================================
30 --- main/trunk/bin/portageq 2008-06-08 20:23:14 UTC (rev 10608)
31 +++ main/trunk/bin/portageq 2008-06-09 13:55:06 UTC (rev 10609)
32 @@ -188,27 +188,22 @@
33 return 2
34 files.append(f[len(root):])
35
36 - found_owner = False
37 - for cpv in vardb.cpv_all():
38 - cat, pkg = catsplit(cpv)
39 - mylink = dblink(cat, pkg, root, settings, vartree=vardb.vartree)
40 - myfiles = []
41 - for f in files:
42 - if mylink.isowner(f, root):
43 - myfiles.append(f)
44 - if myfiles:
45 - found_owner = True
46 - sys.stdout.write("%s\n" % cpv)
47 - for f in myfiles:
48 - sys.stdout.write("\t%s\n" % \
49 - os.path.join(root, f.lstrip(os.path.sep)))
50 - sys.stdout.flush()
51 - if not found_owner:
52 - sys.stderr.write("None of the installed packages claim the file(s).\n")
53 - sys.stderr.flush()
54 - return 1
55 - return 0
56 + owners = vardb._owners.get_owners(files)
57
58 + for pkg, owned_files in owners.iteritems():
59 + cpv = pkg.mycpv
60 + sys.stdout.write("%s\n" % cpv)
61 + for f in sorted(owned_files):
62 + sys.stdout.write("\t%s\n" % \
63 + os.path.join(root, f.lstrip(os.path.sep)))
64 + if owners:
65 + sys.stdout.flush()
66 + return 0
67 +
68 + sys.stderr.write("None of the installed packages claim the file(s).\n")
69 + sys.stderr.flush()
70 + return 1
71 +
72 owners.uses_root = True
73
74 def best_visible(argv):
75
76 Modified: main/trunk/pym/_emerge/__init__.py
77 ===================================================================
78 --- main/trunk/pym/_emerge/__init__.py 2008-06-08 20:23:14 UTC (rev 10608)
79 +++ main/trunk/pym/_emerge/__init__.py 2008-06-09 13:55:06 UTC (rev 10609)
80 @@ -2560,6 +2560,7 @@
81 myroot = self.target_root
82 dbs = self._filtered_trees[myroot]["dbs"]
83 vardb = self.trees[myroot]["vartree"].dbapi
84 + real_vardb = self._trees_orig[myroot]["vartree"].dbapi
85 portdb = self.trees[myroot]["porttree"].dbapi
86 bindb = self.trees[myroot]["bintree"].dbapi
87 pkgsettings = self.pkgsettings[myroot]
88 @@ -2638,16 +2639,12 @@
89 " $ROOT.\n") % x, noiselevel=-1)
90 return 0, []
91 relative_path = x[len(myroot):]
92 - vartree = self._trees_orig[myroot]["vartree"]
93 owner_cpv = None
94 - for cpv in vardb.cpv_all():
95 - self.spinner.update()
96 - cat, pf = portage.catsplit(cpv)
97 - if portage.dblink(cat, pf, myroot,
98 - pkgsettings, vartree=vartree).isowner(
99 - relative_path, myroot):
100 - owner_cpv = cpv
101 - break
102 + for pkg, relative_path in \
103 + real_vardb._owners.iter_owners([relative_path]):
104 + owner_cpv = pkg.mycpv
105 + break
106 +
107 if owner_cpv is None:
108 portage.writemsg(("\n\n!!! '%s' is not claimed " + \
109 "by any package.\n") % x, noiselevel=-1)
110
111 Modified: main/trunk/pym/portage/dbapi/vartree.py
112 ===================================================================
113 --- main/trunk/pym/portage/dbapi/vartree.py 2008-06-08 20:23:14 UTC (rev 10608)
114 +++ main/trunk/pym/portage/dbapi/vartree.py 2008-06-09 13:55:06 UTC (rev 10609)
115 @@ -238,6 +238,9 @@
116 _excluded_dirs = re.compile(r'^(\..*|-MERGING-.*|' + \
117 "|".join(_excluded_dirs) + r')$')
118
119 + _aux_cache_version = "1"
120 + _owners_cache_version = "1"
121 +
122 # Number of uncached packages to trigger cache update, since
123 # it's wasteful to update it for every vdb change.
124 _aux_cache_threshold = 5
125 @@ -275,8 +278,7 @@
126 "EAPI", "HOMEPAGE", "IUSE", "KEYWORDS",
127 "LICENSE", "PDEPEND", "PROVIDE", "RDEPEND",
128 "repository", "RESTRICT" , "SLOT", "USE"])
129 - self._aux_cache = None
130 - self._aux_cache_version = "1"
131 + self._aux_cache_obj = None
132 self._aux_cache_filename = os.path.join(self.root,
133 CACHE_PATH.lstrip(os.path.sep), "vdb_metadata.pickle")
134 self._counter_path = os.path.join(root,
135 @@ -290,6 +292,7 @@
136 self.plib_registry = None
137
138 self.linkmap = LinkageMap(self)
139 + self._owners = self._owners_db(self)
140
141 def getpath(self, mykey, filename=None):
142 rValue = os.path.join(self.root, VDB_PATH, mykey)
143 @@ -562,6 +565,7 @@
144 if self._aux_cache is not None and \
145 len(self._aux_cache["modified"]) >= self._aux_cache_threshold and \
146 secpass >= 2:
147 + self._owners.populate() # index any unindexed contents
148 valid_nodes = set(self.cpv_all())
149 for cpv in self._aux_cache["packages"].keys():
150 if cpv not in valid_nodes:
151 @@ -577,6 +581,54 @@
152 pass
153 self._aux_cache["modified"] = set()
154
155 + @property
156 + def _aux_cache(self):
157 + if self._aux_cache_obj is None:
158 + self._aux_cache_init()
159 + return self._aux_cache_obj
160 +
161 + def _aux_cache_init(self):
162 + try:
163 + f = open(self._aux_cache_filename)
164 + mypickle = cPickle.Unpickler(f)
165 + mypickle.find_global = None
166 + aux_cache = mypickle.load()
167 + f.close()
168 + del f
169 + except (IOError, OSError, EOFError, cPickle.UnpicklingError), e:
170 + if isinstance(e, cPickle.UnpicklingError):
171 + writemsg("!!! Error loading '%s': %s\n" % \
172 + (self._aux_cache_filename, str(e)), noiselevel=-1)
173 + del e
174 +
175 + if not aux_cache or \
176 + not isinstance(aux_cache, dict) or \
177 + aux_cache.get("version") != self._aux_cache_version or \
178 + not aux_cache.get("packages"):
179 + aux_cache = {"version": self._aux_cache_version}
180 + aux_cache["packages"] = {}
181 +
182 + owners = aux_cache.get("owners")
183 + if owners is not None:
184 + if not isinstance(owners, dict):
185 + owners = None
186 + elif "version" not in owners:
187 + owners = None
188 + elif owners["version"] != self._owners_cache_version:
189 + owners = None
190 + elif "base_names" not in owners:
191 + owners = None
192 +
193 + if owners is None:
194 + owners = {
195 + "base_names" : {},
196 + "version" : self._owners_cache_version
197 + }
198 + aux_cache["owners"] = owners
199 +
200 + aux_cache["modified"] = set()
201 + self._aux_cache_obj = aux_cache
202 +
203 def aux_get(self, mycpv, wants):
204 """This automatically caches selected keys that are frequently needed
205 by emerge for dependency calculations. The cached metadata is
206 @@ -601,26 +653,6 @@
207 cache_these = set(self._aux_cache_keys)
208 cache_these.update(cache_these_wants)
209
210 - if self._aux_cache is None:
211 - try:
212 - f = open(self._aux_cache_filename)
213 - mypickle = cPickle.Unpickler(f)
214 - mypickle.find_global = None
215 - self._aux_cache = mypickle.load()
216 - f.close()
217 - del f
218 - except (IOError, OSError, EOFError, cPickle.UnpicklingError), e:
219 - if isinstance(e, cPickle.UnpicklingError):
220 - writemsg("!!! Error loading '%s': %s\n" % \
221 - (self._aux_cache_filename, str(e)), noiselevel=-1)
222 - del e
223 - if not self._aux_cache or \
224 - not isinstance(self._aux_cache, dict) or \
225 - self._aux_cache.get("version") != self._aux_cache_version or \
226 - not self._aux_cache.get("packages"):
227 - self._aux_cache = {"version": self._aux_cache_version}
228 - self._aux_cache["packages"] = {}
229 - self._aux_cache["modified"] = set()
230 mydir = self.getpath(mycpv)
231 mydir_stat = None
232 try:
233 @@ -801,6 +833,173 @@
234 write_atomic(self._counter_path, str(counter))
235 return counter
236
237 + def _dblink(self, cpv):
238 + category, pf = catsplit(cpv)
239 + return dblink(category, pf, self.root,
240 + self.settings, vartree=self.vartree)
241 +
242 + class _owners_cache(object):
243 + """
244 + This class maintains an hash table that serves to index package
245 + contents by mapping the basename of file to a list of possible
246 + packages that own it. This is used to optimize owner lookups
247 + by narrowing the search down to a smaller number of packages.
248 + """
249 + try:
250 + from hashlib import md5 as _new_hash
251 + except ImportError:
252 + from md5 import new as _new_hash
253 +
254 + _hash_bits = 16
255 + _hex_chars = _hash_bits / 4
256 +
257 + def __init__(self, vardb):
258 + self._vardb = vardb
259 +
260 + def add(self, cpv):
261 + root_len = len(self._vardb.root)
262 + contents = self._vardb._dblink(cpv).getcontents()
263 + pkg_hash = self._hash_pkg(cpv)
264 + if not contents:
265 + # Empty path is a code used to represent empty contents.
266 + self._add_path("", pkg_hash)
267 + for x in contents:
268 + relative_path = x[root_len:]
269 + self._add_path(x, pkg_hash)
270 + self._vardb._aux_cache["modified"].add(cpv)
271 +
272 + def _add_path(self, path, pkg_hash):
273 + """
274 + Empty path is a code that represents empty contents.
275 + """
276 + if path:
277 + name = os.path.basename(path.rstrip(os.path.sep))
278 + if not name:
279 + return
280 + else:
281 + name = path
282 + name_hash = self._hash_str(name)
283 + base_names = self._vardb._aux_cache["owners"]["base_names"]
284 + pkgs = base_names.get(name_hash)
285 + if pkgs is None:
286 + pkgs = {}
287 + base_names[name_hash] = pkgs
288 + pkgs[pkg_hash] = None
289 +
290 + def _hash_str(self, s):
291 + h = self._new_hash()
292 + h.update(s)
293 + h = h.hexdigest()
294 + h = h[-self._hex_chars:]
295 + h = int(h, 16)
296 + return h
297 +
298 + def _hash_pkg(self, cpv):
299 + counter, mtime = self._vardb.aux_get(
300 + cpv, ["COUNTER", "_mtime_"])
301 + try:
302 + counter = int(counter)
303 + except ValueError:
304 + counter = 0
305 + return (cpv, counter, mtime)
306 +
307 + class _owners_db(object):
308 +
309 + def __init__(self, vardb):
310 + self._vardb = vardb
311 +
312 + def populate(self):
313 + self._populate()
314 +
315 + def _populate(self):
316 + owners_cache = vardbapi._owners_cache(self._vardb)
317 + cached_hashes = set()
318 + base_names = self._vardb._aux_cache["owners"]["base_names"]
319 +
320 + # Take inventory of all cached package hashes.
321 + for hash_values in base_names.itervalues():
322 + cached_hashes.update(hash_values)
323 +
324 + # Create sets of valid package hashes and uncached packages.
325 + uncached_pkgs = set()
326 + hash_pkg = owners_cache._hash_pkg
327 + valid_pkg_hashes = set()
328 + for cpv in self._vardb.cpv_all():
329 + hash_value = hash_pkg(cpv)
330 + valid_pkg_hashes.add(hash_value)
331 + if hash_value not in cached_hashes:
332 + uncached_pkgs.add(cpv)
333 +
334 + # Cache any missing packages.
335 + for cpv in uncached_pkgs:
336 + owners_cache.add(cpv)
337 +
338 + # Delete any stale cache.
339 + stale_hashes = cached_hashes.difference(valid_pkg_hashes)
340 + if stale_hashes:
341 + for base_name_hash, bucket in base_names.items():
342 + for hash_value in stale_hashes.intersection(bucket):
343 + del bucket[hash_value]
344 + if not bucket:
345 + del base_names[base_name_hash]
346 +
347 + return owners_cache
348 +
349 + def get_owners(self, path_iter):
350 + """
351 + @return the owners as a dblink -> set(files) mapping.
352 + """
353 + owners = {}
354 + for owner, f in self.iter_owners(path_iter):
355 + owned_files = owners.get(owner)
356 + if owned_files is None:
357 + owned_files = set()
358 + owners[owner] = owned_files
359 + owned_files.add(f)
360 + return owners
361 +
362 + def iter_owners(self, path_iter):
363 + """
364 + Iterate over tuples of (dblink, path). In order to avoid
365 + consuming too many resources for too much time, resources
366 + are only allocated for the duration of a given iter_owners()
367 + call. Therefore, to maximize reuse of resources when searching
368 + for multiple files, it's best to search for them all in a single
369 + call.
370 + """
371 +
372 + owners_cache = self._populate()
373 +
374 + vardb = self._vardb
375 + root = vardb.root
376 + hash_pkg = owners_cache._hash_pkg
377 + hash_str = owners_cache._hash_str
378 + base_names = self._vardb._aux_cache["owners"]["base_names"]
379 +
380 + dblink_cache = {}
381 +
382 + def dblink(cpv):
383 + x = dblink_cache.get(cpv)
384 + if x is None:
385 + x = self._vardb._dblink(cpv)
386 + dblink_cache[cpv] = x
387 + return x
388 +
389 + for path in path_iter:
390 + name = os.path.basename(path.rstrip(os.path.sep))
391 + if not name:
392 + continue
393 +
394 + name_hash = hash_str(name)
395 + pkgs = base_names.get(name_hash)
396 + if pkgs is not None:
397 + for hash_value in pkgs:
398 + cpv, counter, mtime = hash_value
399 + if hash_pkg(cpv) != hash_value:
400 + continue
401 + if dblink(cpv).isowner(path, root):
402 + yield dblink(cpv), path
403 +
404 class vartree(object):
405 "this tree will scan a var/db/pkg database located at root (passed to init)"
406 def __init__(self, root="/", virtual=None, clone=None, categories=None,
407 @@ -2202,36 +2401,30 @@
408
409 eerror(msg)
410
411 - if collision_protect:
412 + msg = []
413 + msg.append("")
414 + msg.append("Searching all installed" + \
415 + " packages for file collisions...")
416 + msg.append("")
417 + msg.append("Press Ctrl-C to Stop")
418 + msg.append("")
419 + eerror(msg)
420 +
421 + owners = self.vartree.dbapi._owners.get_owners(files)
422 + self.vartree.dbapi.flush_cache()
423 +
424 + for pkg, owned_files in owners.iteritems():
425 + cpv = pkg.mycpv
426 msg = []
427 - msg.append("")
428 - msg.append("Searching all installed" + \
429 - " packages for file collisions...")
430 - msg.append("")
431 - msg.append("Press Ctrl-C to Stop")
432 - msg.append("")
433 + msg.append("%s" % cpv)
434 + for f in sorted(owned_files):
435 + msg.append("\t%s" % os.path.join(destroot,
436 + f.lstrip(os.path.sep)))
437 eerror(msg)
438 -
439 - found_owner = False
440 - for cpv in self.vartree.dbapi.cpv_all():
441 - cat, pkg = catsplit(cpv)
442 - mylink = dblink(cat, pkg, destroot, self.settings,
443 - vartree=self.vartree)
444 - mycollisions = []
445 - for f in collisions:
446 - if mylink.isowner(f, destroot):
447 - mycollisions.append(f)
448 - if mycollisions:
449 - found_owner = True
450 - msg = []
451 - msg.append("%s" % cpv)
452 - for f in mycollisions:
453 - msg.append("\t%s" % os.path.join(destroot,
454 - f.lstrip(os.path.sep)))
455 - eerror(msg)
456 - if not found_owner:
457 - eerror(["None of the installed" + \
458 - " packages claim the file(s)."])
459 + if not owners:
460 + eerror(["None of the installed" + \
461 + " packages claim the file(s)."])
462 + if collision_protect:
463 return 1
464
465 writemsg_stdout(">>> Merging %s to %s\n" % (self.mycpv, destroot))
466
467 --
468 gentoo-commits@l.g.o mailing list