Gentoo Archives: gentoo-portage-dev

From: Zac Medico <zmedico@g.o>
To: gentoo-portage-dev@l.g.o
Subject: [gentoo-portage-dev] Re: [PATCH] emerge --search: use description index
Date: Sun, 19 Oct 2014 21:51:13
Message-Id: 54443249.1090901@gentoo.org
In Reply to: [gentoo-portage-dev] Re: [PATCH] emerge --search: use description index by Zac Medico
1 This updated patch changes the index format to use spaces instead of
2 commas, for readability. This example given in man/portage.5:
3
4 sys-apps/sed 4.2 4.2.1 4.2.1-r1 4.2.2: Super-useful stream editor
5 sys-apps/usleep 0.1: A wrapper for usleep
6
7 Hopefully that's easier on the eyes (thanks to Michał Górny for the
8 suggestion).
9
10 Also, Michał has brought it to my attention that git will send the
11 whole file instead of the delta, unless an expensive `git repack`
12 operation is performed. Maybe it's possible to repack the user.git
13 each time the index is generated? Currently, the master rsync mirror
14 runs egencache every 30 minutes. If user.git syncs at the same
15 interval, it would need to be repacked at the same interval.
16
17 Anyway, it would be nice to merge this patch, even if we don't have
18 the resources now to generate the index for gentoo on the server side.
19 We could follow up this patch later with a post emerge --sync hook
20 for client-side index generation.
21
22
23 From 5192579f79da36f5a1ce5f3651c26ccb235cca28 Mon Sep 17 00:00:00 2001
24 From: Zac Medico <zmedico@g.o>
25 Date: Fri, 17 Oct 2014 17:38:59 -0700
26 Subject: [PATCH] emerge --search: use description index
27
28 This adds an egencache --update-pkg-desc-index action which generates
29 a plain-text index of package names, versions, and descriptions. The
30 index can then be used to optimize emerge --search / --searchdesc
31 actions. If the package description index is missing from a particular
32 repository, then all metadata for that repository is obtained using the
33 normal pordbapi.aux_get method.
34
35 Searching of installed packages is optimized to take advantage of
36 vardbdbapi._aux_cache, which is backed by vdb_metadata.pickle.
37 See the IndexedVardb docstring for some more details.
38
39 For users that would like to modify ebuilds in a repository without
40 running egencache afterwards, the new emerge --search-index < y | n >
41 option can be used to get non-indexed search. Alternatively, the user
42 could simply remove the stale index file, in order to disable the
43 search index for a particular repository.
44
45 X-Gentoo-Bug: 525718
46 X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=525718
47 ---
48 bin/egencache | 39 +++++++++-
49 man/egencache.1 | 4 +
50 man/emerge.1 | 8 ++
51 man/portage.5 | 12 +++
52 pym/_emerge/actions.py | 3 +-
53 pym/_emerge/main.py | 5 ++
54 pym/_emerge/search.py | 207 +++++++++++++++++++++++++++++++++++++++++++++----
55 7 files changed, 261 insertions(+), 17 deletions(-)
56
57 diff --git a/bin/egencache b/bin/egencache
58 index e366058..95cb1ad 100755
59 --- a/bin/egencache
60 +++ b/bin/egencache
61 @@ -57,7 +57,7 @@ from portage.util._async.run_main_scheduler import run_main_scheduler
62 from portage.util._eventloop.global_event_loop import global_event_loop
63 from portage import cpv_getkey
64 from portage.dep import Atom, isjustname
65 -from portage.versions import pkgsplit, vercmp
66 +from portage.versions import pkgsplit, vercmp, _pkg_str
67
68 try:
69 from xml.etree import ElementTree
70 @@ -91,6 +91,9 @@ def parse_args(args):
71 actions.add_argument("--update-changelogs",
72 action="store_true",
73 help="update the ChangeLog files from SCM logs")
74 + actions.add_argument("--update-pkg-desc-index",
75 + action="store_true",
76 + help="update package description index")
77 actions.add_argument("--update-manifests",
78 action="store_true",
79 help="update manifests")
80 @@ -451,6 +454,31 @@ class GenCache(object):
81 if hasattr(trg_cache, '_prune_empty_dirs'):
82 trg_cache._prune_empty_dirs()
83
84 +class GenPkgDescIndex(object):
85 + def __init__(self, portdb, output_file):
86 + self.returncode = os.EX_OK
87 + self._portdb = portdb
88 + self._output_file = output_file
89 +
90 + def run(self):
91 +
92 + portage.util.ensure_dirs(os.path.dirname(self._output_file))
93 + f = portage.util.atomic_ofstream(self._output_file,
94 + encoding=_encodings["repo.content"])
95 +
96 + portdb = self._portdb
97 + for cp in portdb.cp_all():
98 + pkgs = portdb.cp_list(cp)
99 + if not pkgs:
100 + continue
101 + desc, = portdb.aux_get(pkgs[-1], ["DESCRIPTION"])
102 +
103 + f.write("%s %s: %s\n" % (cp,
104 + " ".join(_pkg_str(cpv).version
105 + for cpv in pkgs), desc))
106 +
107 + f.close()
108 +
109 class GenUseLocalDesc(object):
110 def __init__(self, portdb, output=None,
111 preserve_comments=False):
112 @@ -893,7 +921,8 @@ def egencache_main(args):
113 local_config=False, env=env)
114
115 if not (options.update or options.update_use_local_desc or
116 - options.update_changelogs or options.update_manifests):
117 + options.update_changelogs or options.update_manifests or
118 + options.update_pkg_desc_index):
119 parser.error('No action specified')
120 return 1
121
122 @@ -1057,6 +1086,12 @@ def egencache_main(args):
123 else:
124 ret.append(scheduler.returncode)
125
126 + if options.update_pkg_desc_index:
127 + gen_index = GenPkgDescIndex(portdb, os.path.join(
128 + repo_config.location, "metadata", "pkg_desc_index"))
129 + gen_index.run()
130 + ret.append(gen_index.returncode)
131 +
132 if options.update_use_local_desc:
133 gen_desc = GenUseLocalDesc(portdb,
134 output=options.uld_output,
135 diff --git a/man/egencache.1 b/man/egencache.1
136 index f71feb3..3a3197f 100644
137 --- a/man/egencache.1
138 +++ b/man/egencache.1
139 @@ -19,6 +19,10 @@ for the details on package atom syntax.
140 .BR "\-\-update\-changelogs"
141 Update the ChangeLog files from SCM logs (supported only in git repos).
142 .TP
143 +.BR "\-\-update\-pkg\-desc\-index"
144 +Update the package description index which is located at
145 +\fImetadata/pkg_desc_index\fR in the repository.
146 +.TP
147 .BR "\-\-update\-use\-local\-desc"
148 Update the \fIprofiles/use.local.desc\fR file from metadata.xml.
149 .TP
150 diff --git a/man/emerge.1 b/man/emerge.1
151 index 2264b58..efd5d41 100644
152 --- a/man/emerge.1
153 +++ b/man/emerge.1
154 @@ -790,6 +790,14 @@ If ebuilds using EAPIs which \fIdo not\fR support \fBHDEPEND\fR are built in
155 the same \fBemerge\fR run as those using EAPIs which \fIdo\fR support
156 \fBHDEPEND\fR, this option affects only the former.
157 .TP
158 +.BR "\-\-search\-index < y | n >"
159 +Enable or disable indexed search for search actions. This option is
160 +enabled by default. The search index needs to be regenerated by
161 +\fBegencache\fR(1) after changes are made to a repository (see the
162 +\fB\-\-update\-pkg\-desc\-index\fR action). This setting can be added
163 +to \fBEMERGE_DEFAULT_OPTS\fR (see \fBmake.conf\fR(5)) and later
164 +overridden via the command line.
165 +.TP
166 .BR "\-\-select [ y | n ] (\-w short option)"
167 Add specified packages to the world set (inverse of
168 \fB\-\-oneshot\fR). This is useful if you want to
169 diff --git a/man/portage.5 b/man/portage.5
170 index e399f0f..bf9457c 100644
171 --- a/man/portage.5
172 +++ b/man/portage.5
173 @@ -75,6 +75,7 @@ user\-defined package sets
174 .BR /usr/portage/metadata/
175 .nf
176 layout.conf
177 +pkg_desc_index
178 .fi
179 .TP
180 .BR /usr/portage/profiles/
181 @@ -1110,6 +1111,17 @@ cache\-formats = md5-dict pms
182 profile\-formats = portage-2
183 .fi
184 .RE
185 +.TP
186 +.BR pkg_desc_index
187 +This is an index of package names, versions, and descriptions which
188 +may be generated by \fBegencache\fR(1) in order to optimize
189 +\fBemerge\fR(1) search actions.
190 +
191 +.I Example:
192 +.nf
193 +sys-apps/sed 4.2 4.2.1 4.2.1-r1 4.2.2: Super-useful stream editor
194 +sys-apps/usleep 0.1: A wrapper for usleep
195 +.fi
196 .RE
197 .TP
198 .BR /usr/portage/profiles/
199 diff --git a/pym/_emerge/actions.py b/pym/_emerge/actions.py
200 index 4e8b83b..a81212c 100644
201 --- a/pym/_emerge/actions.py
202 +++ b/pym/_emerge/actions.py
203 @@ -2015,7 +2015,8 @@ def action_search(root_config, myopts, myfiles, spinner):
204 searchinstance = search(root_config,
205 spinner, "--searchdesc" in myopts,
206 "--quiet" not in myopts, "--usepkg" in myopts,
207 - "--usepkgonly" in myopts)
208 + "--usepkgonly" in myopts,
209 + search_index = myopts.get("--search-index", "y") != "n")
210 for mysearch in myfiles:
211 try:
212 searchinstance.execute(mysearch)
213 diff --git a/pym/_emerge/main.py b/pym/_emerge/main.py
214 index 3883f72..d403b36 100644
215 --- a/pym/_emerge/main.py
216 +++ b/pym/_emerge/main.py
217 @@ -616,6 +616,11 @@ def parse_opts(tmpcmdline, silent=False):
218 "choices" :("True", "rdeps")
219 },
220
221 + "--search-index": {
222 + "help": "Enable or disable indexed search (enabled by default)",
223 + "choices": y_or_n
224 + },
225 +
226 "--select": {
227 "shortopt" : "-w",
228 "help" : "add specified packages to the world set " + \
229 diff --git a/pym/_emerge/search.py b/pym/_emerge/search.py
230 index 4b0fd9f..37fee20 100644
231 --- a/pym/_emerge/search.py
232 +++ b/pym/_emerge/search.py
233 @@ -3,13 +3,17 @@
234
235 from __future__ import print_function
236
237 +import io
238 import re
239 import portage
240 -from portage import os
241 +from portage import os, _encodings
242 from portage.dbapi.porttree import _parse_uri_map
243 +from portage.dep import Atom
244 +from portage.exception import InvalidAtom, InvalidData
245 from portage.localization import localized_size
246 from portage.output import bold, bold as white, darkgreen, green, red
247 from portage.util import writemsg_stdout
248 +from portage.versions import _pkg_str
249
250 from _emerge.Package import Package
251
252 @@ -25,12 +29,11 @@ class search(object):
253 # public interface
254 #
255 def __init__(self, root_config, spinner, searchdesc,
256 - verbose, usepkg, usepkgonly):
257 + verbose, usepkg, usepkgonly, search_index = True):
258 """Searches the available and installed packages for the supplied search key.
259 The list of available and installed packages is created at object instantiation.
260 This makes successive searches faster."""
261 self.settings = root_config.settings
262 - self.vartree = root_config.trees["vartree"]
263 self.spinner = spinner
264 self.verbose = verbose
265 self.searchdesc = searchdesc
266 @@ -45,6 +48,10 @@ class search(object):
267 bindb = root_config.trees["bintree"].dbapi
268 vardb = root_config.trees["vartree"].dbapi
269
270 + if search_index:
271 + portdb = IndexedPortdb(portdb)
272 + vardb = IndexedVardb(vardb)
273 +
274 if not usepkgonly and portdb._have_root_eclass_dir:
275 self._dbs.append(portdb)
276
277 @@ -53,6 +60,7 @@ class search(object):
278
279 self._dbs.append(vardb)
280 self._portdb = portdb
281 + self._vardb = vardb
282
283 def _spinner_update(self):
284 if self.spinner:
285 @@ -97,7 +105,7 @@ class search(object):
286 return {}
287
288 def _visible(self, db, cpv, metadata):
289 - installed = db is self.vartree.dbapi
290 + installed = db is self._vardb
291 built = installed or db is not self._portdb
292 pkg_type = "ebuild"
293 if installed:
294 @@ -208,6 +216,22 @@ class search(object):
295 masked=1
296 self.matches["pkg"].append([package,masked])
297 elif self.searchdesc: # DESCRIPTION searching
298 + # Check for DESCRIPTION match first, so that we can skip
299 + # the expensive visiblity check if it doesn't match.
300 + full_package = self._xmatch("match-all", package)
301 + if not full_package:
302 + continue
303 + full_package = full_package[-1]
304 + try:
305 + full_desc = self._aux_get(
306 + full_package, ["DESCRIPTION"])[0]
307 + except KeyError:
308 + portage.writemsg(
309 + "emerge: search: aux_get() failed, skipping\n",
310 + noiselevel=-1)
311 + continue
312 + if not self.searchre.search(full_desc):
313 + continue
314 full_package = self._xmatch("bestmatch-visible", package)
315 if not full_package:
316 #no match found; we don't want to query description
317 @@ -217,14 +241,8 @@ class search(object):
318 continue
319 else:
320 masked=1
321 - try:
322 - full_desc = self._aux_get(
323 - full_package, ["DESCRIPTION"])[0]
324 - except KeyError:
325 - print("emerge: search: aux_get() failed, skipping")
326 - continue
327 - if self.searchre.search(full_desc):
328 - self.matches["desc"].append([full_package,masked])
329 +
330 + self.matches["desc"].append((full_package, masked))
331
332 self.sdict = self.setconfig.getSets()
333 for setname in self.sdict:
334 @@ -262,7 +280,7 @@ class search(object):
335 bold(self.searchkey) + " ]\n")
336 msg.append("[ Applications found : " + \
337 bold(str(self.mlen)) + " ]\n\n")
338 - vardb = self.vartree.dbapi
339 + vardb = self._vardb
340 metadata_keys = set(Package.metadata_keys)
341 metadata_keys.update(["DESCRIPTION", "HOMEPAGE", "LICENSE", "SRC_URI"])
342 metadata_keys = tuple(metadata_keys)
343 @@ -372,7 +390,11 @@ class search(object):
344 # private interface
345 #
346 def getInstallationStatus(self,package):
347 - installed_package = self.vartree.dep_bestmatch(package)
348 + installed_package = self._vardb.match(package)
349 + if installed_package:
350 + installed_package = installed_package[-1]
351 + else:
352 + installed_package = ""
353 result = ""
354 version = self.getVersion(installed_package,search.VERSION_RELEASE)
355 if len(version) > 0:
356 @@ -392,3 +414,160 @@ class search(object):
357 result = ""
358 return result
359
360 +
361 +class IndexedPortdb(object):
362 + """
363 + A portdbapi interface that uses a package description index to
364 + improve performance. If the description index is missing for a
365 + particular repository, then all metadata for that repository is
366 + obtained using the normal pordbapi.aux_get method.
367 + """
368 + def __init__(self, portdb):
369 + self._portdb = portdb
370 + self.cpv_exists = portdb.cpv_exists
371 + self.getFetchMap = portdb.getFetchMap
372 + self.findname = portdb.findname
373 + self._aux_cache_keys = portdb._aux_cache_keys
374 + self._have_root_eclass_dir = portdb._have_root_eclass_dir
375 + self._cpv_sort_ascending = portdb._cpv_sort_ascending
376 + self._desc_cache = None
377 + self._cp_map = None
378 +
379 + def _init_index(self):
380 + cp_map = {}
381 + desc_cache = {}
382 + for repo_path in self._portdb.porttrees:
383 + outside_repo = os.path.join(self._portdb.depcachedir,
384 + repo_path.lstrip(os.sep))
385 + for parent_dir in (repo_path, outside_repo):
386 + file_path = os.path.join(parent_dir,
387 + "metadata", "pkg_desc_index")
388 +
389 + try:
390 + with io.open(file_path,
391 + encoding=_encodings["repo.content"]) as f:
392 + for line in f:
393 + try:
394 + pkgs, desc = line.split(":", 1)
395 + except ValueError:
396 + continue
397 + desc = desc.strip()
398 + try:
399 + cp, pkgs = pkgs.split(" ", 1)
400 + except ValueError:
401 + continue
402 + if not cp:
403 + continue
404 + try:
405 + atom = Atom(cp)
406 + except InvalidAtom:
407 + continue
408 + if cp != atom.cp:
409 + continue
410 + cp_list = cp_map.get(cp)
411 + if cp_list is None:
412 + cp_list = []
413 + cp_map[cp] = cp_list
414 + for ver in pkgs.split():
415 + try:
416 + cpv = _pkg_str(cp + "-" + ver)
417 + except InvalidData:
418 + pass
419 + else:
420 + cp_list.append(cpv)
421 + desc_cache[cpv] = desc
422 + except IOError:
423 + pass
424 + else:
425 + break
426 + else:
427 + # No descriptions index was found, so populate
428 + # cp_map the slow way.
429 + for cp in self._portdb.cp_all(trees=[repo_path]):
430 + cp_list = cp_map.get(cp)
431 + if cp_list is None:
432 + cp_list = []
433 + cp_map[cp] = cp_list
434 + for cpv in self._portdb.cp_list(cp, mytree=repo_path):
435 + if cpv not in cp_list:
436 + cp_list.append(_pkg_str(cpv))
437 +
438 + self._desc_cache = desc_cache
439 + self._cp_map = cp_map
440 +
441 + def cp_all(self):
442 + if self._cp_map is None:
443 + self._init_index()
444 + return list(self._cp_map)
445 +
446 + def match(self, atom):
447 + if not isinstance(atom, Atom):
448 + atom = Atom(atom)
449 + cp_list = self._cp_map.get(atom.cp)
450 + if cp_list is None:
451 + return []
452 + self._portdb._cpv_sort_ascending(cp_list)
453 + return portage.match_from_list(atom, cp_list)
454 +
455 + def aux_get(self, cpv, attrs, myrepo = None):
456 + if len(attrs) == 1 and attrs[0] == "DESCRIPTION":
457 + try:
458 + return [self._desc_cache[cpv]]
459 + except KeyError:
460 + pass
461 + return self._portdb.aux_get(cpv, attrs)
462 +
463 +
464 +class IndexedVardb(object):
465 + """
466 + A vardbapi interface that sacrifices validation in order to
467 + improve performance. It takes advantage of vardbdbapi._aux_cache,
468 + which is backed by vdb_metadata.pickle. Since _aux_cache is
469 + not updated for every single merge/unmerge (see
470 + _aux_cache_threshold), the list of packages is obtained directly
471 + from the real vardbapi instance. If a package is missing from
472 + _aux_cache, then its metadata is obtained using the normal
473 + (validated) vardbapi.aux_get method.
474 + """
475 + def __init__(self, vardb):
476 + self._vardb = vardb
477 + self._aux_cache_keys = vardb._aux_cache_keys
478 + self._cpv_sort_ascending = vardb._cpv_sort_ascending
479 + self._cp_map = {}
480 + self.cpv_exists = vardb.cpv_exists
481 +
482 + def cp_all(self):
483 + if self._cp_map:
484 + return list(self._cp_map)
485 + cp_map = self._cp_map
486 + for cpv in self._vardb.cpv_all():
487 + cp = portage.cpv_getkey(cpv)
488 + if cp is not None:
489 + cp_list = cp_map.get(cp)
490 + if cp_list is None:
491 + cp_list = []
492 + cp_map[cp] = cp_list
493 + cp_list.append(_pkg_str(cpv))
494 + return list(cp_map)
495 +
496 + def match(self, atom):
497 + if not isinstance(atom, Atom):
498 + atom = Atom(atom)
499 + cp_list = self._cp_map.get(atom.cp)
500 + if cp_list is None:
501 + return []
502 + self._vardb._cpv_sort_ascending(cp_list)
503 + return portage.match_from_list(atom, cp_list)
504 +
505 + def aux_get(self, cpv, attrs, myrepo = None):
506 + pkg_data = self._vardb._aux_cache["packages"].get(cpv)
507 + if not isinstance(pkg_data, tuple) or \
508 + len(pkg_data) != 2 or \
509 + not isinstance(pkg_data[1], dict):
510 + pkg_data = None
511 + if pkg_data is None:
512 + # It may be missing from _aux_cache due to
513 + # _aux_cache_threshold.
514 + return self._vardb.aux_get(cpv, attrs)
515 + metadata = pkg_data[1]
516 + return [metadata.get(k, "") for k in attrs]
517 --
518 2.0.4

Replies