Gentoo Archives: gentoo-portage-dev

From: Zac Medico <zmedico@g.o>
To: gentoo-portage-dev@l.g.o
Subject: [gentoo-portage-dev] Re: [PATCH] emerge --search: use description index
Date: Sat, 18 Oct 2014 05:59:41
Message-Id: 544201C4.5050106@gentoo.org
In Reply to: [gentoo-portage-dev] [PATCH] emerge --search: use description index by Zac Medico
1 This updated patch adds --search-index < y | n >. I'll be maintaining
2 this patch in the following branch:
3
4 https://github.com/zmedico/portage/tree/bug_525718
5
6 From 2aca92f664fd2ff669b77b38a49b06fafbc66b8d Mon Sep 17 00:00:00 2001
7 From: Zac Medico <zmedico@g.o>
8 Date: Fri, 17 Oct 2014 17:38:59 -0700
9 Subject: [PATCH] emerge --search: use description index
10
11 This adds an egencache --update-pkg-desc-index action which generates
12 a plain-text index of package names, versions, and descriptions. The
13 index can then be used to optimize emerge --search / --searchdesc
14 actions. If the package description index is missing from a particular
15 repository, then all metadata for that repository is obtained using the
16 normal pordbapi.aux_get method.
17
18 Searching of installed packages is optimized to take advantage of
19 vardbdbapi._aux_cache, which is backed by vardb_metadata.pickle.
20 See the IndexedVardb docstring some more details.
21
22 For users that would like to modify ebuilds in a repository without
23 running egencache afterwards, the new emerge --search-index < y | n >
24 option can be used to get non-indexed search. Alternatively, the user
25 could simply remove the stale index file, in order to disable the
26 search index for a particular repository.
27
28 X-Gentoo-Bug: 525718
29 X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=525718
30 ---
31 bin/egencache | 43 ++++++++++-
32 man/egencache.1 | 4 +
33 man/emerge.1 | 8 ++
34 man/portage.5 | 6 ++
35 pym/_emerge/actions.py | 3 +-
36 pym/_emerge/main.py | 5 ++
37 pym/_emerge/search.py | 198 +++++++++++++++++++++++++++++++++++++++++++++----
38 7 files changed, 250 insertions(+), 17 deletions(-)
39
40 diff --git a/bin/egencache b/bin/egencache
41 index e366058..90d5e68 100755
42 --- a/bin/egencache
43 +++ b/bin/egencache
44 @@ -57,7 +57,7 @@ from portage.util._async.run_main_scheduler import run_main_scheduler
45 from portage.util._eventloop.global_event_loop import global_event_loop
46 from portage import cpv_getkey
47 from portage.dep import Atom, isjustname
48 -from portage.versions import pkgsplit, vercmp
49 +from portage.versions import pkgsplit, vercmp, _pkg_str
50
51 try:
52 from xml.etree import ElementTree
53 @@ -91,6 +91,9 @@ def parse_args(args):
54 actions.add_argument("--update-changelogs",
55 action="store_true",
56 help="update the ChangeLog files from SCM logs")
57 + actions.add_argument("--update-pkg-desc-index",
58 + action="store_true",
59 + help="update package description index")
60 actions.add_argument("--update-manifests",
61 action="store_true",
62 help="update manifests")
63 @@ -451,6 +454,35 @@ class GenCache(object):
64 if hasattr(trg_cache, '_prune_empty_dirs'):
65 trg_cache._prune_empty_dirs()
66
67 +class GenPkgDescIndex(object):
68 + def __init__(self, portdb, output_file):
69 + self.returncode = os.EX_OK
70 + self._portdb = portdb
71 + self._output_file = output_file
72 +
73 + def run(self):
74 +
75 + portage.util.ensure_dirs(os.path.dirname(self._output_file))
76 + f = portage.util.atomic_ofstream(self._output_file,
77 + encoding=_encodings["repo.content"])
78 +
79 + portdb = self._portdb
80 + for cp in portdb.cp_all():
81 + pkgs = portdb.cp_list(cp)
82 + if not pkgs:
83 + continue
84 + desc, = portdb.aux_get(pkgs[-1], ["DESCRIPTION"])
85 +
86 + if len(pkgs) == 1:
87 + output = "%s: %s\n" % (pkgs[0], desc)
88 + else:
89 + output = "%s,%s: %s\n" % (pkgs[0],
90 + ",".join(_pkg_str(cpv).version
91 + for cpv in pkgs[1:]), desc)
92 + f.write(output)
93 +
94 + f.close()
95 +
96 class GenUseLocalDesc(object):
97 def __init__(self, portdb, output=None,
98 preserve_comments=False):
99 @@ -893,7 +925,8 @@ def egencache_main(args):
100 local_config=False, env=env)
101
102 if not (options.update or options.update_use_local_desc or
103 - options.update_changelogs or options.update_manifests):
104 + options.update_changelogs or options.update_manifests or
105 + options.update_pkg_desc_index):
106 parser.error('No action specified')
107 return 1
108
109 @@ -1057,6 +1090,12 @@ def egencache_main(args):
110 else:
111 ret.append(scheduler.returncode)
112
113 + if options.update_pkg_desc_index:
114 + gen_index = GenPkgDescIndex(portdb, os.path.join(
115 + repo_config.location, "metadata", "pkg_desc_index"))
116 + gen_index.run()
117 + ret.append(gen_index.returncode)
118 +
119 if options.update_use_local_desc:
120 gen_desc = GenUseLocalDesc(portdb,
121 output=options.uld_output,
122 diff --git a/man/egencache.1 b/man/egencache.1
123 index f71feb3..3a3197f 100644
124 --- a/man/egencache.1
125 +++ b/man/egencache.1
126 @@ -19,6 +19,10 @@ for the details on package atom syntax.
127 .BR "\-\-update\-changelogs"
128 Update the ChangeLog files from SCM logs (supported only in git repos).
129 .TP
130 +.BR "\-\-update\-pkg\-desc\-index"
131 +Update the package description index which is located at
132 +\fImetadata/pkg_desc_index\fR in the repository.
133 +.TP
134 .BR "\-\-update\-use\-local\-desc"
135 Update the \fIprofiles/use.local.desc\fR file from metadata.xml.
136 .TP
137 diff --git a/man/emerge.1 b/man/emerge.1
138 index 2264b58..efd5d41 100644
139 --- a/man/emerge.1
140 +++ b/man/emerge.1
141 @@ -790,6 +790,14 @@ If ebuilds using EAPIs which \fIdo not\fR support \fBHDEPEND\fR are built in
142 the same \fBemerge\fR run as those using EAPIs which \fIdo\fR support
143 \fBHDEPEND\fR, this option affects only the former.
144 .TP
145 +.BR "\-\-search\-index < y | n >"
146 +Enable or disable indexed search for search actions. This option is
147 +enabled by default. The search index needs to be regenerated by
148 +\fBegencache\fR(1) after changes are made to a repository (see the
149 +\fB\-\-update\-pkg\-desc\-index\fR action). This setting can be added
150 +to \fBEMERGE_DEFAULT_OPTS\fR (see \fBmake.conf\fR(5)) and later
151 +overridden via the command line.
152 +.TP
153 .BR "\-\-select [ y | n ] (\-w short option)"
154 Add specified packages to the world set (inverse of
155 \fB\-\-oneshot\fR). This is useful if you want to
156 diff --git a/man/portage.5 b/man/portage.5
157 index e399f0f..26856d1 100644
158 --- a/man/portage.5
159 +++ b/man/portage.5
160 @@ -75,6 +75,7 @@ user\-defined package sets
161 .BR /usr/portage/metadata/
162 .nf
163 layout.conf
164 +pkg_desc_index
165 .fi
166 .TP
167 .BR /usr/portage/profiles/
168 @@ -1110,6 +1111,11 @@ cache\-formats = md5-dict pms
169 profile\-formats = portage-2
170 .fi
171 .RE
172 +.TP
173 +.BR pkg_desc_index
174 +This is an index of packages and descriptions which may be generated
175 +by \fBegencache\fR(1) in order to optimize \fBemerge\fR(1) search
176 +actions.
177 .RE
178 .TP
179 .BR /usr/portage/profiles/
180 diff --git a/pym/_emerge/actions.py b/pym/_emerge/actions.py
181 index 4e8b83b..a81212c 100644
182 --- a/pym/_emerge/actions.py
183 +++ b/pym/_emerge/actions.py
184 @@ -2015,7 +2015,8 @@ def action_search(root_config, myopts, myfiles, spinner):
185 searchinstance = search(root_config,
186 spinner, "--searchdesc" in myopts,
187 "--quiet" not in myopts, "--usepkg" in myopts,
188 - "--usepkgonly" in myopts)
189 + "--usepkgonly" in myopts,
190 + search_index = myopts.get("--search-index", "y") != "n")
191 for mysearch in myfiles:
192 try:
193 searchinstance.execute(mysearch)
194 diff --git a/pym/_emerge/main.py b/pym/_emerge/main.py
195 index 3883f72..d403b36 100644
196 --- a/pym/_emerge/main.py
197 +++ b/pym/_emerge/main.py
198 @@ -616,6 +616,11 @@ def parse_opts(tmpcmdline, silent=False):
199 "choices" :("True", "rdeps")
200 },
201
202 + "--search-index": {
203 + "help": "Enable or disable indexed search (enabled by default)",
204 + "choices": y_or_n
205 + },
206 +
207 "--select": {
208 "shortopt" : "-w",
209 "help" : "add specified packages to the world set " + \
210 diff --git a/pym/_emerge/search.py b/pym/_emerge/search.py
211 index 4b0fd9f..007abf2 100644
212 --- a/pym/_emerge/search.py
213 +++ b/pym/_emerge/search.py
214 @@ -3,13 +3,17 @@
215
216 from __future__ import print_function
217
218 +import io
219 import re
220 import portage
221 -from portage import os
222 +from portage import os, _encodings
223 from portage.dbapi.porttree import _parse_uri_map
224 +from portage.dep import Atom
225 +from portage.exception import InvalidData
226 from portage.localization import localized_size
227 from portage.output import bold, bold as white, darkgreen, green, red
228 from portage.util import writemsg_stdout
229 +from portage.versions import _pkg_str
230
231 from _emerge.Package import Package
232
233 @@ -25,12 +29,11 @@ class search(object):
234 # public interface
235 #
236 def __init__(self, root_config, spinner, searchdesc,
237 - verbose, usepkg, usepkgonly):
238 + verbose, usepkg, usepkgonly, search_index=True):
239 """Searches the available and installed packages for the supplied search key.
240 The list of available and installed packages is created at object instantiation.
241 This makes successive searches faster."""
242 self.settings = root_config.settings
243 - self.vartree = root_config.trees["vartree"]
244 self.spinner = spinner
245 self.verbose = verbose
246 self.searchdesc = searchdesc
247 @@ -45,6 +48,10 @@ class search(object):
248 bindb = root_config.trees["bintree"].dbapi
249 vardb = root_config.trees["vartree"].dbapi
250
251 + if search_index:
252 + portdb = IndexedPortdb(portdb)
253 + vardb = IndexedVardb(vardb)
254 +
255 if not usepkgonly and portdb._have_root_eclass_dir:
256 self._dbs.append(portdb)
257
258 @@ -53,6 +60,7 @@ class search(object):
259
260 self._dbs.append(vardb)
261 self._portdb = portdb
262 + self._vardb = vardb
263
264 def _spinner_update(self):
265 if self.spinner:
266 @@ -97,7 +105,7 @@ class search(object):
267 return {}
268
269 def _visible(self, db, cpv, metadata):
270 - installed = db is self.vartree.dbapi
271 + installed = db is self._vardb
272 built = installed or db is not self._portdb
273 pkg_type = "ebuild"
274 if installed:
275 @@ -208,6 +216,20 @@ class search(object):
276 masked=1
277 self.matches["pkg"].append([package,masked])
278 elif self.searchdesc: # DESCRIPTION searching
279 + # Check for DESCRIPTION match first, so that we can skip
280 + # the expensive visiblity check if it doesn't match.
281 + full_package = portage.best(
282 + self._xmatch("match-all", package))
283 + try:
284 + full_desc = self._aux_get(
285 + full_package, ["DESCRIPTION"])[0]
286 + except KeyError:
287 + portage.writemsg(
288 + "emerge: search: aux_get() failed, skipping\n",
289 + noiselevel=-1)
290 + continue
291 + if not self.searchre.search(full_desc):
292 + continue
293 full_package = self._xmatch("bestmatch-visible", package)
294 if not full_package:
295 #no match found; we don't want to query description
296 @@ -217,14 +239,8 @@ class search(object):
297 continue
298 else:
299 masked=1
300 - try:
301 - full_desc = self._aux_get(
302 - full_package, ["DESCRIPTION"])[0]
303 - except KeyError:
304 - print("emerge: search: aux_get() failed, skipping")
305 - continue
306 - if self.searchre.search(full_desc):
307 - self.matches["desc"].append([full_package,masked])
308 +
309 + self.matches["desc"].append((full_package, masked))
310
311 self.sdict = self.setconfig.getSets()
312 for setname in self.sdict:
313 @@ -262,7 +278,7 @@ class search(object):
314 bold(self.searchkey) + " ]\n")
315 msg.append("[ Applications found : " + \
316 bold(str(self.mlen)) + " ]\n\n")
317 - vardb = self.vartree.dbapi
318 + vardb = self._vardb
319 metadata_keys = set(Package.metadata_keys)
320 metadata_keys.update(["DESCRIPTION", "HOMEPAGE", "LICENSE", "SRC_URI"])
321 metadata_keys = tuple(metadata_keys)
322 @@ -372,7 +388,11 @@ class search(object):
323 # private interface
324 #
325 def getInstallationStatus(self,package):
326 - installed_package = self.vartree.dep_bestmatch(package)
327 + installed_package = self._vardb.match(package)
328 + if installed_package:
329 + installed_package = installed_package[-1]
330 + else:
331 + installed_package = ""
332 result = ""
333 version = self.getVersion(installed_package,search.VERSION_RELEASE)
334 if len(version) > 0:
335 @@ -392,3 +412,153 @@ class search(object):
336 result = ""
337 return result
338
339 +
340 +class IndexedPortdb(object):
341 + """
342 + A portdbapi interface that uses a package description index to
343 + improve performance. If the description index is missing for a
344 + particular repository, then all metadata for that repository is
345 + obtained using the normal pordbapi.aux_get method.
346 + """
347 + def __init__(self, portdb):
348 + self._portdb = portdb
349 + self.cpv_exists = portdb.cpv_exists
350 + self.getFetchMap = portdb.getFetchMap
351 + self.findname = portdb.findname
352 + self._aux_cache_keys = portdb._aux_cache_keys
353 + self._have_root_eclass_dir = portdb._have_root_eclass_dir
354 + self._cpv_sort_ascending = portdb._cpv_sort_ascending
355 + self._desc_cache = None
356 + self._cp_map = None
357 +
358 + def _init_index(self):
359 + cp_map = {}
360 + desc_cache = {}
361 + for repo_path in self._portdb.porttrees:
362 + outside_repo = os.path.join(self._portdb.depcachedir,
363 + repo_path.lstrip(os.sep))
364 + for parent_dir in (repo_path, outside_repo):
365 + file_path = os.path.join(parent_dir,
366 + "metadata", "pkg_desc_index")
367 +
368 + try:
369 + with io.open(file_path,
370 + encoding=_encodings["repo.content"]) as f:
371 + for line in f:
372 + pkgs, desc = line.split(":", 1)
373 + desc = desc.strip()
374 + pkgs = pkgs.split(",")
375 + if not pkgs[0]:
376 + continue
377 + try:
378 + pkg = _pkg_str(pkgs[0])
379 + except InvalidData:
380 + continue
381 + cp_list = cp_map.get(pkg.cp)
382 + if cp_list is None:
383 + cp_list = []
384 + cp_map[pkg.cp] = cp_list
385 + cp_list.append(pkg)
386 + for ver in pkgs[1:]:
387 + try:
388 + cp_list.append(
389 + _pkg_str(pkg.cp + "-" + ver))
390 + except InvalidData:
391 + pass
392 + for cpv in cp_list:
393 + desc_cache[cpv] = desc
394 + except IOError:
395 + pass
396 + else:
397 + break
398 + else:
399 + # No descriptions index was found, so populate
400 + # cp_map the slow way.
401 + for cp in self._portdb.cp_all(trees=[repo_path]):
402 + cp_list = cp_map.get(cp)
403 + if cp_list is None:
404 + cp_list = []
405 + cp_map[cp] = cp_list
406 + for cpv in self._portdb.cp_list(cp, mytree=repo_path):
407 + if cpv not in cp_list:
408 + cp_list.append(_pkg_str(cpv))
409 +
410 + self._desc_cache = desc_cache
411 + self._cp_map = cp_map
412 +
413 + def cp_all(self):
414 + if self._cp_map is None:
415 + self._init_index()
416 + return list(self._cp_map)
417 +
418 + def match(self, atom):
419 + if not isinstance(atom, Atom):
420 + atom = Atom(atom)
421 + cp_list = self._cp_map.get(atom.cp)
422 + if cp_list is None:
423 + return []
424 + self._portdb._cpv_sort_ascending(cp_list)
425 + return portage.match_from_list(atom, cp_list)
426 +
427 + def aux_get(self, cpv, attrs, myrepo = None):
428 + if len(attrs) == 1 and attrs[0] == "DESCRIPTION":
429 + try:
430 + return [self._desc_cache[cpv]]
431 + except KeyError:
432 + pass
433 + return self._portdb.aux_get(cpv, attrs)
434 +
435 +
436 +class IndexedVardb(object):
437 + """
438 + A vardbapi interface that sacrifices validation in order to
439 + improve performance. It takes advantage of vardbdbapi._aux_cache,
440 + which is backed by vardb_metadata.pickle. Since _aux_cache is
441 + not updated for every single merge/unmerge (see
442 + _aux_cache_threshold), the list of packages is obtained directly
443 + from the real vardbapi instance. If a package is missing from
444 + _aux_cache, then its metadata is obtained using the normal
445 + (validated) vardbapi.aux_get method.
446 + """
447 + def __init__(self, vardb):
448 + self._vardb = vardb
449 + self._aux_cache_keys = vardb._aux_cache_keys
450 + self._cpv_sort_ascending = vardb._cpv_sort_ascending
451 + self._cp_map = {}
452 + self.cpv_exists = vardb.cpv_exists
453 +
454 + def cp_all(self):
455 + if self._cp_map:
456 + return list(self._cp_map)
457 + cp_map = self._cp_map
458 + for cpv in self._vardb.cpv_all():
459 + cp = portage.cpv_getkey(cpv)
460 + if cp is not None:
461 + cp_list = cp_map.get(cp)
462 + if cp_list is None:
463 + cp_list = []
464 + cp_map[cp] = cp_list
465 + cp_list.append(_pkg_str(cpv))
466 + return list(cp_map)
467 +
468 + def match(self, atom):
469 + if not isinstance(atom, Atom):
470 + atom = Atom(atom)
471 + cp_list = self._cp_map.get(atom.cp)
472 + if cp_list is None:
473 + return []
474 + self._vardb._cpv_sort_ascending(cp_list)
475 + return portage.match_from_list(atom, cp_list)
476 +
477 + def aux_get(self, cpv, attrs, myrepo = None):
478 + pkg_data = self._vardb._aux_cache["packages"].get(cpv)
479 + if not isinstance(pkg_data, tuple) or \
480 + len(pkg_data) != 2 or \
481 + not isinstance(pkg_data[1], dict):
482 + pkg_data = None
483 + if pkg_data is None:
484 + # It may be missing from _aux_cache due to
485 + # _aux_cache_threshold.
486 + return self._vardb.aux_get(cpv, attrs)
487 + metadata = pkg_data[1]
488 + return [metadata.get(k, "") for k in attrs]
489 --
490 2.0.4

Replies