Gentoo Archives: gentoo-portage-dev

From: Zac Medico <zmedico@g.o>
To: gentoo-portage-dev@l.g.o
Subject: [gentoo-portage-dev] [PATCH] emerge --search: use description index
Date: Sat, 18 Oct 2014 03:29:02
Message-Id: 5441DE73.3090601@gentoo.org
1 This adds an egencache --update-pkg-desc-index action which generates
2 a plain-text index of package names, versions, and descriptions. The
3 index can then be used to optimize emerge --search / --searchdesc
4 actions. If the package description index is missing from a particular
5 repository, then all metadata for that repository is obtained using the
6 normal pordbapi.aux_get method.
7
8 Searching of installed packages is optimized to take advantage of
9 vardbdbapi._aux_cache, which is backed by vardb_metadata.pickle.
10 See the IndexedVardb docstring some more details.
11
12 X-Gentoo-Bug: 525718
13 X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=525718
14 ---
15 bin/egencache | 43 ++++++++++-
16 man/egencache.1 | 4 ++
17 man/portage.5 | 6 ++
18 pym/_emerge/search.py | 196 ++++++++++++++++++++++++++++++++++++++++++++++----
19 4 files changed, 232 insertions(+), 17 deletions(-)
20
21 diff --git a/bin/egencache b/bin/egencache
22 index e366058..90d5e68 100755
23 --- a/bin/egencache
24 +++ b/bin/egencache
25 @@ -57,7 +57,7 @@ from portage.util._async.run_main_scheduler import run_main_scheduler
26 from portage.util._eventloop.global_event_loop import global_event_loop
27 from portage import cpv_getkey
28 from portage.dep import Atom, isjustname
29 -from portage.versions import pkgsplit, vercmp
30 +from portage.versions import pkgsplit, vercmp, _pkg_str
31
32 try:
33 from xml.etree import ElementTree
34 @@ -91,6 +91,9 @@ def parse_args(args):
35 actions.add_argument("--update-changelogs",
36 action="store_true",
37 help="update the ChangeLog files from SCM logs")
38 + actions.add_argument("--update-pkg-desc-index",
39 + action="store_true",
40 + help="update package description index")
41 actions.add_argument("--update-manifests",
42 action="store_true",
43 help="update manifests")
44 @@ -451,6 +454,35 @@ class GenCache(object):
45 if hasattr(trg_cache, '_prune_empty_dirs'):
46 trg_cache._prune_empty_dirs()
47
48 +class GenPkgDescIndex(object):
49 + def __init__(self, portdb, output_file):
50 + self.returncode = os.EX_OK
51 + self._portdb = portdb
52 + self._output_file = output_file
53 +
54 + def run(self):
55 +
56 + portage.util.ensure_dirs(os.path.dirname(self._output_file))
57 + f = portage.util.atomic_ofstream(self._output_file,
58 + encoding=_encodings["repo.content"])
59 +
60 + portdb = self._portdb
61 + for cp in portdb.cp_all():
62 + pkgs = portdb.cp_list(cp)
63 + if not pkgs:
64 + continue
65 + desc, = portdb.aux_get(pkgs[-1], ["DESCRIPTION"])
66 +
67 + if len(pkgs) == 1:
68 + output = "%s: %s\n" % (pkgs[0], desc)
69 + else:
70 + output = "%s,%s: %s\n" % (pkgs[0],
71 + ",".join(_pkg_str(cpv).version
72 + for cpv in pkgs[1:]), desc)
73 + f.write(output)
74 +
75 + f.close()
76 +
77 class GenUseLocalDesc(object):
78 def __init__(self, portdb, output=None,
79 preserve_comments=False):
80 @@ -893,7 +925,8 @@ def egencache_main(args):
81 local_config=False, env=env)
82
83 if not (options.update or options.update_use_local_desc or
84 - options.update_changelogs or options.update_manifests):
85 + options.update_changelogs or options.update_manifests or
86 + options.update_pkg_desc_index):
87 parser.error('No action specified')
88 return 1
89
90 @@ -1057,6 +1090,12 @@ def egencache_main(args):
91 else:
92 ret.append(scheduler.returncode)
93
94 + if options.update_pkg_desc_index:
95 + gen_index = GenPkgDescIndex(portdb, os.path.join(
96 + repo_config.location, "metadata", "pkg_desc_index"))
97 + gen_index.run()
98 + ret.append(gen_index.returncode)
99 +
100 if options.update_use_local_desc:
101 gen_desc = GenUseLocalDesc(portdb,
102 output=options.uld_output,
103 diff --git a/man/egencache.1 b/man/egencache.1
104 index f71feb3..3a3197f 100644
105 --- a/man/egencache.1
106 +++ b/man/egencache.1
107 @@ -19,6 +19,10 @@ for the details on package atom syntax.
108 .BR "\-\-update\-changelogs"
109 Update the ChangeLog files from SCM logs (supported only in git repos).
110 .TP
111 +.BR "\-\-update\-pkg\-desc\-index"
112 +Update the package description index which is located at
113 +\fImetadata/pkg_desc_index\fR in the repository.
114 +.TP
115 .BR "\-\-update\-use\-local\-desc"
116 Update the \fIprofiles/use.local.desc\fR file from metadata.xml.
117 .TP
118 diff --git a/man/portage.5 b/man/portage.5
119 index e399f0f..26856d1 100644
120 --- a/man/portage.5
121 +++ b/man/portage.5
122 @@ -75,6 +75,7 @@ user\-defined package sets
123 .BR /usr/portage/metadata/
124 .nf
125 layout.conf
126 +pkg_desc_index
127 .fi
128 .TP
129 .BR /usr/portage/profiles/
130 @@ -1110,6 +1111,11 @@ cache\-formats = md5-dict pms
131 profile\-formats = portage-2
132 .fi
133 .RE
134 +.TP
135 +.BR pkg_desc_index
136 +This is an index of packages and descriptions which may be generated
137 +by \fBegencache\fR(1) in order to optimize \fBemerge\fR(1) search
138 +actions.
139 .RE
140 .TP
141 .BR /usr/portage/profiles/
142 diff --git a/pym/_emerge/search.py b/pym/_emerge/search.py
143 index 4b0fd9f..bf15f11 100644
144 --- a/pym/_emerge/search.py
145 +++ b/pym/_emerge/search.py
146 @@ -3,13 +3,17 @@
147
148 from __future__ import print_function
149
150 +import io
151 import re
152 import portage
153 -from portage import os
154 +from portage import os, _encodings
155 from portage.dbapi.porttree import _parse_uri_map
156 +from portage.dep import Atom
157 +from portage.exception import InvalidData
158 from portage.localization import localized_size
159 from portage.output import bold, bold as white, darkgreen, green, red
160 from portage.util import writemsg_stdout
161 +from portage.versions import _pkg_str
162
163 from _emerge.Package import Package
164
165 @@ -30,7 +34,6 @@ class search(object):
166 The list of available and installed packages is created at object instantiation.
167 This makes successive searches faster."""
168 self.settings = root_config.settings
169 - self.vartree = root_config.trees["vartree"]
170 self.spinner = spinner
171 self.verbose = verbose
172 self.searchdesc = searchdesc
173 @@ -41,9 +44,9 @@ class search(object):
174
175 self._dbs = []
176
177 - portdb = root_config.trees["porttree"].dbapi
178 + portdb = IndexedPortdb(root_config.trees["porttree"].dbapi)
179 bindb = root_config.trees["bintree"].dbapi
180 - vardb = root_config.trees["vartree"].dbapi
181 + vardb = IndexedVardb(root_config.trees["vartree"].dbapi)
182
183 if not usepkgonly and portdb._have_root_eclass_dir:
184 self._dbs.append(portdb)
185 @@ -53,6 +56,7 @@ class search(object):
186
187 self._dbs.append(vardb)
188 self._portdb = portdb
189 + self._vardb = vardb
190
191 def _spinner_update(self):
192 if self.spinner:
193 @@ -97,7 +101,7 @@ class search(object):
194 return {}
195
196 def _visible(self, db, cpv, metadata):
197 - installed = db is self.vartree.dbapi
198 + installed = db is self._vardb
199 built = installed or db is not self._portdb
200 pkg_type = "ebuild"
201 if installed:
202 @@ -208,6 +212,20 @@ class search(object):
203 masked=1
204 self.matches["pkg"].append([package,masked])
205 elif self.searchdesc: # DESCRIPTION searching
206 + # Check for DESCRIPTION match first, so that we can skip
207 + # the expensive visiblity check if it doesn't match.
208 + full_package = portage.best(
209 + self._xmatch("match-all", package))
210 + try:
211 + full_desc = self._aux_get(
212 + full_package, ["DESCRIPTION"])[0]
213 + except KeyError:
214 + portage.writemsg(
215 + "emerge: search: aux_get() failed, skipping\n",
216 + noiselevel=-1)
217 + continue
218 + if not self.searchre.search(full_desc):
219 + continue
220 full_package = self._xmatch("bestmatch-visible", package)
221 if not full_package:
222 #no match found; we don't want to query description
223 @@ -217,14 +235,8 @@ class search(object):
224 continue
225 else:
226 masked=1
227 - try:
228 - full_desc = self._aux_get(
229 - full_package, ["DESCRIPTION"])[0]
230 - except KeyError:
231 - print("emerge: search: aux_get() failed, skipping")
232 - continue
233 - if self.searchre.search(full_desc):
234 - self.matches["desc"].append([full_package,masked])
235 +
236 + self.matches["desc"].append((full_package, masked))
237
238 self.sdict = self.setconfig.getSets()
239 for setname in self.sdict:
240 @@ -262,7 +274,7 @@ class search(object):
241 bold(self.searchkey) + " ]\n")
242 msg.append("[ Applications found : " + \
243 bold(str(self.mlen)) + " ]\n\n")
244 - vardb = self.vartree.dbapi
245 + vardb = self._vardb
246 metadata_keys = set(Package.metadata_keys)
247 metadata_keys.update(["DESCRIPTION", "HOMEPAGE", "LICENSE", "SRC_URI"])
248 metadata_keys = tuple(metadata_keys)
249 @@ -372,7 +384,11 @@ class search(object):
250 # private interface
251 #
252 def getInstallationStatus(self,package):
253 - installed_package = self.vartree.dep_bestmatch(package)
254 + installed_package = self._vardb.match(package)
255 + if installed_package:
256 + installed_package = installed_package[-1]
257 + else:
258 + installed_package = ""
259 result = ""
260 version = self.getVersion(installed_package,search.VERSION_RELEASE)
261 if len(version) > 0:
262 @@ -392,3 +408,153 @@ class search(object):
263 result = ""
264 return result
265
266 +
267 +class IndexedPortdb(object):
268 + """
269 + A portdbapi interface that uses a package description index to
270 + improve performance. If the description index is missing for a
271 + particular repository, then all metadata for that repository is
272 + obtained using the normal pordbapi.aux_get method.
273 + """
274 + def __init__(self, portdb):
275 + self._portdb = portdb
276 + self.cpv_exists = portdb.cpv_exists
277 + self.getFetchMap = portdb.getFetchMap
278 + self.findname = portdb.findname
279 + self._aux_cache_keys = portdb._aux_cache_keys
280 + self._have_root_eclass_dir = portdb._have_root_eclass_dir
281 + self._cpv_sort_ascending = portdb._cpv_sort_ascending
282 + self._desc_cache = None
283 + self._cp_map = None
284 +
285 + def _init_index(self):
286 + cp_map = {}
287 + desc_cache = {}
288 + for repo_path in self._portdb.porttrees:
289 + outside_repo = os.path.join(self._portdb.depcachedir,
290 + repo_path.lstrip(os.sep))
291 + for parent_dir in (repo_path, outside_repo):
292 + file_path = os.path.join(parent_dir,
293 + "metadata", "pkg_desc_index")
294 +
295 + try:
296 + with io.open(file_path,
297 + encoding=_encodings["repo.content"]) as f:
298 + for line in f:
299 + pkgs, desc = line.split(":", 1)
300 + desc = desc.strip()
301 + pkgs = pkgs.split(",")
302 + if not pkgs[0]:
303 + continue
304 + try:
305 + pkg = _pkg_str(pkgs[0])
306 + except InvalidData:
307 + continue
308 + cp_list = cp_map.get(pkg.cp)
309 + if cp_list is None:
310 + cp_list = []
311 + cp_map[pkg.cp] = cp_list
312 + cp_list.append(pkg)
313 + for ver in pkgs[1:]:
314 + try:
315 + cp_list.append(
316 + _pkg_str(pkg.cp + "-" + ver))
317 + except InvalidData:
318 + pass
319 + for cpv in cp_list:
320 + desc_cache[cpv] = desc
321 + except IOError:
322 + pass
323 + else:
324 + break
325 + else:
326 + # No descriptions index was found, so populate
327 + # cp_map the slow way.
328 + for cp in self._portdb.cp_all(trees=[repo_path]):
329 + cp_list = cp_map.get(cp)
330 + if cp_list is None:
331 + cp_list = []
332 + cp_map[cp] = cp_list
333 + for cpv in self._portdb.cp_list(cp, mytree=repo_path):
334 + if cpv not in cp_list:
335 + cp_list.append(_pkg_str(cpv))
336 +
337 + self._desc_cache = desc_cache
338 + self._cp_map = cp_map
339 +
340 + def cp_all(self):
341 + if self._cp_map is None:
342 + self._init_index()
343 + return list(self._cp_map)
344 +
345 + def match(self, atom):
346 + if not isinstance(atom, Atom):
347 + atom = Atom(atom)
348 + cp_list = self._cp_map.get(atom.cp)
349 + if cp_list is None:
350 + return []
351 + self._portdb._cpv_sort_ascending(cp_list)
352 + return portage.match_from_list(atom, cp_list)
353 +
354 + def aux_get(self, cpv, attrs, myrepo = None):
355 + if len(attrs) == 1 and attrs[0] == "DESCRIPTION":
356 + try:
357 + return [self._desc_cache[cpv]]
358 + except KeyError:
359 + pass
360 + return self._portdb.aux_get(cpv, attrs)
361 +
362 +
363 +class IndexedVardb(object):
364 + """
365 + A vardbapi interface that sacrifices validation in order to
366 + improve performance. It takes advantage of vardbdbapi._aux_cache,
367 + which is backed by vardb_metadata.pickle. Since _aux_cache is
368 + not updated for every single merge/unmerge (see
369 + _aux_cache_threshold), the list of packages is obtained directly
370 + from the real vardbapi instance. If a package is missing from
371 + _aux_cache, then its metadata is obtained using the normal
372 + (validated) vardbapi.aux_get method.
373 + """
374 + def __init__(self, vardb):
375 + self._vardb = vardb
376 + self._aux_cache_keys = vardb._aux_cache_keys
377 + self._cpv_sort_ascending = vardb._cpv_sort_ascending
378 + self._cp_map = {}
379 + self.cpv_exists = vardb.cpv_exists
380 +
381 + def cp_all(self):
382 + if self._cp_map:
383 + return list(self._cp_map)
384 + cp_map = self._cp_map
385 + for cpv in self._vardb.cpv_all():
386 + cp = portage.cpv_getkey(cpv)
387 + if cp is not None:
388 + cp_list = cp_map.get(cp)
389 + if cp_list is None:
390 + cp_list = []
391 + cp_map[cp] = cp_list
392 + cp_list.append(_pkg_str(cpv))
393 + return list(cp_map)
394 +
395 + def match(self, atom):
396 + if not isinstance(atom, Atom):
397 + atom = Atom(atom)
398 + cp_list = self._cp_map.get(atom.cp)
399 + if cp_list is None:
400 + return []
401 + self._vardb._cpv_sort_ascending(cp_list)
402 + return portage.match_from_list(atom, cp_list)
403 +
404 + def aux_get(self, cpv, attrs, myrepo = None):
405 + pkg_data = self._vardb._aux_cache["packages"].get(cpv)
406 + if not isinstance(pkg_data, tuple) or \
407 + len(pkg_data) != 2 or \
408 + not isinstance(pkg_data[1], dict):
409 + pkg_data = None
410 + if pkg_data is None:
411 + # It may be missing from _aux_cache due to
412 + # _aux_cache_threshold.
413 + return self._vardb.aux_get(cpv, attrs)
414 + metadata = pkg_data[1]
415 + return [metadata.get(k, "") for k in attrs]
416 --
417 2.0.4

Replies

Subject Author
[gentoo-portage-dev] Re: [PATCH] emerge --search: use description index Zac Medico <zmedico@g.o>
[gentoo-portage-dev] Zac Medico <zmedico@g.o>