Gentoo Archives: gentoo-commits

From: Zac Medico <zmedico@g.o>
To: gentoo-commits@l.g.o
Subject: [gentoo-commits] proj/portage:master commit in: pym/portage/dbapi/
Date: Sun, 15 Oct 2017 00:59:10
Message-Id: 1508028820.c5a2a0edc4f4b01b16a274268431fa21f7f678b2.zmedico@gentoo
1 commit: c5a2a0edc4f4b01b16a274268431fa21f7f678b2
2 Author: Daniel Robbins <drobbins <AT> funtoo <DOT> org>
3 AuthorDate: Sat Oct 14 23:38:05 2017 +0000
4 Commit: Zac Medico <zmedico <AT> gentoo <DOT> org>
5 CommitDate: Sun Oct 15 00:53:40 2017 +0000
6 URL: https://gitweb.gentoo.org/proj/portage.git/commit/?id=c5a2a0ed
7
8 portdbapi: factor out _better_cache class
9
10 Better_cache -- now even better :) This version only scans individual
11 categories on-demand. I have addressed concerns about PMS-compliance by
12 enhancing the documentation so that developers are aware of what
13 assumptions to make (and not make) when using better_cache.
14
15 Closes: https://github.com/gentoo/portage/pull/219
16
17 pym/portage/dbapi/porttree.py | 124 ++++++++++++++++++++++++------------------
18 1 file changed, 71 insertions(+), 53 deletions(-)
19
20 diff --git a/pym/portage/dbapi/porttree.py b/pym/portage/dbapi/porttree.py
21 index 53edcd18f..f5979d2d0 100644
22 --- a/pym/portage/dbapi/porttree.py
23 +++ b/pym/portage/dbapi/porttree.py
24 @@ -16,7 +16,7 @@ portage.proxy.lazyimport.lazyimport(globals(),
25 'portage.package.ebuild.doebuild:doebuild',
26 'portage.util:ensure_dirs,shlex_split,writemsg,writemsg_level',
27 'portage.util.listdir:listdir',
28 - 'portage.versions:best,catpkgsplit,_pkgsplit@pkgsplit,ver_regexp,_pkg_str',
29 + 'portage.versions:best,catsplit,catpkgsplit,_pkgsplit@pkgsplit,ver_regexp,_pkg_str',
30 )
31
32 from portage.cache import volatile
33 @@ -103,6 +103,68 @@ class _dummy_list(list):
34 except ValueError:
35 pass
36
37 +
38 +class _better_cache(object):
39 +
40 + """
41 + The purpose of better_cache is to locate catpkgs in repositories using ``os.listdir()`` as much as possible, which
42 + is less expensive IO-wise than exhaustively doing a stat on each repo for a particular catpkg. better_cache stores a
43 + list of repos in which particular catpkgs appear. Various dbapi methods use better_cache to locate repositories of
44 + interest related to particular catpkg rather than performing an exhaustive scan of all repos/overlays.
45 +
46 + Better_cache.items data may look like this::
47 +
48 + { "sys-apps/portage" : [ repo1, repo2 ] }
49 +
50 + Without better_cache, Portage will get slower and slower (due to excessive IO) as more overlays are added.
51 +
52 + Also note that it is OK if this cache has some 'false positive' catpkgs in it. We use it to search for specific
53 + catpkgs listed in ebuilds. The likelihood of a false positive catpkg in our cache causing a problem is extremely
54 + low, because the user of our cache is passing us a catpkg that came from somewhere and has already undergone some
55 + validation, and even then will further interrogate the short-list of repos we return to gather more information
56 + on the catpkg.
57 +
58 + Thus, the code below is optimized for speed rather than painstaking correctness. I have added a note to
59 + ``dbapi.getRepositories()`` to ensure that developers are aware of this just in case.
60 +
61 + The better_cache has been redesigned to perform on-demand scans -- it will only scan a category at a time, as
62 + needed. This should further optimize IO performance by not scanning category directories that are not needed by
63 + Portage.
64 + """
65 +
66 + def __init__(self, repositories):
67 + self._items = collections.defaultdict(list)
68 + self._scanned_cats = set()
69 +
70 + # ordered list of all portree locations we'll scan:
71 + self._repo_list = [repo for repo in reversed(list(repositories))
72 + if repo.location is not None]
73 +
74 + def __getitem__(self, catpkg):
75 + result = self._items.get(catpkg)
76 + if result is not None:
77 + return result
78 +
79 + cat, pkg = catsplit(catpkg)
80 + if cat not in self._scanned_cats:
81 + self._scan_cat(cat)
82 + return self._items[catpkg]
83 +
84 + def _scan_cat(self, cat):
85 + for repo in self._repo_list:
86 + cat_dir = repo.location + "/" + cat
87 + try:
88 + pkg_list = os.listdir(cat_dir)
89 + except OSError as e:
90 + if e.errno not in (errno.ENOTDIR, errno.ENOENT, errno.ESTALE):
91 + raise
92 + continue
93 + for p in pkg_list:
94 + if os.path.isdir(cat_dir + "/" + p):
95 + self._items[cat + "/" + p].append(repo)
96 + self._scanned_cats.add(cat)
97 +
98 +
99 class portdbapi(dbapi):
100 """this tree will scan a portage directory located at root (passed to init)"""
101 portdbapi_instances = _dummy_list()
102 @@ -346,11 +408,14 @@ class portdbapi(dbapi):
103 return None
104
105 def getRepositories(self, catpkg=None):
106 +
107 """
108 With catpkg=None, this will return a complete list of repositories in this dbapi. With catpkg set to a value,
109 this method will return a short-list of repositories that contain this catpkg. Use this second approach if
110 possible, to avoid exhaustively searching all repos for a particular catpkg. It's faster for this method to
111 - find the catpkg than for you do it yourself.
112 + find the catpkg than for you do it yourself. When specifying catpkg, you should have reasonable assurance that
113 + the category is valid and PMS-compliant as the caching mechanism we use does not perform validation checks for
114 + categories.
115
116 This function is required for GLEP 42 compliance.
117
118 @@ -358,7 +423,8 @@ class portdbapi(dbapi):
119 catpkg; if None, return a list of all Repositories that contain a particular catpkg.
120 @return: a list of repositories.
121 """
122 - if catpkg is not None and self._better_cache is not None and catpkg in self._better_cache:
123 +
124 + if catpkg is not None and self._better_cache is not None:
125 return [repo.name for repo in self._better_cache[catpkg]]
126 return self._ordered_repo_name_list
127
128 @@ -796,12 +862,7 @@ class portdbapi(dbapi):
129 elif self._better_cache is None:
130 mytrees = self.porttrees
131 else:
132 - try:
133 - repos = self._better_cache[mycp]
134 - except KeyError:
135 - mytrees = []
136 - else:
137 - mytrees = [repo.location for repo in repos]
138 + mytrees = [repo.location for repo in self._better_cache[mycp]]
139 for oroot in mytrees:
140 try:
141 file_list = os.listdir(os.path.join(oroot, mycp))
142 @@ -850,50 +911,7 @@ class portdbapi(dbapi):
143 "minimum-all-ignore-profile", "minimum-visible"):
144 self.xcache[x]={}
145 self.frozen=1
146 - self._better_cache = better_cache = collections.defaultdict(list)
147 -
148 - # The purpose of self._better_cache is to perform an initial quick scan of all repositories
149 - # using os.listdir(), which is less expensive IO-wise than exhaustively doing a stat on each
150 - # repo. self._better_cache stores a list of repos in which particular catpkgs appear.
151 - #
152 - # For example, better_cache data may look like this:
153 - #
154 - # { "sys-apps/portage" : [ repo1, repo2 ] }
155 - #
156 - # Without this tweak, Portage will get slower and slower as more overlays are added.
157 - #
158 - # Also note that it is OK if this cache has some 'false positive' catpkgs in it. We use it
159 - # to search for specific catpkgs listed in ebuilds. The likelihood of a false positive catpkg
160 - # in our cache causing a problem is extremely low. Thus, the code below is optimized for
161 - # speed rather than painstaking correctness.
162 -
163 - valid_categories = self.settings.categories
164 - for repo_loc in reversed(self.porttrees):
165 - repo = self.repositories.get_repo_for_location(repo_loc)
166 - try:
167 - categories = os.listdir(repo_loc)
168 - except OSError as e:
169 - if e.errno not in (errno.ENOTDIR, errno.ENOENT, errno.ESTALE):
170 - raise
171 - continue
172 -
173 - for cat in categories:
174 - if cat not in valid_categories:
175 - continue
176 - cat_dir = repo_loc + "/" + cat
177 - try:
178 - pkg_list = os.listdir(cat_dir)
179 - except OSError as e:
180 - if e.errno != errno.ENOTDIR:
181 - raise
182 - continue
183 -
184 - for p in pkg_list:
185 - catpkg_dir = cat_dir + "/" + p
186 - if not os.path.isdir(catpkg_dir):
187 - continue
188 - catpkg = cat + "/" + p
189 - better_cache[catpkg].append(repo)
190 + self._better_cache = _better_cache(self.repositories)
191
192 def melt(self):
193 self.xcache = {}