1 |
commit: c5a2a0edc4f4b01b16a274268431fa21f7f678b2 |
2 |
Author: Daniel Robbins <drobbins <AT> funtoo <DOT> org> |
3 |
AuthorDate: Sat Oct 14 23:38:05 2017 +0000 |
4 |
Commit: Zac Medico <zmedico <AT> gentoo <DOT> org> |
5 |
CommitDate: Sun Oct 15 00:53:40 2017 +0000 |
6 |
URL: https://gitweb.gentoo.org/proj/portage.git/commit/?id=c5a2a0ed |
7 |
|
8 |
portdbapi: factor out _better_cache class |
9 |
|
10 |
Better_cache -- now even better :) This version only scans individual |
11 |
categories on-demand. I have addressed concerns about PMS-compliance by |
12 |
enhancing the documentation so that developers are aware of what |
13 |
assumptions to make (and not make) when using better_cache. |
14 |
|
15 |
Closes: https://github.com/gentoo/portage/pull/219 |
16 |
|
17 |
pym/portage/dbapi/porttree.py | 124 ++++++++++++++++++++++++------------------ |
18 |
1 file changed, 71 insertions(+), 53 deletions(-) |
19 |
|
20 |
diff --git a/pym/portage/dbapi/porttree.py b/pym/portage/dbapi/porttree.py |
21 |
index 53edcd18f..f5979d2d0 100644 |
22 |
--- a/pym/portage/dbapi/porttree.py |
23 |
+++ b/pym/portage/dbapi/porttree.py |
24 |
@@ -16,7 +16,7 @@ portage.proxy.lazyimport.lazyimport(globals(), |
25 |
'portage.package.ebuild.doebuild:doebuild', |
26 |
'portage.util:ensure_dirs,shlex_split,writemsg,writemsg_level', |
27 |
'portage.util.listdir:listdir', |
28 |
- 'portage.versions:best,catpkgsplit,_pkgsplit@pkgsplit,ver_regexp,_pkg_str', |
29 |
+ 'portage.versions:best,catsplit,catpkgsplit,_pkgsplit@pkgsplit,ver_regexp,_pkg_str', |
30 |
) |
31 |
|
32 |
from portage.cache import volatile |
33 |
@@ -103,6 +103,68 @@ class _dummy_list(list): |
34 |
except ValueError: |
35 |
pass |
36 |
|
37 |
+ |
38 |
+class _better_cache(object): |
39 |
+ |
40 |
+ """ |
41 |
+ The purpose of better_cache is to locate catpkgs in repositories using ``os.listdir()`` as much as possible, which |
42 |
+ is less expensive IO-wise than exhaustively doing a stat on each repo for a particular catpkg. better_cache stores a |
43 |
+ list of repos in which particular catpkgs appear. Various dbapi methods use better_cache to locate repositories of |
44 |
+ interest related to particular catpkg rather than performing an exhaustive scan of all repos/overlays. |
45 |
+ |
46 |
+ Better_cache.items data may look like this:: |
47 |
+ |
48 |
+ { "sys-apps/portage" : [ repo1, repo2 ] } |
49 |
+ |
50 |
+ Without better_cache, Portage will get slower and slower (due to excessive IO) as more overlays are added. |
51 |
+ |
52 |
+ Also note that it is OK if this cache has some 'false positive' catpkgs in it. We use it to search for specific |
53 |
+ catpkgs listed in ebuilds. The likelihood of a false positive catpkg in our cache causing a problem is extremely |
54 |
+ low, because the user of our cache is passing us a catpkg that came from somewhere and has already undergone some |
55 |
+ validation, and even then will further interrogate the short-list of repos we return to gather more information |
56 |
+ on the catpkg. |
57 |
+ |
58 |
+ Thus, the code below is optimized for speed rather than painstaking correctness. I have added a note to |
59 |
+ ``dbapi.getRepositories()`` to ensure that developers are aware of this just in case. |
60 |
+ |
61 |
+ The better_cache has been redesigned to perform on-demand scans -- it will only scan a category at a time, as |
62 |
+ needed. This should further optimize IO performance by not scanning category directories that are not needed by |
63 |
+ Portage. |
64 |
+ """ |
65 |
+ |
66 |
+ def __init__(self, repositories): |
67 |
+ self._items = collections.defaultdict(list) |
68 |
+ self._scanned_cats = set() |
69 |
+ |
70 |
+ # ordered list of all portree locations we'll scan: |
71 |
+ self._repo_list = [repo for repo in reversed(list(repositories)) |
72 |
+ if repo.location is not None] |
73 |
+ |
74 |
+ def __getitem__(self, catpkg): |
75 |
+ result = self._items.get(catpkg) |
76 |
+ if result is not None: |
77 |
+ return result |
78 |
+ |
79 |
+ cat, pkg = catsplit(catpkg) |
80 |
+ if cat not in self._scanned_cats: |
81 |
+ self._scan_cat(cat) |
82 |
+ return self._items[catpkg] |
83 |
+ |
84 |
+ def _scan_cat(self, cat): |
85 |
+ for repo in self._repo_list: |
86 |
+ cat_dir = repo.location + "/" + cat |
87 |
+ try: |
88 |
+ pkg_list = os.listdir(cat_dir) |
89 |
+ except OSError as e: |
90 |
+ if e.errno not in (errno.ENOTDIR, errno.ENOENT, errno.ESTALE): |
91 |
+ raise |
92 |
+ continue |
93 |
+ for p in pkg_list: |
94 |
+ if os.path.isdir(cat_dir + "/" + p): |
95 |
+ self._items[cat + "/" + p].append(repo) |
96 |
+ self._scanned_cats.add(cat) |
97 |
+ |
98 |
+ |
99 |
class portdbapi(dbapi): |
100 |
"""this tree will scan a portage directory located at root (passed to init)""" |
101 |
portdbapi_instances = _dummy_list() |
102 |
@@ -346,11 +408,14 @@ class portdbapi(dbapi): |
103 |
return None |
104 |
|
105 |
def getRepositories(self, catpkg=None): |
106 |
+ |
107 |
""" |
108 |
With catpkg=None, this will return a complete list of repositories in this dbapi. With catpkg set to a value, |
109 |
this method will return a short-list of repositories that contain this catpkg. Use this second approach if |
110 |
possible, to avoid exhaustively searching all repos for a particular catpkg. It's faster for this method to |
111 |
- find the catpkg than for you do it yourself. |
112 |
+ find the catpkg than for you do it yourself. When specifying catpkg, you should have reasonable assurance that |
113 |
+ the category is valid and PMS-compliant as the caching mechanism we use does not perform validation checks for |
114 |
+ categories. |
115 |
|
116 |
This function is required for GLEP 42 compliance. |
117 |
|
118 |
@@ -358,7 +423,8 @@ class portdbapi(dbapi): |
119 |
catpkg; if None, return a list of all Repositories that contain a particular catpkg. |
120 |
@return: a list of repositories. |
121 |
""" |
122 |
- if catpkg is not None and self._better_cache is not None and catpkg in self._better_cache: |
123 |
+ |
124 |
+ if catpkg is not None and self._better_cache is not None: |
125 |
return [repo.name for repo in self._better_cache[catpkg]] |
126 |
return self._ordered_repo_name_list |
127 |
|
128 |
@@ -796,12 +862,7 @@ class portdbapi(dbapi): |
129 |
elif self._better_cache is None: |
130 |
mytrees = self.porttrees |
131 |
else: |
132 |
- try: |
133 |
- repos = self._better_cache[mycp] |
134 |
- except KeyError: |
135 |
- mytrees = [] |
136 |
- else: |
137 |
- mytrees = [repo.location for repo in repos] |
138 |
+ mytrees = [repo.location for repo in self._better_cache[mycp]] |
139 |
for oroot in mytrees: |
140 |
try: |
141 |
file_list = os.listdir(os.path.join(oroot, mycp)) |
142 |
@@ -850,50 +911,7 @@ class portdbapi(dbapi): |
143 |
"minimum-all-ignore-profile", "minimum-visible"): |
144 |
self.xcache[x]={} |
145 |
self.frozen=1 |
146 |
- self._better_cache = better_cache = collections.defaultdict(list) |
147 |
- |
148 |
- # The purpose of self._better_cache is to perform an initial quick scan of all repositories |
149 |
- # using os.listdir(), which is less expensive IO-wise than exhaustively doing a stat on each |
150 |
- # repo. self._better_cache stores a list of repos in which particular catpkgs appear. |
151 |
- # |
152 |
- # For example, better_cache data may look like this: |
153 |
- # |
154 |
- # { "sys-apps/portage" : [ repo1, repo2 ] } |
155 |
- # |
156 |
- # Without this tweak, Portage will get slower and slower as more overlays are added. |
157 |
- # |
158 |
- # Also note that it is OK if this cache has some 'false positive' catpkgs in it. We use it |
159 |
- # to search for specific catpkgs listed in ebuilds. The likelihood of a false positive catpkg |
160 |
- # in our cache causing a problem is extremely low. Thus, the code below is optimized for |
161 |
- # speed rather than painstaking correctness. |
162 |
- |
163 |
- valid_categories = self.settings.categories |
164 |
- for repo_loc in reversed(self.porttrees): |
165 |
- repo = self.repositories.get_repo_for_location(repo_loc) |
166 |
- try: |
167 |
- categories = os.listdir(repo_loc) |
168 |
- except OSError as e: |
169 |
- if e.errno not in (errno.ENOTDIR, errno.ENOENT, errno.ESTALE): |
170 |
- raise |
171 |
- continue |
172 |
- |
173 |
- for cat in categories: |
174 |
- if cat not in valid_categories: |
175 |
- continue |
176 |
- cat_dir = repo_loc + "/" + cat |
177 |
- try: |
178 |
- pkg_list = os.listdir(cat_dir) |
179 |
- except OSError as e: |
180 |
- if e.errno != errno.ENOTDIR: |
181 |
- raise |
182 |
- continue |
183 |
- |
184 |
- for p in pkg_list: |
185 |
- catpkg_dir = cat_dir + "/" + p |
186 |
- if not os.path.isdir(catpkg_dir): |
187 |
- continue |
188 |
- catpkg = cat + "/" + p |
189 |
- better_cache[catpkg].append(repo) |
190 |
+ self._better_cache = _better_cache(self.repositories) |
191 |
|
192 |
def melt(self): |
193 |
self.xcache = {} |