1 |
This IndexStreamIterator class can be used together with the |
2 |
pkg_desc_index_line_read function to read an index file incrementally |
3 |
as a stream. |
4 |
|
5 |
The MultiIterGroupBy class can be used to iterate over multiple |
6 |
IndexStreamIterator instances at once, incrementally grouping results |
7 |
for a particular package from multiple indices, while limiting the |
8 |
amount of any given index that must be in memory at once. |
9 |
|
10 |
Both of these classes are used by the IndexedPortdb class in the next |
11 |
patch of this series. |
12 |
|
13 |
X-Gentoo-Bug: 525718 |
14 |
X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=525718 |
15 |
--- |
16 |
This updated patch uses python's bisect module to optimize the search for |
17 |
completed groups. |
18 |
|
19 |
pym/portage/cache/index/IndexStreamIterator.py | 27 +++++++ |
20 |
pym/portage/util/iterators/MultiIterGroupBy.py | 97 ++++++++++++++++++++++++++ |
21 |
pym/portage/util/iterators/__init__.py | 2 + |
22 |
3 files changed, 126 insertions(+) |
23 |
create mode 100644 pym/portage/cache/index/IndexStreamIterator.py |
24 |
create mode 100644 pym/portage/util/iterators/MultiIterGroupBy.py |
25 |
create mode 100644 pym/portage/util/iterators/__init__.py |
26 |
|
27 |
diff --git a/pym/portage/cache/index/IndexStreamIterator.py b/pym/portage/cache/index/IndexStreamIterator.py |
28 |
new file mode 100644 |
29 |
index 0000000..972aee1 |
30 |
--- /dev/null |
31 |
+++ b/pym/portage/cache/index/IndexStreamIterator.py |
32 |
@@ -0,0 +1,27 @@ |
33 |
+# Copyright 2014 Gentoo Foundation |
34 |
+# Distributed under the terms of the GNU General Public License v2 |
35 |
+ |
36 |
+class IndexStreamIterator(object): |
37 |
+ |
38 |
+ def __init__(self, f, parser): |
39 |
+ |
40 |
+ self.parser = parser |
41 |
+ self._file = f |
42 |
+ |
43 |
+ def close(self): |
44 |
+ |
45 |
+ if self._file is not None: |
46 |
+ self._file.close() |
47 |
+ self._file = None |
48 |
+ |
49 |
+ def __iter__(self): |
50 |
+ |
51 |
+ try: |
52 |
+ |
53 |
+ for line in self._file: |
54 |
+ node = self.parser(line) |
55 |
+ if node is not None: |
56 |
+ yield node |
57 |
+ |
58 |
+ finally: |
59 |
+ self.close() |
60 |
diff --git a/pym/portage/util/iterators/MultiIterGroupBy.py b/pym/portage/util/iterators/MultiIterGroupBy.py |
61 |
new file mode 100644 |
62 |
index 0000000..f5f8278 |
63 |
--- /dev/null |
64 |
+++ b/pym/portage/util/iterators/MultiIterGroupBy.py |
65 |
@@ -0,0 +1,97 @@ |
66 |
+# Copyright 2014 Gentoo Foundation |
67 |
+# Distributed under the terms of the GNU General Public License v2 |
68 |
+ |
69 |
+import bisect |
70 |
+ |
71 |
+class MultiIterGroupBy(object): |
72 |
+ """ |
73 |
+ This class functions similarly to the itertools.groupby function, |
74 |
+ except that it takes multiple source iterators as input. The source |
75 |
+ iterators must yield objects in sorted order. A group is yielded as |
76 |
+ soon as the progress of all iterators reaches a state which |
77 |
+ guarantees that there can not be any remaining (unseen) elements of |
78 |
+ the group. This is useful for incremental display of grouped search |
79 |
+ results. |
80 |
+ """ |
81 |
+ |
82 |
+ def __init__(self, iterators, key = None): |
83 |
+ self._iterators = iterators |
84 |
+ self._key = key |
85 |
+ |
86 |
+ def __iter__(self): |
87 |
+ |
88 |
+ trackers = [] |
89 |
+ for iterator in self._iterators: |
90 |
+ trackers.append(_IteratorTracker(iterator)) |
91 |
+ |
92 |
+ key_map = {} |
93 |
+ key_list = [] |
94 |
+ eof = [] |
95 |
+ key_getter = self._key |
96 |
+ if key_getter is None: |
97 |
+ key_getter = lambda x: x |
98 |
+ min_progress = None |
99 |
+ |
100 |
+ while trackers: |
101 |
+ |
102 |
+ for tracker in trackers: |
103 |
+ |
104 |
+ if tracker.current is not None and \ |
105 |
+ tracker.current != min_progress: |
106 |
+ # The trackers are sorted by progress, so the |
107 |
+ # remaining trackers are guaranteed to have |
108 |
+ # sufficient progress. |
109 |
+ break |
110 |
+ |
111 |
+ # In order to avoid over-buffering (waste of memory), |
112 |
+ # only grab a single entry. |
113 |
+ try: |
114 |
+ entry = next(tracker.iterator) |
115 |
+ except StopIteration: |
116 |
+ eof.append(tracker) |
117 |
+ else: |
118 |
+ tracker.current = key_getter(entry) |
119 |
+ key_group = key_map.get(tracker.current) |
120 |
+ if key_group is None: |
121 |
+ key_group = [] |
122 |
+ key_map[tracker.current] = key_group |
123 |
+ bisect.insort(key_list, tracker.current) |
124 |
+ key_group.append(entry) |
125 |
+ |
126 |
+ if eof: |
127 |
+ for tracker in eof: |
128 |
+ trackers.remove(tracker) |
129 |
+ del eof[:] |
130 |
+ |
131 |
+ if trackers: |
132 |
+ trackers.sort() |
133 |
+ min_progress = trackers[0].current |
134 |
+ # yield if key <= min_progress |
135 |
+ i = bisect.bisect_right(key_list, min_progress) |
136 |
+ yield_these = key_list[:i] |
137 |
+ del key_list[:i] |
138 |
+ else: |
139 |
+ yield_these = key_list |
140 |
+ key_list = [] |
141 |
+ |
142 |
+ if yield_these: |
143 |
+ for k in yield_these: |
144 |
+ yield key_map.pop(k) |
145 |
+ |
146 |
+class _IteratorTracker(object): |
147 |
+ |
148 |
+ __slots__ = ('current', 'iterator') |
149 |
+ |
150 |
+ def __init__(self, iterator): |
151 |
+ |
152 |
+ self.iterator = iterator |
153 |
+ self.current = None |
154 |
+ |
155 |
+ def __lt__(self, other): |
156 |
+ if self.current is None: |
157 |
+ if other.current is None: |
158 |
+ return False |
159 |
+ else: |
160 |
+ return True |
161 |
+ return other.current is not None and \ |
162 |
+ self.current < other.current |
163 |
diff --git a/pym/portage/util/iterators/__init__.py b/pym/portage/util/iterators/__init__.py |
164 |
new file mode 100644 |
165 |
index 0000000..7cd880e |
166 |
--- /dev/null |
167 |
+++ b/pym/portage/util/iterators/__init__.py |
168 |
@@ -0,0 +1,2 @@ |
169 |
+# Copyright 2014 Gentoo Foundation |
170 |
+# Distributed under the terms of the GNU General Public License v2 |
171 |
-- |
172 |
2.0.4 |