Gentoo Archives: gentoo-portage-dev

From: Zac Medico <zmedico@g.o>
To: gentoo-portage-dev@l.g.o
Cc: Zac Medico <zmedico@g.o>
Subject: [gentoo-portage-dev] [PATCH 2/5 v4] Add IndexStreamIterator and MultiIterGroupBy.
Date: Mon, 03 Nov 2014 03:07:55
Message-Id: 1414984059-23038-1-git-send-email-zmedico@gentoo.org
In Reply to: [gentoo-portage-dev] [PATCH 2/5] Add IndexStreamIterator and MultiIterGroupBy. by Zac Medico
1 This IndexStreamIterator class can be used together with the
2 pkg_desc_index_line_read function to read an index file incrementally
3 as a stream.
4
5 The MultiIterGroupBy class can be used to iterate over multiple
6 IndexStreamIterator instances at once, incrementally grouping results
7 for a particular package from multiple indices, while limiting the
8 amount of any given index that must be in memory at once.
9
10 Both of these classes are used by the IndexedPortdb class in the next
11 patch of this series.
12
13 X-Gentoo-Bug: 525718
14 X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=525718
15 ---
16 This updated patch uses python's bisect module to optimize the search for
17 completed groups.
18
19 pym/portage/cache/index/IndexStreamIterator.py | 27 +++++++
20 pym/portage/util/iterators/MultiIterGroupBy.py | 97 ++++++++++++++++++++++++++
21 pym/portage/util/iterators/__init__.py | 2 +
22 3 files changed, 126 insertions(+)
23 create mode 100644 pym/portage/cache/index/IndexStreamIterator.py
24 create mode 100644 pym/portage/util/iterators/MultiIterGroupBy.py
25 create mode 100644 pym/portage/util/iterators/__init__.py
26
27 diff --git a/pym/portage/cache/index/IndexStreamIterator.py b/pym/portage/cache/index/IndexStreamIterator.py
28 new file mode 100644
29 index 0000000..972aee1
30 --- /dev/null
31 +++ b/pym/portage/cache/index/IndexStreamIterator.py
32 @@ -0,0 +1,27 @@
33 +# Copyright 2014 Gentoo Foundation
34 +# Distributed under the terms of the GNU General Public License v2
35 +
36 +class IndexStreamIterator(object):
37 +
38 + def __init__(self, f, parser):
39 +
40 + self.parser = parser
41 + self._file = f
42 +
43 + def close(self):
44 +
45 + if self._file is not None:
46 + self._file.close()
47 + self._file = None
48 +
49 + def __iter__(self):
50 +
51 + try:
52 +
53 + for line in self._file:
54 + node = self.parser(line)
55 + if node is not None:
56 + yield node
57 +
58 + finally:
59 + self.close()
60 diff --git a/pym/portage/util/iterators/MultiIterGroupBy.py b/pym/portage/util/iterators/MultiIterGroupBy.py
61 new file mode 100644
62 index 0000000..f5f8278
63 --- /dev/null
64 +++ b/pym/portage/util/iterators/MultiIterGroupBy.py
65 @@ -0,0 +1,97 @@
66 +# Copyright 2014 Gentoo Foundation
67 +# Distributed under the terms of the GNU General Public License v2
68 +
69 +import bisect
70 +
71 +class MultiIterGroupBy(object):
72 + """
73 + This class functions similarly to the itertools.groupby function,
74 + except that it takes multiple source iterators as input. The source
75 + iterators must yield objects in sorted order. A group is yielded as
76 + soon as the progress of all iterators reaches a state which
77 + guarantees that there can not be any remaining (unseen) elements of
78 + the group. This is useful for incremental display of grouped search
79 + results.
80 + """
81 +
82 + def __init__(self, iterators, key = None):
83 + self._iterators = iterators
84 + self._key = key
85 +
86 + def __iter__(self):
87 +
88 + trackers = []
89 + for iterator in self._iterators:
90 + trackers.append(_IteratorTracker(iterator))
91 +
92 + key_map = {}
93 + key_list = []
94 + eof = []
95 + key_getter = self._key
96 + if key_getter is None:
97 + key_getter = lambda x: x
98 + min_progress = None
99 +
100 + while trackers:
101 +
102 + for tracker in trackers:
103 +
104 + if tracker.current is not None and \
105 + tracker.current != min_progress:
106 + # The trackers are sorted by progress, so the
107 + # remaining trackers are guaranteed to have
108 + # sufficient progress.
109 + break
110 +
111 + # In order to avoid over-buffering (waste of memory),
112 + # only grab a single entry.
113 + try:
114 + entry = next(tracker.iterator)
115 + except StopIteration:
116 + eof.append(tracker)
117 + else:
118 + tracker.current = key_getter(entry)
119 + key_group = key_map.get(tracker.current)
120 + if key_group is None:
121 + key_group = []
122 + key_map[tracker.current] = key_group
123 + bisect.insort(key_list, tracker.current)
124 + key_group.append(entry)
125 +
126 + if eof:
127 + for tracker in eof:
128 + trackers.remove(tracker)
129 + del eof[:]
130 +
131 + if trackers:
132 + trackers.sort()
133 + min_progress = trackers[0].current
134 + # yield if key <= min_progress
135 + i = bisect.bisect_right(key_list, min_progress)
136 + yield_these = key_list[:i]
137 + del key_list[:i]
138 + else:
139 + yield_these = key_list
140 + key_list = []
141 +
142 + if yield_these:
143 + for k in yield_these:
144 + yield key_map.pop(k)
145 +
146 +class _IteratorTracker(object):
147 +
148 + __slots__ = ('current', 'iterator')
149 +
150 + def __init__(self, iterator):
151 +
152 + self.iterator = iterator
153 + self.current = None
154 +
155 + def __lt__(self, other):
156 + if self.current is None:
157 + if other.current is None:
158 + return False
159 + else:
160 + return True
161 + return other.current is not None and \
162 + self.current < other.current
163 diff --git a/pym/portage/util/iterators/__init__.py b/pym/portage/util/iterators/__init__.py
164 new file mode 100644
165 index 0000000..7cd880e
166 --- /dev/null
167 +++ b/pym/portage/util/iterators/__init__.py
168 @@ -0,0 +1,2 @@
169 +# Copyright 2014 Gentoo Foundation
170 +# Distributed under the terms of the GNU General Public License v2
171 --
172 2.0.4