Gentoo Archives: gentoo-portage-dev

From: Zac Medico <zmedico@g.o>
To: gentoo-portage-dev@l.g.o
Cc: Zac Medico <zmedico@g.o>
Subject: [gentoo-portage-dev] [PATCH 2/5 v3] Add IndexStreamIterator and MultiIterGroupBy.
Date: Sun, 02 Nov 2014 22:50:18
Message-Id: 1414968611-20093-1-git-send-email-zmedico@gentoo.org
In Reply to: [gentoo-portage-dev] [PATCH 2/5] Add IndexStreamIterator and MultiIterGroupBy. by Zac Medico
1 This IndexStreamIterator class can be used together with the
2 pkg_desc_index_line_read function to read an index file incrementally
3 as a stream.
4
5 The MultiIterGroupBy class can be used to iterate over multiple
6 IndexStreamIterator instances at once, incrementally grouping results
7 for a particular package from multiple indices, while limiting the
8 amount of any given index that must be in memory at once.
9
10 Both of these classes are used by the IndexedPortdb class in the next
11 patch of this series.
12
13 X-Gentoo-Bug: 525718
14 X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=525718
15 ---
16 This updated patch cleans up the logic (possibly fixing bugs), and
17 optimizes it to avoid over-buffering (memory waste). Also, I added
18 a TODO note to use a binary search tree to optimize the search
19 for completed groups.
20
21 pym/portage/cache/index/IndexStreamIterator.py | 27 ++++++++
22 pym/portage/util/iterators/MultiIterGroupBy.py | 94 ++++++++++++++++++++++++++
23 pym/portage/util/iterators/__init__.py | 2 +
24 3 files changed, 123 insertions(+)
25 create mode 100644 pym/portage/cache/index/IndexStreamIterator.py
26 create mode 100644 pym/portage/util/iterators/MultiIterGroupBy.py
27 create mode 100644 pym/portage/util/iterators/__init__.py
28
29 diff --git a/pym/portage/cache/index/IndexStreamIterator.py b/pym/portage/cache/index/IndexStreamIterator.py
30 new file mode 100644
31 index 0000000..972aee1
32 --- /dev/null
33 +++ b/pym/portage/cache/index/IndexStreamIterator.py
34 @@ -0,0 +1,27 @@
35 +# Copyright 2014 Gentoo Foundation
36 +# Distributed under the terms of the GNU General Public License v2
37 +
38 +class IndexStreamIterator(object):
39 +
40 + def __init__(self, f, parser):
41 +
42 + self.parser = parser
43 + self._file = f
44 +
45 + def close(self):
46 +
47 + if self._file is not None:
48 + self._file.close()
49 + self._file = None
50 +
51 + def __iter__(self):
52 +
53 + try:
54 +
55 + for line in self._file:
56 + node = self.parser(line)
57 + if node is not None:
58 + yield node
59 +
60 + finally:
61 + self.close()
62 diff --git a/pym/portage/util/iterators/MultiIterGroupBy.py b/pym/portage/util/iterators/MultiIterGroupBy.py
63 new file mode 100644
64 index 0000000..2d8652e
65 --- /dev/null
66 +++ b/pym/portage/util/iterators/MultiIterGroupBy.py
67 @@ -0,0 +1,94 @@
68 +# Copyright 2014 Gentoo Foundation
69 +# Distributed under the terms of the GNU General Public License v2
70 +
71 +class MultiIterGroupBy(object):
72 + """
73 + This class functions similarly to the itertools.groupby function,
74 + except that it takes multiple source iterators as input. The source
75 + iterators must yield objects in sorted order. A group is yielded as
76 + soon as the progress of all iterators reaches a state which
77 + guarantees that there can not be any remaining (unseen) elements of
78 + the group. This is useful for incremental display of grouped search
79 + results.
80 + """
81 +
82 + def __init__(self, iterators, key = None):
83 + self._iterators = iterators
84 + self._key = key
85 +
86 + def __iter__(self):
87 +
88 + trackers = []
89 + for iterator in self._iterators:
90 + trackers.append(_IteratorTracker(iterator))
91 +
92 + key_map = {}
93 + eof = []
94 + key_getter = self._key
95 + if key_getter is None:
96 + key_getter = lambda x: x
97 + min_progress = None
98 +
99 + while trackers:
100 +
101 + for tracker in trackers:
102 +
103 + if tracker.current is not None and \
104 + tracker.current != min_progress:
105 + # The trackers are sorted by progress, so the
106 + # remaining trackers are guaranteed to have
107 + # sufficient progress.
108 + continue
109 +
110 + # In order to avoid over-buffering (waste of memory),
111 + # only grab a single entry.
112 + try:
113 + entry = next(tracker.iterator)
114 + except StopIteration:
115 + eof.append(tracker)
116 + else:
117 + tracker.current = key_getter(entry)
118 + key_group = key_map.get(tracker.current)
119 + if key_group is None:
120 + key_group = []
121 + key_map[tracker.current] = key_group
122 + key_group.append(entry)
123 +
124 + if eof:
125 + for tracker in eof:
126 + trackers.remove(tracker)
127 + del eof[:]
128 +
129 + if trackers:
130 + trackers.sort()
131 + min_progress = trackers[0].current
132 + yield_these = []
133 + # TODO: Use a binary search tree to optimize this loop.
134 + for k in key_map:
135 + if k <= min_progress:
136 + yield_these.append(k)
137 + else:
138 + yield_these = list(key_map)
139 +
140 + if yield_these:
141 + yield_these.sort()
142 + for k in yield_these:
143 + yield key_map.pop(k)
144 +
145 +class _IteratorTracker(object):
146 +
147 + __slots__ = ('current', 'iterator')
148 +
149 + def __init__(self, iterator):
150 +
151 + self.iterator = iterator
152 + self.current = None
153 +
154 + def __lt__(self, other):
155 + if self.current is None:
156 + if other.current is None:
157 + return False
158 + else:
159 + return True
160 + return other.current is not None and \
161 + self.current < other.current
162 diff --git a/pym/portage/util/iterators/__init__.py b/pym/portage/util/iterators/__init__.py
163 new file mode 100644
164 index 0000000..7cd880e
165 --- /dev/null
166 +++ b/pym/portage/util/iterators/__init__.py
167 @@ -0,0 +1,2 @@
168 +# Copyright 2014 Gentoo Foundation
169 +# Distributed under the terms of the GNU General Public License v2
170 --
171 2.0.4