1 |
Hi, |
2 |
|
3 |
On 29-06-2020 21:13:43 -0500, Sid Spry wrote: |
4 |
> Hello, |
5 |
> |
6 |
> I have some runnable pseudocode outlining a faster tree verification algorithm. |
7 |
> Before I create patches I'd like to see if there is any guidance on making the |
8 |
> changes as unobtrusive as possible. If the radical change in algorithm is |
9 |
> acceptable I can work on adding the changes. |
10 |
> |
11 |
> Instead of composing any kind of structured data out of the portage tree my |
12 |
> algorithm just lists all files and then optionally batches them out to threads. |
13 |
> There is a noticeable speedup by eliding the tree traversal operations which |
14 |
> can be seen when running the algorithm with a single thread and comparing it to |
15 |
> the current algorithm in gemato (which should still be discussed here?). |
16 |
|
17 |
I remember something that gemato used to use multiple threads, but |
18 |
because it totally saturated disk-IO, it was brought back to a single |
19 |
thread. People were complaining about unusable systems. |
20 |
|
21 |
In any case, can you share your performance results? What speedup did |
22 |
you see, on warm and hot FS caches? Which type of disk do you use? |
23 |
|
24 |
You could compare against qmanifest, which uses OpenMP-based |
25 |
paralllelism while verifying the tree. On SSDs this does help. |
26 |
|
27 |
Thanks, |
28 |
Fabian |
29 |
|
30 |
> |
31 |
> Some simple tests like counting all objects traversed and verified returns the |
32 |
> same(ish). Once it is put into portage it could be tested in detail. |
33 |
> |
34 |
> There is also my partial attempt at removing the brittle interface to GnuPG |
35 |
> (it's not as if the current code is badly designed, just that parsing the |
36 |
> output of GnuPG directly is likely not the best idea). |
37 |
> |
38 |
> Needs gemato, dnspython, and requests. Slightly better than random code because |
39 |
> I took inspiration from the existing gemato classes. |
40 |
> |
41 |
> ```python (veriftree.py) |
42 |
> #!/usr/bin/env python3 |
43 |
> import os, sys, zlib, hashlib, tempfile, shutil, timeit |
44 |
> import subprocess |
45 |
> from typing import List |
46 |
> from pprint import pprint |
47 |
> |
48 |
> from gemato.manifest import ( |
49 |
> ManifestFile, |
50 |
> ManifestFileEntry, |
51 |
> ) |
52 |
> from wkd import ( |
53 |
> check_domain_signature, |
54 |
> hash_localpart, |
55 |
> build_web_key_uri, |
56 |
> stream_to_file |
57 |
> ) |
58 |
> from fetchmedia import ( |
59 |
> OpenPGPEnvironment, |
60 |
> setup_verification_environment |
61 |
> ) |
62 |
> |
63 |
> # 0. Top level directory (repository) contains Manifest, a PGP signature of |
64 |
> # blake2b and sha512 hashes of Manifest.files.gz. |
65 |
> # 1. Manifest.files contains hashes of each category Manifest.gz. |
66 |
> # 2. The category Manifest contains hashes of each package Manifest. |
67 |
> # 3. The package Manifest contains hashes of each package file. |
68 |
> # Must be aware of PMS, e.g. aux tag specifies a file in files/. |
69 |
> |
70 |
> # 0. Check signature of repo Manifest. |
71 |
> # 1. Merge items in Manifest.files, each category Manifest, and each package |
72 |
> # Manifest into one big list. The path must be made absolute. |
73 |
> # 2. Distribute items to threads. |
74 |
> |
75 |
> # To check operation compare directory tree to files appearing in all |
76 |
> # ManifestRecords. |
77 |
> |
78 |
> class ManifestTree(object): |
79 |
> __slots__ = ['_directory', '_manifest_list', '_manifest_records', |
80 |
> '_manifest_results'] |
81 |
> |
82 |
> def __init__(self, directory: str): |
83 |
> self._directory = directory |
84 |
> # Tuples of (base_path, full_path). |
85 |
> self._manifest_list = [] |
86 |
> self._manifest_records = [] |
87 |
> self._manifest_results = [] |
88 |
> |
89 |
> def build_manifest_list(self): |
90 |
> for path, dirs, files in os.walk(self._directory): |
91 |
> #if 'glsa' in path or 'news' in path: |
92 |
> #if 'metadata' in path: |
93 |
> # continue # Skip the metadata directory for now. |
94 |
> # It contains a repository. Current algo barfs on Manifest |
95 |
> # containing only sig. |
96 |
> |
97 |
> if 'Manifest.files.gz' in files: |
98 |
> self._manifest_list += [(path, path + '/Manifest.files.gz')] |
99 |
> if 'Manifest.gz' in files: |
100 |
> self._manifest_list += [(path, path + '/Manifest.gz')] |
101 |
> |
102 |
> if path == self._directory: |
103 |
> continue # Skip the repo manifest. Order matters, fix eventually. |
104 |
> if 'Manifest' in files: |
105 |
> self._manifest_list += [(path, path + '/Manifest')] |
106 |
> |
107 |
> def parse_manifests(self): |
108 |
> td = tempfile.TemporaryDirectory(dir='./') |
109 |
> for manifest in self._manifest_list: |
110 |
> def inner(): |
111 |
> if manifest[1].endswith('.gz'): |
112 |
> name = 'Manifest.files' # Need to also handle Manifest.gz. |
113 |
> path = '{0}/{1}'.format(td.name, name) |
114 |
> subprocess.run(['sh', '-c', 'gunzip -c {0} > {1}' |
115 |
> .format(manifest[1], path)]) |
116 |
> for line in open(path): |
117 |
> mr = ManifestRecord(line) |
118 |
> mr.make_absolute(manifest[0]) |
119 |
> self._manifest_records += [mr] |
120 |
> else: |
121 |
> for line in open(manifest[1]): |
122 |
> if line.startswith('-'): |
123 |
> return # Skip the signed manifest. |
124 |
> mr = ManifestRecord(line) |
125 |
> mr.make_absolute(manifest[0]) |
126 |
> self._manifest_records += [mr] |
127 |
> inner() |
128 |
> |
129 |
> def verify_manifests(self): |
130 |
> for record in self._manifest_records: |
131 |
> self._manifest_results += [record.verify()] |
132 |
> |
133 |
> |
134 |
> class ManifestRecord(object): |
135 |
> __slots__ = ['_tag', '_abs_path', '_path', '_size', '_hashes'] |
136 |
> |
137 |
> def __init__(self, line: str=None): |
138 |
> self._tag = None |
139 |
> self._abs_path = None |
140 |
> self._path = None |
141 |
> self._size = None |
142 |
> self._hashes = [] |
143 |
> if line: |
144 |
> self.from_string(line) |
145 |
> |
146 |
> def from_string(self, line: str) -> None: |
147 |
> parts = line.split() |
148 |
> if len(parts) == 2: |
149 |
> self._tag = 'ignore' |
150 |
> return |
151 |
> self._tag = parts[0] |
152 |
> self._path = parts[1] |
153 |
> self._size = parts[2] |
154 |
> self._hashes = parts[3:] |
155 |
> |
156 |
> def make_absolute(self, abs_path: str) -> None: |
157 |
> self._abs_path = abs_path |
158 |
> try: |
159 |
> pass |
160 |
> #if 'md5-cache' in abs_path: |
161 |
> # print(abs_path + '/' + self._path) |
162 |
> except TypeError as exc: |
163 |
> return |
164 |
> |
165 |
> def verify(self) -> bool: |
166 |
> if self._tag == 'ignore': |
167 |
> return None |
168 |
> |
169 |
> # Where is best place to do this? Before? |
170 |
> if self._tag.lower() == 'aux': |
171 |
> self._path = self._abs_path + '/files/' + self._path |
172 |
> elif self._abs_path: |
173 |
> self._path = self._abs_path + '/' + self._path |
174 |
> |
175 |
> # Distfiles will not be present. |
176 |
> if self._tag.lower() == 'dist': |
177 |
> return None |
178 |
> |
179 |
> if not os.path.exists(self._path): |
180 |
> return False |
181 |
> |
182 |
> fd = open(self._path, 'rb') |
183 |
> sha512 = hashlib.sha512() |
184 |
> blake2b = hashlib.blake2b() |
185 |
> while True: |
186 |
> d = fd.read(8192) |
187 |
> if not d: |
188 |
> break |
189 |
> sha512.update(d) |
190 |
> blake2b.update(d) |
191 |
> rsha512 = sha512.hexdigest() |
192 |
> rblake2b = blake2b.hexdigest() |
193 |
> |
194 |
> if rblake2b != self._hashes[1]: |
195 |
> return False |
196 |
> |
197 |
> if rsha512 != self._hashes[3]: |
198 |
> return False |
199 |
> |
200 |
> return True |
201 |
> |
202 |
> def __repr__(self) -> str: |
203 |
> #return repr(self._hashes) |
204 |
> return '\t'.join([self._tag, self._size, self._path]) |
205 |
> |
206 |
> def main() -> int: |
207 |
> # Step 0: verify the repo manifest. |
208 |
> #publishers = ['infrastructure@g.o'] |
209 |
> #ev = setup_verification_environment(publishers) |
210 |
> #mf = ManifestFile() |
211 |
> #mf.load(open('/var/db/repos/gentoo/Manifest'), |
212 |
> # verify_openpgp=True, openpgp_env=ev) |
213 |
> #pprint(mf) |
214 |
> #pprint(mf.openpgp_signed) |
215 |
> #pprint(mf.openpgp_signature) |
216 |
> |
217 |
> # Step 1: merge manifests. |
218 |
> #mt = ManifestTree('/var/db/repos/gentoo') |
219 |
> #mt.build_manifest_list() |
220 |
> #mt.parse_manifests() |
221 |
> #mt.verify_manifests() |
222 |
> |
223 |
> glsa = ManifestTree('/var/db/repos/gentoo') |
224 |
> glsa.build_manifest_list() |
225 |
> glsa.parse_manifests() |
226 |
> |
227 |
> start = timeit.default_timer() |
228 |
> glsa.verify_manifests() |
229 |
> end = timeit.default_timer() |
230 |
> pprint(end - start) |
231 |
> |
232 |
> # Handled by checking for None. |
233 |
> #no_ignore = filter(lambda x: x._tag != 'ignore', glsa_manifest_results) |
234 |
> |
235 |
> |
236 |
> #pprint(glsa._manifest_results) |
237 |
> real_files = [x for x in filter(lambda x: x is not None, glsa._manifest_results)] |
238 |
> #pprint(real_files) |
239 |
> pprint(len(glsa._manifest_results)) |
240 |
> pprint(len(real_files)) |
241 |
> |
242 |
> all_files = [] |
243 |
> for path, dirs, files in os.walk('/var/db/repos/gentoo'): |
244 |
> pass |
245 |
> |
246 |
> return 0 |
247 |
> |
248 |
> if __name__ == '__main__': |
249 |
> sys.exit(main()) |
250 |
> ``` |
251 |
> |
252 |
> ```python (wkd.py, likely unneeded but I didn't want to redo these files yet) |
253 |
> #!/usr/bin/env python3 |
254 |
> import sys, hashlib |
255 |
> import dns |
256 |
> from dns import ( |
257 |
> name, query, dnssec, |
258 |
> message, resolver, rdatatype |
259 |
> ) |
260 |
> import shutil, requests |
261 |
> |
262 |
> def check_domain_signature(domain: str) -> bool: |
263 |
> response = dns.resolver.query(domain, dns.rdatatype.NS) |
264 |
> nsname = response.rrset[0] |
265 |
> response = dns.resolver.query(str(nsname), dns.rdatatype.A) |
266 |
> nsaddr = response.rrset[0].to_text() |
267 |
> |
268 |
> # DNSKEY |
269 |
> request = dns.message.make_query(domain, |
270 |
> dns.rdatatype.DNSKEY, want_dnssec=True) |
271 |
> response = dns.query.udp(request, nsaddr) |
272 |
> if response.rcode() != 0: |
273 |
> raise Exception('Unable to request dnskey.') |
274 |
> |
275 |
> answer = response.answer |
276 |
> if len(answer) != 2: |
277 |
> raise Exception('Malformed answer to dnskey query.') |
278 |
> |
279 |
> name = dns.name.from_text(domain) |
280 |
> try: |
281 |
> dns.dnssec.validate(answer[0], answer[1], {name: answer[0]}) |
282 |
> except dns.dnssec.ValidationFailure as exc: |
283 |
> # Validation failed. The raise causes python to abort with status 1. |
284 |
> #raise exc |
285 |
> return False |
286 |
> except AttributeError as exc: |
287 |
> # Validation may have failed; DNSKEY missing signer attribute. dig may report |
288 |
> # domain as valid. |
289 |
> # |
290 |
> # TODO: Additional state where subdomain of valid domain may fail with 3 nested |
291 |
> # KeyErrors. Avoid temptation to wildcard catch. Safer to put in process? |
292 |
> #raise exc |
293 |
> return False |
294 |
> else: |
295 |
> return True |
296 |
> |
297 |
> def hash_localpart(incoming: bytes) -> str: |
298 |
> '''Z-base32 the localpart of an e-mail address |
299 |
> |
300 |
> https://tools.ietf.org/html/draft-koch-openpgp-webkey-service-08#section-3.1 |
301 |
> describes why this is needed. |
302 |
> |
303 |
> See https://tools.ietf.org/html/rfc6189#section-5.1.6 for a |
304 |
> description of the z-base32 scheme. |
305 |
> ''' |
306 |
> zb32 = "ybndrfg8ejkmcpqxot1uwisza345h769" |
307 |
> |
308 |
> b = hashlib.sha1(incoming).digest() |
309 |
> ret = "" |
310 |
> assert(len(b) * 8 == 160) |
311 |
> for i in range(0, 160, 5): |
312 |
> byte = i // 8 |
313 |
> offset = i - byte * 8 |
314 |
> # offset | bits remaining in k+1 | right-shift k+1 |
315 |
> # 3 | 0 | x |
316 |
> # 4 | 1 | 7 |
317 |
> # 5 | 2 | 6 |
318 |
> # 6 | 3 | 5 |
319 |
> # 7 | 4 | 4 |
320 |
> if offset < 4: |
321 |
> n = (b[byte] >> (3 - offset)) |
322 |
> else: |
323 |
> n = (b[byte] << (offset - 3)) + (b[byte + 1] >> (11 - offset)) |
324 |
> |
325 |
> ret += zb32[n & 0b11111] |
326 |
> return ret |
327 |
> |
328 |
> def build_web_key_uri(address: str) -> str: |
329 |
> local, remote = address.split('@') |
330 |
> local = hash_localpart(local.encode('utf-8')) |
331 |
> return 'https://' + remote + '/.well-known/openpgpkey/hu/' + \ |
332 |
> local |
333 |
> |
334 |
> def stream_to_file(uri: str, fname: str) -> None: |
335 |
> with requests.get(uri, verify=True, stream=True) as r: |
336 |
> from pprint import pprint |
337 |
> pprint(r.headers) |
338 |
> with open(fname, 'wb') as f: |
339 |
> shutil.copyfileobj(r.raw, f) |
340 |
> ``` |
341 |
> |
342 |
|
343 |
-- |
344 |
Fabian Groffen |
345 |
Gentoo on a different level |