1 |
Dnia June 30, 2020 6:20:37 AM UTC, Fabian Groffen <grobian@g.o> napisał(a): |
2 |
>Hi, |
3 |
> |
4 |
>On 29-06-2020 21:13:43 -0500, Sid Spry wrote: |
5 |
>> Hello, |
6 |
>> |
7 |
>> I have some runnable pseudocode outlining a faster tree verification |
8 |
>algorithm. |
9 |
>> Before I create patches I'd like to see if there is any guidance on |
10 |
>making the |
11 |
>> changes as unobtrusive as possible. If the radical change in |
12 |
>algorithm is |
13 |
>> acceptable I can work on adding the changes. |
14 |
>> |
15 |
>> Instead of composing any kind of structured data out of the portage |
16 |
>tree my |
17 |
>> algorithm just lists all files and then optionally batches them out |
18 |
>to threads. |
19 |
>> There is a noticeable speedup by eliding the tree traversal |
20 |
>operations which |
21 |
>> can be seen when running the algorithm with a single thread and |
22 |
>comparing it to |
23 |
>> the current algorithm in gemato (which should still be discussed |
24 |
>here?). |
25 |
> |
26 |
>I remember something that gemato used to use multiple threads, but |
27 |
>because it totally saturated disk-IO, it was brought back to a single |
28 |
>thread. People were complaining about unusable systems. |
29 |
|
30 |
No, it gave significant speedup even on spinning HDDs. However, it hang on some people due to some bug that I couldn't reproduce. |
31 |
|
32 |
> |
33 |
>In any case, can you share your performance results? What speedup did |
34 |
>you see, on warm and hot FS caches? Which type of disk do you use? |
35 |
> |
36 |
>You could compare against qmanifest, which uses OpenMP-based |
37 |
>paralllelism while verifying the tree. On SSDs this does help. |
38 |
> |
39 |
>Thanks, |
40 |
>Fabian |
41 |
> |
42 |
>> |
43 |
>> Some simple tests like counting all objects traversed and verified |
44 |
>returns the |
45 |
>> same(ish). Once it is put into portage it could be tested in detail. |
46 |
>> |
47 |
>> There is also my partial attempt at removing the brittle interface to |
48 |
>GnuPG |
49 |
>> (it's not as if the current code is badly designed, just that parsing |
50 |
>the |
51 |
>> output of GnuPG directly is likely not the best idea). |
52 |
>> |
53 |
>> Needs gemato, dnspython, and requests. Slightly better than random |
54 |
>code because |
55 |
>> I took inspiration from the existing gemato classes. |
56 |
>> |
57 |
>> ```python (veriftree.py) |
58 |
>> #!/usr/bin/env python3 |
59 |
>> import os, sys, zlib, hashlib, tempfile, shutil, timeit |
60 |
>> import subprocess |
61 |
>> from typing import List |
62 |
>> from pprint import pprint |
63 |
>> |
64 |
>> from gemato.manifest import ( |
65 |
>> ManifestFile, |
66 |
>> ManifestFileEntry, |
67 |
>> ) |
68 |
>> from wkd import ( |
69 |
>> check_domain_signature, |
70 |
>> hash_localpart, |
71 |
>> build_web_key_uri, |
72 |
>> stream_to_file |
73 |
>> ) |
74 |
>> from fetchmedia import ( |
75 |
>> OpenPGPEnvironment, |
76 |
>> setup_verification_environment |
77 |
>> ) |
78 |
>> |
79 |
>> # 0. Top level directory (repository) contains Manifest, a PGP |
80 |
>signature of |
81 |
>> # blake2b and sha512 hashes of Manifest.files.gz. |
82 |
>> # 1. Manifest.files contains hashes of each category Manifest.gz. |
83 |
>> # 2. The category Manifest contains hashes of each package Manifest. |
84 |
>> # 3. The package Manifest contains hashes of each package file. |
85 |
>> # Must be aware of PMS, e.g. aux tag specifies a file in files/. |
86 |
>> |
87 |
>> # 0. Check signature of repo Manifest. |
88 |
>> # 1. Merge items in Manifest.files, each category Manifest, and each |
89 |
>package |
90 |
>> # Manifest into one big list. The path must be made absolute. |
91 |
>> # 2. Distribute items to threads. |
92 |
>> |
93 |
>> # To check operation compare directory tree to files appearing in all |
94 |
>> # ManifestRecords. |
95 |
>> |
96 |
>> class ManifestTree(object): |
97 |
>> __slots__ = ['_directory', '_manifest_list', '_manifest_records', |
98 |
>> '_manifest_results'] |
99 |
>> |
100 |
>> def __init__(self, directory: str): |
101 |
>> self._directory = directory |
102 |
>> # Tuples of (base_path, full_path). |
103 |
>> self._manifest_list = [] |
104 |
>> self._manifest_records = [] |
105 |
>> self._manifest_results = [] |
106 |
>> |
107 |
>> def build_manifest_list(self): |
108 |
>> for path, dirs, files in os.walk(self._directory): |
109 |
>> #if 'glsa' in path or 'news' in path: |
110 |
>> #if 'metadata' in path: |
111 |
>> # continue # Skip the metadata directory for now. |
112 |
>> # It contains a repository. Current algo barfs on |
113 |
>Manifest |
114 |
>> # containing only sig. |
115 |
>> |
116 |
>> if 'Manifest.files.gz' in files: |
117 |
>> self._manifest_list += [(path, path + |
118 |
>'/Manifest.files.gz')] |
119 |
>> if 'Manifest.gz' in files: |
120 |
>> self._manifest_list += [(path, path + |
121 |
>'/Manifest.gz')] |
122 |
>> |
123 |
>> if path == self._directory: |
124 |
>> continue # Skip the repo manifest. Order matters, fix |
125 |
>eventually. |
126 |
>> if 'Manifest' in files: |
127 |
>> self._manifest_list += [(path, path + '/Manifest')] |
128 |
>> |
129 |
>> def parse_manifests(self): |
130 |
>> td = tempfile.TemporaryDirectory(dir='./') |
131 |
>> for manifest in self._manifest_list: |
132 |
>> def inner(): |
133 |
>> if manifest[1].endswith('.gz'): |
134 |
>> name = 'Manifest.files' # Need to also handle |
135 |
>Manifest.gz. |
136 |
>> path = '{0}/{1}'.format(td.name, name) |
137 |
>> subprocess.run(['sh', '-c', 'gunzip -c {0} > {1}' |
138 |
>> .format(manifest[1], path)]) |
139 |
>> for line in open(path): |
140 |
>> mr = ManifestRecord(line) |
141 |
>> mr.make_absolute(manifest[0]) |
142 |
>> self._manifest_records += [mr] |
143 |
>> else: |
144 |
>> for line in open(manifest[1]): |
145 |
>> if line.startswith('-'): |
146 |
>> return # Skip the signed manifest. |
147 |
>> mr = ManifestRecord(line) |
148 |
>> mr.make_absolute(manifest[0]) |
149 |
>> self._manifest_records += [mr] |
150 |
>> inner() |
151 |
>> |
152 |
>> def verify_manifests(self): |
153 |
>> for record in self._manifest_records: |
154 |
>> self._manifest_results += [record.verify()] |
155 |
>> |
156 |
>> |
157 |
>> class ManifestRecord(object): |
158 |
>> __slots__ = ['_tag', '_abs_path', '_path', '_size', '_hashes'] |
159 |
>> |
160 |
>> def __init__(self, line: str=None): |
161 |
>> self._tag = None |
162 |
>> self._abs_path = None |
163 |
>> self._path = None |
164 |
>> self._size = None |
165 |
>> self._hashes = [] |
166 |
>> if line: |
167 |
>> self.from_string(line) |
168 |
>> |
169 |
>> def from_string(self, line: str) -> None: |
170 |
>> parts = line.split() |
171 |
>> if len(parts) == 2: |
172 |
>> self._tag = 'ignore' |
173 |
>> return |
174 |
>> self._tag = parts[0] |
175 |
>> self._path = parts[1] |
176 |
>> self._size = parts[2] |
177 |
>> self._hashes = parts[3:] |
178 |
>> |
179 |
>> def make_absolute(self, abs_path: str) -> None: |
180 |
>> self._abs_path = abs_path |
181 |
>> try: |
182 |
>> pass |
183 |
>> #if 'md5-cache' in abs_path: |
184 |
>> # print(abs_path + '/' + self._path) |
185 |
>> except TypeError as exc: |
186 |
>> return |
187 |
>> |
188 |
>> def verify(self) -> bool: |
189 |
>> if self._tag == 'ignore': |
190 |
>> return None |
191 |
>> |
192 |
>> # Where is best place to do this? Before? |
193 |
>> if self._tag.lower() == 'aux': |
194 |
>> self._path = self._abs_path + '/files/' + self._path |
195 |
>> elif self._abs_path: |
196 |
>> self._path = self._abs_path + '/' + self._path |
197 |
>> |
198 |
>> # Distfiles will not be present. |
199 |
>> if self._tag.lower() == 'dist': |
200 |
>> return None |
201 |
>> |
202 |
>> if not os.path.exists(self._path): |
203 |
>> return False |
204 |
>> |
205 |
>> fd = open(self._path, 'rb') |
206 |
>> sha512 = hashlib.sha512() |
207 |
>> blake2b = hashlib.blake2b() |
208 |
>> while True: |
209 |
>> d = fd.read(8192) |
210 |
>> if not d: |
211 |
>> break |
212 |
>> sha512.update(d) |
213 |
>> blake2b.update(d) |
214 |
>> rsha512 = sha512.hexdigest() |
215 |
>> rblake2b = blake2b.hexdigest() |
216 |
>> |
217 |
>> if rblake2b != self._hashes[1]: |
218 |
>> return False |
219 |
>> |
220 |
>> if rsha512 != self._hashes[3]: |
221 |
>> return False |
222 |
>> |
223 |
>> return True |
224 |
>> |
225 |
>> def __repr__(self) -> str: |
226 |
>> #return repr(self._hashes) |
227 |
>> return '\t'.join([self._tag, self._size, self._path]) |
228 |
>> |
229 |
>> def main() -> int: |
230 |
>> # Step 0: verify the repo manifest. |
231 |
>> #publishers = ['infrastructure@g.o'] |
232 |
>> #ev = setup_verification_environment(publishers) |
233 |
>> #mf = ManifestFile() |
234 |
>> #mf.load(open('/var/db/repos/gentoo/Manifest'), |
235 |
>> # verify_openpgp=True, openpgp_env=ev) |
236 |
>> #pprint(mf) |
237 |
>> #pprint(mf.openpgp_signed) |
238 |
>> #pprint(mf.openpgp_signature) |
239 |
>> |
240 |
>> # Step 1: merge manifests. |
241 |
>> #mt = ManifestTree('/var/db/repos/gentoo') |
242 |
>> #mt.build_manifest_list() |
243 |
>> #mt.parse_manifests() |
244 |
>> #mt.verify_manifests() |
245 |
>> |
246 |
>> glsa = ManifestTree('/var/db/repos/gentoo') |
247 |
>> glsa.build_manifest_list() |
248 |
>> glsa.parse_manifests() |
249 |
>> |
250 |
>> start = timeit.default_timer() |
251 |
>> glsa.verify_manifests() |
252 |
>> end = timeit.default_timer() |
253 |
>> pprint(end - start) |
254 |
>> |
255 |
>> # Handled by checking for None. |
256 |
>> #no_ignore = filter(lambda x: x._tag != 'ignore', |
257 |
>glsa_manifest_results) |
258 |
>> |
259 |
>> |
260 |
>> #pprint(glsa._manifest_results) |
261 |
>> real_files = [x for x in filter(lambda x: x is not None, |
262 |
>glsa._manifest_results)] |
263 |
>> #pprint(real_files) |
264 |
>> pprint(len(glsa._manifest_results)) |
265 |
>> pprint(len(real_files)) |
266 |
>> |
267 |
>> all_files = [] |
268 |
>> for path, dirs, files in os.walk('/var/db/repos/gentoo'): |
269 |
>> pass |
270 |
>> |
271 |
>> return 0 |
272 |
>> |
273 |
>> if __name__ == '__main__': |
274 |
>> sys.exit(main()) |
275 |
>> ``` |
276 |
>> |
277 |
>> ```python (wkd.py, likely unneeded but I didn't want to redo these |
278 |
>files yet) |
279 |
>> #!/usr/bin/env python3 |
280 |
>> import sys, hashlib |
281 |
>> import dns |
282 |
>> from dns import ( |
283 |
>> name, query, dnssec, |
284 |
>> message, resolver, rdatatype |
285 |
>> ) |
286 |
>> import shutil, requests |
287 |
>> |
288 |
>> def check_domain_signature(domain: str) -> bool: |
289 |
>> response = dns.resolver.query(domain, dns.rdatatype.NS) |
290 |
>> nsname = response.rrset[0] |
291 |
>> response = dns.resolver.query(str(nsname), dns.rdatatype.A) |
292 |
>> nsaddr = response.rrset[0].to_text() |
293 |
>> |
294 |
>> # DNSKEY |
295 |
>> request = dns.message.make_query(domain, |
296 |
>> dns.rdatatype.DNSKEY, want_dnssec=True) |
297 |
>> response = dns.query.udp(request, nsaddr) |
298 |
>> if response.rcode() != 0: |
299 |
>> raise Exception('Unable to request dnskey.') |
300 |
>> |
301 |
>> answer = response.answer |
302 |
>> if len(answer) != 2: |
303 |
>> raise Exception('Malformed answer to dnskey query.') |
304 |
>> |
305 |
>> name = dns.name.from_text(domain) |
306 |
>> try: |
307 |
>> dns.dnssec.validate(answer[0], answer[1], {name: answer[0]}) |
308 |
>> except dns.dnssec.ValidationFailure as exc: |
309 |
>> # Validation failed. The raise causes python to abort with |
310 |
>status 1. |
311 |
>> #raise exc |
312 |
>> return False |
313 |
>> except AttributeError as exc: |
314 |
>> # Validation may have failed; DNSKEY missing signer |
315 |
>attribute. dig may report |
316 |
>> # domain as valid. |
317 |
>> # |
318 |
>> # TODO: Additional state where subdomain of valid domain may |
319 |
>fail with 3 nested |
320 |
>> # KeyErrors. Avoid temptation to wildcard catch. Safer to put |
321 |
>in process? |
322 |
>> #raise exc |
323 |
>> return False |
324 |
>> else: |
325 |
>> return True |
326 |
>> |
327 |
>> def hash_localpart(incoming: bytes) -> str: |
328 |
>> '''Z-base32 the localpart of an e-mail address |
329 |
>> |
330 |
>> |
331 |
>https://tools.ietf.org/html/draft-koch-openpgp-webkey-service-08#section-3.1 |
332 |
>> describes why this is needed. |
333 |
>> |
334 |
>> See https://tools.ietf.org/html/rfc6189#section-5.1.6 for a |
335 |
>> description of the z-base32 scheme. |
336 |
>> ''' |
337 |
>> zb32 = "ybndrfg8ejkmcpqxot1uwisza345h769" |
338 |
>> |
339 |
>> b = hashlib.sha1(incoming).digest() |
340 |
>> ret = "" |
341 |
>> assert(len(b) * 8 == 160) |
342 |
>> for i in range(0, 160, 5): |
343 |
>> byte = i // 8 |
344 |
>> offset = i - byte * 8 |
345 |
>> # offset | bits remaining in k+1 | right-shift k+1 |
346 |
>> # 3 | 0 | x |
347 |
>> # 4 | 1 | 7 |
348 |
>> # 5 | 2 | 6 |
349 |
>> # 6 | 3 | 5 |
350 |
>> # 7 | 4 | 4 |
351 |
>> if offset < 4: |
352 |
>> n = (b[byte] >> (3 - offset)) |
353 |
>> else: |
354 |
>> n = (b[byte] << (offset - 3)) + (b[byte + 1] >> (11 - |
355 |
>offset)) |
356 |
>> |
357 |
>> ret += zb32[n & 0b11111] |
358 |
>> return ret |
359 |
>> |
360 |
>> def build_web_key_uri(address: str) -> str: |
361 |
>> local, remote = address.split('@') |
362 |
>> local = hash_localpart(local.encode('utf-8')) |
363 |
>> return 'https://' + remote + '/.well-known/openpgpkey/hu/' + \ |
364 |
>> local |
365 |
>> |
366 |
>> def stream_to_file(uri: str, fname: str) -> None: |
367 |
>> with requests.get(uri, verify=True, stream=True) as r: |
368 |
>> from pprint import pprint |
369 |
>> pprint(r.headers) |
370 |
>> with open(fname, 'wb') as f: |
371 |
>> shutil.copyfileobj(r.raw, f) |
372 |
>> ``` |
373 |
>> |
374 |
|
375 |
|
376 |
-- |
377 |
Best regards, |
378 |
Michał Górny |