1 |
Dnia June 30, 2020 2:13:43 AM UTC, Sid Spry <sid@××××.us> napisał(a): |
2 |
>Hello, |
3 |
> |
4 |
>I have some runnable pseudocode outlining a faster tree verification |
5 |
>algorithm. |
6 |
>Before I create patches I'd like to see if there is any guidance on |
7 |
>making the |
8 |
>changes as unobtrusive as possible. If the radical change in algorithm |
9 |
>is |
10 |
>acceptable I can work on adding the changes. |
11 |
> |
12 |
>Instead of composing any kind of structured data out of the portage |
13 |
>tree my |
14 |
>algorithm just lists all files and then optionally batches them out to |
15 |
>threads. |
16 |
>There is a noticeable speedup by eliding the tree traversal operations |
17 |
>which |
18 |
>can be seen when running the algorithm with a single thread and |
19 |
>comparing it to |
20 |
>the current algorithm in gemato (which should still be discussed |
21 |
>here?). |
22 |
|
23 |
Without reading the code: does your algorithm correctly detect extraneous files? |
24 |
|
25 |
> |
26 |
>Some simple tests like counting all objects traversed and verified |
27 |
>returns the |
28 |
>same(ish). Once it is put into portage it could be tested in detail. |
29 |
> |
30 |
>There is also my partial attempt at removing the brittle interface to |
31 |
>GnuPG |
32 |
>(it's not as if the current code is badly designed, just that parsing |
33 |
>the |
34 |
>output of GnuPG directly is likely not the best idea). |
35 |
|
36 |
The 'brittle interface' is well-defined machine-readable output. |
37 |
|
38 |
> |
39 |
>Needs gemato, dnspython, and requests. Slightly better than random code |
40 |
>because |
41 |
>I took inspiration from the existing gemato classes. |
42 |
|
43 |
The code makes a lot of brittle assumptions about the structure. The GLEP was specifically designed to avoid that and let us adjust the structure in the future to meet our needs. |
44 |
|
45 |
> |
46 |
>```python (veriftree.py) |
47 |
>#!/usr/bin/env python3 |
48 |
>import os, sys, zlib, hashlib, tempfile, shutil, timeit |
49 |
>import subprocess |
50 |
>from typing import List |
51 |
>from pprint import pprint |
52 |
> |
53 |
>from gemato.manifest import ( |
54 |
> ManifestFile, |
55 |
> ManifestFileEntry, |
56 |
>) |
57 |
>from wkd import ( |
58 |
> check_domain_signature, |
59 |
> hash_localpart, |
60 |
> build_web_key_uri, |
61 |
> stream_to_file |
62 |
>) |
63 |
>from fetchmedia import ( |
64 |
> OpenPGPEnvironment, |
65 |
> setup_verification_environment |
66 |
>) |
67 |
> |
68 |
># 0. Top level directory (repository) contains Manifest, a PGP |
69 |
>signature of |
70 |
># blake2b and sha512 hashes of Manifest.files.gz. |
71 |
># 1. Manifest.files contains hashes of each category Manifest.gz. |
72 |
># 2. The category Manifest contains hashes of each package Manifest. |
73 |
># 3. The package Manifest contains hashes of each package file. |
74 |
># Must be aware of PMS, e.g. aux tag specifies a file in files/. |
75 |
> |
76 |
># 0. Check signature of repo Manifest. |
77 |
># 1. Merge items in Manifest.files, each category Manifest, and each |
78 |
>package |
79 |
># Manifest into one big list. The path must be made absolute. |
80 |
># 2. Distribute items to threads. |
81 |
> |
82 |
># To check operation compare directory tree to files appearing in all |
83 |
># ManifestRecords. |
84 |
> |
85 |
>class ManifestTree(object): |
86 |
> __slots__ = ['_directory', '_manifest_list', '_manifest_records', |
87 |
> '_manifest_results'] |
88 |
> |
89 |
> def __init__(self, directory: str): |
90 |
> self._directory = directory |
91 |
> # Tuples of (base_path, full_path). |
92 |
> self._manifest_list = [] |
93 |
> self._manifest_records = [] |
94 |
> self._manifest_results = [] |
95 |
> |
96 |
> def build_manifest_list(self): |
97 |
> for path, dirs, files in os.walk(self._directory): |
98 |
> #if 'glsa' in path or 'news' in path: |
99 |
> #if 'metadata' in path: |
100 |
> # continue # Skip the metadata directory for now. |
101 |
> # It contains a repository. Current algo barfs on Manifest |
102 |
> # containing only sig. |
103 |
> |
104 |
> if 'Manifest.files.gz' in files: |
105 |
> self._manifest_list += [(path, path + '/Manifest.files.gz')] |
106 |
> if 'Manifest.gz' in files: |
107 |
> self._manifest_list += [(path, path + '/Manifest.gz')] |
108 |
> |
109 |
> if path == self._directory: |
110 |
> continue # Skip the repo manifest. Order matters, fix eventually. |
111 |
> if 'Manifest' in files: |
112 |
> self._manifest_list += [(path, path + '/Manifest')] |
113 |
> |
114 |
> def parse_manifests(self): |
115 |
> td = tempfile.TemporaryDirectory(dir='./') |
116 |
> for manifest in self._manifest_list: |
117 |
> def inner(): |
118 |
> if manifest[1].endswith('.gz'): |
119 |
> name = 'Manifest.files' # Need to also handle Manifest.gz. |
120 |
> path = '{0}/{1}'.format(td.name, name) |
121 |
> subprocess.run(['sh', '-c', 'gunzip -c {0} > {1}' |
122 |
> .format(manifest[1], path)]) |
123 |
> for line in open(path): |
124 |
> mr = ManifestRecord(line) |
125 |
> mr.make_absolute(manifest[0]) |
126 |
> self._manifest_records += [mr] |
127 |
> else: |
128 |
> for line in open(manifest[1]): |
129 |
> if line.startswith('-'): |
130 |
> return # Skip the signed manifest. |
131 |
> mr = ManifestRecord(line) |
132 |
> mr.make_absolute(manifest[0]) |
133 |
> self._manifest_records += [mr] |
134 |
> inner() |
135 |
> |
136 |
> def verify_manifests(self): |
137 |
> for record in self._manifest_records: |
138 |
> self._manifest_results += [record.verify()] |
139 |
> |
140 |
> |
141 |
>class ManifestRecord(object): |
142 |
> __slots__ = ['_tag', '_abs_path', '_path', '_size', '_hashes'] |
143 |
> |
144 |
> def __init__(self, line: str=None): |
145 |
> self._tag = None |
146 |
> self._abs_path = None |
147 |
> self._path = None |
148 |
> self._size = None |
149 |
> self._hashes = [] |
150 |
> if line: |
151 |
> self.from_string(line) |
152 |
> |
153 |
> def from_string(self, line: str) -> None: |
154 |
> parts = line.split() |
155 |
> if len(parts) == 2: |
156 |
> self._tag = 'ignore' |
157 |
> return |
158 |
> self._tag = parts[0] |
159 |
> self._path = parts[1] |
160 |
> self._size = parts[2] |
161 |
> self._hashes = parts[3:] |
162 |
> |
163 |
> def make_absolute(self, abs_path: str) -> None: |
164 |
> self._abs_path = abs_path |
165 |
> try: |
166 |
> pass |
167 |
> #if 'md5-cache' in abs_path: |
168 |
> # print(abs_path + '/' + self._path) |
169 |
> except TypeError as exc: |
170 |
> return |
171 |
> |
172 |
> def verify(self) -> bool: |
173 |
> if self._tag == 'ignore': |
174 |
> return None |
175 |
> |
176 |
> # Where is best place to do this? Before? |
177 |
> if self._tag.lower() == 'aux': |
178 |
> self._path = self._abs_path + '/files/' + self._path |
179 |
> elif self._abs_path: |
180 |
> self._path = self._abs_path + '/' + self._path |
181 |
> |
182 |
> # Distfiles will not be present. |
183 |
> if self._tag.lower() == 'dist': |
184 |
> return None |
185 |
> |
186 |
> if not os.path.exists(self._path): |
187 |
> return False |
188 |
> |
189 |
> fd = open(self._path, 'rb') |
190 |
> sha512 = hashlib.sha512() |
191 |
> blake2b = hashlib.blake2b() |
192 |
> while True: |
193 |
> d = fd.read(8192) |
194 |
> if not d: |
195 |
> break |
196 |
> sha512.update(d) |
197 |
> blake2b.update(d) |
198 |
> rsha512 = sha512.hexdigest() |
199 |
> rblake2b = blake2b.hexdigest() |
200 |
> |
201 |
> if rblake2b != self._hashes[1]: |
202 |
> return False |
203 |
> |
204 |
> if rsha512 != self._hashes[3]: |
205 |
> return False |
206 |
> |
207 |
> return True |
208 |
> |
209 |
> def __repr__(self) -> str: |
210 |
> #return repr(self._hashes) |
211 |
> return '\t'.join([self._tag, self._size, self._path]) |
212 |
> |
213 |
>def main() -> int: |
214 |
> # Step 0: verify the repo manifest. |
215 |
> #publishers = ['infrastructure@g.o'] |
216 |
> #ev = setup_verification_environment(publishers) |
217 |
> #mf = ManifestFile() |
218 |
> #mf.load(open('/var/db/repos/gentoo/Manifest'), |
219 |
> # verify_openpgp=True, openpgp_env=ev) |
220 |
> #pprint(mf) |
221 |
> #pprint(mf.openpgp_signed) |
222 |
> #pprint(mf.openpgp_signature) |
223 |
> |
224 |
> # Step 1: merge manifests. |
225 |
> #mt = ManifestTree('/var/db/repos/gentoo') |
226 |
> #mt.build_manifest_list() |
227 |
> #mt.parse_manifests() |
228 |
> #mt.verify_manifests() |
229 |
> |
230 |
> glsa = ManifestTree('/var/db/repos/gentoo') |
231 |
> glsa.build_manifest_list() |
232 |
> glsa.parse_manifests() |
233 |
> |
234 |
> start = timeit.default_timer() |
235 |
> glsa.verify_manifests() |
236 |
> end = timeit.default_timer() |
237 |
> pprint(end - start) |
238 |
> |
239 |
> # Handled by checking for None. |
240 |
>#no_ignore = filter(lambda x: x._tag != 'ignore', |
241 |
>glsa_manifest_results) |
242 |
> |
243 |
> |
244 |
> #pprint(glsa._manifest_results) |
245 |
>real_files = [x for x in filter(lambda x: x is not None, |
246 |
>glsa._manifest_results)] |
247 |
> #pprint(real_files) |
248 |
> pprint(len(glsa._manifest_results)) |
249 |
> pprint(len(real_files)) |
250 |
> |
251 |
> all_files = [] |
252 |
> for path, dirs, files in os.walk('/var/db/repos/gentoo'): |
253 |
> pass |
254 |
> |
255 |
> return 0 |
256 |
> |
257 |
>if __name__ == '__main__': |
258 |
> sys.exit(main()) |
259 |
>``` |
260 |
> |
261 |
>```python (wkd.py, likely unneeded but I didn't want to redo these |
262 |
>files yet) |
263 |
>#!/usr/bin/env python3 |
264 |
>import sys, hashlib |
265 |
>import dns |
266 |
>from dns import ( |
267 |
> name, query, dnssec, |
268 |
> message, resolver, rdatatype |
269 |
>) |
270 |
>import shutil, requests |
271 |
> |
272 |
>def check_domain_signature(domain: str) -> bool: |
273 |
> response = dns.resolver.query(domain, dns.rdatatype.NS) |
274 |
> nsname = response.rrset[0] |
275 |
> response = dns.resolver.query(str(nsname), dns.rdatatype.A) |
276 |
> nsaddr = response.rrset[0].to_text() |
277 |
> |
278 |
> # DNSKEY |
279 |
> request = dns.message.make_query(domain, |
280 |
> dns.rdatatype.DNSKEY, want_dnssec=True) |
281 |
> response = dns.query.udp(request, nsaddr) |
282 |
> if response.rcode() != 0: |
283 |
> raise Exception('Unable to request dnskey.') |
284 |
> |
285 |
> answer = response.answer |
286 |
> if len(answer) != 2: |
287 |
> raise Exception('Malformed answer to dnskey query.') |
288 |
> |
289 |
> name = dns.name.from_text(domain) |
290 |
> try: |
291 |
> dns.dnssec.validate(answer[0], answer[1], {name: answer[0]}) |
292 |
> except dns.dnssec.ValidationFailure as exc: |
293 |
> # Validation failed. The raise causes python to abort with status 1. |
294 |
> #raise exc |
295 |
> return False |
296 |
> except AttributeError as exc: |
297 |
># Validation may have failed; DNSKEY missing signer attribute. dig may |
298 |
>report |
299 |
> # domain as valid. |
300 |
> # |
301 |
># TODO: Additional state where subdomain of valid domain may fail with |
302 |
>3 nested |
303 |
># KeyErrors. Avoid temptation to wildcard catch. Safer to put in |
304 |
>process? |
305 |
> #raise exc |
306 |
> return False |
307 |
> else: |
308 |
> return True |
309 |
> |
310 |
>def hash_localpart(incoming: bytes) -> str: |
311 |
> '''Z-base32 the localpart of an e-mail address |
312 |
> |
313 |
>https://tools.ietf.org/html/draft-koch-openpgp-webkey-service-08#section-3.1 |
314 |
> describes why this is needed. |
315 |
> |
316 |
> See https://tools.ietf.org/html/rfc6189#section-5.1.6 for a |
317 |
> description of the z-base32 scheme. |
318 |
> ''' |
319 |
> zb32 = "ybndrfg8ejkmcpqxot1uwisza345h769" |
320 |
> |
321 |
> b = hashlib.sha1(incoming).digest() |
322 |
> ret = "" |
323 |
> assert(len(b) * 8 == 160) |
324 |
> for i in range(0, 160, 5): |
325 |
> byte = i // 8 |
326 |
> offset = i - byte * 8 |
327 |
> # offset | bits remaining in k+1 | right-shift k+1 |
328 |
> # 3 | 0 | x |
329 |
> # 4 | 1 | 7 |
330 |
> # 5 | 2 | 6 |
331 |
> # 6 | 3 | 5 |
332 |
> # 7 | 4 | 4 |
333 |
> if offset < 4: |
334 |
> n = (b[byte] >> (3 - offset)) |
335 |
> else: |
336 |
> n = (b[byte] << (offset - 3)) + (b[byte + 1] >> (11 - offset)) |
337 |
> |
338 |
> ret += zb32[n & 0b11111] |
339 |
> return ret |
340 |
> |
341 |
>def build_web_key_uri(address: str) -> str: |
342 |
> local, remote = address.split('@') |
343 |
> local = hash_localpart(local.encode('utf-8')) |
344 |
> return 'https://' + remote + '/.well-known/openpgpkey/hu/' + \ |
345 |
> local |
346 |
> |
347 |
>def stream_to_file(uri: str, fname: str) -> None: |
348 |
> with requests.get(uri, verify=True, stream=True) as r: |
349 |
> from pprint import pprint |
350 |
> pprint(r.headers) |
351 |
> with open(fname, 'wb') as f: |
352 |
> shutil.copyfileobj(r.raw, f) |
353 |
>``` |
354 |
|
355 |
|
356 |
-- |
357 |
Best regards, |
358 |
Michał Górny |