1 |
Hello, |
2 |
|
3 |
I have some runnable pseudocode outlining a faster tree verification algorithm. |
4 |
Before I create patches I'd like to see if there is any guidance on making the |
5 |
changes as unobtrusive as possible. If the radical change in algorithm is |
6 |
acceptable I can work on adding the changes. |
7 |
|
8 |
Instead of composing any kind of structured data out of the portage tree my |
9 |
algorithm just lists all files and then optionally batches them out to threads. |
10 |
There is a noticeable speedup by eliding the tree traversal operations which |
11 |
can be seen when running the algorithm with a single thread and comparing it to |
12 |
the current algorithm in gemato (which should still be discussed here?). |
13 |
|
14 |
Some simple tests like counting all objects traversed and verified returns the |
15 |
same(ish). Once it is put into portage it could be tested in detail. |
16 |
|
17 |
There is also my partial attempt at removing the brittle interface to GnuPG |
18 |
(it's not as if the current code is badly designed, just that parsing the |
19 |
output of GnuPG directly is likely not the best idea). |
20 |
|
21 |
Needs gemato, dnspython, and requests. Slightly better than random code because |
22 |
I took inspiration from the existing gemato classes. |
23 |
|
24 |
```python (veriftree.py) |
25 |
#!/usr/bin/env python3 |
26 |
import os, sys, zlib, hashlib, tempfile, shutil, timeit |
27 |
import subprocess |
28 |
from typing import List |
29 |
from pprint import pprint |
30 |
|
31 |
from gemato.manifest import ( |
32 |
ManifestFile, |
33 |
ManifestFileEntry, |
34 |
) |
35 |
from wkd import ( |
36 |
check_domain_signature, |
37 |
hash_localpart, |
38 |
build_web_key_uri, |
39 |
stream_to_file |
40 |
) |
41 |
from fetchmedia import ( |
42 |
OpenPGPEnvironment, |
43 |
setup_verification_environment |
44 |
) |
45 |
|
46 |
# 0. Top level directory (repository) contains Manifest, a PGP signature of |
47 |
# blake2b and sha512 hashes of Manifest.files.gz. |
48 |
# 1. Manifest.files contains hashes of each category Manifest.gz. |
49 |
# 2. The category Manifest contains hashes of each package Manifest. |
50 |
# 3. The package Manifest contains hashes of each package file. |
51 |
# Must be aware of PMS, e.g. aux tag specifies a file in files/. |
52 |
|
53 |
# 0. Check signature of repo Manifest. |
54 |
# 1. Merge items in Manifest.files, each category Manifest, and each package |
55 |
# Manifest into one big list. The path must be made absolute. |
56 |
# 2. Distribute items to threads. |
57 |
|
58 |
# To check operation compare directory tree to files appearing in all |
59 |
# ManifestRecords. |
60 |
|
61 |
class ManifestTree(object): |
62 |
__slots__ = ['_directory', '_manifest_list', '_manifest_records', |
63 |
'_manifest_results'] |
64 |
|
65 |
def __init__(self, directory: str): |
66 |
self._directory = directory |
67 |
# Tuples of (base_path, full_path). |
68 |
self._manifest_list = [] |
69 |
self._manifest_records = [] |
70 |
self._manifest_results = [] |
71 |
|
72 |
def build_manifest_list(self): |
73 |
for path, dirs, files in os.walk(self._directory): |
74 |
#if 'glsa' in path or 'news' in path: |
75 |
#if 'metadata' in path: |
76 |
# continue # Skip the metadata directory for now. |
77 |
# It contains a repository. Current algo barfs on Manifest |
78 |
# containing only sig. |
79 |
|
80 |
if 'Manifest.files.gz' in files: |
81 |
self._manifest_list += [(path, path + '/Manifest.files.gz')] |
82 |
if 'Manifest.gz' in files: |
83 |
self._manifest_list += [(path, path + '/Manifest.gz')] |
84 |
|
85 |
if path == self._directory: |
86 |
continue # Skip the repo manifest. Order matters, fix eventually. |
87 |
if 'Manifest' in files: |
88 |
self._manifest_list += [(path, path + '/Manifest')] |
89 |
|
90 |
def parse_manifests(self): |
91 |
td = tempfile.TemporaryDirectory(dir='./') |
92 |
for manifest in self._manifest_list: |
93 |
def inner(): |
94 |
if manifest[1].endswith('.gz'): |
95 |
name = 'Manifest.files' # Need to also handle Manifest.gz. |
96 |
path = '{0}/{1}'.format(td.name, name) |
97 |
subprocess.run(['sh', '-c', 'gunzip -c {0} > {1}' |
98 |
.format(manifest[1], path)]) |
99 |
for line in open(path): |
100 |
mr = ManifestRecord(line) |
101 |
mr.make_absolute(manifest[0]) |
102 |
self._manifest_records += [mr] |
103 |
else: |
104 |
for line in open(manifest[1]): |
105 |
if line.startswith('-'): |
106 |
return # Skip the signed manifest. |
107 |
mr = ManifestRecord(line) |
108 |
mr.make_absolute(manifest[0]) |
109 |
self._manifest_records += [mr] |
110 |
inner() |
111 |
|
112 |
def verify_manifests(self): |
113 |
for record in self._manifest_records: |
114 |
self._manifest_results += [record.verify()] |
115 |
|
116 |
|
117 |
class ManifestRecord(object): |
118 |
__slots__ = ['_tag', '_abs_path', '_path', '_size', '_hashes'] |
119 |
|
120 |
def __init__(self, line: str=None): |
121 |
self._tag = None |
122 |
self._abs_path = None |
123 |
self._path = None |
124 |
self._size = None |
125 |
self._hashes = [] |
126 |
if line: |
127 |
self.from_string(line) |
128 |
|
129 |
def from_string(self, line: str) -> None: |
130 |
parts = line.split() |
131 |
if len(parts) == 2: |
132 |
self._tag = 'ignore' |
133 |
return |
134 |
self._tag = parts[0] |
135 |
self._path = parts[1] |
136 |
self._size = parts[2] |
137 |
self._hashes = parts[3:] |
138 |
|
139 |
def make_absolute(self, abs_path: str) -> None: |
140 |
self._abs_path = abs_path |
141 |
try: |
142 |
pass |
143 |
#if 'md5-cache' in abs_path: |
144 |
# print(abs_path + '/' + self._path) |
145 |
except TypeError as exc: |
146 |
return |
147 |
|
148 |
def verify(self) -> bool: |
149 |
if self._tag == 'ignore': |
150 |
return None |
151 |
|
152 |
# Where is best place to do this? Before? |
153 |
if self._tag.lower() == 'aux': |
154 |
self._path = self._abs_path + '/files/' + self._path |
155 |
elif self._abs_path: |
156 |
self._path = self._abs_path + '/' + self._path |
157 |
|
158 |
# Distfiles will not be present. |
159 |
if self._tag.lower() == 'dist': |
160 |
return None |
161 |
|
162 |
if not os.path.exists(self._path): |
163 |
return False |
164 |
|
165 |
fd = open(self._path, 'rb') |
166 |
sha512 = hashlib.sha512() |
167 |
blake2b = hashlib.blake2b() |
168 |
while True: |
169 |
d = fd.read(8192) |
170 |
if not d: |
171 |
break |
172 |
sha512.update(d) |
173 |
blake2b.update(d) |
174 |
rsha512 = sha512.hexdigest() |
175 |
rblake2b = blake2b.hexdigest() |
176 |
|
177 |
if rblake2b != self._hashes[1]: |
178 |
return False |
179 |
|
180 |
if rsha512 != self._hashes[3]: |
181 |
return False |
182 |
|
183 |
return True |
184 |
|
185 |
def __repr__(self) -> str: |
186 |
#return repr(self._hashes) |
187 |
return '\t'.join([self._tag, self._size, self._path]) |
188 |
|
189 |
def main() -> int: |
190 |
# Step 0: verify the repo manifest. |
191 |
#publishers = ['infrastructure@g.o'] |
192 |
#ev = setup_verification_environment(publishers) |
193 |
#mf = ManifestFile() |
194 |
#mf.load(open('/var/db/repos/gentoo/Manifest'), |
195 |
# verify_openpgp=True, openpgp_env=ev) |
196 |
#pprint(mf) |
197 |
#pprint(mf.openpgp_signed) |
198 |
#pprint(mf.openpgp_signature) |
199 |
|
200 |
# Step 1: merge manifests. |
201 |
#mt = ManifestTree('/var/db/repos/gentoo') |
202 |
#mt.build_manifest_list() |
203 |
#mt.parse_manifests() |
204 |
#mt.verify_manifests() |
205 |
|
206 |
glsa = ManifestTree('/var/db/repos/gentoo') |
207 |
glsa.build_manifest_list() |
208 |
glsa.parse_manifests() |
209 |
|
210 |
start = timeit.default_timer() |
211 |
glsa.verify_manifests() |
212 |
end = timeit.default_timer() |
213 |
pprint(end - start) |
214 |
|
215 |
# Handled by checking for None. |
216 |
#no_ignore = filter(lambda x: x._tag != 'ignore', glsa_manifest_results) |
217 |
|
218 |
|
219 |
#pprint(glsa._manifest_results) |
220 |
real_files = [x for x in filter(lambda x: x is not None, glsa._manifest_results)] |
221 |
#pprint(real_files) |
222 |
pprint(len(glsa._manifest_results)) |
223 |
pprint(len(real_files)) |
224 |
|
225 |
all_files = [] |
226 |
for path, dirs, files in os.walk('/var/db/repos/gentoo'): |
227 |
pass |
228 |
|
229 |
return 0 |
230 |
|
231 |
if __name__ == '__main__': |
232 |
sys.exit(main()) |
233 |
``` |
234 |
|
235 |
```python (wkd.py, likely unneeded but I didn't want to redo these files yet) |
236 |
#!/usr/bin/env python3 |
237 |
import sys, hashlib |
238 |
import dns |
239 |
from dns import ( |
240 |
name, query, dnssec, |
241 |
message, resolver, rdatatype |
242 |
) |
243 |
import shutil, requests |
244 |
|
245 |
def check_domain_signature(domain: str) -> bool: |
246 |
response = dns.resolver.query(domain, dns.rdatatype.NS) |
247 |
nsname = response.rrset[0] |
248 |
response = dns.resolver.query(str(nsname), dns.rdatatype.A) |
249 |
nsaddr = response.rrset[0].to_text() |
250 |
|
251 |
# DNSKEY |
252 |
request = dns.message.make_query(domain, |
253 |
dns.rdatatype.DNSKEY, want_dnssec=True) |
254 |
response = dns.query.udp(request, nsaddr) |
255 |
if response.rcode() != 0: |
256 |
raise Exception('Unable to request dnskey.') |
257 |
|
258 |
answer = response.answer |
259 |
if len(answer) != 2: |
260 |
raise Exception('Malformed answer to dnskey query.') |
261 |
|
262 |
name = dns.name.from_text(domain) |
263 |
try: |
264 |
dns.dnssec.validate(answer[0], answer[1], {name: answer[0]}) |
265 |
except dns.dnssec.ValidationFailure as exc: |
266 |
# Validation failed. The raise causes python to abort with status 1. |
267 |
#raise exc |
268 |
return False |
269 |
except AttributeError as exc: |
270 |
# Validation may have failed; DNSKEY missing signer attribute. dig may report |
271 |
# domain as valid. |
272 |
# |
273 |
# TODO: Additional state where subdomain of valid domain may fail with 3 nested |
274 |
# KeyErrors. Avoid temptation to wildcard catch. Safer to put in process? |
275 |
#raise exc |
276 |
return False |
277 |
else: |
278 |
return True |
279 |
|
280 |
def hash_localpart(incoming: bytes) -> str: |
281 |
'''Z-base32 the localpart of an e-mail address |
282 |
|
283 |
https://tools.ietf.org/html/draft-koch-openpgp-webkey-service-08#section-3.1 |
284 |
describes why this is needed. |
285 |
|
286 |
See https://tools.ietf.org/html/rfc6189#section-5.1.6 for a |
287 |
description of the z-base32 scheme. |
288 |
''' |
289 |
zb32 = "ybndrfg8ejkmcpqxot1uwisza345h769" |
290 |
|
291 |
b = hashlib.sha1(incoming).digest() |
292 |
ret = "" |
293 |
assert(len(b) * 8 == 160) |
294 |
for i in range(0, 160, 5): |
295 |
byte = i // 8 |
296 |
offset = i - byte * 8 |
297 |
# offset | bits remaining in k+1 | right-shift k+1 |
298 |
# 3 | 0 | x |
299 |
# 4 | 1 | 7 |
300 |
# 5 | 2 | 6 |
301 |
# 6 | 3 | 5 |
302 |
# 7 | 4 | 4 |
303 |
if offset < 4: |
304 |
n = (b[byte] >> (3 - offset)) |
305 |
else: |
306 |
n = (b[byte] << (offset - 3)) + (b[byte + 1] >> (11 - offset)) |
307 |
|
308 |
ret += zb32[n & 0b11111] |
309 |
return ret |
310 |
|
311 |
def build_web_key_uri(address: str) -> str: |
312 |
local, remote = address.split('@') |
313 |
local = hash_localpart(local.encode('utf-8')) |
314 |
return 'https://' + remote + '/.well-known/openpgpkey/hu/' + \ |
315 |
local |
316 |
|
317 |
def stream_to_file(uri: str, fname: str) -> None: |
318 |
with requests.get(uri, verify=True, stream=True) as r: |
319 |
from pprint import pprint |
320 |
pprint(r.headers) |
321 |
with open(fname, 'wb') as f: |
322 |
shutil.copyfileobj(r.raw, f) |
323 |
``` |