Gentoo Archives: gentoo-portage-dev

From: Sid Spry <sid@××××.us>
To: gentoo-portage-dev@l.g.o
Subject: [gentoo-portage-dev] Speeding up Tree Verification
Date: Tue, 30 Jun 2020 02:14:18
Message-Id: 8cef84eb-e8f4-445b-9cb5-193c5f995894@www.fastmail.com
1 Hello,
2
3 I have some runnable pseudocode outlining a faster tree verification algorithm.
4 Before I create patches I'd like to see if there is any guidance on making the
5 changes as unobtrusive as possible. If the radical change in algorithm is
6 acceptable I can work on adding the changes.
7
8 Instead of composing any kind of structured data out of the portage tree my
9 algorithm just lists all files and then optionally batches them out to threads.
10 There is a noticeable speedup by eliding the tree traversal operations which
11 can be seen when running the algorithm with a single thread and comparing it to
12 the current algorithm in gemato (which should still be discussed here?).
13
14 Some simple tests like counting all objects traversed and verified returns the
15 same(ish). Once it is put into portage it could be tested in detail.
16
17 There is also my partial attempt at removing the brittle interface to GnuPG
18 (it's not as if the current code is badly designed, just that parsing the
19 output of GnuPG directly is likely not the best idea).
20
21 Needs gemato, dnspython, and requests. Slightly better than random code because
22 I took inspiration from the existing gemato classes.
23
24 ```python (veriftree.py)
25 #!/usr/bin/env python3
26 import os, sys, zlib, hashlib, tempfile, shutil, timeit
27 import subprocess
28 from typing import List
29 from pprint import pprint
30
31 from gemato.manifest import (
32 ManifestFile,
33 ManifestFileEntry,
34 )
35 from wkd import (
36 check_domain_signature,
37 hash_localpart,
38 build_web_key_uri,
39 stream_to_file
40 )
41 from fetchmedia import (
42 OpenPGPEnvironment,
43 setup_verification_environment
44 )
45
46 # 0. Top level directory (repository) contains Manifest, a PGP signature of
47 # blake2b and sha512 hashes of Manifest.files.gz.
48 # 1. Manifest.files contains hashes of each category Manifest.gz.
49 # 2. The category Manifest contains hashes of each package Manifest.
50 # 3. The package Manifest contains hashes of each package file.
51 # Must be aware of PMS, e.g. aux tag specifies a file in files/.
52
53 # 0. Check signature of repo Manifest.
54 # 1. Merge items in Manifest.files, each category Manifest, and each package
55 # Manifest into one big list. The path must be made absolute.
56 # 2. Distribute items to threads.
57
58 # To check operation compare directory tree to files appearing in all
59 # ManifestRecords.
60
61 class ManifestTree(object):
62 __slots__ = ['_directory', '_manifest_list', '_manifest_records',
63 '_manifest_results']
64
65 def __init__(self, directory: str):
66 self._directory = directory
67 # Tuples of (base_path, full_path).
68 self._manifest_list = []
69 self._manifest_records = []
70 self._manifest_results = []
71
72 def build_manifest_list(self):
73 for path, dirs, files in os.walk(self._directory):
74 #if 'glsa' in path or 'news' in path:
75 #if 'metadata' in path:
76 # continue # Skip the metadata directory for now.
77 # It contains a repository. Current algo barfs on Manifest
78 # containing only sig.
79
80 if 'Manifest.files.gz' in files:
81 self._manifest_list += [(path, path + '/Manifest.files.gz')]
82 if 'Manifest.gz' in files:
83 self._manifest_list += [(path, path + '/Manifest.gz')]
84
85 if path == self._directory:
86 continue # Skip the repo manifest. Order matters, fix eventually.
87 if 'Manifest' in files:
88 self._manifest_list += [(path, path + '/Manifest')]
89
90 def parse_manifests(self):
91 td = tempfile.TemporaryDirectory(dir='./')
92 for manifest in self._manifest_list:
93 def inner():
94 if manifest[1].endswith('.gz'):
95 name = 'Manifest.files' # Need to also handle Manifest.gz.
96 path = '{0}/{1}'.format(td.name, name)
97 subprocess.run(['sh', '-c', 'gunzip -c {0} > {1}'
98 .format(manifest[1], path)])
99 for line in open(path):
100 mr = ManifestRecord(line)
101 mr.make_absolute(manifest[0])
102 self._manifest_records += [mr]
103 else:
104 for line in open(manifest[1]):
105 if line.startswith('-'):
106 return # Skip the signed manifest.
107 mr = ManifestRecord(line)
108 mr.make_absolute(manifest[0])
109 self._manifest_records += [mr]
110 inner()
111
112 def verify_manifests(self):
113 for record in self._manifest_records:
114 self._manifest_results += [record.verify()]
115
116
117 class ManifestRecord(object):
118 __slots__ = ['_tag', '_abs_path', '_path', '_size', '_hashes']
119
120 def __init__(self, line: str=None):
121 self._tag = None
122 self._abs_path = None
123 self._path = None
124 self._size = None
125 self._hashes = []
126 if line:
127 self.from_string(line)
128
129 def from_string(self, line: str) -> None:
130 parts = line.split()
131 if len(parts) == 2:
132 self._tag = 'ignore'
133 return
134 self._tag = parts[0]
135 self._path = parts[1]
136 self._size = parts[2]
137 self._hashes = parts[3:]
138
139 def make_absolute(self, abs_path: str) -> None:
140 self._abs_path = abs_path
141 try:
142 pass
143 #if 'md5-cache' in abs_path:
144 # print(abs_path + '/' + self._path)
145 except TypeError as exc:
146 return
147
148 def verify(self) -> bool:
149 if self._tag == 'ignore':
150 return None
151
152 # Where is best place to do this? Before?
153 if self._tag.lower() == 'aux':
154 self._path = self._abs_path + '/files/' + self._path
155 elif self._abs_path:
156 self._path = self._abs_path + '/' + self._path
157
158 # Distfiles will not be present.
159 if self._tag.lower() == 'dist':
160 return None
161
162 if not os.path.exists(self._path):
163 return False
164
165 fd = open(self._path, 'rb')
166 sha512 = hashlib.sha512()
167 blake2b = hashlib.blake2b()
168 while True:
169 d = fd.read(8192)
170 if not d:
171 break
172 sha512.update(d)
173 blake2b.update(d)
174 rsha512 = sha512.hexdigest()
175 rblake2b = blake2b.hexdigest()
176
177 if rblake2b != self._hashes[1]:
178 return False
179
180 if rsha512 != self._hashes[3]:
181 return False
182
183 return True
184
185 def __repr__(self) -> str:
186 #return repr(self._hashes)
187 return '\t'.join([self._tag, self._size, self._path])
188
189 def main() -> int:
190 # Step 0: verify the repo manifest.
191 #publishers = ['infrastructure@g.o']
192 #ev = setup_verification_environment(publishers)
193 #mf = ManifestFile()
194 #mf.load(open('/var/db/repos/gentoo/Manifest'),
195 # verify_openpgp=True, openpgp_env=ev)
196 #pprint(mf)
197 #pprint(mf.openpgp_signed)
198 #pprint(mf.openpgp_signature)
199
200 # Step 1: merge manifests.
201 #mt = ManifestTree('/var/db/repos/gentoo')
202 #mt.build_manifest_list()
203 #mt.parse_manifests()
204 #mt.verify_manifests()
205
206 glsa = ManifestTree('/var/db/repos/gentoo')
207 glsa.build_manifest_list()
208 glsa.parse_manifests()
209
210 start = timeit.default_timer()
211 glsa.verify_manifests()
212 end = timeit.default_timer()
213 pprint(end - start)
214
215 # Handled by checking for None.
216 #no_ignore = filter(lambda x: x._tag != 'ignore', glsa_manifest_results)
217
218
219 #pprint(glsa._manifest_results)
220 real_files = [x for x in filter(lambda x: x is not None, glsa._manifest_results)]
221 #pprint(real_files)
222 pprint(len(glsa._manifest_results))
223 pprint(len(real_files))
224
225 all_files = []
226 for path, dirs, files in os.walk('/var/db/repos/gentoo'):
227 pass
228
229 return 0
230
231 if __name__ == '__main__':
232 sys.exit(main())
233 ```
234
235 ```python (wkd.py, likely unneeded but I didn't want to redo these files yet)
236 #!/usr/bin/env python3
237 import sys, hashlib
238 import dns
239 from dns import (
240 name, query, dnssec,
241 message, resolver, rdatatype
242 )
243 import shutil, requests
244
245 def check_domain_signature(domain: str) -> bool:
246 response = dns.resolver.query(domain, dns.rdatatype.NS)
247 nsname = response.rrset[0]
248 response = dns.resolver.query(str(nsname), dns.rdatatype.A)
249 nsaddr = response.rrset[0].to_text()
250
251 # DNSKEY
252 request = dns.message.make_query(domain,
253 dns.rdatatype.DNSKEY, want_dnssec=True)
254 response = dns.query.udp(request, nsaddr)
255 if response.rcode() != 0:
256 raise Exception('Unable to request dnskey.')
257
258 answer = response.answer
259 if len(answer) != 2:
260 raise Exception('Malformed answer to dnskey query.')
261
262 name = dns.name.from_text(domain)
263 try:
264 dns.dnssec.validate(answer[0], answer[1], {name: answer[0]})
265 except dns.dnssec.ValidationFailure as exc:
266 # Validation failed. The raise causes python to abort with status 1.
267 #raise exc
268 return False
269 except AttributeError as exc:
270 # Validation may have failed; DNSKEY missing signer attribute. dig may report
271 # domain as valid.
272 #
273 # TODO: Additional state where subdomain of valid domain may fail with 3 nested
274 # KeyErrors. Avoid temptation to wildcard catch. Safer to put in process?
275 #raise exc
276 return False
277 else:
278 return True
279
280 def hash_localpart(incoming: bytes) -> str:
281 '''Z-base32 the localpart of an e-mail address
282
283 https://tools.ietf.org/html/draft-koch-openpgp-webkey-service-08#section-3.1
284 describes why this is needed.
285
286 See https://tools.ietf.org/html/rfc6189#section-5.1.6 for a
287 description of the z-base32 scheme.
288 '''
289 zb32 = "ybndrfg8ejkmcpqxot1uwisza345h769"
290
291 b = hashlib.sha1(incoming).digest()
292 ret = ""
293 assert(len(b) * 8 == 160)
294 for i in range(0, 160, 5):
295 byte = i // 8
296 offset = i - byte * 8
297 # offset | bits remaining in k+1 | right-shift k+1
298 # 3 | 0 | x
299 # 4 | 1 | 7
300 # 5 | 2 | 6
301 # 6 | 3 | 5
302 # 7 | 4 | 4
303 if offset < 4:
304 n = (b[byte] >> (3 - offset))
305 else:
306 n = (b[byte] << (offset - 3)) + (b[byte + 1] >> (11 - offset))
307
308 ret += zb32[n & 0b11111]
309 return ret
310
311 def build_web_key_uri(address: str) -> str:
312 local, remote = address.split('@')
313 local = hash_localpart(local.encode('utf-8'))
314 return 'https://' + remote + '/.well-known/openpgpkey/hu/' + \
315 local
316
317 def stream_to_file(uri: str, fname: str) -> None:
318 with requests.get(uri, verify=True, stream=True) as r:
319 from pprint import pprint
320 pprint(r.headers)
321 with open(fname, 'wb') as f:
322 shutil.copyfileobj(r.raw, f)
323 ```

Replies