Gentoo Archives: gentoo-portage-dev

From: Zac Medico <zmedico@g.o>
To: gentoo-portage-dev@l.g.o
Cc: Zac Medico <zmedico@g.o>
Subject: [gentoo-portage-dev] [PATCH v4] portage.cache: write md5 instead of mtime (bug 568934)
Date: Tue, 12 Jul 2016 17:18:47
Message-Id: 1468343898-6266-1-git-send-email-zmedico@gentoo.org
In Reply to: [gentoo-portage-dev] [PATCH] portage.cache: write md5 instead of mtime (bug 568934) by Zac Medico
1 Change cache modules to write md5 in cache entries, instead of mtime.
2 Since portage-2.2.27, the relevant cache modules have had the ability
3 to read cache entries containing either md5 or mtime, therefore this
4 change is backward-compatible with portage-2.2.27 and later.
5
6 Also fix the reconstruct_eclasses function to raise CacheCorruption
7 when the specified chf_type is md5 and the cache entry contains mtime
8 data, and optimize __getitem__ to skip reconstruct_eclasses calls when
9 the entry appears to have a different chf_type.
10
11 X-Gentoo-Bug: 568934
12 X-Gentoo-Bug-url: https://bugs.gentoo.org/show_bug.cgi?id=568934
13 ---
14 [PATCH v4] adds some comments to clarify the purposes of the __getitem__
15 optimization and _md5_deserializer stuff
16
17 pym/portage/cache/anydbm.py | 4 ++--
18 pym/portage/cache/flat_hash.py | 4 ++--
19 pym/portage/cache/sqlite.py | 4 ++--
20 pym/portage/cache/template.py | 36 ++++++++++++++++++++++++++++++++----
21 4 files changed, 38 insertions(+), 10 deletions(-)
22
23 diff --git a/pym/portage/cache/anydbm.py b/pym/portage/cache/anydbm.py
24 index 80d24e5..88d85b0 100644
25 --- a/pym/portage/cache/anydbm.py
26 +++ b/pym/portage/cache/anydbm.py
27 @@ -36,8 +36,8 @@ from portage.cache import cache_errors
28
29 class database(fs_template.FsBased):
30
31 - validation_chf = 'mtime'
32 - chf_types = ('mtime', 'md5')
33 + validation_chf = 'md5'
34 + chf_types = ('md5', 'mtime')
35
36 autocommits = True
37 cleanse_keys = True
38 diff --git a/pym/portage/cache/flat_hash.py b/pym/portage/cache/flat_hash.py
39 index cca0f10..3a899c0 100644
40 --- a/pym/portage/cache/flat_hash.py
41 +++ b/pym/portage/cache/flat_hash.py
42 @@ -163,5 +163,5 @@ class md5_database(database):
43
44
45 class mtime_md5_database(database):
46 - validation_chf = 'mtime'
47 - chf_types = ('mtime', 'md5')
48 + validation_chf = 'md5'
49 + chf_types = ('md5', 'mtime')
50 diff --git a/pym/portage/cache/sqlite.py b/pym/portage/cache/sqlite.py
51 index 32e4076..69150f6 100644
52 --- a/pym/portage/cache/sqlite.py
53 +++ b/pym/portage/cache/sqlite.py
54 @@ -18,8 +18,8 @@ if sys.hexversion >= 0x3000000:
55
56 class database(fs_template.FsBased):
57
58 - validation_chf = 'mtime'
59 - chf_types = ('mtime', 'md5')
60 + validation_chf = 'md5'
61 + chf_types = ('md5', 'mtime')
62
63 autocommits = False
64 synchronous = False
65 diff --git a/pym/portage/cache/template.py b/pym/portage/cache/template.py
66 index a7c6de0..8662d85 100644
67 --- a/pym/portage/cache/template.py
68 +++ b/pym/portage/cache/template.py
69 @@ -54,6 +54,15 @@ class database(object):
70
71 if self.serialize_eclasses and "_eclasses_" in d:
72 for chf_type in chf_types:
73 + if '_%s_' % chf_type not in d:
74 + # Skip the reconstruct_eclasses call, since it's
75 + # a waste of time if it contains a different chf_type
76 + # than the current one. In the past, it was possible
77 + # for reconstruct_eclasses called with chf_type='md5'
78 + # to "successfully" return invalid data here, because
79 + # it was unable to distinguish between md5 data and
80 + # mtime data.
81 + continue
82 try:
83 d["_eclasses_"] = reconstruct_eclasses(cpv, d["_eclasses_"],
84 chf_type, paths=self.store_eclass_paths)
85 @@ -62,6 +71,9 @@ class database(object):
86 raise
87 else:
88 break
89 + else:
90 + raise cache_errors.CacheCorruption(cpv,
91 + 'entry does not contain a recognized chf_type')
92
93 elif "_eclasses_" not in d:
94 d["_eclasses_"] = {}
95 @@ -310,6 +322,23 @@ def serialize_eclasses(eclass_dict, chf_type='mtime', paths=True):
96 for k, v in sorted(eclass_dict.items(), key=_keysorter))
97
98
99 +def _md5_deserializer(md5):
100 + """
101 + Without this validation, it's possible for reconstruct_eclasses to
102 + mistakenly interpret mtime data as md5 data, and return an invalid
103 + data structure containing strings where ints are expected.
104 + """
105 + if len(md5) != 32:
106 + raise ValueError('expected 32 hex digits')
107 + return md5
108 +
109 +
110 +_chf_deserializers = {
111 + 'md5': _md5_deserializer,
112 + 'mtime': long,
113 +}
114 +
115 +
116 def reconstruct_eclasses(cpv, eclass_string, chf_type='mtime', paths=True):
117 """returns a dict when handed a string generated by serialize_eclasses"""
118 eclasses = eclass_string.rstrip().lstrip().split("\t")
119 @@ -317,9 +346,7 @@ def reconstruct_eclasses(cpv, eclass_string, chf_type='mtime', paths=True):
120 # occasionally this occurs in the fs backends. they suck.
121 return {}
122
123 - converter = _unicode
124 - if chf_type == 'mtime':
125 - converter = long
126 + converter = _chf_deserializers.get(chf_type, lambda x: x)
127
128 if paths:
129 if len(eclasses) % 3 != 0:
130 @@ -340,6 +367,7 @@ def reconstruct_eclasses(cpv, eclass_string, chf_type='mtime', paths=True):
131 raise cache_errors.CacheCorruption(cpv,
132 "_eclasses_ was of invalid len %i" % len(eclasses))
133 except ValueError:
134 - raise cache_errors.CacheCorruption(cpv, "_eclasses_ mtime conversion to long failed")
135 + raise cache_errors.CacheCorruption(cpv,
136 + "_eclasses_ not valid for chf_type {}".format(chf_type))
137 del eclasses
138 return d
139 --
140 2.7.4

Replies