Re: [gentoo-portage-dev] [PATCH] fetch: Support GLEP 75 mirror structure - gentoo-portage-dev

From:	"Michał Górny" <mgorny@g.o>
To:	gentoo-portage-dev@l.g.o
Subject:	Re: [gentoo-portage-dev] [PATCH] fetch: Support GLEP 75 mirror structure
Date:	Fri, 04 Oct 2019 05:51:26
Message-Id:	`28cb5656b0755a1fe07df1eadd5e3683607064e2.camel@gentoo.org`
In Reply to:	Re: [gentoo-portage-dev] [PATCH] fetch: Support GLEP 75 mirror structure by Alec Warner

1

On Thu, 2019-10-03 at 21:58 -0700, Alec Warner wrote:

2

> On Thu, Oct 3, 2019 at 7:52 AM Michał Górny <mgorny@g.o> wrote:

3

>

4

> > Add a support for the subset of GLEP 75 needed by Gentoo Infra.  This

5

> > includes fetching and parsing layout.conf, and support for flat layout

6

> > and filename-hash layout with cutoffs being multiplies of 4.

7

> >

8

> > Bug: https://bugs.gentoo.org/646898

9

> > Signed-off-by: Michał Górny <mgorny@g.o>

10

> > ---

11

> >  lib/portage/package/ebuild/fetch.py | 113 +++++++++++++++++++++++++++-

12

> >  1 file changed, 109 insertions(+), 4 deletions(-)

13

> >

14

> > diff --git a/lib/portage/package/ebuild/fetch.py

15

> > b/lib/portage/package/ebuild/fetch.py

16

> > index 227bf45ae..692efcc01 100644

17

> > --- a/lib/portage/package/ebuild/fetch.py

18

> > +++ b/lib/portage/package/ebuild/fetch.py

19

> > @@ -7,12 +7,15 @@ __all__ = ['fetch']

20

> >

21

> >  import errno

22

> >  import io

23

> > +import itertools

24

> > +import json

25

> >  import logging

26

> >  import random

27

> >  import re

28

> >  import stat

29

> >  import sys

30

> >  import tempfile

31

> > +import time

32

> >

33

> >  from collections import OrderedDict

34

> >

35

> > @@ -27,14 +30,17 @@ portage.proxy.lazyimport.lazyimport(globals(),

36

> >         'portage.package.ebuild.doebuild:doebuild_environment,' + \

37

> >                 '_doebuild_spawn',

38

> >         'portage.package.ebuild.prepare_build_dirs:prepare_build_dirs',

39

> > +

40

> >  'portage.util.configparser:SafeConfigParser,read_configs,NoOptionError',

41

> > +       'portage.util._urlopen:urlopen',

42

> >  )

43

> >

44

> >  from portage import os, selinux, shutil, _encodings, \

45

> >         _movefile, _shell_quote, _unicode_encode

46

> >  from portage.checksum import (get_valid_checksum_keys, perform_md5,

47

> > verify_all,

48

> > -       _filter_unaccelarated_hashes, _hash_filter, _apply_hash_filter)

49

> > +       _filter_unaccelarated_hashes, _hash_filter, _apply_hash_filter,

50

> > +       checksum_str)

51

> >  from portage.const import BASH_BINARY, CUSTOM_MIRRORS_FILE, \

52

> > -       GLOBAL_CONFIG_PATH

53

> > +       GLOBAL_CONFIG_PATH, CACHE_PATH

54

> >  from portage.data import portage_gid, portage_uid, secpass,

55

> > userpriv_groups

56

> >  from portage.exception import FileNotFound, OperationNotPermitted, \

57

> >         PortageException, TryAgain

58

> > @@ -253,6 +259,104 @@ _size_suffix_map = {

59

> >         'Y' : 80,

60

> >  }

61

> >

62

> > +

63

> > +def filename_hash_path(filename, algo, cutoffs):

64

> > +       """

65

> > +       Get directory path for filename in filename-hash mirror structure.

66

> > +

67

> > +       @param filename: Filename to fetch

68

> > +       @param algo: Hash algorithm

69

> > +       @param cutoffs: Cutoff values (n:n...)

70

> > +       @return: Directory path

71

> > +       """

72

> > +

73

> > +       fnhash = checksum_str(filename.encode('utf8'), algo)

74

> > +       ret = ''

75

> > +       for c in cutoffs.split(':'):

76

> > +               c = int(c) // 4

77

> > +               ret += fnhash[:c] + '/'

78

> >

79

>

80

> When making a path, please use os.path.join()

81

82

This is URL, not a path.

83

84

>

85

>

86

> > +               fnhash = fnhash[c:]

87

> > +       return ret

88

> > +

89

> > +

90

> > +def get_mirror_url(mirror_url, filename, eroot):

91

> > +       """

92

> > +       Get correct fetch URL for a given file, accounting for mirror

93

> > +       layout configuration.

94

> > +

95

> > +       @param mirror_url: Base URL to the mirror (without '/distfiles')

96

> > +       @param filename: Filename to fetch

97

> > +       @param eroot: EROOT to use for the cache file

98

> > +       @return: Full URL to fetch

99

> > +       """

100

> >

101

> +

102

> > +       cache_file = os.path.join(eroot, CACHE_PATH,

103

> > 'mirror-metadata.json')

104

> > +       try:

105

> > +               with open(cache_file, 'r') as f:

106

> > +                       cache = json.load(f)

107

> > +       except (IOError, ValueError):

108

> > +               cache = {}

109

> >

110

>

111

> I'm a bit worried that we are opening this cache file off of disk every

112

> time we call get_mirror_url(). Can we just cache the contents in memory

113

> between calls; or even better pass the cache in as argument rather than it

114

> be contained in get_mirror_url?

115

116

We could but this is no bottleneck.  That's premature optimization,

117

the way I see it.

118

119

>

120

>

121

> > +

122

> > +       ts, layout = cache.get(mirror_url, (0, None))

123

> > +       # refresh at least daily

124

> > +       if ts < time.time() - 86400:

125

> > +               # the default

126

> > +               layout = ('flat',)

127

> > +

128

> > +               try:

129

> > +                       f = urlopen(mirror_url + '/distfiles/layout.conf')

130

> > +                       try:

131

> > +                               data = io.StringIO(f.read().decode('utf8'))

132

> > +                       finally:

133

> > +                               f.close()

134

> > +                       cp = SafeConfigParser()

135

> > +                       read_configs(cp, [data])

136

> > +

137

> > +                       for i in itertools.count():

138

> > +                               try:

139

> > +                                       val = tuple(cp.get('structure',

140

> > '%d' % i).split())

141

> > +                                       if val == ('flat',):

142

> > +                                               pass

143

> > +                                       elif val[0] == 'filename-hash' and

144

> > len(val) == 3:

145

> > +                                               if val[1] not in

146

> > get_valid_checksum_keys():

147

> > +                                                       continue

148

> > +                                               # validate cutoffs

149

> > +                                               cutoffs_good = False

150

> > +                                               for c in val[2].split(':'):

151

> > +                                                       try:

152

> > +                                                               c = int(c)

153

> > +                                                       except ValueError:

154

> > +                                                               break

155

> > +                                                       else:

156

> > +                                                               if c % 4

157

> > != 0:

158

> > +

159

> >  break

160

> > +                                               else:

161

> > +                                                       cutoffs_good = True

162

> > +                                               if not cutoffs_good:

163

> > +                                                       continue

164

> > +                                       else:

165

> > +                                               # (skip unsupported

166

> > variant)

167

> > +                                               continue

168

> > +                                       layout = val

169

> > +                                       break

170

> > +                               except NoOptionError:

171

> > +                                       break

172

> > +               except IOError:

173

> > +                       pass

174

> > +

175

> > +               cache[mirror_url] = (time.time(), layout)

176

> > +               with open(cache_file, 'w') as f:

177

> > +                       json.dump(cache, f)

178

> > +

179

> > +       if layout[0] == 'flat':

180

> > +               return mirror_url + "/distfiles/" + filename

181

> > +       elif layout[0] == 'filename-hash':

182

> > +               return (mirror_url + "/distfiles/" +

183

> > +                               filename_hash_path(filename, *layout[1:])

184

> > + filename)

185

> > +       else:

186

> > +               raise AssertionError("get_mirror_url() got unknown layout

187

> > type")

188

> >

189

>

190

> raise AssertionError("get_mirror_url() got unknown layout type %s wanted

191

> one of %s" % (layout[0], ('flat', 'filename-hash')))

192

>

193

> E.g. if you got an unknown thing, it's nice to print what you want and what

194

> you wanted so callers can fix it.

195

196

Assertions are not for callers.  They merely check that half

197

of the functions wasn't updated, and the other half left.

198

199

>

200

>

201

> > +

202

> > +

203

> >  def fetch(myuris, mysettings, listonly=0, fetchonly=0,

204

> >         locks_in_subdir=".locks", use_locks=1, try_mirrors=1, digests=None,

205

> >         allow_missing_digests=True):

206

> > @@ -434,8 +538,9 @@ def fetch(myuris, mysettings, listonly=0, fetchonly=0,

207

> >         for myfile, myuri in file_uri_tuples:

208

> >                 if myfile not in filedict:

209

> >                         filedict[myfile]=[]

210

> > -                       for y in range(0,len(locations)):

211

> > -

212

> >  filedict[myfile].append(locations[y]+"/distfiles/"+myfile)

213

> > +                       for l in locations:

214

> > +                               filedict[myfile].append(get_mirror_url(l,

215

> > myfile,

216

> > +                                               mysettings["EROOT"]))

217

> >                 if myuri is None:

218

> >                         continue

219

> >                 if myuri[:9]=="mirror://":

220

> > --

221

> > 2.23.0

222

> >

223

> >

224

> >

225

226

--

227

Best regards,

228

Michał Górny

Gentoo Archives: gentoo-portage-dev

Attachments

1	On Thu, 2019-10-03 at 21:58 -0700, Alec Warner wrote:
2	> On Thu, Oct 3, 2019 at 7:52 AM Michał Górny <mgorny@g.o> wrote:
3	>
4	> > Add a support for the subset of GLEP 75 needed by Gentoo Infra. This
5	> > includes fetching and parsing layout.conf, and support for flat layout
6	> > and filename-hash layout with cutoffs being multiplies of 4.
7	> >
8	> > Bug: https://bugs.gentoo.org/646898
9	> > Signed-off-by: Michał Górny <mgorny@g.o>
10	> > ---
11	> > lib/portage/package/ebuild/fetch.py \| 113 +++++++++++++++++++++++++++-
12	> > 1 file changed, 109 insertions(+), 4 deletions(-)
13	> >
14	> > diff --git a/lib/portage/package/ebuild/fetch.py
15	> > b/lib/portage/package/ebuild/fetch.py
16	> > index 227bf45ae..692efcc01 100644
17	> > --- a/lib/portage/package/ebuild/fetch.py
18	> > +++ b/lib/portage/package/ebuild/fetch.py
19	> > @@ -7,12 +7,15 @@ __all__ = ['fetch']
20	> >
21	> > import errno
22	> > import io
23	> > +import itertools
24	> > +import json
25	> > import logging
26	> > import random
27	> > import re
28	> > import stat
29	> > import sys
30	> > import tempfile
31	> > +import time
32	> >
33	> > from collections import OrderedDict
34	> >
35	> > @@ -27,14 +30,17 @@ portage.proxy.lazyimport.lazyimport(globals(),
36	> > 'portage.package.ebuild.doebuild:doebuild_environment,' + \
37	> > '_doebuild_spawn',
38	> > 'portage.package.ebuild.prepare_build_dirs:prepare_build_dirs',
39	> > +
40	> > 'portage.util.configparser:SafeConfigParser,read_configs,NoOptionError',
41	> > + 'portage.util._urlopen:urlopen',
42	> > )
43	> >
44	> > from portage import os, selinux, shutil, _encodings, \
45	> > _movefile, _shell_quote, _unicode_encode
46	> > from portage.checksum import (get_valid_checksum_keys, perform_md5,
47	> > verify_all,
48	> > - _filter_unaccelarated_hashes, _hash_filter, _apply_hash_filter)
49	> > + _filter_unaccelarated_hashes, _hash_filter, _apply_hash_filter,
50	> > + checksum_str)
51	> > from portage.const import BASH_BINARY, CUSTOM_MIRRORS_FILE, \
52	> > - GLOBAL_CONFIG_PATH
53	> > + GLOBAL_CONFIG_PATH, CACHE_PATH
54	> > from portage.data import portage_gid, portage_uid, secpass,
55	> > userpriv_groups
56	> > from portage.exception import FileNotFound, OperationNotPermitted, \
57	> > PortageException, TryAgain
58	> > @@ -253,6 +259,104 @@ _size_suffix_map = {
59	> > 'Y' : 80,
60	> > }
61	> >
62	> > +
63	> > +def filename_hash_path(filename, algo, cutoffs):
64	> > + """
65	> > + Get directory path for filename in filename-hash mirror structure.
66	> > +
67	> > + @param filename: Filename to fetch
68	> > + @param algo: Hash algorithm
69	> > + @param cutoffs: Cutoff values (n:n...)
70	> > + @return: Directory path
71	> > + """
72	> > +
73	> > + fnhash = checksum_str(filename.encode('utf8'), algo)
74	> > + ret = ''
75	> > + for c in cutoffs.split(':'):
76	> > + c = int(c) // 4
77	> > + ret += fnhash[:c] + '/'
78	> >
79	>
80	> When making a path, please use os.path.join()
81
82	This is URL, not a path.
83
84	>
85	>
86	> > + fnhash = fnhash[c:]
87	> > + return ret
88	> > +
89	> > +
90	> > +def get_mirror_url(mirror_url, filename, eroot):
91	> > + """
92	> > + Get correct fetch URL for a given file, accounting for mirror
93	> > + layout configuration.
94	> > +
95	> > + @param mirror_url: Base URL to the mirror (without '/distfiles')
96	> > + @param filename: Filename to fetch
97	> > + @param eroot: EROOT to use for the cache file
98	> > + @return: Full URL to fetch
99	> > + """
100	> >
101	> +
102	> > + cache_file = os.path.join(eroot, CACHE_PATH,
103	> > 'mirror-metadata.json')
104	> > + try:
105	> > + with open(cache_file, 'r') as f:
106	> > + cache = json.load(f)
107	> > + except (IOError, ValueError):
108	> > + cache = {}
109	> >
110	>
111	> I'm a bit worried that we are opening this cache file off of disk every
112	> time we call get_mirror_url(). Can we just cache the contents in memory
113	> between calls; or even better pass the cache in as argument rather than it
114	> be contained in get_mirror_url?
115
116	We could but this is no bottleneck. That's premature optimization,
117	the way I see it.
118
119	>
120	>
121	> > +
122	> > + ts, layout = cache.get(mirror_url, (0, None))
123	> > + # refresh at least daily
124	> > + if ts < time.time() - 86400:
125	> > + # the default
126	> > + layout = ('flat',)
127	> > +
128	> > + try:
129	> > + f = urlopen(mirror_url + '/distfiles/layout.conf')
130	> > + try:
131	> > + data = io.StringIO(f.read().decode('utf8'))
132	> > + finally:
133	> > + f.close()
134	> > + cp = SafeConfigParser()
135	> > + read_configs(cp, [data])
136	> > +
137	> > + for i in itertools.count():
138	> > + try:
139	> > + val = tuple(cp.get('structure',
140	> > '%d' % i).split())
141	> > + if val == ('flat',):
142	> > + pass
143	> > + elif val[0] == 'filename-hash' and
144	> > len(val) == 3:
145	> > + if val[1] not in
146	> > get_valid_checksum_keys():
147	> > + continue
148	> > + # validate cutoffs
149	> > + cutoffs_good = False
150	> > + for c in val[2].split(':'):
151	> > + try:
152	> > + c = int(c)
153	> > + except ValueError:
154	> > + break
155	> > + else:
156	> > + if c % 4
157	> > != 0:
158	> > +
159	> > break
160	> > + else:
161	> > + cutoffs_good = True
162	> > + if not cutoffs_good:
163	> > + continue
164	> > + else:
165	> > + # (skip unsupported
166	> > variant)
167	> > + continue
168	> > + layout = val
169	> > + break
170	> > + except NoOptionError:
171	> > + break
172	> > + except IOError:
173	> > + pass
174	> > +
175	> > + cache[mirror_url] = (time.time(), layout)
176	> > + with open(cache_file, 'w') as f:
177	> > + json.dump(cache, f)
178	> > +
179	> > + if layout[0] == 'flat':
180	> > + return mirror_url + "/distfiles/" + filename
181	> > + elif layout[0] == 'filename-hash':
182	> > + return (mirror_url + "/distfiles/" +
183	> > + filename_hash_path(filename, *layout[1:])
184	> > + filename)
185	> > + else:
186	> > + raise AssertionError("get_mirror_url() got unknown layout
187	> > type")
188	> >
189	>
190	> raise AssertionError("get_mirror_url() got unknown layout type %s wanted
191	> one of %s" % (layout[0], ('flat', 'filename-hash')))
192	>
193	> E.g. if you got an unknown thing, it's nice to print what you want and what
194	> you wanted so callers can fix it.
195
196	Assertions are not for callers. They merely check that half
197	of the functions wasn't updated, and the other half left.
198
199	>
200	>
201	> > +
202	> > +
203	> > def fetch(myuris, mysettings, listonly=0, fetchonly=0,
204	> > locks_in_subdir=".locks", use_locks=1, try_mirrors=1, digests=None,
205	> > allow_missing_digests=True):
206	> > @@ -434,8 +538,9 @@ def fetch(myuris, mysettings, listonly=0, fetchonly=0,
207	> > for myfile, myuri in file_uri_tuples:
208	> > if myfile not in filedict:
209	> > filedict[myfile]=[]
210	> > - for y in range(0,len(locations)):
211	> > -
212	> > filedict[myfile].append(locations[y]+"/distfiles/"+myfile)
213	> > + for l in locations:
214	> > + filedict[myfile].append(get_mirror_url(l,
215	> > myfile,
216	> > + mysettings["EROOT"]))
217	> > if myuri is None:
218	> > continue
219	> > if myuri[:9]=="mirror://":
220	> > --
221	> > 2.23.0
222	> >
223	> >
224	> >
225
226	--
227	Best regards,
228	Michał Górny