Gentoo Archives: gentoo-portage-dev

From: "Michał Górny" <mgorny@g.o>
To: gentoo-portage-dev@l.g.o
Cc: Zac Medico <zmedico@g.o>
Subject: Re: [gentoo-portage-dev] [PATCH] movefile: support in-kernel file copying on Linux (bug 607868)
Date: Thu, 02 Mar 2017 16:25:21
Message-Id: 1488471898.1711.2.camel@gentoo.org
In Reply to: [gentoo-portage-dev] [PATCH] movefile: support in-kernel file copying on Linux (bug 607868) by Zac Medico
1 W dniu 01.03.2017, śro o godzinie 19∶03 -0800, użytkownik Zac Medico
2 napisał:
3 > Perform in-kernel file copying when possible, and also support
4 > reflinks and sparse files. If the optimized implementation
5 > fails at runtime, gracefully fallback to shutil.copyfile.
6 >
7 > Compile-time and run-time fallbacks are implemented, so that
8 > any incompatiblities will be handled gracefully. For example,
9 > if the code is compiled on a system that supports the
10 > copy_file_range syscall, but at run-time an older kernel that
11 > does not support this syscall is detected, it will be handled
12 > gracefully.
13 >
14 > X-Gentoo-Bug: 607868
15 > X-Gentoo-Bug-Url: https://bugs.gentoo.org/show_bug.cgi?id=607868
16 > ---
17 > pym/portage/tests/util/file_copy/__init__.py | 0
18 > pym/portage/tests/util/file_copy/__test__.py | 0
19 > pym/portage/tests/util/file_copy/test_copyfile.py | 68 +++++++
20 > pym/portage/util/file_copy/__init__.py | 78 ++++++++
21 > pym/portage/util/movefile.py | 4 +-
22 > setup.py | 9 +
23 > src/portage_util_file_copy_reflink_linux.c | 225 ++++++++++++++++++++++
24 > 7 files changed, 383 insertions(+), 1 deletion(-)
25 > create mode 100644 pym/portage/tests/util/file_copy/__init__.py
26 > create mode 100644 pym/portage/tests/util/file_copy/__test__.py
27 > create mode 100644 pym/portage/tests/util/file_copy/test_copyfile.py
28 > create mode 100644 pym/portage/util/file_copy/__init__.py
29 > create mode 100644 src/portage_util_file_copy_reflink_linux.c
30 >
31 > diff --git a/pym/portage/tests/util/file_copy/__init__.py b/pym/portage/tests/util/file_copy/__init__.py
32 > new file mode 100644
33 > index 0000000..e69de29
34 > diff --git a/pym/portage/tests/util/file_copy/__test__.py b/pym/portage/tests/util/file_copy/__test__.py
35 > new file mode 100644
36 > index 0000000..e69de29
37 > diff --git a/pym/portage/tests/util/file_copy/test_copyfile.py b/pym/portage/tests/util/file_copy/test_copyfile.py
38 > new file mode 100644
39 > index 0000000..987a701
40 > --- /dev/null
41 > +++ b/pym/portage/tests/util/file_copy/test_copyfile.py
42 > @@ -0,0 +1,68 @@
43 > +# Copyright 2017 Gentoo Foundation
44 > +# Distributed under the terms of the GNU General Public License v2
45 > +
46 > +import shutil
47 > +import tempfile
48 > +
49 > +from portage import os
50 > +from portage.tests import TestCase
51 > +from portage.checksum import perform_md5
52 > +from portage.util.file_copy import copyfile
53 > +
54 > +
55 > +class CopyFileTestCase(TestCase):
56 > +
57 > + def testCopyFile(self):
58 > +
59 > + tempdir = tempfile.mkdtemp()
60 > + try:
61 > + src_path = os.path.join(tempdir, 'src')
62 > + dest_path = os.path.join(tempdir, 'dest')
63 > + content = b'foo'
64 > +
65 > + with open(src_path, 'wb') as f:
66 > + f.write(content)
67 > +
68 > + copyfile(src_path, dest_path)
69 > +
70 > + self.assertEqual(perform_md5(src_path), perform_md5(dest_path))
71 > + finally:
72 > + shutil.rmtree(tempdir)
73 > +
74 > +
75 > +class CopyFileSparseTestCase(TestCase):
76 > +
77 > + def testCopyFileSparse(self):
78 > +
79 > + # This test is expected to fail on platforms where we have
80 > + # not implemented sparse copy, so set the todo flag in order
81 > + # to tolerate failures.
82 > + self.todo = True
83 > +
84 > + tempdir = tempfile.mkdtemp()
85 > + try:
86 > + src_path = os.path.join(tempdir, 'src')
87 > + dest_path = os.path.join(tempdir, 'dest')
88 > + content = b'foo'
89 > +
90 > + # Use seek to create some sparse blocks. Don't make these
91 > + # files too big, in case the filesystem doesn't support
92 > + # sparse files.
93 > + with open(src_path, 'wb') as f:
94 > + f.write(content)
95 > + f.seek(2**18, 1)
96 > + f.write(content)
97 > + f.seek(2**19, 1)
98 > + f.write(content)
99 > +
100 > + copyfile(src_path, dest_path)
101 > +
102 > + self.assertEqual(perform_md5(src_path), perform_md5(dest_path))
103 > +
104 > + # If sparse blocks were preserved, then both files should
105 > + # consume the same number of blocks.
106 > + self.assertEqual(
107 > + os.stat(src_path).st_blocks,
108 > + os.stat(dest_path).st_blocks)
109 > + finally:
110 > + shutil.rmtree(tempdir)
111 > diff --git a/pym/portage/util/file_copy/__init__.py b/pym/portage/util/file_copy/__init__.py
112 > new file mode 100644
113 > index 0000000..5c7aff1
114 > --- /dev/null
115 > +++ b/pym/portage/util/file_copy/__init__.py
116 > @@ -0,0 +1,78 @@
117 > +# Copyright 2017 Gentoo Foundation
118 > +# Distributed under the terms of the GNU General Public License v2
119 > +
120 > +import os
121 > +import shutil
122 > +import tempfile
123 > +
124 > +try:
125 > + from portage.util.file_copy.reflink_linux import file_copy as _file_copy
126 > +except ImportError:
127 > + _file_copy = None
128 > +
129 > +
130 > +_copyfile = None
131 > +
132 > +
133 > +def _optimized_copyfile(src, dst):
134 > + with open(src, 'rb', buffering=0) as src_file, \
135 > + open(dst, 'wb', buffering=0) as dst_file:
136 > + _file_copy(src_file.fileno(), dst_file.fileno())
137 > +
138 > +
139 > +def _test_optimized_copyfile():
140 > + """
141 > + Test if _optimized_copyfile works. It will fail for Linux versions
142 > + from 2.6.0 to 2.6.32, because sendfile does not support writing
143 > + to regular files. It will also fail for Linux versions less than
144 > + 3.1 if portage was compiled 3.1 or later, due to missing support
145 > + for lseek SEEK_DATA/SEEK_HOLE.
146
147 I don't really like the idea of relying on one-time attempt to determine
148 whether a complex mechanism lacking proper fallback will work. But I
149 guess you're only aiming at catching corner cases here.
150
151 > + """
152 > + tempdir = tempfile.mkdtemp()
153 > + try:
154 > + src_path = os.path.join(tempdir, 'src')
155 > + dest_path = os.path.join(tempdir, 'dest')
156 > + content = b'foo'
157 > +
158 > + with open(src_path, 'wb') as f:
159 > + f.write(content)
160 > +
161 > + try:
162 > + _optimized_copyfile(src_path, dest_path)
163 > + except Exception:
164 > + return False
165 > +
166 > + with open(dest_path, 'rb') as dest_file:
167 > + if content != dest_file.read():
168 > + return False
169 > + finally:
170 > + shutil.rmtree(tempdir)
171 > +
172 > + return True
173 > +
174 > +
175 > +def copyfile(src, dst):
176 > + """
177 > + Copy the contents (no metadata) of the file named src to a file
178 > + named dst.
179 > +
180 > + If possible, copying is done within the kernel, and uses
181 > + "copy acceleration" techniques (such as reflinks). This also
182 > + supports sparse files.
183 > +
184 > + @param src: path of source file
185 > + @type src: str
186 > + @param dst: path of destination file
187 > + @type dst: str
188 > + """
189 > + global _copyfile
190 > +
191 > + if _copyfile is None:
192 > + if _file_copy is None:
193 > + _copyfile = shutil.copyfile
194 > + elif _test_optimized_copyfile():
195 > + _copyfile = _optimized_copyfile
196 > + else:
197 > + _copyfile = shutil.copyfile
198
199 This logic looks a bit around. Wouldn't it be simpler if
200 _test_optimized_copyfile() just returned False is _file_copy is None?
201
202 > +
203 > + _copyfile(src, dst)
204 > diff --git a/pym/portage/util/movefile.py b/pym/portage/util/movefile.py
205 > index 4be1c3b..88b35d3 100644
206 > --- a/pym/portage/util/movefile.py
207 > +++ b/pym/portage/util/movefile.py
208 > @@ -23,6 +23,8 @@ from portage.localization import _
209 > from portage.process import spawn
210 > from portage.util import writemsg
211 > from portage.util._xattr import xattr
212 > +from portage.util.file_copy import copyfile
213 > +
214 >
215 > def _apply_stat(src_stat, dest):
216 > _os.chown(dest, src_stat.st_uid, src_stat.st_gid)
217 > @@ -114,7 +116,7 @@ def movefile(src, dest, newmtime=None, sstat=None, mysettings=None,
218 > _copyfile = selinux.copyfile
219 > _rename = selinux.rename
220 > else:
221 > - _copyfile = _shutil.copyfile
222 > + _copyfile = copyfile
223 > _rename = _os.rename
224 >
225 > lchown = _unicode_func_wrapper(portage.data.lchown, encoding=encoding)
226 > diff --git a/setup.py b/setup.py
227 > index a346bd4..b624767 100755
228 > --- a/setup.py
229 > +++ b/setup.py
230 > @@ -23,6 +23,7 @@ import collections
231 > import glob
232 > import os
233 > import os.path
234 > +import platform
235 > import re
236 > import subprocess
237 > import sys
238 > @@ -54,6 +55,14 @@ x_c_helpers = {
239 > ],
240 > }
241 >
242 > +if platform.system() == 'Linux':
243 > + x_c_helpers.update({
244 > + 'portage.util.file_copy.reflink_linux': [
245 > + 'src/portage_util_file_copy_reflink_linux.c',
246 > + ],
247 > + })
248 > +
249 > +
250 > class x_build(build):
251 > """ Build command with extra build_man call. """
252 >
253 > diff --git a/src/portage_util_file_copy_reflink_linux.c b/src/portage_util_file_copy_reflink_linux.c
254 > new file mode 100644
255 > index 0000000..7139c7a
256 > --- /dev/null
257 > +++ b/src/portage_util_file_copy_reflink_linux.c
258 > @@ -0,0 +1,225 @@
259 > +/* Copyright 2017 Gentoo Foundation
260 > + * Distributed under the terms of the GNU General Public License v2
261 > + */
262 > +
263 > +#include <Python.h>
264 > +#include <errno.h>
265 > +#include <stdlib.h>
266 > +#include <ctype.h>
267 > +#include <sys/sendfile.h>
268 > +#include <sys/stat.h>
269 > +#include <sys/syscall.h>
270 > +#include <sys/types.h>
271 > +#include <unistd.h>
272 > +
273 > +static PyObject * _reflink_linux_file_copy(PyObject *, PyObject *);
274 > +
275 > +static PyMethodDef reflink_linuxMethods[] = {
276 > + {
277 > + "file_copy",
278 > + _reflink_linux_file_copy,
279 > + METH_VARARGS,
280 > + "Copy between two file descriptors, "
281 > + "with reflink and sparse file support."
282 > + },
283 > + {NULL, NULL, 0, NULL}
284 > +};
285 > +
286 > +#if PY_MAJOR_VERSION >= 3
287 > +static struct PyModuleDef moduledef = {
288 > + PyModuleDef_HEAD_INIT,
289 > + "reflink_linux", /* m_name */
290 > + "Module for reflink_linux copy operations", /* m_doc */
291 > + -1, /* m_size */
292 > + reflink_linuxMethods, /* m_methods */
293 > + NULL, /* m_reload */
294 > + NULL, /* m_traverse */
295 > + NULL, /* m_clear */
296 > + NULL, /* m_free */
297 > +};
298 > +
299 > +PyMODINIT_FUNC
300 > +PyInit_reflink_linux(void)
301 > +{
302 > + PyObject *m;
303 > + m = PyModule_Create(&moduledef);
304 > + return m;
305 > +}
306 > +#else
307 > +PyMODINIT_FUNC
308 > +initreflink_linux(void)
309 > +{
310 > + Py_InitModule("reflink_linux", reflink_linuxMethods);
311 > +}
312 > +#endif
313 > +
314 > +
315 > +static ssize_t
316 > +cfr_wrapper(int fd_out, int fd_in, loff_t *off_out, size_t len)
317 > +{
318 > +#ifdef __NR_copy_file_range
319 > + return syscall(__NR_copy_file_range, fd_in, NULL, fd_out,
320 > + off_out, len, 0);
321 > +#else
322 > + errno = ENOSYS;
323 > + return -1;
324 > +#endif
325 > +}
326 > +
327 > +
328 > +static PyObject *
329 > +_reflink_linux_file_copy(PyObject *self, PyObject *args)
330 > +{
331 > + int eintr_retry, error, fd_in, fd_out;
332 > + off_t offset_out;
333 > + ssize_t copyfunc_ret;
334 > +#ifdef SEEK_DATA
335 > + /* Linux 3.1 and later support SEEK_DATA (for sparse file support).
336 > + * This code uses copy_file_range if possible, and falls back to
337 > + * sendfile for cross-device or when the copy_file_range syscall
338 > + * is not available (less than Linux 4.5). This will fail for
339 > + * Linux less than 3.1, which does not support the lseek SEEK_DATA
340 > + * parameter. The caller should perform a runtime test to verify
341 > + * that this function works with the running kernel.
342 > + */
343 > + off_t offset_data, offset_hole;
344 > + ssize_t (*copyfunc)(int, int, loff_t *, size_t);
345 > +
346 > + if (!PyArg_ParseTuple(args, "ii", &fd_in, &fd_out))
347 > + return NULL;
348 > +
349 > + eintr_retry = 1;
350 > + offset_out = 0;
351 > + offset_data = 0;
352 > + copyfunc = cfr_wrapper;
353 > +
354 > + while (eintr_retry) {
355 > +
356 > + error = 0;
357 > +
358 > + Py_BEGIN_ALLOW_THREADS
359 > +
360 > + while (1) {
361 > + /* Use lseek SEEK_DATA/SEEK_HOLE for sparse file support,
362 > + * as suggested in the copy_file_range man page.
363 > + */
364 > + offset_data = lseek(fd_in, offset_out, SEEK_DATA);
365 > + if (offset_data < 0) {
366 > + if (errno == ENXIO) {
367 > + /* EOF */
368 > + break;
369 > + }
370 > + error = 1;
371 > + break;
372 > + }
373 > +
374 > + /* Create sparse empty blocks in the output file, up
375 > + * until the next location that will contain data.
376 > + */
377 > + if (offset_data > offset_out) {
378 > + offset_out = lseek(fd_out, offset_data, SEEK_SET);
379 > + if (offset_out < 0) {
380 > + error = 1;
381 > + break;
382 > + }
383 > + }
384 > +
385 > + /* Locate the next hole, so that we know when to
386 > + * stop copying. There is an implicit hole at the
387 > + * end of the file.
388 > + */
389 > + offset_hole = lseek(fd_in, offset_data, SEEK_HOLE);
390 > + if (offset_hole < 0) {
391 > + error = 1;
392 > + break;
393 > + }
394 > +
395 > + /* Revert SEEK_HOLE offset change, since we're going
396 > + * to copy the data that comes before the hole.
397 > + */
398 > + if (lseek(fd_in, offset_out, SEEK_SET) < 0) {
399 > + error = 1;
400 > + break;
401 > + }
402 > +
403 > + copyfunc_ret = copyfunc(fd_out,
404 > + fd_in,
405 > + &offset_out,
406 > + offset_hole - offset_data);
407 > +
408 > + if (copyfunc_ret < 0) {
409 > + if ((errno == EXDEV || errno == ENOSYS) &&
410 > + copyfunc == cfr_wrapper) {
411 > + /* Use sendfile instead of copy_file_range for
412 > + * cross-device copies, or when the copy_file_range
413 > + * syscall is not available (less than Linux 4.5).
414 > + */
415 > + copyfunc = sendfile;
416 > + copyfunc_ret = copyfunc(fd_out,
417 > + fd_in,
418 > + &offset_out,
419 > + offset_hole - offset_data);
420 > +
421 > + if (copyfunc_ret < 0) {
422
423 I think you still should have a proper fallback for sendfile() refusing
424 to do its job.
425
426 > + error = 1;
427 > + break;
428 > + }
429 > + }
430 > + else {
431 > + error = 1;
432 > + break;
433 > + }
434 > + }
435 > + }
436 > +#else
437 > + /* Less than Linux 3.1 does not support SEEK_DATA or copy_file_range,
438 > + * so just use sendfile for in-kernel copy. This will fail for Linux
439 > + * versions from 2.6.0 to 2.6.32, because sendfile does not support
440 > + * writing to regular files. The caller should perform a runtime
441 > + * test to verify that this function works with the running kernel.
442 > + */
443 > + struct stat sb;
444 > + int stat_acquired;
445 > +
446 > + if (!PyArg_ParseTuple(args, "ii", &fd_in, &fd_out))
447 > + return NULL;
448 > +
449 > + eintr_retry = 1;
450 > + offset_out = 0;
451 > + stat_acquired = 0;
452 > +
453 > + while (eintr_retry) {
454 > +
455 > + error = 0;
456 > +
457 > + Py_BEGIN_ALLOW_THREADS
458 > +
459 > + if (!stat_acquired && fstat(fd_in, &sb) < 0) {
460 > + error = 1;
461 > + }
462 > + else {
463 > + stat_acquired = 1;
464 > + while (offset_out < sb.st_size) {
465 > + copyfunc_ret = sendfile(fd_out,
466 > + fd_in,
467 > + &offset_out,
468 > + sb.st_size - offset_out);
469 > +
470 > + if (copyfunc_ret < 0) {
471
472 Likewise, especially that old kernels may refuse file-to-file copy here.
473
474 > + error = 1;
475 > + break;
476 > + }
477 > + }
478 > + }
479 > +#endif
480 > + Py_END_ALLOW_THREADS
481 > +
482 > + if (!(error && errno == EINTR && PyErr_CheckSignals() == 0))
483 > + eintr_retry = 0;
484 > + }
485 > +
486 > + if (error)
487 > + return PyErr_SetFromErrno(PyExc_OSError);
488 > +
489 > + return Py_BuildValue("i", offset_out);
490 > +}
491
492 --
493 Best regards,
494 Michał Górny

Attachments

File name MIME type
signature.asc application/pgp-signature

Replies