Gentoo Archives: gentoo-portage-dev

From: Zac Medico <zmedico@g.o>
To: gentoo-portage-dev@l.g.o
Cc: Zac Medico <zmedico@g.o>
Subject: [gentoo-portage-dev] [PATCH] movefile: support in-kernel file copying on Linux (bug 607868)
Date: Thu, 02 Mar 2017 03:04:52
Message-Id: 20170302030336.22956-1-zmedico@gentoo.org
1 Perform in-kernel file copying when possible, and also support
2 reflinks and sparse files. If the optimized implementation
3 fails at runtime, gracefully fallback to shutil.copyfile.
4
5 Compile-time and run-time fallbacks are implemented, so that
6 any incompatiblities will be handled gracefully. For example,
7 if the code is compiled on a system that supports the
8 copy_file_range syscall, but at run-time an older kernel that
9 does not support this syscall is detected, it will be handled
10 gracefully.
11
12 X-Gentoo-Bug: 607868
13 X-Gentoo-Bug-Url: https://bugs.gentoo.org/show_bug.cgi?id=607868
14 ---
15 pym/portage/tests/util/file_copy/__init__.py | 0
16 pym/portage/tests/util/file_copy/__test__.py | 0
17 pym/portage/tests/util/file_copy/test_copyfile.py | 68 +++++++
18 pym/portage/util/file_copy/__init__.py | 78 ++++++++
19 pym/portage/util/movefile.py | 4 +-
20 setup.py | 9 +
21 src/portage_util_file_copy_reflink_linux.c | 225 ++++++++++++++++++++++
22 7 files changed, 383 insertions(+), 1 deletion(-)
23 create mode 100644 pym/portage/tests/util/file_copy/__init__.py
24 create mode 100644 pym/portage/tests/util/file_copy/__test__.py
25 create mode 100644 pym/portage/tests/util/file_copy/test_copyfile.py
26 create mode 100644 pym/portage/util/file_copy/__init__.py
27 create mode 100644 src/portage_util_file_copy_reflink_linux.c
28
29 diff --git a/pym/portage/tests/util/file_copy/__init__.py b/pym/portage/tests/util/file_copy/__init__.py
30 new file mode 100644
31 index 0000000..e69de29
32 diff --git a/pym/portage/tests/util/file_copy/__test__.py b/pym/portage/tests/util/file_copy/__test__.py
33 new file mode 100644
34 index 0000000..e69de29
35 diff --git a/pym/portage/tests/util/file_copy/test_copyfile.py b/pym/portage/tests/util/file_copy/test_copyfile.py
36 new file mode 100644
37 index 0000000..987a701
38 --- /dev/null
39 +++ b/pym/portage/tests/util/file_copy/test_copyfile.py
40 @@ -0,0 +1,68 @@
41 +# Copyright 2017 Gentoo Foundation
42 +# Distributed under the terms of the GNU General Public License v2
43 +
44 +import shutil
45 +import tempfile
46 +
47 +from portage import os
48 +from portage.tests import TestCase
49 +from portage.checksum import perform_md5
50 +from portage.util.file_copy import copyfile
51 +
52 +
53 +class CopyFileTestCase(TestCase):
54 +
55 + def testCopyFile(self):
56 +
57 + tempdir = tempfile.mkdtemp()
58 + try:
59 + src_path = os.path.join(tempdir, 'src')
60 + dest_path = os.path.join(tempdir, 'dest')
61 + content = b'foo'
62 +
63 + with open(src_path, 'wb') as f:
64 + f.write(content)
65 +
66 + copyfile(src_path, dest_path)
67 +
68 + self.assertEqual(perform_md5(src_path), perform_md5(dest_path))
69 + finally:
70 + shutil.rmtree(tempdir)
71 +
72 +
73 +class CopyFileSparseTestCase(TestCase):
74 +
75 + def testCopyFileSparse(self):
76 +
77 + # This test is expected to fail on platforms where we have
78 + # not implemented sparse copy, so set the todo flag in order
79 + # to tolerate failures.
80 + self.todo = True
81 +
82 + tempdir = tempfile.mkdtemp()
83 + try:
84 + src_path = os.path.join(tempdir, 'src')
85 + dest_path = os.path.join(tempdir, 'dest')
86 + content = b'foo'
87 +
88 + # Use seek to create some sparse blocks. Don't make these
89 + # files too big, in case the filesystem doesn't support
90 + # sparse files.
91 + with open(src_path, 'wb') as f:
92 + f.write(content)
93 + f.seek(2**18, 1)
94 + f.write(content)
95 + f.seek(2**19, 1)
96 + f.write(content)
97 +
98 + copyfile(src_path, dest_path)
99 +
100 + self.assertEqual(perform_md5(src_path), perform_md5(dest_path))
101 +
102 + # If sparse blocks were preserved, then both files should
103 + # consume the same number of blocks.
104 + self.assertEqual(
105 + os.stat(src_path).st_blocks,
106 + os.stat(dest_path).st_blocks)
107 + finally:
108 + shutil.rmtree(tempdir)
109 diff --git a/pym/portage/util/file_copy/__init__.py b/pym/portage/util/file_copy/__init__.py
110 new file mode 100644
111 index 0000000..5c7aff1
112 --- /dev/null
113 +++ b/pym/portage/util/file_copy/__init__.py
114 @@ -0,0 +1,78 @@
115 +# Copyright 2017 Gentoo Foundation
116 +# Distributed under the terms of the GNU General Public License v2
117 +
118 +import os
119 +import shutil
120 +import tempfile
121 +
122 +try:
123 + from portage.util.file_copy.reflink_linux import file_copy as _file_copy
124 +except ImportError:
125 + _file_copy = None
126 +
127 +
128 +_copyfile = None
129 +
130 +
131 +def _optimized_copyfile(src, dst):
132 + with open(src, 'rb', buffering=0) as src_file, \
133 + open(dst, 'wb', buffering=0) as dst_file:
134 + _file_copy(src_file.fileno(), dst_file.fileno())
135 +
136 +
137 +def _test_optimized_copyfile():
138 + """
139 + Test if _optimized_copyfile works. It will fail for Linux versions
140 + from 2.6.0 to 2.6.32, because sendfile does not support writing
141 + to regular files. It will also fail for Linux versions less than
142 + 3.1 if portage was compiled 3.1 or later, due to missing support
143 + for lseek SEEK_DATA/SEEK_HOLE.
144 + """
145 + tempdir = tempfile.mkdtemp()
146 + try:
147 + src_path = os.path.join(tempdir, 'src')
148 + dest_path = os.path.join(tempdir, 'dest')
149 + content = b'foo'
150 +
151 + with open(src_path, 'wb') as f:
152 + f.write(content)
153 +
154 + try:
155 + _optimized_copyfile(src_path, dest_path)
156 + except Exception:
157 + return False
158 +
159 + with open(dest_path, 'rb') as dest_file:
160 + if content != dest_file.read():
161 + return False
162 + finally:
163 + shutil.rmtree(tempdir)
164 +
165 + return True
166 +
167 +
168 +def copyfile(src, dst):
169 + """
170 + Copy the contents (no metadata) of the file named src to a file
171 + named dst.
172 +
173 + If possible, copying is done within the kernel, and uses
174 + "copy acceleration" techniques (such as reflinks). This also
175 + supports sparse files.
176 +
177 + @param src: path of source file
178 + @type src: str
179 + @param dst: path of destination file
180 + @type dst: str
181 + """
182 + global _copyfile
183 +
184 + if _copyfile is None:
185 + if _file_copy is None:
186 + _copyfile = shutil.copyfile
187 + elif _test_optimized_copyfile():
188 + _copyfile = _optimized_copyfile
189 + else:
190 + _copyfile = shutil.copyfile
191 +
192 + _copyfile(src, dst)
193 diff --git a/pym/portage/util/movefile.py b/pym/portage/util/movefile.py
194 index 4be1c3b..88b35d3 100644
195 --- a/pym/portage/util/movefile.py
196 +++ b/pym/portage/util/movefile.py
197 @@ -23,6 +23,8 @@ from portage.localization import _
198 from portage.process import spawn
199 from portage.util import writemsg
200 from portage.util._xattr import xattr
201 +from portage.util.file_copy import copyfile
202 +
203
204 def _apply_stat(src_stat, dest):
205 _os.chown(dest, src_stat.st_uid, src_stat.st_gid)
206 @@ -114,7 +116,7 @@ def movefile(src, dest, newmtime=None, sstat=None, mysettings=None,
207 _copyfile = selinux.copyfile
208 _rename = selinux.rename
209 else:
210 - _copyfile = _shutil.copyfile
211 + _copyfile = copyfile
212 _rename = _os.rename
213
214 lchown = _unicode_func_wrapper(portage.data.lchown, encoding=encoding)
215 diff --git a/setup.py b/setup.py
216 index a346bd4..b624767 100755
217 --- a/setup.py
218 +++ b/setup.py
219 @@ -23,6 +23,7 @@ import collections
220 import glob
221 import os
222 import os.path
223 +import platform
224 import re
225 import subprocess
226 import sys
227 @@ -54,6 +55,14 @@ x_c_helpers = {
228 ],
229 }
230
231 +if platform.system() == 'Linux':
232 + x_c_helpers.update({
233 + 'portage.util.file_copy.reflink_linux': [
234 + 'src/portage_util_file_copy_reflink_linux.c',
235 + ],
236 + })
237 +
238 +
239 class x_build(build):
240 """ Build command with extra build_man call. """
241
242 diff --git a/src/portage_util_file_copy_reflink_linux.c b/src/portage_util_file_copy_reflink_linux.c
243 new file mode 100644
244 index 0000000..7139c7a
245 --- /dev/null
246 +++ b/src/portage_util_file_copy_reflink_linux.c
247 @@ -0,0 +1,225 @@
248 +/* Copyright 2017 Gentoo Foundation
249 + * Distributed under the terms of the GNU General Public License v2
250 + */
251 +
252 +#include <Python.h>
253 +#include <errno.h>
254 +#include <stdlib.h>
255 +#include <ctype.h>
256 +#include <sys/sendfile.h>
257 +#include <sys/stat.h>
258 +#include <sys/syscall.h>
259 +#include <sys/types.h>
260 +#include <unistd.h>
261 +
262 +static PyObject * _reflink_linux_file_copy(PyObject *, PyObject *);
263 +
264 +static PyMethodDef reflink_linuxMethods[] = {
265 + {
266 + "file_copy",
267 + _reflink_linux_file_copy,
268 + METH_VARARGS,
269 + "Copy between two file descriptors, "
270 + "with reflink and sparse file support."
271 + },
272 + {NULL, NULL, 0, NULL}
273 +};
274 +
275 +#if PY_MAJOR_VERSION >= 3
276 +static struct PyModuleDef moduledef = {
277 + PyModuleDef_HEAD_INIT,
278 + "reflink_linux", /* m_name */
279 + "Module for reflink_linux copy operations", /* m_doc */
280 + -1, /* m_size */
281 + reflink_linuxMethods, /* m_methods */
282 + NULL, /* m_reload */
283 + NULL, /* m_traverse */
284 + NULL, /* m_clear */
285 + NULL, /* m_free */
286 +};
287 +
288 +PyMODINIT_FUNC
289 +PyInit_reflink_linux(void)
290 +{
291 + PyObject *m;
292 + m = PyModule_Create(&moduledef);
293 + return m;
294 +}
295 +#else
296 +PyMODINIT_FUNC
297 +initreflink_linux(void)
298 +{
299 + Py_InitModule("reflink_linux", reflink_linuxMethods);
300 +}
301 +#endif
302 +
303 +
304 +static ssize_t
305 +cfr_wrapper(int fd_out, int fd_in, loff_t *off_out, size_t len)
306 +{
307 +#ifdef __NR_copy_file_range
308 + return syscall(__NR_copy_file_range, fd_in, NULL, fd_out,
309 + off_out, len, 0);
310 +#else
311 + errno = ENOSYS;
312 + return -1;
313 +#endif
314 +}
315 +
316 +
317 +static PyObject *
318 +_reflink_linux_file_copy(PyObject *self, PyObject *args)
319 +{
320 + int eintr_retry, error, fd_in, fd_out;
321 + off_t offset_out;
322 + ssize_t copyfunc_ret;
323 +#ifdef SEEK_DATA
324 + /* Linux 3.1 and later support SEEK_DATA (for sparse file support).
325 + * This code uses copy_file_range if possible, and falls back to
326 + * sendfile for cross-device or when the copy_file_range syscall
327 + * is not available (less than Linux 4.5). This will fail for
328 + * Linux less than 3.1, which does not support the lseek SEEK_DATA
329 + * parameter. The caller should perform a runtime test to verify
330 + * that this function works with the running kernel.
331 + */
332 + off_t offset_data, offset_hole;
333 + ssize_t (*copyfunc)(int, int, loff_t *, size_t);
334 +
335 + if (!PyArg_ParseTuple(args, "ii", &fd_in, &fd_out))
336 + return NULL;
337 +
338 + eintr_retry = 1;
339 + offset_out = 0;
340 + offset_data = 0;
341 + copyfunc = cfr_wrapper;
342 +
343 + while (eintr_retry) {
344 +
345 + error = 0;
346 +
347 + Py_BEGIN_ALLOW_THREADS
348 +
349 + while (1) {
350 + /* Use lseek SEEK_DATA/SEEK_HOLE for sparse file support,
351 + * as suggested in the copy_file_range man page.
352 + */
353 + offset_data = lseek(fd_in, offset_out, SEEK_DATA);
354 + if (offset_data < 0) {
355 + if (errno == ENXIO) {
356 + /* EOF */
357 + break;
358 + }
359 + error = 1;
360 + break;
361 + }
362 +
363 + /* Create sparse empty blocks in the output file, up
364 + * until the next location that will contain data.
365 + */
366 + if (offset_data > offset_out) {
367 + offset_out = lseek(fd_out, offset_data, SEEK_SET);
368 + if (offset_out < 0) {
369 + error = 1;
370 + break;
371 + }
372 + }
373 +
374 + /* Locate the next hole, so that we know when to
375 + * stop copying. There is an implicit hole at the
376 + * end of the file.
377 + */
378 + offset_hole = lseek(fd_in, offset_data, SEEK_HOLE);
379 + if (offset_hole < 0) {
380 + error = 1;
381 + break;
382 + }
383 +
384 + /* Revert SEEK_HOLE offset change, since we're going
385 + * to copy the data that comes before the hole.
386 + */
387 + if (lseek(fd_in, offset_out, SEEK_SET) < 0) {
388 + error = 1;
389 + break;
390 + }
391 +
392 + copyfunc_ret = copyfunc(fd_out,
393 + fd_in,
394 + &offset_out,
395 + offset_hole - offset_data);
396 +
397 + if (copyfunc_ret < 0) {
398 + if ((errno == EXDEV || errno == ENOSYS) &&
399 + copyfunc == cfr_wrapper) {
400 + /* Use sendfile instead of copy_file_range for
401 + * cross-device copies, or when the copy_file_range
402 + * syscall is not available (less than Linux 4.5).
403 + */
404 + copyfunc = sendfile;
405 + copyfunc_ret = copyfunc(fd_out,
406 + fd_in,
407 + &offset_out,
408 + offset_hole - offset_data);
409 +
410 + if (copyfunc_ret < 0) {
411 + error = 1;
412 + break;
413 + }
414 + }
415 + else {
416 + error = 1;
417 + break;
418 + }
419 + }
420 + }
421 +#else
422 + /* Less than Linux 3.1 does not support SEEK_DATA or copy_file_range,
423 + * so just use sendfile for in-kernel copy. This will fail for Linux
424 + * versions from 2.6.0 to 2.6.32, because sendfile does not support
425 + * writing to regular files. The caller should perform a runtime
426 + * test to verify that this function works with the running kernel.
427 + */
428 + struct stat sb;
429 + int stat_acquired;
430 +
431 + if (!PyArg_ParseTuple(args, "ii", &fd_in, &fd_out))
432 + return NULL;
433 +
434 + eintr_retry = 1;
435 + offset_out = 0;
436 + stat_acquired = 0;
437 +
438 + while (eintr_retry) {
439 +
440 + error = 0;
441 +
442 + Py_BEGIN_ALLOW_THREADS
443 +
444 + if (!stat_acquired && fstat(fd_in, &sb) < 0) {
445 + error = 1;
446 + }
447 + else {
448 + stat_acquired = 1;
449 + while (offset_out < sb.st_size) {
450 + copyfunc_ret = sendfile(fd_out,
451 + fd_in,
452 + &offset_out,
453 + sb.st_size - offset_out);
454 +
455 + if (copyfunc_ret < 0) {
456 + error = 1;
457 + break;
458 + }
459 + }
460 + }
461 +#endif
462 + Py_END_ALLOW_THREADS
463 +
464 + if (!(error && errno == EINTR && PyErr_CheckSignals() == 0))
465 + eintr_retry = 0;
466 + }
467 +
468 + if (error)
469 + return PyErr_SetFromErrno(PyExc_OSError);
470 +
471 + return Py_BuildValue("i", offset_out);
472 +}
473 --
474 2.10.2

Replies