1 |
Perform in-kernel file copying when possible, and also support |
2 |
reflinks and sparse files. If the optimized implementation |
3 |
fails at runtime, gracefully fallback to a plain read/write |
4 |
loop. |
5 |
|
6 |
Compile-time and run-time fallbacks are implemented, so that |
7 |
any incompatiblities will be handled gracefully. For example, |
8 |
if the code is compiled on a system that supports the |
9 |
copy_file_range syscall, but at run-time an older kernel that |
10 |
does not support this syscall is detected, it will be handled |
11 |
gracefully. There are similar fallbacks for lack of lseek |
12 |
SEEK_DATA and sendfile support. |
13 |
|
14 |
X-Gentoo-Bug: 607868 |
15 |
X-Gentoo-Bug-Url: https://bugs.gentoo.org/show_bug.cgi?id=607868 |
16 |
--- |
17 |
[PATCH v2] adds a native fallback that will work on any kernel, |
18 |
using a plain read/write loop. |
19 |
|
20 |
pym/portage/tests/util/file_copy/__init__.py | 0 |
21 |
pym/portage/tests/util/file_copy/__test__.py | 0 |
22 |
pym/portage/tests/util/file_copy/test_copyfile.py | 68 +++++ |
23 |
pym/portage/util/file_copy/__init__.py | 45 ++++ |
24 |
pym/portage/util/movefile.py | 4 +- |
25 |
setup.py | 9 + |
26 |
src/portage_util_file_copy_reflink_linux.c | 298 ++++++++++++++++++++++ |
27 |
7 files changed, 423 insertions(+), 1 deletion(-) |
28 |
create mode 100644 pym/portage/tests/util/file_copy/__init__.py |
29 |
create mode 100644 pym/portage/tests/util/file_copy/__test__.py |
30 |
create mode 100644 pym/portage/tests/util/file_copy/test_copyfile.py |
31 |
create mode 100644 pym/portage/util/file_copy/__init__.py |
32 |
create mode 100644 src/portage_util_file_copy_reflink_linux.c |
33 |
|
34 |
diff --git a/pym/portage/tests/util/file_copy/__init__.py b/pym/portage/tests/util/file_copy/__init__.py |
35 |
new file mode 100644 |
36 |
index 0000000..e69de29 |
37 |
diff --git a/pym/portage/tests/util/file_copy/__test__.py b/pym/portage/tests/util/file_copy/__test__.py |
38 |
new file mode 100644 |
39 |
index 0000000..e69de29 |
40 |
diff --git a/pym/portage/tests/util/file_copy/test_copyfile.py b/pym/portage/tests/util/file_copy/test_copyfile.py |
41 |
new file mode 100644 |
42 |
index 0000000..94fb4d7 |
43 |
--- /dev/null |
44 |
+++ b/pym/portage/tests/util/file_copy/test_copyfile.py |
45 |
@@ -0,0 +1,68 @@ |
46 |
+# Copyright 2017 Gentoo Foundation |
47 |
+# Distributed under the terms of the GNU General Public License v2 |
48 |
+ |
49 |
+import shutil |
50 |
+import tempfile |
51 |
+ |
52 |
+from portage import os |
53 |
+from portage.tests import TestCase |
54 |
+from portage.checksum import perform_md5 |
55 |
+from portage.util.file_copy import copyfile |
56 |
+ |
57 |
+ |
58 |
+class CopyFileTestCase(TestCase): |
59 |
+ |
60 |
+ def testCopyFile(self): |
61 |
+ |
62 |
+ tempdir = tempfile.mkdtemp() |
63 |
+ try: |
64 |
+ src_path = os.path.join(tempdir, 'src') |
65 |
+ dest_path = os.path.join(tempdir, 'dest') |
66 |
+ content = b'foo' |
67 |
+ |
68 |
+ with open(src_path, 'wb') as f: |
69 |
+ f.write(content) |
70 |
+ |
71 |
+ copyfile(src_path, dest_path) |
72 |
+ |
73 |
+ self.assertEqual(perform_md5(src_path), perform_md5(dest_path)) |
74 |
+ finally: |
75 |
+ shutil.rmtree(tempdir) |
76 |
+ |
77 |
+ |
78 |
+class CopyFileSparseTestCase(TestCase): |
79 |
+ |
80 |
+ def testCopyFileSparse(self): |
81 |
+ |
82 |
+ tempdir = tempfile.mkdtemp() |
83 |
+ try: |
84 |
+ src_path = os.path.join(tempdir, 'src') |
85 |
+ dest_path = os.path.join(tempdir, 'dest') |
86 |
+ content = b'foo' |
87 |
+ |
88 |
+ # Use seek to create some sparse blocks. Don't make these |
89 |
+ # files too big, in case the filesystem doesn't support |
90 |
+ # sparse files. |
91 |
+ with open(src_path, 'wb') as f: |
92 |
+ f.write(content) |
93 |
+ f.seek(2**18, 1) |
94 |
+ f.write(content) |
95 |
+ f.seek(2**19, 1) |
96 |
+ f.write(content) |
97 |
+ |
98 |
+ copyfile(src_path, dest_path) |
99 |
+ |
100 |
+ self.assertEqual(perform_md5(src_path), perform_md5(dest_path)) |
101 |
+ |
102 |
+ # This last part of the test is expected to fail when sparse |
103 |
+ # copy is not implemented, so set the todo flag in order |
104 |
+ # to tolerate failures. |
105 |
+ self.todo = True |
106 |
+ |
107 |
+ # If sparse blocks were preserved, then both files should |
108 |
+ # consume the same number of blocks. |
109 |
+ self.assertEqual( |
110 |
+ os.stat(src_path).st_blocks, |
111 |
+ os.stat(dest_path).st_blocks) |
112 |
+ finally: |
113 |
+ shutil.rmtree(tempdir) |
114 |
diff --git a/pym/portage/util/file_copy/__init__.py b/pym/portage/util/file_copy/__init__.py |
115 |
new file mode 100644 |
116 |
index 0000000..34c46ad |
117 |
--- /dev/null |
118 |
+++ b/pym/portage/util/file_copy/__init__.py |
119 |
@@ -0,0 +1,45 @@ |
120 |
+# Copyright 2017 Gentoo Foundation |
121 |
+# Distributed under the terms of the GNU General Public License v2 |
122 |
+ |
123 |
+import os |
124 |
+import shutil |
125 |
+import tempfile |
126 |
+ |
127 |
+try: |
128 |
+ from portage.util.file_copy.reflink_linux import file_copy as _file_copy |
129 |
+except ImportError: |
130 |
+ _file_copy = None |
131 |
+ |
132 |
+ |
133 |
+_copyfile = None |
134 |
+ |
135 |
+ |
136 |
+def _optimized_copyfile(src, dst): |
137 |
+ with open(src, 'rb', buffering=0) as src_file, \ |
138 |
+ open(dst, 'wb', buffering=0) as dst_file: |
139 |
+ _file_copy(src_file.fileno(), dst_file.fileno()) |
140 |
+ |
141 |
+ |
142 |
+def copyfile(src, dst): |
143 |
+ """ |
144 |
+ Copy the contents (no metadata) of the file named src to a file |
145 |
+ named dst. |
146 |
+ |
147 |
+ If possible, copying is done within the kernel, and uses |
148 |
+ "copy acceleration" techniques (such as reflinks). This also |
149 |
+ supports sparse files. |
150 |
+ |
151 |
+ @param src: path of source file |
152 |
+ @type src: str |
153 |
+ @param dst: path of destination file |
154 |
+ @type dst: str |
155 |
+ """ |
156 |
+ global _copyfile |
157 |
+ |
158 |
+ if _copyfile is None: |
159 |
+ if _file_copy is None: |
160 |
+ _copyfile = shutil.copyfile |
161 |
+ else: |
162 |
+ _copyfile = _optimized_copyfile |
163 |
+ |
164 |
+ _copyfile(src, dst) |
165 |
diff --git a/pym/portage/util/movefile.py b/pym/portage/util/movefile.py |
166 |
index 4be1c3b..88b35d3 100644 |
167 |
--- a/pym/portage/util/movefile.py |
168 |
+++ b/pym/portage/util/movefile.py |
169 |
@@ -23,6 +23,8 @@ from portage.localization import _ |
170 |
from portage.process import spawn |
171 |
from portage.util import writemsg |
172 |
from portage.util._xattr import xattr |
173 |
+from portage.util.file_copy import copyfile |
174 |
+ |
175 |
|
176 |
def _apply_stat(src_stat, dest): |
177 |
_os.chown(dest, src_stat.st_uid, src_stat.st_gid) |
178 |
@@ -114,7 +116,7 @@ def movefile(src, dest, newmtime=None, sstat=None, mysettings=None, |
179 |
_copyfile = selinux.copyfile |
180 |
_rename = selinux.rename |
181 |
else: |
182 |
- _copyfile = _shutil.copyfile |
183 |
+ _copyfile = copyfile |
184 |
_rename = _os.rename |
185 |
|
186 |
lchown = _unicode_func_wrapper(portage.data.lchown, encoding=encoding) |
187 |
diff --git a/setup.py b/setup.py |
188 |
index a346bd4..b624767 100755 |
189 |
--- a/setup.py |
190 |
+++ b/setup.py |
191 |
@@ -23,6 +23,7 @@ import collections |
192 |
import glob |
193 |
import os |
194 |
import os.path |
195 |
+import platform |
196 |
import re |
197 |
import subprocess |
198 |
import sys |
199 |
@@ -54,6 +55,14 @@ x_c_helpers = { |
200 |
], |
201 |
} |
202 |
|
203 |
+if platform.system() == 'Linux': |
204 |
+ x_c_helpers.update({ |
205 |
+ 'portage.util.file_copy.reflink_linux': [ |
206 |
+ 'src/portage_util_file_copy_reflink_linux.c', |
207 |
+ ], |
208 |
+ }) |
209 |
+ |
210 |
+ |
211 |
class x_build(build): |
212 |
""" Build command with extra build_man call. """ |
213 |
|
214 |
diff --git a/src/portage_util_file_copy_reflink_linux.c b/src/portage_util_file_copy_reflink_linux.c |
215 |
new file mode 100644 |
216 |
index 0000000..fde3cd3 |
217 |
--- /dev/null |
218 |
+++ b/src/portage_util_file_copy_reflink_linux.c |
219 |
@@ -0,0 +1,298 @@ |
220 |
+/* Copyright 2017 Gentoo Foundation |
221 |
+ * Distributed under the terms of the GNU General Public License v2 |
222 |
+ */ |
223 |
+ |
224 |
+#include <Python.h> |
225 |
+#include <errno.h> |
226 |
+#include <stdlib.h> |
227 |
+#include <ctype.h> |
228 |
+#include <sys/sendfile.h> |
229 |
+#include <sys/stat.h> |
230 |
+#include <sys/syscall.h> |
231 |
+#include <sys/types.h> |
232 |
+#include <unistd.h> |
233 |
+ |
234 |
+static PyObject * _reflink_linux_file_copy(PyObject *, PyObject *); |
235 |
+ |
236 |
+static PyMethodDef reflink_linuxMethods[] = { |
237 |
+ { |
238 |
+ "file_copy", |
239 |
+ _reflink_linux_file_copy, |
240 |
+ METH_VARARGS, |
241 |
+ "Copy between two file descriptors, " |
242 |
+ "with reflink and sparse file support." |
243 |
+ }, |
244 |
+ {NULL, NULL, 0, NULL} |
245 |
+}; |
246 |
+ |
247 |
+#if PY_MAJOR_VERSION >= 3 |
248 |
+static struct PyModuleDef moduledef = { |
249 |
+ PyModuleDef_HEAD_INIT, |
250 |
+ "reflink_linux", /* m_name */ |
251 |
+ "Module for reflink_linux copy operations", /* m_doc */ |
252 |
+ -1, /* m_size */ |
253 |
+ reflink_linuxMethods, /* m_methods */ |
254 |
+ NULL, /* m_reload */ |
255 |
+ NULL, /* m_traverse */ |
256 |
+ NULL, /* m_clear */ |
257 |
+ NULL, /* m_free */ |
258 |
+}; |
259 |
+ |
260 |
+PyMODINIT_FUNC |
261 |
+PyInit_reflink_linux(void) |
262 |
+{ |
263 |
+ PyObject *m; |
264 |
+ m = PyModule_Create(&moduledef); |
265 |
+ return m; |
266 |
+} |
267 |
+#else |
268 |
+PyMODINIT_FUNC |
269 |
+initreflink_linux(void) |
270 |
+{ |
271 |
+ Py_InitModule("reflink_linux", reflink_linuxMethods); |
272 |
+} |
273 |
+#endif |
274 |
+ |
275 |
+ |
276 |
+static ssize_t |
277 |
+cfr_wrapper(int fd_out, int fd_in, loff_t *off_out, size_t len) |
278 |
+{ |
279 |
+#ifdef __NR_copy_file_range |
280 |
+ return syscall(__NR_copy_file_range, fd_in, NULL, fd_out, |
281 |
+ off_out, len, 0); |
282 |
+#else |
283 |
+ /* This is how it fails at runtime when the syscall is not supported. */ |
284 |
+ errno = ENOSYS; |
285 |
+ return -1; |
286 |
+#endif |
287 |
+} |
288 |
+ |
289 |
+ |
290 |
+static off_t |
291 |
+do_lseek(int fd_out, int fd_in, loff_t *off_out) { |
292 |
+#ifdef SEEK_DATA |
293 |
+ /* Use lseek SEEK_DATA/SEEK_HOLE for sparse file support, |
294 |
+ * as suggested in the copy_file_range man page. |
295 |
+ */ |
296 |
+ off_t offset_data, offset_hole; |
297 |
+ |
298 |
+ offset_data = lseek(fd_in, *off_out, SEEK_DATA); |
299 |
+ if (offset_data < 0) { |
300 |
+ if (errno == ENXIO) { |
301 |
+ return 0; |
302 |
+ } |
303 |
+ return -1; |
304 |
+ } |
305 |
+ |
306 |
+ /* Create sparse empty blocks in the output file, up |
307 |
+ * until the next location that will contain data. |
308 |
+ */ |
309 |
+ if (offset_data > *off_out) { |
310 |
+ if (lseek(fd_out, offset_data, SEEK_SET) < 0) { |
311 |
+ return -1; |
312 |
+ } |
313 |
+ *off_out = offset_data; |
314 |
+ } |
315 |
+ |
316 |
+ /* Locate the next hole, so that we know when to |
317 |
+ * stop copying. There is an implicit hole at the |
318 |
+ * end of the file. |
319 |
+ */ |
320 |
+ offset_hole = lseek(fd_in, offset_data, SEEK_HOLE); |
321 |
+ if (offset_hole < 0) { |
322 |
+ return -1; |
323 |
+ } |
324 |
+ |
325 |
+ /* Revert SEEK_HOLE offset change, since we're going |
326 |
+ * to copy the data that comes before the hole. |
327 |
+ */ |
328 |
+ if (lseek(fd_in, *off_out, SEEK_SET) < 0) { |
329 |
+ return -1; |
330 |
+ } |
331 |
+ |
332 |
+ return offset_hole - offset_data; |
333 |
+#else |
334 |
+ /* This is how it fails at runtime when lseek SEEK_DATA is not supported. */ |
335 |
+ errno = EINVAL; |
336 |
+ return -1; |
337 |
+#endif |
338 |
+} |
339 |
+ |
340 |
+ |
341 |
+static PyObject * |
342 |
+_reflink_linux_file_copy(PyObject *self, PyObject *args) |
343 |
+{ |
344 |
+ int eintr_retry, error, fd_in, fd_out, stat_in_acquired, stat_out_acquired; |
345 |
+ int lseek_works, sendfile_works; |
346 |
+ off_t offset_out, len; |
347 |
+ ssize_t buf_bytes, buf_offset, copyfunc_ret; |
348 |
+ struct stat stat_in, stat_out; |
349 |
+ char* buf; |
350 |
+ ssize_t (*copyfunc)(int, int, loff_t *, size_t); |
351 |
+ |
352 |
+ if (!PyArg_ParseTuple(args, "ii", &fd_in, &fd_out)) |
353 |
+ return NULL; |
354 |
+ |
355 |
+ eintr_retry = 1; |
356 |
+ offset_out = 0; |
357 |
+ stat_in_acquired = 0; |
358 |
+ stat_out_acquired = 0; |
359 |
+ buf = NULL; |
360 |
+ buf_bytes = 0; |
361 |
+ buf_offset = 0; |
362 |
+ copyfunc = cfr_wrapper; |
363 |
+ lseek_works = 1; |
364 |
+ sendfile_works = 1; |
365 |
+ |
366 |
+ while (eintr_retry) { |
367 |
+ |
368 |
+ Py_BEGIN_ALLOW_THREADS |
369 |
+ |
370 |
+ /* Linux 3.1 and later support SEEK_DATA (for sparse file support). |
371 |
+ * This code uses copy_file_range if possible, and falls back to |
372 |
+ * sendfile for cross-device or when the copy_file_range syscall |
373 |
+ * is not available (less than Linux 4.5). This will fail for |
374 |
+ * Linux less than 3.1, which does not support the lseek SEEK_DATA |
375 |
+ * parameter. |
376 |
+ */ |
377 |
+ if (sendfile_works && lseek_works) { |
378 |
+ error = 0; |
379 |
+ |
380 |
+ while (1) { |
381 |
+ len = do_lseek(fd_out, fd_in, &offset_out); |
382 |
+ if (!len) { |
383 |
+ /* EOF */ |
384 |
+ break; |
385 |
+ } |
386 |
+ else if (len < 0) { |
387 |
+ if (errno == EINVAL && !offset_out) { |
388 |
+ lseek_works = 0; |
389 |
+ } |
390 |
+ error = 1; |
391 |
+ break; |
392 |
+ } |
393 |
+ |
394 |
+ copyfunc_ret = copyfunc(fd_out, |
395 |
+ fd_in, |
396 |
+ &offset_out, |
397 |
+ len); |
398 |
+ |
399 |
+ if (copyfunc_ret < 0) { |
400 |
+ if ((errno == EXDEV || errno == ENOSYS) && |
401 |
+ copyfunc == cfr_wrapper) { |
402 |
+ /* Use sendfile instead of copy_file_range for |
403 |
+ * cross-device copies, or when the copy_file_range |
404 |
+ * syscall is not available (less than Linux 4.5). |
405 |
+ */ |
406 |
+ copyfunc = sendfile; |
407 |
+ copyfunc_ret = copyfunc(fd_out, |
408 |
+ fd_in, |
409 |
+ &offset_out, |
410 |
+ len); |
411 |
+ |
412 |
+ if (copyfunc_ret < 0) { |
413 |
+ /* On Linux, if lseek succeeded above, then |
414 |
+ * sendfile should have worked here too, so |
415 |
+ * don't bother to fallback for EINVAL here. |
416 |
+ */ |
417 |
+ error = 1; |
418 |
+ break; |
419 |
+ } |
420 |
+ } |
421 |
+ else { |
422 |
+ error = 1; |
423 |
+ break; |
424 |
+ } |
425 |
+ } |
426 |
+ } |
427 |
+ } |
428 |
+ |
429 |
+ /* Less than Linux 3.1 does not support SEEK_DATA or copy_file_range, |
430 |
+ * so just use sendfile for in-kernel copy. This will fail for Linux |
431 |
+ * versions from 2.6.0 to 2.6.32, because sendfile does not support |
432 |
+ * writing to regular files. |
433 |
+ */ |
434 |
+ if (sendfile_works && !lseek_works) { |
435 |
+ error = 0; |
436 |
+ |
437 |
+ if (!stat_in_acquired && fstat(fd_in, &stat_in) < 0) { |
438 |
+ error = 1; |
439 |
+ } |
440 |
+ else { |
441 |
+ stat_in_acquired = 1; |
442 |
+ while (offset_out < stat_in.st_size) { |
443 |
+ copyfunc_ret = sendfile(fd_out, |
444 |
+ fd_in, |
445 |
+ &offset_out, |
446 |
+ stat_in.st_size - offset_out); |
447 |
+ |
448 |
+ if (copyfunc_ret < 0) { |
449 |
+ if (errno == EINVAL && !offset_out) { |
450 |
+ sendfile_works = 0; |
451 |
+ } |
452 |
+ error = 1; |
453 |
+ break; |
454 |
+ } |
455 |
+ } |
456 |
+ } |
457 |
+ } |
458 |
+ |
459 |
+ /* This implementation will work on any kernel. */ |
460 |
+ if (!sendfile_works) { |
461 |
+ error = 0; |
462 |
+ |
463 |
+ if (!stat_out_acquired && fstat(fd_in, &stat_out) < 0) { |
464 |
+ error = 1; |
465 |
+ } |
466 |
+ else { |
467 |
+ stat_out_acquired = 1; |
468 |
+ if (buf == NULL) |
469 |
+ buf = malloc(stat_out.st_blksize); |
470 |
+ if (buf == NULL) { |
471 |
+ error = 1; |
472 |
+ } |
473 |
+ else { |
474 |
+ while (1) { |
475 |
+ /* Some bytes may still be buffered from the |
476 |
+ * previous iteration of the outer loop. |
477 |
+ */ |
478 |
+ if (!buf_bytes) { |
479 |
+ buf_offset = 0; |
480 |
+ buf_bytes = read(fd_in, buf, stat_out.st_blksize); |
481 |
+ |
482 |
+ if (!buf_bytes) { |
483 |
+ /* EOF */ |
484 |
+ break; |
485 |
+ } |
486 |
+ else if (buf_bytes < 0) { |
487 |
+ error = 1; |
488 |
+ break; |
489 |
+ } |
490 |
+ } |
491 |
+ |
492 |
+ copyfunc_ret = write(fd_out, buf + buf_offset, buf_bytes); |
493 |
+ if (copyfunc_ret < 0) { |
494 |
+ error = 1; |
495 |
+ break; |
496 |
+ } |
497 |
+ |
498 |
+ buf_bytes -= copyfunc_ret; |
499 |
+ buf_offset += copyfunc_ret; |
500 |
+ } |
501 |
+ } |
502 |
+ } |
503 |
+ } |
504 |
+ |
505 |
+ Py_END_ALLOW_THREADS |
506 |
+ |
507 |
+ if (!(error && errno == EINTR && PyErr_CheckSignals() == 0)) |
508 |
+ eintr_retry = 0; |
509 |
+ } |
510 |
+ |
511 |
+ free(buf); |
512 |
+ |
513 |
+ if (error) |
514 |
+ return PyErr_SetFromErrno(PyExc_OSError); |
515 |
+ |
516 |
+ return Py_BuildValue("i", offset_out); |
517 |
+} |
518 |
-- |
519 |
2.10.2 |