>From f85e444133987003166529f4ca83d7bb72212b1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= Date: Thu, 2 Oct 2014 14:07:42 +0100 Subject: [PATCH 1/4] copy: support smaller holes than the copy buffer size Previously cp would not detect runs of NULs that were smaller than the buffer size used for I/O (currently 128KiB). * src/copy.c (copy_reg): Use an independent hole_size, set to st_blksize, to increase the chances of detecting a representable hole, in a run of NULs read from the input. (create_hole): A new function refactored from sparse_copy() and extent_copy() so we have a single place to handle holes. (sparse_copy): Adjust to loop over the larger input buffer in chunks of the passed hole size. Also adjust to only call lseek once per hole, rather than up to once per input buffer. * tests/cp/sparse.sh: Add test cases for various sparse chunk sizes. * NEWS: Mention the improvement. --- NEWS | 3 + src/copy.c | 151 ++++++++++++++++++++++++++++++++++++---------------- tests/cp/sparse.sh | 33 +++++++++++ 3 files changed, 140 insertions(+), 47 deletions(-) diff --git a/NEWS b/NEWS index a323b0c..7007070 100644 --- a/NEWS +++ b/NEWS @@ -33,6 +33,9 @@ GNU coreutils NEWS -*- outline -*- ** Improvements + cp,install,mv will convert smaller runs of NULs in the input to holes, + to reduce allocation in the copy. + mv will try a reflink before falling back to a standard copy, which is more efficient when moving files across BTRFS subvolume boundaries. diff --git a/src/copy.c b/src/copy.c index b7baee4..24b8af3 100644 --- a/src/copy.c +++ b/src/copy.c @@ -145,6 +145,20 @@ utimens_symlink (char const *file, struct timespec const *timespec) return err; } +/* Create a hole at the end of a file. */ + +static bool +create_hole (int fd, char const *name, off_t size) +{ + if (lseek (fd, size, SEEK_CUR) < 0) + { + error (0, errno, _("cannot lseek %s"), quote (name)); + return false; + } + + return true; +} + /* Copy the regular file open on SRC_FD/SRC_NAME to DST_FD/DST_NAME, honoring the MAKE_HOLES setting and using the BUF_SIZE-byte buffer BUF for temporary storage. Copy no more than MAX_N_READ bytes. @@ -158,18 +172,18 @@ utimens_symlink (char const *file, struct timespec const *timespec) bytes read. */ static bool sparse_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, - bool make_holes, + size_t hole_size, bool make_holes, char const *src_name, char const *dst_name, uintmax_t max_n_read, off_t *total_n_read, bool *last_write_made_hole) { *last_write_made_hole = false; *total_n_read = 0; + bool make_hole = false; + off_t psize = 0; while (max_n_read) { - bool make_hole = false; - ssize_t n_read = read (src_fd, buf, MIN (max_n_read, buf_size)); if (n_read < 0) { @@ -183,50 +197,94 @@ sparse_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, max_n_read -= n_read; *total_n_read += n_read; - if (make_holes) + /* Loop over the input buffer in chunks of hole_size. */ + size_t csize = make_holes ? hole_size : buf_size; + char *cbuf = buf; + char *pbuf = buf; + + while (n_read) { - /* Sentinel required by is_nul(). */ - buf[n_read] = '\1'; -#ifdef lint - typedef uintptr_t word; - /* Usually, buf[n_read] is not the byte just before a "word" - (aka uintptr_t) boundary. In that case, the word-oriented - test below (*wp++ == 0) would read some uninitialized bytes - after the sentinel. To avoid false-positive reports about - this condition (e.g., from a tool like valgrind), set the - remaining bytes -- to any value. */ - memset (buf + n_read + 1, 0, sizeof (word) - 1); -#endif + bool prev_hole = make_hole; + csize = MIN (csize, n_read); - if ((make_hole = is_nul (buf, n_read))) + if (make_holes && csize) { - if (lseek (dest_fd, n_read, SEEK_CUR) < 0) + /* Setup sentinel required by is_nul(). */ + typedef uintptr_t word; + word isnul_tmp; + memcpy (&isnul_tmp, cbuf + csize, sizeof (word)); + memset (cbuf + csize, 1, sizeof (word)); + + make_hole = is_nul (cbuf, csize); + + memcpy (cbuf + csize, &isnul_tmp, sizeof (word)); + } + + bool transition = (make_hole != prev_hole) && psize; + bool last_chunk = (n_read == csize && ! make_hole) || ! csize; + + if (transition || last_chunk) + { + if (! transition) + psize += csize; + + if (! prev_hole) { - error (0, errno, _("cannot lseek %s"), quote (dst_name)); - return false; + if (full_write (dest_fd, pbuf, psize) != psize) + { + error (0, errno, _("error writing %s"), quote (dst_name)); + return false; + } + } + else + { + if (! create_hole (dest_fd, dst_name, psize)) + return false; } - } - } - if (!make_hole) - { - size_t n = n_read; - if (full_write (dest_fd, buf, n) != n) + pbuf = cbuf; + psize = csize; + + if (last_chunk) + { + if (! csize) + n_read = 0; /* Finished processing buffer. */ + + if (transition) + csize = 0; /* Loop again to deal with last chunk. */ + else + psize = 0; /* Reset for next read loop. */ + } + } + else /* Coalesce writes/seeks. */ { - error (0, errno, _("error writing %s"), quote (dst_name)); - return false; + if (psize <= OFF_T_MAX - csize) + psize += csize; + else + { + error (0, 0, _("overflow reading %s"), quote (src_name)); + return false; + } } - /* It is tempting to return early here upon a short read from a - regular file. That would save the final read syscall for each - file. Unfortunately that doesn't work for certain files in - /proc with linux kernels from at least 2.6.9 .. 2.6.29. */ + n_read -= csize; + cbuf += csize; } *last_write_made_hole = make_hole; + + /* It's tempting to break early here upon a short read from + a regular file. That would save the final read syscall + for each file. Unfortunately that doesn't work for + certain files in /proc or /sys with linux kernels. */ } - return true; + /* Ensure a trailing hole is created, so that subsequent + calls of sparse_copy() start at the correct offset. */ + if (make_hole && ! create_hole (dest_fd, dst_name, psize)) + return false; + else + return true; } /* Perform the O(1) btrfs clone operation, if possible. @@ -290,7 +348,8 @@ write_zeros (int fd, off_t n_bytes) return false. */ static bool extent_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, - off_t src_total_size, enum Sparse_type sparse_mode, + size_t hole_size, off_t src_total_size, + enum Sparse_type sparse_mode, char const *src_name, char const *dst_name, bool *require_normal_copy) { @@ -331,7 +390,7 @@ extent_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, { off_t ext_start; off_t ext_len; - off_t hole_size; + off_t ext_hole_size; if (i < scan.ei_count) { @@ -345,11 +404,11 @@ extent_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, ext_len = 0; } - hole_size = ext_start - last_ext_start - last_ext_len; + ext_hole_size = ext_start - last_ext_start - last_ext_len; wrote_hole_at_eof = false; - if (hole_size) + if (ext_hole_size) { if (lseek (src_fd, ext_start, SEEK_SET) < 0) { @@ -362,11 +421,8 @@ extent_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, if ((empty_extent && sparse_mode == SPARSE_ALWAYS) || (!empty_extent && sparse_mode != SPARSE_NEVER)) { - if (lseek (dest_fd, ext_start, SEEK_SET) < 0) - { - error (0, errno, _("cannot lseek %s"), quote (dst_name)); - goto fail; - } + if (! create_hole (dest_fd, dst_name, ext_hole_size)) + goto fail; wrote_hole_at_eof = true; } else @@ -374,9 +430,9 @@ extent_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, /* When not inducing holes and when there is a hole between the end of the previous extent and the beginning of the current one, write zeros to the destination file. */ - off_t nzeros = hole_size; + off_t nzeros = ext_hole_size; if (empty_extent) - nzeros = MIN (src_total_size - dest_pos, hole_size); + nzeros = MIN (src_total_size - dest_pos, ext_hole_size); if (! write_zeros (dest_fd, nzeros)) { @@ -409,7 +465,7 @@ extent_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, empty_extent = false; last_ext_len = ext_len; - if ( ! sparse_copy (src_fd, dest_fd, buf, buf_size, + if ( ! sparse_copy (src_fd, dest_fd, buf, buf_size, hole_size, sparse_mode == SPARSE_ALWAYS, src_name, dst_name, ext_len, &n_read, &wrote_hole_at_eof)) @@ -1105,6 +1161,7 @@ copy_reg (char const *src_name, char const *dst_name, size_t buf_alignment = lcm (getpagesize (), sizeof (word)); size_t buf_alignment_slop = sizeof (word) + buf_alignment - 1; size_t buf_size = io_blksize (sb); + size_t hole_size = ST_BLKSIZE (sb); fdadvise (source_desc, 0, 0, FADVISE_SEQUENTIAL); @@ -1164,7 +1221,7 @@ copy_reg (char const *src_name, char const *dst_name, standard copy only if the initial extent scan fails. If the '--sparse=never' option is specified, write all data but use any extents to read more efficiently. */ - if (extent_copy (source_desc, dest_desc, buf, buf_size, + if (extent_copy (source_desc, dest_desc, buf, buf_size, hole_size, src_open_sb.st_size, S_ISREG (sb.st_mode) ? x->sparse_mode : SPARSE_NEVER, src_name, dst_name, &normal_copy_required)) @@ -1179,7 +1236,7 @@ copy_reg (char const *src_name, char const *dst_name, off_t n_read; bool wrote_hole_at_eof; - if ( ! sparse_copy (source_desc, dest_desc, buf, buf_size, + if ( ! sparse_copy (source_desc, dest_desc, buf, buf_size, hole_size, make_holes, src_name, dst_name, UINTMAX_MAX, &n_read, &wrote_hole_at_eof) diff --git a/tests/cp/sparse.sh b/tests/cp/sparse.sh index d6cc4c4..3e4a67c 100755 --- a/tests/cp/sparse.sh +++ b/tests/cp/sparse.sh @@ -37,4 +37,37 @@ test $(stat --printf %b copy) -le $(stat --printf %b sparse) || fail=1 cp --sparse=always --reflink sparse copy && fail=1 cp --sparse=never --reflink sparse copy && fail=1 + +# Ensure we handle sparse/non-sparse transitions correctly +maxn=128 # how many $hole_size chunks per file +hole_size=$(stat -c %o copy) +dd if=/dev/zero bs=$hole_size count=$maxn of=zeros || framework_failure_ +tr '\0' 'U' < zeros > nonzero || framework_failure_ + +for pattern in 1 0; do + test "$pattern" = 1 && pattern="$(printf '%s\n%s' nonzero zeros)" + test "$pattern" = 0 && pattern="$(printf '%s\n%s' zeros nonzero)" + + for n in 1 2 4 11 32 $maxn; do + parts=$(expr $maxn / $n) + + rm -f sparse.in + + # Generate non sparse file for copying with alternating + # hole/data patterns of size n * $hole_size + for i in $(yes "$pattern" | head -n$parts); do + dd iflag=fullblock if=$i of=sparse.in conv=notrunc oflag=append \ + bs=$hole_size count=$n status=none || framework_failure_ + done + + cp --sparse=always sparse.in sparse.out || fail=1 # non sparse input + cp --sparse=always sparse.out sparse.out2 || fail=1 # sparse input + + cmp sparse.in sparse.out || fail=1 + cmp sparse.in sparse.out2 || fail=1 + + ls -lsh sparse.* + done +done + Exit $fail -- 1.7.7.6 >From 3f048b220c2cd1aaee10d32760df2c13705d845e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= Date: Mon, 6 Oct 2014 10:19:58 +0100 Subject: [PATCH 2/4] cp: avoid speculative preallocation with --sparse=always With --sparse=always use fallocate(...PUNCH_HOLE...) to avoid any permanent allocation due to speculative preallocation employed by file systems such as XFS. * m4/jm-macros.m4: Check for and fallocate(). * src/copy.c (punch_hole): A new function to try and punch a hole an the specified offset if supported. (create_hole): Call punch_hole() after requesting a hole. (extent_copy): Likewise. * NEWS: Mention the improvement. --- NEWS | 2 +- m4/jm-macros.m4 | 2 + src/copy.c | 76 ++++++++++++++++++++++++++++++++++++++++++++---------- 3 files changed, 65 insertions(+), 15 deletions(-) diff --git a/NEWS b/NEWS index 7007070..e7aef77 100644 --- a/NEWS +++ b/NEWS @@ -34,7 +34,7 @@ GNU coreutils NEWS -*- outline -*- ** Improvements cp,install,mv will convert smaller runs of NULs in the input to holes, - to reduce allocation in the copy. + and cp --sparse=always avoids speculative preallocation on XFS for example. mv will try a reflink before falling back to a standard copy, which is more efficient when moving files across BTRFS subvolume boundaries. diff --git a/m4/jm-macros.m4 b/m4/jm-macros.m4 index a96ecab..07b9085 100644 --- a/m4/jm-macros.m4 +++ b/m4/jm-macros.m4 @@ -78,6 +78,7 @@ AC_DEFUN([coreutils_MACROS], AC_CHECK_FUNCS_ONCE([ endgrent endpwent + fallocate fchown fchmod ftruncate @@ -189,6 +190,7 @@ AC_DEFUN([gl_CHECK_ALL_HEADERS], [ AC_CHECK_HEADERS_ONCE([ hurd.h + linux/falloc.h paths.h priv.h stropts.h diff --git a/src/copy.c b/src/copy.c index 24b8af3..85a4c59 100644 --- a/src/copy.c +++ b/src/copy.c @@ -70,6 +70,10 @@ # include "verror.h" #endif +#if HAVE_LINUX_FALLOC_H +# include +#endif + #ifndef HAVE_FCHOWN # define HAVE_FCHOWN false # define fchown(fd, uid, gid) (-1) @@ -145,20 +149,54 @@ utimens_symlink (char const *file, struct timespec const *timespec) return err; } -/* Create a hole at the end of a file. */ +/* Attempt to punch a hole to avoid any permanent + speculative preallocation on file systems such as XFS. + Return values as per fallocate(2) except ENOSYS etc. are ignored. */ + +static int +punch_hole (int fd, off_t offset, off_t length) +{ + int ret = 0; +#if HAVE_FALLOCATE +# if defined FALLOC_FL_PUNCH_HOLE && defined FALLOC_FL_KEEP_SIZE + ret = fallocate (fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + offset, length); + if (ret < 0 + && (errno == EOPNOTSUPP || errno == ENOTSUP || errno == ENOSYS)) + ret = 0; +# endif +#endif + return ret; +} + +/* Create a hole at the end of a file, + avoiding preallocation if requested. */ static bool -create_hole (int fd, char const *name, off_t size) +create_hole (int fd, char const *name, bool punch_holes, off_t size) { - if (lseek (fd, size, SEEK_CUR) < 0) + off_t file_end = lseek (fd, size, SEEK_CUR); + + if (file_end < 0) { error (0, errno, _("cannot lseek %s"), quote (name)); return false; } + /* Some file systems (like XFS) preallocate when write extending a file. + I.E. a previous write() may have preallocated extra space + that the seek above will not discard. A subsequent write() could + then make this allocation permanent. */ + if (punch_holes && punch_hole (fd, file_end - size, size) < 0) + { + error (0, errno, _("error deallocating %s"), quote (name)); + return false; + } + return true; } + /* Copy the regular file open on SRC_FD/SRC_NAME to DST_FD/DST_NAME, honoring the MAKE_HOLES setting and using the BUF_SIZE-byte buffer BUF for temporary storage. Copy no more than MAX_N_READ bytes. @@ -172,7 +210,7 @@ create_hole (int fd, char const *name, off_t size) bytes read. */ static bool sparse_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, - size_t hole_size, bool make_holes, + size_t hole_size, bool punch_holes, char const *src_name, char const *dst_name, uintmax_t max_n_read, off_t *total_n_read, bool *last_write_made_hole) @@ -198,7 +236,7 @@ sparse_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, *total_n_read += n_read; /* Loop over the input buffer in chunks of hole_size. */ - size_t csize = make_holes ? hole_size : buf_size; + size_t csize = hole_size ? hole_size : buf_size; char *cbuf = buf; char *pbuf = buf; @@ -207,7 +245,7 @@ sparse_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, bool prev_hole = make_hole; csize = MIN (csize, n_read); - if (make_holes && csize) + if (hole_size && csize) { /* Setup sentinel required by is_nul(). */ typedef uintptr_t word; @@ -238,7 +276,7 @@ sparse_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, } else { - if (! create_hole (dest_fd, dst_name, psize)) + if (! create_hole (dest_fd, dst_name, punch_holes, psize)) return false; } @@ -281,7 +319,7 @@ sparse_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, /* Ensure a trailing hole is created, so that subsequent calls of sparse_copy() start at the correct offset. */ - if (make_hole && ! create_hole (dest_fd, dst_name, psize)) + if (make_hole && ! create_hole (dest_fd, dst_name, punch_holes, psize)) return false; else return true; @@ -421,7 +459,9 @@ extent_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, if ((empty_extent && sparse_mode == SPARSE_ALWAYS) || (!empty_extent && sparse_mode != SPARSE_NEVER)) { - if (! create_hole (dest_fd, dst_name, ext_hole_size)) + if (! create_hole (dest_fd, dst_name, + sparse_mode == SPARSE_ALWAYS, + ext_hole_size)) goto fail; wrote_hole_at_eof = true; } @@ -465,9 +505,9 @@ extent_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, empty_extent = false; last_ext_len = ext_len; - if ( ! sparse_copy (src_fd, dest_fd, buf, buf_size, hole_size, - sparse_mode == SPARSE_ALWAYS, - src_name, dst_name, ext_len, &n_read, + if ( ! sparse_copy (src_fd, dest_fd, buf, buf_size, + sparse_mode == SPARSE_ALWAYS ? hole_size: 0, + true, src_name, dst_name, ext_len, &n_read, &wrote_hole_at_eof)) goto fail; @@ -509,6 +549,13 @@ extent_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, return false; } + if (sparse_mode == SPARSE_ALWAYS && dest_pos < src_total_size + && punch_hole (dest_fd, dest_pos, src_total_size - dest_pos) < 0) + { + error (0, errno, _("error deallocating %s"), quote (dst_name)); + return false; + } + return true; } @@ -1236,8 +1283,9 @@ copy_reg (char const *src_name, char const *dst_name, off_t n_read; bool wrote_hole_at_eof; - if ( ! sparse_copy (source_desc, dest_desc, buf, buf_size, hole_size, - make_holes, src_name, dst_name, + if ( ! sparse_copy (source_desc, dest_desc, buf, buf_size, + make_holes ? hole_size : 0, + x->sparse_mode == SPARSE_ALWAYS, src_name, dst_name, UINTMAX_MAX, &n_read, &wrote_hole_at_eof) || (wrote_hole_at_eof -- 1.7.7.6 >From 2812e602fc481bbad7d11c1597a0fd4689905d55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= Date: Mon, 6 Oct 2014 11:02:34 +0100 Subject: [PATCH 3/4] cp: read sparse files more efficiently with non regular destination * src.copy.c (copy_reg): Use fiemap to read sparse files, even if the output is not to a regular file. * NEWS: Mention the improvement. --- NEWS | 3 +++ src/copy.c | 5 ++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/NEWS b/NEWS index e7aef77..52332bd 100644 --- a/NEWS +++ b/NEWS @@ -36,6 +36,9 @@ GNU coreutils NEWS -*- outline -*- cp,install,mv will convert smaller runs of NULs in the input to holes, and cp --sparse=always avoids speculative preallocation on XFS for example. + cp will read sparse files more efficiently when the destination is a + non regular file. For example when copying a disk image to a device node. + mv will try a reflink before falling back to a standard copy, which is more efficient when moving files across BTRFS subvolume boundaries. diff --git a/src/copy.c b/src/copy.c index 85a4c59..b8e12c2 100644 --- a/src/copy.c +++ b/src/copy.c @@ -1214,7 +1214,7 @@ copy_reg (char const *src_name, char const *dst_name, /* Deal with sparse files. */ bool make_holes = false; - bool sparse_src = false; + bool sparse_src = is_probably_sparse (&src_open_sb); if (S_ISREG (sb.st_mode)) { @@ -1227,7 +1227,6 @@ copy_reg (char const *src_name, char const *dst_name, blocks. If the file has fewer blocks than would normally be needed for a file of its size, then at least one of the blocks in the file is a hole. */ - sparse_src = is_probably_sparse (&src_open_sb); if (x->sparse_mode == SPARSE_AUTO && sparse_src) make_holes = true; } @@ -1270,7 +1269,7 @@ copy_reg (char const *src_name, char const *dst_name, any extents to read more efficiently. */ if (extent_copy (source_desc, dest_desc, buf, buf_size, hole_size, src_open_sb.st_size, - S_ISREG (sb.st_mode) ? x->sparse_mode : SPARSE_NEVER, + make_holes ? x->sparse_mode : SPARSE_NEVER, src_name, dst_name, &normal_copy_required)) goto preserve_metadata; -- 1.7.7.6 >From f88224422669edf37653a8372e9a4a86d540972f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= Date: Tue, 7 Oct 2014 19:48:53 +0100 Subject: [PATCH 4/4] copy: avoid an extraneous error when reporting errors * src/copy.c (copy_reg): If sparse_copy() failed, then an erroneous error about failing to extend the file would be reported. --- src/copy.c | 17 ++++++++++------- 1 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/copy.c b/src/copy.c index b8e12c2..446e72d 100644 --- a/src/copy.c +++ b/src/copy.c @@ -1282,13 +1282,16 @@ copy_reg (char const *src_name, char const *dst_name, off_t n_read; bool wrote_hole_at_eof; - if ( ! sparse_copy (source_desc, dest_desc, buf, buf_size, - make_holes ? hole_size : 0, - x->sparse_mode == SPARSE_ALWAYS, src_name, dst_name, - UINTMAX_MAX, &n_read, - &wrote_hole_at_eof) - || (wrote_hole_at_eof - && ftruncate (dest_desc, n_read) < 0)) + if (! sparse_copy (source_desc, dest_desc, buf, buf_size, + make_holes ? hole_size : 0, + x->sparse_mode == SPARSE_ALWAYS, src_name, dst_name, + UINTMAX_MAX, &n_read, + &wrote_hole_at_eof)) + { + return_val = false; + goto close_src_and_dst_desc; + } + else if (wrote_hole_at_eof && ftruncate (dest_desc, n_read) < 0) { error (0, errno, _("failed to extend %s"), quote (dst_name)); return_val = false; -- 1.7.7.6