>From 364e43d3502e7317b84a0ad549a4bef61f2fd8b7 Mon Sep 17 00:00:00 2001 From: Pavel Raiskup Date: Sun, 23 Feb 2014 13:12:54 +0100 Subject: [PATCH] tar: use SEEK_HOLE for hole detection Reuse the SEEK_HOLE/SEEK_DATA feature of lseek when possible. This makes archiving of sparse files much faster on capable file-systems as tar does not need to perform additional and expansive read of the whole sparse file to detect "map" of data chunks. Also implement --hole-detection option to allow users explicitly choose the old behavior. * src/common.h (HOLE_DETECTION_RAW, HOLE_DETECTION_SEEK) (HOLE_DETECTION_ALL, hole_detection): New constants and variable. * src/sparse.c (sparse_scan_file_wholesparse): New function as a method for detecting sparse files without any data. (sparse_scan_file_raw): Renamed from sparse_scan_file, removed the completely-sparse detection if-branch. (sparse_scan_file_seek): Implements method for hole detection using lseek. (sparse_scan_file): Reimplemented function as a wrapper for all methods. * src/tar.c (HOLE_DETECTION_OPTION): New option. (parse_opt): Handle new --hole-detection option. * tests/sparse02.at: Use --hole-detection=raw as the seek method creates little bit bigger archives causing test to fail. * tests/checkseekhole.c: SEEK_HOLE detection helper. * tests/sparsemv.at: Likewise. * tests/sparsemvp.at: Likewise. * tests/sparse06.at: New test-case. * tests/testsuite.at: Cover new testcase. * tests/Makefile.am: Likewise. * tests/.gitignore: Mention two test binaries. * doc/tar.1: Document. * doc/tar.texi: Likewise. --- doc/tar.1 | 6 ++ doc/tar.texi | 70 +++++++++++++++----- src/common.h | 6 ++ src/sparse.c | 173 ++++++++++++++++++++++++++++++++++++++++---------- src/tar.c | 16 +++++ tests/.gitignore | 2 + tests/Makefile.am | 4 +- tests/checkseekhole.c | 90 ++++++++++++++++++++++++++ tests/sparse02.at | 2 +- tests/sparse06.at | 56 ++++++++++++++++ tests/sparsemv.at | 1 + tests/sparsemvp.at | 1 + tests/testsuite.at | 16 +++++ 13 files changed, 393 insertions(+), 50 deletions(-) create mode 100644 tests/checkseekhole.c create mode 100644 tests/sparse06.at diff --git a/doc/tar.1 b/doc/tar.1 index 6e2aa74..ebea138 100644 --- a/doc/tar.1 +++ b/doc/tar.1 @@ -259,6 +259,12 @@ When listing or extracting, the actual contents of \fIFILE\fR is not inspected, it is needed only due to syntactical requirements. It is therefore common practice to use \fB/dev/null\fR in its place. .TP +\fB\-\-hole\-detection\fR=\fIMETHOD\fR +Use method to detect holes in sparse files. This option implies +\fB\-\-sparse\fR. Currently there are \fIseek\fR and \fIraw\fR methods +implemented. Default is \fIseek\fR with fallback to \fIraw\fR when not +applicable. +.TP \fB\-G\fR, \fB\-\-incremental\fR Handle old GNU-format incremental backups. .TP diff --git a/doc/tar.texi b/doc/tar.texi index 9713cb0..8a59461 100644 --- a/doc/tar.texi +++ b/doc/tar.texi @@ -2782,6 +2782,13 @@ they refer to, instead of creating usual hard link members. @command{tar} will print out a short message summarizing the operations and options to @command{tar} and exit. @xref{help}. address@hidden address@hidden address@hidden +Use method to detect holes in sparse files. This option implies address@hidden Currently there are @var{seek} and @var{raw} methods +implemented. Default is @var{seek} with fallback to @var{raw} when not +applicable. @xref{sparse}. + @opsummary{ignore-case} @item --ignore-case Ignore case when matching member or file names with @@ -9536,13 +9543,14 @@ could create an archive longer than the original. To have @command{tar} attempt to recognize the holes in a file, use @option{--sparse} (@option{-S}). When you use this option, then, for any file using less disk space than would be expected from its length, @command{tar} -searches the file for consecutive stretches of zeros. It then records -in the archive for the file where the consecutive stretches of zeros -are, and only archives the ``real contents'' of the file. On -extraction (using @option{--sparse} is not needed on extraction) any -such files have holes created wherever the continuous stretches of zeros -were found. Thus, if you use @option{--sparse}, @command{tar} archives -won't take more space than the original. +searches the file for holes. It then records in the archive for the file where +the holes (consecutive stretches of zeros) are, and only archives the +``real contents'' of the file. On extraction (using @option{--sparse} is not +needed on extraction) any such files have also holes created wherever the holes +were found. Thus, if you use @option{--sparse}, @command{tar} archives won't +take more space than the original. +Gnu tar uses different methods for detecting holes, for more informations look +at @option{--hole-detection}. @table @option @opindex sparse @@ -9568,13 +9576,15 @@ will never take more space on the media than the files take on disk (otherwise, archiving a disk filled with sparse files might take hundreds of tapes). @xref{Incremental Dumps}. -However, be aware that @option{--sparse} option presents a serious -drawback. Namely, in order to determine if the file is sparse address@hidden has to read it before trying to archive it, so in total -the file is read @strong{twice}. So, always bear in mind that the -time needed to process all files with this option is roughly twice -the time needed to archive them without it. address@hidden technical note: +However, be aware that @option{--sparse} option may present a serious +drawback. Namely, in order to determine the positions of holes in a file address@hidden may have to read it before trying to archive it, so in total +the file may be read @strong{twice}. This may happen when your OS or your FS +does not support @dfn{SEEK_HOLE/SEEK_DATA} feature in @dfn{lseek} (See address@hidden). address@hidden technical (and in 2014 some time already kind of historical as +todays file-systems usually support SEEK_HOLE/SEEK_DATA or other techniquies for +hole detection) note: Programs like @command{dump} do not have to read the entire file; by examining the file system directly, they can determine in advance @@ -9612,7 +9622,6 @@ use an earlier format, you can select it using @table @option @opindex sparse-version @item address@hidden - Select the format to store sparse files in. Valid @var{version} values are: @samp{0.0}, @samp{0.1} and @samp{1.0}. @xref{Sparse Formats}, for a detailed description of each format. @@ -9620,6 +9629,37 @@ for a detailed description of each format. Using @option{--sparse-format} option implies @option{--sparse}. address@hidden @option address@hidden hole-detection address@hidden hole detection address@hidden address@hidden +Enforce concrete hole detection method. Tar, before the real contents of sparse +file are stored, needs to have good knowledge about file sparseness. This is +because it needs to have the file's map of holes stored into tar header before +it starts archiving the file contents. Currently, there are implemented two +methods for detection of holes: address@hidden @bullet address@hidden @option{--hole-detection=seek} +Seeking the file for data and holes. It uses enhancement of the @dfn{lseek} +syscall (@dfn{SEEK_HOLE/SEEK_DATA}) which is able to reuse file system knowledge +about sparse file contents - so the detection is usually very fast. To use this +feature, your file system and operating system must support it. Even though +this lseek feature is not yet commited (just proposed) to be POSIX, it is fairly +widely supported on today's machines (year 2014). address@hidden @option{--hole-detection=raw} +Reading "roughly" byte-by-byte the whole sparse file before the archiving. This +method detects holes like consecutive stretches of zeroes. Comparing to +previous method, it is usually much slower though this method is the most +portable one. address@hidden itemize +When no @option{--hole-detection} method is chosen, tar uses the @var{seek} +method and if not successful it falls-back to (old and everywhere working) address@hidden method. This guarantees best-effort and still usually quick detection +of holes. By specifying concrete method, no other method will be used. + address@hidden table +Using @option{--hole-detection} option implies @option{--sparse}. + @node Attributes @section Handling File Attributes @cindex attributes, files diff --git a/src/common.h b/src/common.h index 72ad4c1..2988795 100644 --- a/src/common.h +++ b/src/common.h @@ -280,6 +280,12 @@ GLOBAL bool sparse_option; GLOBAL unsigned tar_sparse_major; GLOBAL unsigned tar_sparse_minor; +#define HOLE_DETECTION_RAW 0x01 +#define HOLE_DETECTION_SEEK 0x02 +#define HOLE_DETECTION_ALL 0xFF + +GLOBAL int hole_detection; + GLOBAL bool starting_file_option; /* Specified maximum byte length of each tape volume (multiple of 1024). */ diff --git a/src/sparse.c b/src/sparse.c index 6a97676..46370d1 100644 --- a/src/sparse.c +++ b/src/sparse.c @@ -208,9 +208,9 @@ sparse_add_map (struct tar_stat_info *st, struct sp_array const *sp) st->sparse_map_avail = avail + 1; } -/* Scan the sparse file and create its map */ +/* Scan the sparse file byte-by-byte and create its map. */ static bool -sparse_scan_file (struct tar_sparse_file *file) +sparse_scan_file_raw (struct tar_sparse_file *file) { struct tar_stat_info *st = file->stat_info; int fd = file->fd; @@ -221,41 +221,38 @@ sparse_scan_file (struct tar_sparse_file *file) st->archive_file_size = 0; - if (ST_NBLOCKS (st->stat) == 0) - offset = st->stat.st_size; - else + if (!tar_sparse_scan (file, scan_begin, NULL)) + return false; + + while ((count = blocking_read (fd, buffer, sizeof buffer)) != 0 + && count != SAFE_READ_ERROR) { - if (!tar_sparse_scan (file, scan_begin, NULL)) - return false; + /* Analyze the block. */ + if (zero_block_p (buffer, count)) + { + if (sp.numbytes) + { + sparse_add_map (st, &sp); + sp.numbytes = 0; + if (!tar_sparse_scan (file, scan_block, NULL)) + return false; + } + } + else + { + if (sp.numbytes == 0) + sp.offset = offset; + sp.numbytes += count; + st->archive_file_size += count; + if (!tar_sparse_scan (file, scan_block, buffer)) + return false; + } - while ((count = blocking_read (fd, buffer, sizeof buffer)) != 0 - && count != SAFE_READ_ERROR) - { - /* Analyze the block. */ - if (zero_block_p (buffer, count)) - { - if (sp.numbytes) - { - sparse_add_map (st, &sp); - sp.numbytes = 0; - if (!tar_sparse_scan (file, scan_block, NULL)) - return false; - } - } - else - { - if (sp.numbytes == 0) - sp.offset = offset; - sp.numbytes += count; - st->archive_file_size += count; - if (!tar_sparse_scan (file, scan_block, buffer)) - return false; - } - - offset += count; - } + offset += count; } + /* we wan't save one more sparse map of length 0 to signalize that + the file contents end with hole */ if (sp.numbytes == 0) sp.offset = offset; @@ -264,6 +261,116 @@ sparse_scan_file (struct tar_sparse_file *file) return tar_sparse_scan (file, scan_end, NULL); } +static bool +sparse_scan_file_wholesparse (struct tar_sparse_file *file) +{ + struct tar_stat_info *st = file->stat_info; + struct sp_array sp = {0, 0}; + + /* Note that this function is called only for truly sparse files of size >= 1 + block size (checked via ST_IS_SPARSE before). See the thread + http://www.mail-archive.com/address@hidden/msg04209.html for more info */ + if (ST_NBLOCKS (st->stat) == 0) + { + st->archive_file_size = 0; + sp.offset = st->stat.st_size; + sparse_add_map (st, &sp); + return true; + } + + return false; +} + +#ifdef SEEK_HOLE +/* Try to engage SEEK_HOLE/SEEK_DATA feature. */ +static bool +sparse_scan_file_seek (struct tar_sparse_file *file) +{ + struct tar_stat_info *st = file->stat_info; + int fd = file->fd; + struct sp_array sp = {0, 0}; + off_t offset = 0; + off_t data_offset; + off_t hole_offset; + + st->archive_file_size = 0; + + for (;;) + { + /* locate first chunk of data */ + data_offset = lseek (fd, offset, SEEK_DATA); + + if (data_offset == (off_t)-1) + /* ENXIO == EOF; error otherwise */ + { + if (errno == ENXIO) + { + /* file ends with hole, add one more empty chunk of data */ + sp.numbytes = 0; + sp.offset = st->stat.st_size; + sparse_add_map (st, &sp); + return true; + } + return false; + } + + hole_offset = lseek (fd, data_offset, SEEK_HOLE); + + /* according to specs, if FS does not fully support SEEK_DATA/SEEK_HOLE it + may just implement kind of "wrapper" around classic lseek() call. We + must detect it here and try to use other hole-detection methods. */ + if (offset == 0 /* first loop */ + && data_offset == 0 + && hole_offset == st->stat.st_size) + { + lseek (fd, 0, SEEK_SET); + return false; + } + + sp.offset = data_offset; + sp.numbytes = hole_offset - data_offset; + sparse_add_map (st, &sp); + + st->archive_file_size += sp.numbytes; + offset = hole_offset; + } + + return true; +} +#endif + +static bool +sparse_scan_file (struct tar_sparse_file *file) +{ + /* always check for completely sparse files */ + if (sparse_scan_file_wholesparse (file)) + return true; + + if (hole_detection & HOLE_DETECTION_SEEK) +#ifdef SEEK_HOLE + { + if (sparse_scan_file_seek (file)) + return true; + } +#else + { + if (hole_detection != HOLE_DETECTION_ALL) + /* be verbose only if user explicitly request "seek", should we rather + fail? */ + WARN((0, 0, + _("\"seek\" hole detection is not supported, using \"raw\"."))); + /* use "raw" also for other files */ + hole_detection = HOLE_DETECTION_RAW; + } +#endif + + if (hole_detection & HOLE_DETECTION_RAW + && sparse_scan_file_raw (file)) + return true; + + return false; +} + static struct tar_sparse_optab const oldgnu_optab; static struct tar_sparse_optab const star_optab; static struct tar_sparse_optab const pax_optab; diff --git a/src/tar.c b/src/tar.c index 21b0a3b..e7e0766 100644 --- a/src/tar.c +++ b/src/tar.c @@ -362,6 +362,7 @@ enum SHOW_TRANSFORMED_NAMES_OPTION, SKIP_OLD_FILES_OPTION, SORT_OPTION, + HOLE_DETECTION_OPTION, SPARSE_VERSION_OPTION, STRIP_COMPONENTS_OPTION, SUFFIX_OPTION, @@ -451,6 +452,8 @@ static struct argp_option options[] = { {"sparse", 'S', 0, 0, N_("handle sparse files efficiently"), GRID+1 }, + {"hole-detection", HOLE_DETECTION_OPTION, N_("TYPE"), 0, + N_("technique to detect holes"), GRID+1 }, {"sparse-version", SPARSE_VERSION_OPTION, N_("MAJOR[.MINOR]"), 0, N_("set version of the sparse format to use (implies --sparse)"), GRID+1}, {"incremental", 'G', 0, 0, @@ -1753,6 +1756,16 @@ parse_opt (int key, char *arg, struct argp_state *state) set_old_files_option (SKIP_OLD_FILES, args->loc); break; + case HOLE_DETECTION_OPTION: + sparse_option = 1; + if (!strcmp (arg, "raw")) + hole_detection = HOLE_DETECTION_RAW; + else if (!strcmp (arg, "seek")) + hole_detection = HOLE_DETECTION_SEEK; + else + USAGE_ERROR ((0, 0, _("'%s' is not a valid hole detection method"), arg)); + break; + case SPARSE_VERSION_OPTION: sparse_option = true; { @@ -2658,6 +2671,9 @@ decode_options (int argc, char **argv) | FORMAT_MASK (GNU_FORMAT) | FORMAT_MASK (POSIX_FORMAT)); + if (sparse_option && !hole_detection) + hole_detection = HOLE_DETECTION_ALL; + if (occurrence_option) { if (!args.input_files) diff --git a/tests/.gitignore b/tests/.gitignore index 9d836a8..2a721ba 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -9,3 +9,5 @@ argcv.h genfile.c genfile download +ttyemu +checkseekhole diff --git a/tests/Makefile.am b/tests/Makefile.am index f145929..0c6d60a 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -204,6 +204,7 @@ TESTSUITE_AT = \ sparse03.at\ sparse04.at\ sparse05.at\ + sparse06.at\ sparsemv.at\ sparsemvp.at\ spmvp00.at\ @@ -272,13 +273,14 @@ installcheck-local: $(check_PROGRAMS) ## genfile ## ## ------------ ## -check_PROGRAMS = genfile +check_PROGRAMS = genfile checkseekhole if TAR_COND_GRANTPT check_PROGRAMS += ttyemu endif genfile_SOURCES = genfile.c argcv.c argcv.h +checkseekhole_SOURCES = checkseekhole.c ttyemu_SOURCES = ttyemu.c diff --git a/tests/checkseekhole.c b/tests/checkseekhole.c new file mode 100644 index 0000000..58d5030 --- /dev/null +++ b/tests/checkseekhole.c @@ -0,0 +1,90 @@ +/* Test suite for GNU tar - SEEK_HOLE detector. + + Copyright 2014 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 3, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General + Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program. If not, see . + + Description: detect whether it is possible to work with SEEK_HOLE on + particular operating system and file system. */ + +#include "config.h" + +#include +#include +#include +#include +#include + +enum { + EOK = 0, /* SEEK_HOLE support */ + EFAIL, /* test failed - no SEEK_HOLE support */ + EBAD, /* test is not relevant */ +}; + +int main() +{ +#ifdef SEEK_HOLE + char template[] = "testseekhole-XXXXXX"; + struct stat stat; + int fd = mkstemp (template); + int rv = EBAD; + off_t offset; + + if (fd == -1) + return EBAD; + + /* hole of 100MB */ + if ((off_t)-1 == lseek (fd, 100*1024*1024, SEEK_END)) + goto cleanup; + + /* piece of data */ + if (5 != write (fd, "data\n", 5)) + goto cleanup; + + /* another hole */ + if ((off_t)-1 == lseek (fd, 100*1024*1024, SEEK_END)) + goto cleanup; + + /* piece of data */ + if (5 != write (fd, "data\n", 5)) + goto cleanup; + + if (fstat (fd, &stat)) + goto cleanup; + + offset = lseek (fd, 0, SEEK_DATA); + if (offset == (off_t)-1) + { + rv = EFAIL; + goto cleanup; + } + + offset = lseek (fd, offset, SEEK_HOLE); + if (offset == (off_t)-1 || offset == stat.st_size) + { + rv = EFAIL; + goto cleanup; + } + + rv = EOK; + +cleanup: + close (fd); + unlink (template); + + return rv; +#else + return 1; +#endif +} diff --git a/tests/sparse02.at b/tests/sparse02.at index 1f04491..16bb31b 100644 --- a/tests/sparse02.at +++ b/tests/sparse02.at @@ -27,7 +27,7 @@ AT_KEYWORDS([sparse sparse02]) AT_TAR_CHECK([ genfile --sparse --file sparsefile --block-size 512 0 ABCD 1M EFGH 2000K IJKL || AT_SKIP_TEST -tar -c -f archive --sparse sparsefile || exit 1 +tar --hole-detection=raw -c -f archive --sparse sparsefile || exit 1 echo separator tar xfO archive | cat - > sparsecopy || exit 1 diff --git a/tests/sparse06.at b/tests/sparse06.at new file mode 100644 index 0000000..c0bcfc6 --- /dev/null +++ b/tests/sparse06.at @@ -0,0 +1,56 @@ +# Process this file with autom4te to create testsuite. -*- Autotest -*- +# +# Test suite for GNU tar. +# Copyright 2014 Free Software Foundation, Inc. + +# This file is part of GNU tar. + +# GNU tar is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. + +# GNU tar is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +AT_SETUP([storing sparse file using seek method]) +AT_KEYWORDS([sparse sparse06]) + +m4_define([check_pattern],[ +rm -rf out archive.tar smallsparse && mkdir out +genfile --sparse --file smallsparse $1 +tar -cSf archive.tar smallsparse +tar -xf archive.tar -C out +diff smallsparse out/smallsparse +]) + +AT_TAR_CHECK([ +AT_SEEKHOLE_PREREQ +AT_TIMEOUT_PREREQ + +TAR_OPTIONS="$TAR_OPTIONS --hole-detection=seek" +genfile --sparse --file bigsparse 0 ABC 8G DEF +timeout 2 tar -cSf a bigsparse +test $? -eq 0 || exit 1 + +check_pattern([0 ABC]) +check_pattern([0 ABC 10M]) +check_pattern([0 ABC 10M DEF]) + +check_pattern([10M]) +check_pattern([10M ABC]) +check_pattern([10M ABC 20M]) + +check_pattern([10M DEF 20M GHI 30M JKL 40M]) + +], +[0],, +[genfile: created file is not sparse +],,,[posix]) + +AT_CLEANUP diff --git a/tests/sparsemv.at b/tests/sparsemv.at index 958f04e..47e71e8 100644 --- a/tests/sparsemv.at +++ b/tests/sparsemv.at @@ -30,6 +30,7 @@ AT_KEYWORDS([sparse multiv sparsemv]) AT_TAR_CHECK([ exec <&- +TAR_OPTIONS="$TAR_OPTIONS --hole-detection=raw" genfile --sparse --file sparsefile 0 ABCDEFGHIJK 1M ABCDEFGHI || AT_SKIP_TEST echo "Pass 1: Split between data blocks" echo "Create archive" diff --git a/tests/sparsemvp.at b/tests/sparsemvp.at index 29ee224..0fd6e33 100644 --- a/tests/sparsemvp.at +++ b/tests/sparsemvp.at @@ -26,6 +26,7 @@ dnl TAR_MVP_TEST version map1 map2 m4_define([TAR_MVP_TEST],[ AT_TAR_CHECK([ exec <&- +TAR_OPTIONS="$TAR_OPTIONS --hole-detection=raw" genfile --sparse --file sparsefile $2 || AT_SKIP_TEST echo "Pass 1: Split between data blocks" echo "Create archive" diff --git a/tests/testsuite.at b/tests/testsuite.at index 4b1c805..d498a83 100644 --- a/tests/testsuite.at +++ b/tests/testsuite.at @@ -112,6 +112,21 @@ rm -f $[]$ test $result -eq 0 || AT_SKIP_TEST ]) +dnl AT_SEEKHOLE_PREREQ +m4_define([AT_SEEKHOLE_PREREQ],[ +checkseekhole +rv=$? +test $rv -ne 0 && exit $rv +]) + +m4_define([AT_TIMEOUT_PREREQ],[ +timeout 100 true +if test $? -ne 0; then + echo >&2 "the 'timeout' utility not found" + AT_SKIP_TEST +fi +]) + m4_define([AT_TAR_MKHIER],[ install-sh -d $1 >/dev/null dnl m4_if([$2],,,&& genfile --file [$1]/[$2]) || AT_SKIP_TEST]) @@ -357,6 +372,7 @@ m4_include([sparse02.at]) m4_include([sparse03.at]) m4_include([sparse04.at]) m4_include([sparse05.at]) +m4_include([sparse06.at]) m4_include([sparsemv.at]) m4_include([spmvp00.at]) m4_include([spmvp01.at]) -- 2.5.0