>From 0563cfd261f6c47f25924ae6fef542230fdf2794 Mon Sep 17 00:00:00 2001 From: Pavel Raiskup Date: Sun, 23 Feb 2014 13:12:54 +0100 Subject: [PATCH] tar: use SEEK_HOLE for hole detection Reuse the SEEK_HOLE/SEEK_DATA feature of lseek when possible. This makes the sparse file archivation to be quite faster as tar does not need to perform additional read of whole file in order to detect file sparse map. This lseek feature is not yet fully POSIX but it is fairly widely implemented these days. Also implement --hole-detection option for proper method selection. * src/common.h (HOLE_DETECTION_RAW, HOLE_DETECTION_SEEK) (HOLE_DETECTION_ALL, hole_detection): New constants and variable. * src/sparse.c (sparse_scan_file_wholesparse): New function as a method for detecting sparse files without any data. (sparse_scan_file_raw): Renamed from sparse_scan_file, removed the completely-sparse detection if-branch. (sparse_scan_file_seek): Implements method for hole detection using lseek. (sparse_scan_file): Reimplemented function as a wrapper for all methods. * src/tar.c (HOLE_DETECTION_OPTION): New option. (parse_opt): Handle new --hole-detection option. * tests/sparse02.at: Use --hole-detection=raw as the seek method creates little bit bigger archives causing test to fail. * tests/checkseekhole.c: SEEK_HOLE detection helper. * tests/sparsemv.at: Likewise. * tests/sparsemvp.at: Likewise. * tests/sparse05.at: New test-case. * tests/testsuite.at: Cover new testcase. * tests/Makefile.am: Likewise. * doc/tar.1: Document. * doc/tar.texi: Likewise. --- doc/tar.1 | 6 ++ doc/tar.texi | 70 +++++++++++++++----- src/common.h | 6 ++ src/sparse.c | 173 ++++++++++++++++++++++++++++++++++++++++---------- src/tar.c | 16 +++++ tests/Makefile.am | 5 +- tests/checkseekhole.c | 90 ++++++++++++++++++++++++++ tests/sparse02.at | 2 +- tests/sparse05.at | 56 ++++++++++++++++ tests/sparsemv.at | 1 + tests/sparsemvp.at | 1 + tests/testsuite.at | 16 +++++ 12 files changed, 392 insertions(+), 50 deletions(-) create mode 100644 tests/checkseekhole.c create mode 100644 tests/sparse05.at diff --git a/doc/tar.1 b/doc/tar.1 index b33f55b..d2898b6 100644 --- a/doc/tar.1 +++ b/doc/tar.1 @@ -259,6 +259,12 @@ When listing or extracting, the actual contents of \fIFILE\fR is not inspected, it is needed only due to syntactical requirements. It is therefore common practice to use \fB/dev/null\fR in its place. .TP +\fB\-\-hole\-detection\fR=\fIMETHOD\fR +Use method to detect holes in sparse files. This option implies +\fB\-\-sparse\fR. Currently there are \fIseek\fR and \fIraw\fR methods +implemented. Default is \fIseek\fR with fallback to \fIraw\fR when not +applicable. +.TP \fB\-G\fR, \fB\-\-incremental\fR Handle old GNU-format incremental backups. .TP diff --git a/doc/tar.texi b/doc/tar.texi index 9bb5a83..0e894f9 100644 --- a/doc/tar.texi +++ b/doc/tar.texi @@ -2748,6 +2748,13 @@ they refer to, instead of creating usual hard link members. @command{tar} will print out a short message summarizing the operations and options to @command{tar} and exit. @xref{help}. address@hidden address@hidden address@hidden +Use method to detect holes in sparse files. This option implies address@hidden Currently there are @var{seek} and @var{raw} methods +implemented. Default is @var{seek} with fallback to @var{raw} when not +applicable. @xref{sparse}. + @opsummary{ignore-case} @item --ignore-case Ignore case when matching member or file names with @@ -9295,13 +9302,14 @@ could create an archive longer than the original. To have @command{tar} attempt to recognize the holes in a file, use @option{--sparse} (@option{-S}). When you use this option, then, for any file using less disk space than would be expected from its length, @command{tar} -searches the file for consecutive stretches of zeros. It then records -in the archive for the file where the consecutive stretches of zeros -are, and only archives the ``real contents'' of the file. On -extraction (using @option{--sparse} is not needed on extraction) any -such files have holes created wherever the continuous stretches of zeros -were found. Thus, if you use @option{--sparse}, @command{tar} archives -won't take more space than the original. +searches the file for holes. It then records in the archive for the file where +the holes (consecutive stretches of zeros) are, and only archives the +``real contents'' of the file. On extraction (using @option{--sparse} is not +needed on extraction) any such files have also holes created wherever the holes +were found. Thus, if you use @option{--sparse}, @command{tar} archives won't +take more space than the original. +Gnu tar uses different methods for detecting holes, for more informations look +at @option{--hole-detection}. @table @option @opindex sparse @@ -9327,13 +9335,15 @@ will never take more space on the media than the files take on disk (otherwise, archiving a disk filled with sparse files might take hundreds of tapes). @xref{Incremental Dumps}. -However, be aware that @option{--sparse} option presents a serious -drawback. Namely, in order to determine if the file is sparse address@hidden has to read it before trying to archive it, so in total -the file is read @strong{twice}. So, always bear in mind that the -time needed to process all files with this option is roughly twice -the time needed to archive them without it. address@hidden technical note: +However, be aware that @option{--sparse} option may present a serious +drawback. Namely, in order to determine the positions of holes in a file address@hidden may have to read it before trying to archive it, so in total +the file may be read @strong{twice}. This may happen when your OS or your FS +does not support @dfn{SEEK_HOLE/SEEK_DATA} feature in @dfn{lseek} (See address@hidden). address@hidden technical (and in 2014 some time already kind of historical as +todays file-systems usually support SEEK_HOLE/SEEK_DATA or other techniquies for +hole detection) note: Programs like @command{dump} do not have to read the entire file; by examining the file system directly, they can determine in advance @@ -9371,7 +9381,6 @@ use an earlier format, you can select it using @table @option @opindex sparse-version @item address@hidden - Select the format to store sparse files in. Valid @var{version} values are: @samp{0.0}, @samp{0.1} and @samp{1.0}. @xref{Sparse Formats}, for a detailed description of each format. @@ -9379,6 +9388,37 @@ for a detailed description of each format. Using @option{--sparse-format} option implies @option{--sparse}. address@hidden @option address@hidden hole-detection address@hidden hole detection address@hidden address@hidden +Enforce concrete hole detection method. Tar, before the real contents of sparse +file are stored, needs to have good knowledge about file sparseness. This is +because it needs to have the file's map of holes stored into tar header before +it starts archiving the file contents. Currently, there are implemented two +methods for detection of holes: address@hidden @bullet address@hidden @option{--hole-detection=seek} +Seeking the file for data and holes. It uses enhancement of the @dfn{lseek} +syscall (@dfn{SEEK_HOLE/SEEK_DATA}) which is able to reuse file system knowledge +about sparse file contents - so the detection is usually very fast. To use this +feature, your file system and operating system must support it. Even though +this lseek feature is not yet commited (just proposed) to be POSIX, it is fairly +widely supported on today's machines (year 2014). address@hidden @option{--hole-detection=raw} +Reading "roughly" byte-by-byte the whole sparse file before the archiving. This +method detects holes like consecutive stretches of zeroes. Comparing to +previous method, it is usually much slower though this method is the most +portable one. address@hidden itemize +When no @option{--hole-detection} method is chosen, tar uses the @var{seek} +method and if not successful it falls-back to (old and everywhere working) address@hidden method. This guarantees best-effort and still usually quick detection +of holes. By specifying concrete method, no other method will be used. + address@hidden table +Using @option{--hole-detection} option implies @option{--sparse}. + @node Attributes @section Handling File Attributes @cindex atrributes, files diff --git a/src/common.h b/src/common.h index 4d2c399..459cf55 100644 --- a/src/common.h +++ b/src/common.h @@ -280,6 +280,12 @@ GLOBAL bool sparse_option; GLOBAL unsigned tar_sparse_major; GLOBAL unsigned tar_sparse_minor; +#define HOLE_DETECTION_RAW 0x01 +#define HOLE_DETECTION_SEEK 0x02 +#define HOLE_DETECTION_ALL 0xFF + +GLOBAL int hole_detection; + GLOBAL bool starting_file_option; /* Specified maximum byte length of each tape volume (multiple of 1024). */ diff --git a/src/sparse.c b/src/sparse.c index 6a97676..46370d1 100644 --- a/src/sparse.c +++ b/src/sparse.c @@ -208,9 +208,9 @@ sparse_add_map (struct tar_stat_info *st, struct sp_array const *sp) st->sparse_map_avail = avail + 1; } -/* Scan the sparse file and create its map */ +/* Scan the sparse file byte-by-byte and create its map. */ static bool -sparse_scan_file (struct tar_sparse_file *file) +sparse_scan_file_raw (struct tar_sparse_file *file) { struct tar_stat_info *st = file->stat_info; int fd = file->fd; @@ -221,41 +221,38 @@ sparse_scan_file (struct tar_sparse_file *file) st->archive_file_size = 0; - if (ST_NBLOCKS (st->stat) == 0) - offset = st->stat.st_size; - else + if (!tar_sparse_scan (file, scan_begin, NULL)) + return false; + + while ((count = blocking_read (fd, buffer, sizeof buffer)) != 0 + && count != SAFE_READ_ERROR) { - if (!tar_sparse_scan (file, scan_begin, NULL)) - return false; + /* Analyze the block. */ + if (zero_block_p (buffer, count)) + { + if (sp.numbytes) + { + sparse_add_map (st, &sp); + sp.numbytes = 0; + if (!tar_sparse_scan (file, scan_block, NULL)) + return false; + } + } + else + { + if (sp.numbytes == 0) + sp.offset = offset; + sp.numbytes += count; + st->archive_file_size += count; + if (!tar_sparse_scan (file, scan_block, buffer)) + return false; + } - while ((count = blocking_read (fd, buffer, sizeof buffer)) != 0 - && count != SAFE_READ_ERROR) - { - /* Analyze the block. */ - if (zero_block_p (buffer, count)) - { - if (sp.numbytes) - { - sparse_add_map (st, &sp); - sp.numbytes = 0; - if (!tar_sparse_scan (file, scan_block, NULL)) - return false; - } - } - else - { - if (sp.numbytes == 0) - sp.offset = offset; - sp.numbytes += count; - st->archive_file_size += count; - if (!tar_sparse_scan (file, scan_block, buffer)) - return false; - } - - offset += count; - } + offset += count; } + /* we wan't save one more sparse map of length 0 to signalize that + the file contents end with hole */ if (sp.numbytes == 0) sp.offset = offset; @@ -264,6 +261,116 @@ sparse_scan_file (struct tar_sparse_file *file) return tar_sparse_scan (file, scan_end, NULL); } +static bool +sparse_scan_file_wholesparse (struct tar_sparse_file *file) +{ + struct tar_stat_info *st = file->stat_info; + struct sp_array sp = {0, 0}; + + /* Note that this function is called only for truly sparse files of size >= 1 + block size (checked via ST_IS_SPARSE before). See the thread + http://www.mail-archive.com/address@hidden/msg04209.html for more info */ + if (ST_NBLOCKS (st->stat) == 0) + { + st->archive_file_size = 0; + sp.offset = st->stat.st_size; + sparse_add_map (st, &sp); + return true; + } + + return false; +} + +#ifdef SEEK_HOLE +/* Try to engage SEEK_HOLE/SEEK_DATA feature. */ +static bool +sparse_scan_file_seek (struct tar_sparse_file *file) +{ + struct tar_stat_info *st = file->stat_info; + int fd = file->fd; + struct sp_array sp = {0, 0}; + off_t offset = 0; + off_t data_offset; + off_t hole_offset; + + st->archive_file_size = 0; + + for (;;) + { + /* locate first chunk of data */ + data_offset = lseek (fd, offset, SEEK_DATA); + + if (data_offset == (off_t)-1) + /* ENXIO == EOF; error otherwise */ + { + if (errno == ENXIO) + { + /* file ends with hole, add one more empty chunk of data */ + sp.numbytes = 0; + sp.offset = st->stat.st_size; + sparse_add_map (st, &sp); + return true; + } + return false; + } + + hole_offset = lseek (fd, data_offset, SEEK_HOLE); + + /* according to specs, if FS does not fully support SEEK_DATA/SEEK_HOLE it + may just implement kind of "wrapper" around classic lseek() call. We + must detect it here and try to use other hole-detection methods. */ + if (offset == 0 /* first loop */ + && data_offset == 0 + && hole_offset == st->stat.st_size) + { + lseek (fd, 0, SEEK_SET); + return false; + } + + sp.offset = data_offset; + sp.numbytes = hole_offset - data_offset; + sparse_add_map (st, &sp); + + st->archive_file_size += sp.numbytes; + offset = hole_offset; + } + + return true; +} +#endif + +static bool +sparse_scan_file (struct tar_sparse_file *file) +{ + /* always check for completely sparse files */ + if (sparse_scan_file_wholesparse (file)) + return true; + + if (hole_detection & HOLE_DETECTION_SEEK) +#ifdef SEEK_HOLE + { + if (sparse_scan_file_seek (file)) + return true; + } +#else + { + if (hole_detection != HOLE_DETECTION_ALL) + /* be verbose only if user explicitly request "seek", should we rather + fail? */ + WARN((0, 0, + _("\"seek\" hole detection is not supported, using \"raw\"."))); + /* use "raw" also for other files */ + hole_detection = HOLE_DETECTION_RAW; + } +#endif + + if (hole_detection & HOLE_DETECTION_RAW + && sparse_scan_file_raw (file)) + return true; + + return false; +} + static struct tar_sparse_optab const oldgnu_optab; static struct tar_sparse_optab const star_optab; static struct tar_sparse_optab const pax_optab; diff --git a/src/tar.c b/src/tar.c index 08f334f..42a0dee 100644 --- a/src/tar.c +++ b/src/tar.c @@ -352,6 +352,7 @@ enum SHOW_TRANSFORMED_NAMES_OPTION, SKIP_OLD_FILES_OPTION, SORT_OPTION, + HOLE_DETECTION_OPTION, SPARSE_VERSION_OPTION, STRIP_COMPONENTS_OPTION, SUFFIX_OPTION, @@ -442,6 +443,8 @@ static struct argp_option options[] = { {"sparse", 'S', 0, 0, N_("handle sparse files efficiently"), GRID+1 }, + {"hole-detection", HOLE_DETECTION_OPTION, N_("TYPE"), 0, + N_("technique to detect holes"), GRID+1 }, {"sparse-version", SPARSE_VERSION_OPTION, N_("MAJOR[.MINOR]"), 0, N_("set version of the sparse format to use (implies --sparse)"), GRID+1}, {"incremental", 'G', 0, 0, @@ -1604,6 +1607,16 @@ parse_opt (int key, char *arg, struct argp_state *state) old_files_option = SKIP_OLD_FILES; break; + case HOLE_DETECTION_OPTION: + sparse_option = 1; + if (!strcmp (arg, "raw")) + hole_detection = HOLE_DETECTION_RAW; + else if (!strcmp (arg, "seek")) + hole_detection = HOLE_DETECTION_SEEK; + else + USAGE_ERROR ((0, 0, _("'%s' is not a valid hole detection method"), arg)); + break; + case SPARSE_VERSION_OPTION: sparse_option = true; { @@ -2438,6 +2451,9 @@ decode_options (int argc, char **argv) | FORMAT_MASK (GNU_FORMAT) | FORMAT_MASK (POSIX_FORMAT)); + if (sparse_option && !hole_detection) + hole_detection = HOLE_DETECTION_ALL; + if (occurrence_option) { if (!args.input_files) diff --git a/tests/Makefile.am b/tests/Makefile.am index 76954eb..8b1c4ae 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -195,6 +195,7 @@ TESTSUITE_AT = \ sparse02.at\ sparse03.at\ sparse04.at\ + sparse05.at\ sparsemv.at\ sparsemvp.at\ spmvp00.at\ @@ -260,10 +261,12 @@ installcheck-local: ## genfile ## ## ------------ ## -check_PROGRAMS = genfile +check_PROGRAMS = genfile checkseekhole genfile_SOURCES = genfile.c argcv.c argcv.h +checkseekhole_SOURCES = checkseekhole.c + localedir = $(datadir)/locale AM_CPPFLAGS = \ -I$(top_srcdir)/gnu\ diff --git a/tests/checkseekhole.c b/tests/checkseekhole.c new file mode 100644 index 0000000..58d5030 --- /dev/null +++ b/tests/checkseekhole.c @@ -0,0 +1,90 @@ +/* Test suite for GNU tar - SEEK_HOLE detector. + + Copyright 2014 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 3, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General + Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program. If not, see . + + Description: detect whether it is possible to work with SEEK_HOLE on + particular operating system and file system. */ + +#include "config.h" + +#include +#include +#include +#include +#include + +enum { + EOK = 0, /* SEEK_HOLE support */ + EFAIL, /* test failed - no SEEK_HOLE support */ + EBAD, /* test is not relevant */ +}; + +int main() +{ +#ifdef SEEK_HOLE + char template[] = "testseekhole-XXXXXX"; + struct stat stat; + int fd = mkstemp (template); + int rv = EBAD; + off_t offset; + + if (fd == -1) + return EBAD; + + /* hole of 100MB */ + if ((off_t)-1 == lseek (fd, 100*1024*1024, SEEK_END)) + goto cleanup; + + /* piece of data */ + if (5 != write (fd, "data\n", 5)) + goto cleanup; + + /* another hole */ + if ((off_t)-1 == lseek (fd, 100*1024*1024, SEEK_END)) + goto cleanup; + + /* piece of data */ + if (5 != write (fd, "data\n", 5)) + goto cleanup; + + if (fstat (fd, &stat)) + goto cleanup; + + offset = lseek (fd, 0, SEEK_DATA); + if (offset == (off_t)-1) + { + rv = EFAIL; + goto cleanup; + } + + offset = lseek (fd, offset, SEEK_HOLE); + if (offset == (off_t)-1 || offset == stat.st_size) + { + rv = EFAIL; + goto cleanup; + } + + rv = EOK; + +cleanup: + close (fd); + unlink (template); + + return rv; +#else + return 1; +#endif +} diff --git a/tests/sparse02.at b/tests/sparse02.at index 1f04491..16bb31b 100644 --- a/tests/sparse02.at +++ b/tests/sparse02.at @@ -27,7 +27,7 @@ AT_KEYWORDS([sparse sparse02]) AT_TAR_CHECK([ genfile --sparse --file sparsefile --block-size 512 0 ABCD 1M EFGH 2000K IJKL || AT_SKIP_TEST -tar -c -f archive --sparse sparsefile || exit 1 +tar --hole-detection=raw -c -f archive --sparse sparsefile || exit 1 echo separator tar xfO archive | cat - > sparsecopy || exit 1 diff --git a/tests/sparse05.at b/tests/sparse05.at new file mode 100644 index 0000000..587fdd7 --- /dev/null +++ b/tests/sparse05.at @@ -0,0 +1,56 @@ +# Process this file with autom4te to create testsuite. -*- Autotest -*- +# +# Test suite for GNU tar. +# Copyright 2014 Free Software Foundation, Inc. + +# This file is part of GNU tar. + +# GNU tar is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. + +# GNU tar is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +AT_SETUP([storing sparse file using seek method]) +AT_KEYWORDS([sparse sparse05]) + +m4_define([check_pattern],[ +rm -rf out archive.tar smallsparse && mkdir out +genfile --sparse --file smallsparse $1 +tar -cSf archive.tar smallsparse +tar -xf archive.tar -C out +diff smallsparse out/smallsparse +]) + +AT_TAR_CHECK([ +AT_SEEKHOLE_PREREQ +AT_TIMEOUT_PREREQ + +TAR_OPTIONS="$TAR_OPTIONS --hole-detection=seek" +genfile --sparse --file bigsparse 0 ABC 8G DEF +timeout 2 tar -cSf a bigsparse +test $? -eq 0 || exit 1 + +check_pattern([0 ABC]) +check_pattern([0 ABC 10M]) +check_pattern([0 ABC 10M DEF]) + +check_pattern([10M]) +check_pattern([10M ABC]) +check_pattern([10M ABC 20M]) + +check_pattern([10M DEF 20M GHI 30M JKL 40M]) + +], +[0],, +[genfile: created file is not sparse +],,,[posix]) + +AT_CLEANUP diff --git a/tests/sparsemv.at b/tests/sparsemv.at index 958f04e..47e71e8 100644 --- a/tests/sparsemv.at +++ b/tests/sparsemv.at @@ -30,6 +30,7 @@ AT_KEYWORDS([sparse multiv sparsemv]) AT_TAR_CHECK([ exec <&- +TAR_OPTIONS="$TAR_OPTIONS --hole-detection=raw" genfile --sparse --file sparsefile 0 ABCDEFGHIJK 1M ABCDEFGHI || AT_SKIP_TEST echo "Pass 1: Split between data blocks" echo "Create archive" diff --git a/tests/sparsemvp.at b/tests/sparsemvp.at index 29ee224..0fd6e33 100644 --- a/tests/sparsemvp.at +++ b/tests/sparsemvp.at @@ -26,6 +26,7 @@ dnl TAR_MVP_TEST version map1 map2 m4_define([TAR_MVP_TEST],[ AT_TAR_CHECK([ exec <&- +TAR_OPTIONS="$TAR_OPTIONS --hole-detection=raw" genfile --sparse --file sparsefile $2 || AT_SKIP_TEST echo "Pass 1: Split between data blocks" echo "Create archive" diff --git a/tests/testsuite.at b/tests/testsuite.at index ac0119e..09e69d5 100644 --- a/tests/testsuite.at +++ b/tests/testsuite.at @@ -112,6 +112,21 @@ rm -f $[]$ test $result -eq 0 || AT_SKIP_TEST ]) +dnl AT_SEEKHOLE_PREREQ +m4_define([AT_SEEKHOLE_PREREQ],[ +checkseekhole +rv=$? +test $rv -ne 0 && exit $rv +]) + +m4_define([AT_TIMEOUT_PREREQ],[ +timeout 100 true +if test $? -ne 0; then + echo >&2 "the 'timeout' utility not found" + AT_SKIP_TEST +fi +]) + m4_define([AT_TAR_MKHIER],[ install-sh -d $1 >/dev/null dnl m4_if([$2],,,&& genfile --file [$1]/[$2]) || AT_SKIP_TEST]) @@ -349,6 +364,7 @@ m4_include([sparse01.at]) m4_include([sparse02.at]) m4_include([sparse03.at]) m4_include([sparse04.at]) +m4_include([sparse05.at]) m4_include([sparsemv.at]) m4_include([spmvp00.at]) m4_include([spmvp01.at]) -- 1.8.5.3