[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH] grep: sparse files are now considered binary
From: |
Paul Eggert |
Subject: |
[PATCH] grep: sparse files are now considered binary |
Date: |
Sun, 11 Mar 2012 23:27:08 -0700 |
User-agent: |
Mozilla/5.0 (X11; Linux i686; rv:10.0.2) Gecko/20120216 Thunderbird/10.0.2 |
Here's a proposed patch to greatly speed up 'grep' in some cases
when it's dealing with sparse files that happen to start
with a block of NUL-free data.
>From cadc29e2b8fed7ae807fc451cac821798d0bc4c8 Mon Sep 17 00:00:00 2001
From: Paul Eggert <address@hidden>
Date: Sun, 11 Mar 2012 22:41:01 -0700
Subject: [PATCH] grep: sparse files are now considered binary
* NEWS: Document this.
* doc/grep.texi (File and Directory Selection): Likewise.
* bootstrap.conf (gnulib_modules): Add stat-size.
* src/main.c: Include stat-size.h.
(file_is_binary): New function, which looks for holes too.
(grep): Use it.
* tests/Makefile.am (TESTS): Add big-hole.
* tests/big-hole: New file.
---
NEWS | 4 +++
bootstrap.conf | 1 +
doc/grep.texi | 7 +++--
src/main.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
tests/Makefile.am | 1 +
tests/big-hole | 25 +++++++++++++++++++
6 files changed, 101 insertions(+), 4 deletions(-)
create mode 100755 tests/big-hole
diff --git a/NEWS b/NEWS
index d0a63d5..8544287 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,10 @@ GNU grep NEWS -*- outline
-*-
* Noteworthy changes in release ?.? (????-??-??) [?]
+** New features
+
+ 'grep' without -z now treats a sparse file as binary, if it can
+ easily determine that the file is sparse.
* Noteworthy changes in release 2.11 (2012-03-02) [stable]
diff --git a/bootstrap.conf b/bootstrap.conf
index 45bb33d..b5634cd 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -67,6 +67,7 @@ realloc-gnu
regex
same-inode
ssize_t
+stat-size
stddef
stdlib
stpcpy
diff --git a/doc/grep.texi b/doc/grep.texi
index c014d8f..8189a94 100644
--- a/doc/grep.texi
+++ b/doc/grep.texi
@@ -580,7 +580,8 @@ this is equivalent to the @samp{--binary-files=text} option.
@item address@hidden
@opindex --binary-files
@cindex binary files
-If the first few bytes of a file indicate that the file contains binary data,
+If a file's allocation metadata or its first few bytes
+indicate that the file contains binary data,
assume that the file is of type @var{type}.
By default, @var{type} is @samp{binary},
and @command{grep} normally outputs either
@@ -703,8 +704,8 @@ better performance.
@cindex binary files, MS-DOS/MS-Windows
Treat the file(s) as binary.
By default, under MS-DOS and MS-Windows,
address@hidden guesses the file type
-by looking at the contents of the first 32kB read from the file.
address@hidden guesses whether a file is text or binary
+as described for the @option{--binary-files} option.
If @command{grep} decides the file is a text file,
it strips the @code{CR} characters from the original file contents
(to make regular expressions with @code{^} and @code{$} work correctly).
diff --git a/src/main.c b/src/main.c
index 2f6c761..cc6427d 100644
--- a/src/main.c
+++ b/src/main.c
@@ -44,6 +44,7 @@
#include "propername.h"
#include "quote.h"
#include "savedir.h"
+#include "stat-size.h"
#include "version-etc.h"
#include "xalloc.h"
#include "xstrtol.h"
@@ -426,6 +427,70 @@ clean_up_stdout (void)
close_stdout ();
}
+/* Return 1 if a file is known to be binary for the purpose of 'grep'.
+ BUF, of size BUFSIZE, is the initial buffer read from the file with
+ descriptor FD and status ST. */
+static int
+file_is_binary (char const *buf, size_t bufsize, int fd, struct stat const *st)
+{
+ #ifndef HAVE_STRUCT_STAT_ST_BLOCKS
+ enum { HAVE_STRUCT_STAT_ST_BLOCKS = 0 };
+ #endif
+ #ifndef SEEK_HOLE
+ enum { SEEK_HOLE = SEEK_END };
+ #endif
+
+ /* If -z, test only whether the initial buffer contains '\200';
+ knowing about holes won't help. */
+ if (! eolbyte)
+ return memchr (buf, '\200', bufsize) != 0;
+
+ /* If the initial buffer contains a null byte, guess that the file
+ is binary. */
+ if (memchr (buf, '\0', bufsize))
+ return 1;
+
+ /* If the file has holes, it must contain a null byte somewhere. */
+ if ((HAVE_STRUCT_STAT_ST_BLOCKS || SEEK_HOLE != SEEK_END)
+ && S_ISREG (st->st_mode))
+ {
+ off_t cur = bufsize;
+ if (O_BINARY || fd == STDIN_FILENO)
+ {
+ cur = lseek (fd, 0, SEEK_CUR);
+ if (cur < 0)
+ return 0;
+ }
+
+ /* If the file has fewer blocks than would be needed to
+ represent its data, then it must have at least one hole. */
+ if (HAVE_STRUCT_STAT_ST_BLOCKS)
+ {
+ off_t nonzeros_needed = st->st_size - cur + bufsize;
+ off_t full_blocks = nonzeros_needed / ST_NBLOCKSIZE;
+ int partial_block = nonzeros_needed % ST_NBLOCKSIZE != 0;
+ if (ST_NBLOCKS (*st) < full_blocks + partial_block)
+ return 1;
+ }
+
+ /* Look for a hole after the current location. */
+ if (SEEK_HOLE != SEEK_END)
+ {
+ off_t hole_start = lseek (fd, cur, SEEK_HOLE);
+ if (0 <= hole_start)
+ {
+ if (lseek (fd, cur, SEEK_SET) < 0)
+ suppressible_error (filename, errno);
+ if (hole_start < st->st_size)
+ return 1;
+ }
+ }
+ }
+
+ /* Guess that the file does not contain binary data. */
+ return 0;
+}
+
/* Convert STR to a nonnegative integer, storing the result in *OUT.
STR must be a valid context length argument; report an error if it
isn't. Silently ceiling *OUT at the maximum value, as that is
@@ -1127,7 +1192,7 @@ grep (int fd, char const *file, struct stats *stats)
not_text = (((binary_files == BINARY_BINARY_FILES && !out_quiet)
|| binary_files == WITHOUT_MATCH_BINARY_FILES)
- && memchr (bufbeg, eol ? '\0' : '\200', buflim - bufbeg));
+ && file_is_binary (bufbeg, buflim - bufbeg, fd, &stats->stat));
if (not_text && binary_files == WITHOUT_MATCH_BINARY_FILES)
return 0;
done_on_match += not_text;
diff --git a/tests/Makefile.am b/tests/Makefile.am
index c2cd2f7..0715fda 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -38,6 +38,7 @@ TESTS = \
backref \
backref-multibyte-slow \
backref-word \
+ big-hole \
big-match \
bogus-wctob \
bre \
diff --git a/tests/big-hole b/tests/big-hole
new file mode 100755
index 0000000..ccc6bf5
--- /dev/null
+++ b/tests/big-hole
@@ -0,0 +1,25 @@
+#!/bin/sh
+# Check that grep --binary-file=without-match quickly skips files with holes.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+
+expensive_
+
+# Create a file that starts with at least a buffer's worth of text,
+# but has a big hole later.
+ten='1 2 3 4 5 6 7 8 9 10'
+x='xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
+(for i in $ten; do
+ for j in $ten; do
+ for k in $ten; do
+ echo $x
+ done
+ done
+ done
+ echo x | dd bs=1024k seek=8000000
+) >8T-or-so || skip_ 'cannot create big sparse file'
+
+grep --binary-file=without-match x 8T-or-so >/dev/null
+test $? -eq 1 || fail=1
+
+Exit $fail
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [PATCH] grep: sparse files are now considered binary,
Paul Eggert <=