--- Begin Message ---
Subject: |
[PATCH] du: tune, and fix some -L bugs with dangling or cyclic symlinks |
Date: |
Thu, 08 Jul 2010 11:25:56 -0700 |
User-agent: |
Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.1.10) Gecko/20100527 Thunderbird/3.0.5 |
I noticed some performance problems in du in some weird cases: when
running commands like "du X X" it descended through X twice, even
though it output the information only once (and obviously needs to
descend only once). While looking into the matter I found some weird
code in du. For example:
if (ent->fts_info == FTS_D || skip)
return ok;
/* If the file is being excluded or if it has already been counted
via a hard link, then don't let it contribute to the sums. */
if (skip
|| (!opt_count_all
&& (hash_all || (! S_ISDIR (sb->st_mode) && 1 < sb->st_nlink))
&& ! hash_ins (sb->st_ino, sb->st_dev)))
...
Obviously the second use of "skip" is entirely redundant, since when
"skip" is true the first "if" always returns. And the "skip" isn't
done quite right, as errors can be reported even for skipped files.
And while fixing _this_, I discovered that "du -L" sometimes screws
up, in that it doesn't report dangling symlinks, or it mistakenly
counts the disk space for a symlink instead of for the pointed-to
file, or it omits a directory entirely. Most of these bugs with "du
-L" have been present for some time, but one (the omission) is
something I introduced with my previous patch.
Fixing all this required rethinking how du's process_file function
works. I can't easily untangle the bug fixes from the performance
improvements, so I'm taking the liberty of shipping out one patch for
both.
I must say that the code in fts.c is surely the worst in coreutils!
It's very hard to figure out under what cases FTS_DC can be returned
(is it always after FTS_D, or can it occur without a FTS_D, for
example), and similarly for FTS_DNR and FTS_ERR. Perhaps at some
point we can find a cheerful contributor who can clean up the stables
in the fts code; or at least document it. Anyway, I eventually gave
up, and wrote du.c so as to not assume properties of fts when I
couldn't be sure of them.
>From 3e68ecc62ea435459f99bd8feff883f88f9a3823 Mon Sep 17 00:00:00 2001
From: Paul Eggert <address@hidden>
Date: Thu, 8 Jul 2010 10:34:40 -0700
Subject: [PATCH] du: tune, and fix some -L bugs with dangling or cyclic symlinks
* src/du.c (process_file): Avoid recalculation of hashes
and of file-exclusion for directories. Do not descend into
the same directory more than once, unless -l is given; this is faster.
Calculate stat buffer lazily, since it
need not be computed at all for excluded files.
Count space if FTS_ERR, since stat buffer is always valid then.
No need for 'print' local variable.
(main): Use FTS_NOSTAT. Use FTS_TIGHT_CYCLE_CHECK only when not
hashing everything, since process_file finds cycles on its own
when hashing everything.
* tests/du/deref: Add test cases for -L bugs.
---
src/du.c | 144 +++++++++++++++++++++++++++++---------------------------
tests/du/deref | 16 ++++++
2 files changed, 91 insertions(+), 69 deletions(-)
diff --git a/src/du.c b/src/du.c
index 739be73..d8c7e00 100644
--- a/src/du.c
+++ b/src/du.c
@@ -392,7 +392,7 @@ print_size (const struct duinfo *pdui, const char *string)
static bool
process_file (FTS *fts, FTSENT *ent)
{
- bool ok;
+ bool ok = true;
struct duinfo dui;
struct duinfo dui_to_print;
size_t level;
@@ -407,79 +407,86 @@ process_file (FTS *fts, FTSENT *ent)
The sum of the sizes of all entries in the hierarchy at or below the
directory at the specified level. */
static struct dulevel *dulvl;
- bool print = true;
const char *file = ent->fts_path;
const struct stat *sb = ent->fts_statp;
- bool skip;
-
- /* If necessary, set FTS_SKIP before returning. */
- skip = excluded_file_name (exclude, file);
- if (skip)
- fts_set (fts, ent, FTS_SKIP);
+ int info = ent->fts_info;
- switch (ent->fts_info)
+ if (info == FTS_DNR)
{
- case FTS_NS:
- error (0, ent->fts_errno, _("cannot access %s"), quote (file));
- return false;
-
- case FTS_ERR:
- /* if (S_ISDIR (ent->fts_statp->st_mode) && FIXME */
- error (0, ent->fts_errno, _("%s"), quote (file));
- return false;
-
- case FTS_DNR:
- /* Don't return just yet, since although the directory is not readable,
- we were able to stat it, so we do have a size. */
+ /* An error occurred, but the size is known, so count it. */
error (0, ent->fts_errno, _("cannot read directory %s"), quote (file));
ok = false;
- break;
+ }
+ else if (info != FTS_DP)
+ {
+ bool excluded = excluded_file_name (exclude, file);
+ if (! excluded)
+ {
+ /* Make the stat buffer *SB valid, or fail noisily. */
+
+ if (info == FTS_NSOK)
+ {
+ fts_set (fts, ent, FTS_AGAIN);
+ FTSENT const *e = fts_read (fts);
+ assert (e == ent);
+ info = ent->fts_info;
+ }
- case FTS_DC: /* directory that causes cycles */
- if (cycle_warning_required (fts, ent))
+ if (info == FTS_NS || info == FTS_SLNONE)
+ {
+ error (0, ent->fts_errno, _("cannot access %s"), quote (file));
+ return false;
+ }
+ }
+
+ if (excluded
+ || (! opt_count_all
+ && (hash_all || (! S_ISDIR (sb->st_mode) && 1 < sb->st_nlink))
+ && ! hash_ins (sb->st_ino, sb->st_dev)))
{
- emit_cycle_warning (file);
- return false;
+ /* If ignoring a directory in preorder, skip its children.
+ Ignore the next fts_read output too, as it's a postorder
+ visit to the same directory. */
+ if (info == FTS_D)
+ {
+ fts_set (fts, ent, FTS_SKIP);
+ FTSENT const *e = fts_read (fts);
+ assert (e == ent);
+ }
+
+ return true;
}
- ok = true;
- break;
- default:
- ok = true;
- break;
- }
+ switch (info)
+ {
+ case FTS_D:
+ return true;
- /* If this is the first (pre-order) encounter with a directory,
- or if it's the second encounter for a skipped directory, then
- return right away. */
- if (ent->fts_info == FTS_D || skip)
- return ok;
-
- /* If the file is being excluded or if it has already been counted
- via a hard link, then don't let it contribute to the sums. */
- if (skip
- || (!opt_count_all
- && (hash_all || (! S_ISDIR (sb->st_mode) && 1 < sb->st_nlink))
- && ! hash_ins (sb->st_ino, sb->st_dev)))
- {
- /* Note that we must not simply return here.
- We still have to update prev_level and maybe propagate
- some sums up the hierarchy. */
- duinfo_init (&dui);
- print = false;
- }
- else
- {
- duinfo_set (&dui,
- (apparent_size
- ? sb->st_size
- : (uintmax_t) ST_NBLOCKS (*sb) * ST_NBLOCKSIZE),
- (time_type == time_mtime ? get_stat_mtime (sb)
- : time_type == time_atime ? get_stat_atime (sb)
- : get_stat_ctime (sb)));
+ case FTS_ERR:
+ /* An error occurred, but the size is known, so count it. */
+ error (0, ent->fts_errno, _("%s"), quote (file));
+ ok = false;
+ break;
+
+ case FTS_DC:
+ if (cycle_warning_required (fts, ent))
+ {
+ emit_cycle_warning (file);
+ return false;
+ }
+ return true;
+ }
}
+ duinfo_set (&dui,
+ (apparent_size
+ ? sb->st_size
+ : (uintmax_t) ST_NBLOCKS (*sb) * ST_NBLOCKSIZE),
+ (time_type == time_mtime ? get_stat_mtime (sb)
+ : time_type == time_atime ? get_stat_atime (sb)
+ : get_stat_ctime (sb)));
+
level = ent->fts_level;
dui_to_print = dui;
@@ -535,20 +542,14 @@ process_file (FTS *fts, FTSENT *ent)
/* Let the size of a directory entry contribute to the total for the
containing directory, unless --separate-dirs (-S) is specified. */
- if ( ! (opt_separate_dirs && IS_DIR_TYPE (ent->fts_info)))
+ if (! (opt_separate_dirs && IS_DIR_TYPE (info)))
duinfo_add (&dulvl[level].ent, &dui);
/* Even if this directory is unreadable or we can't chdir into it,
do let its size contribute to the total. */
duinfo_add (&tot_dui, &dui);
- /* If we're not counting an entry, e.g., because it's a hard link
- to a file we've already counted (and --count-links), then don't
- print a line for it. */
- if (!print)
- return ok;
-
- if ((IS_DIR_TYPE (ent->fts_info) && level <= max_depth)
+ if ((IS_DIR_TYPE (info) && level <= max_depth)
|| ((opt_all && level <= max_depth) || level == 0))
print_size (&dui_to_print, file);
@@ -608,7 +609,7 @@ main (int argc, char **argv)
char *files_from = NULL;
/* Bit flags that control how fts works. */
- int bit_flags = FTS_TIGHT_CYCLE_CHECK | FTS_DEFER_STAT;
+ int bit_flags = FTS_NOSTAT;
/* Select one of the three FTS_ options that control if/when
to follow a symlink. */
@@ -902,6 +903,11 @@ main (int argc, char **argv)
if (!di_set)
xalloc_die ();
+ /* If not hashing everything, process_file won't find cycles on its
+ own, so ask fts_read to check for them accurately. */
+ if (opt_count_all || ! hash_all)
+ bit_flags |= FTS_TIGHT_CYCLE_CHECK;
+
bit_flags |= symlink_deref_bits;
static char *temp_argv[] = { NULL, NULL };
diff --git a/tests/du/deref b/tests/du/deref
index 79737a6..8e4feac 100755
--- a/tests/du/deref
+++ b/tests/du/deref
@@ -1,6 +1,8 @@
#!/bin/sh
# prior to coreutils-4.5.3, du -D didn't work in some cases
# Based on an example from Andreas Schwab and/or Michal Svec.
+# Also, up to coreutils-8.5, du -L sometimes incorrectly
+# counted the space of the followed symlinks.
# Copyright (C) 2002, 2006-2010 Free Software Foundation, Inc.
@@ -27,10 +29,24 @@ fi
mkdir -p a/sub || framework_failure
ln -s a/sub slink || framework_failure
touch b || framework_failure
+ln -s .. a/sub/dotdot || framework_failure
+ln -s nowhere dangle || framework_failure
# This used to fail with the following diagnostic:
# du: `b': No such file or directory
du -sD slink b > /dev/null 2>&1 || fail=1
+# This used to fail to report the dangling symlink.
+du -L dangle > /dev/null 2>&1 && fail=1
+
+# du -L used to mess up, either by counting the symlink's disk space itself
+# (-L should follow symlinks, not count their space)
+# or (briefly in July 2010) by omitting the entry for "a".
+du_L_output=`du -L a` || fail=1
+du_lL_output=`du -lL a` || fail=1
+du_x_output=`du --exclude=dotdot a` || fail=1
+test "X$du_L_output" = "X$du_x_output" || fail=1
+test "X$du_lL_output" = "X$du_x_output" || fail=1
+
Exit $fail
--
1.7.0.4
--- End Message ---