[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: csplit: memory exhausted
From: |
lukekendall |
Subject: |
Re: csplit: memory exhausted |
Date: |
Mon, 16 Aug 2004 01:07:31 +1000 (EST) |
On 11 Aug, Jim Meyering wrote:
> address@hidden wrote:
> ...
> > While the 5.1.2 test release fixed my problem, I've been having a
> > disturbing problem that until today I could ignore. Specifically, if
> > csplit is fed a lot of input (800kb) from stdin, it would sometimes
> > corrupt a tiny amount of the output.
> >
> > E.g. one line was output as:
> > address@hidden@ch patterns particularly archaic?'<P>
> > instead of:
> > her speech patterns particularly archaic?'<P>
> > (showing the odd characters as nvi displays them).
> ...
>
> Thanks for the report.
> [I've redirected to the address@hidden mailing list.]
>
> Would you please see if you can reproduce the problem using
> the latest stable release: coreutils-5.2.1?
>
> ftp://ftp.gnu.org/gnu/coreutils/coreutils-5.2.1.tar.gz
> ftp://ftp.gnu.org/gnu/coreutils/coreutils-5.2.1.tar.bz2
>
> If you can still reproduce it, would you point me to a copy
> of the input file that provokes the bug?
Sorry, Jim, the bug is still there.
The input file leeth-34.gz can be found at
http://www.members.optushome.com.au/lukekendall/leeth-34.gz
This represents the first 13 chapters of a book I'm writing, and trying
to get published, so please don't distribute it any wider than you need
to for testing, and please delete it when you've finished testing with
it.
Also, could you let me know as soon as you've downloaded it, so I can
delete it from my web site? Thanks.
Here's an example of the bug produced via a script I've written that
relies on csplit (Note: you'll need to replace "whiches csplit" by
"which csplit" - or just delete that line). Also, alter the PATH
setting. Maybe also do this:
mkdir -p html2/OLD
: /home/luke/books/leeth; cat /tmp/leeth-34 | fixhtml -start 1 -min 12 -o
html2/leeth%02d.html - && mv /tmp/xxx.html html2/leeth00.html && ls -l
html2/leeth00.html
/home/luke/linux/coreutils-5.2.1/src/csplit
/usr/local/bin/csplit
/usr/bin/csplit
/usr/local/bin/csplit
rm: cannot remove `FH24700-001--expanded': No such file or directory
-rw-rw---- 1 luke kendall 13923 Aug 16 00:53 FH24700-001-000
Chapter 000 contained no Chapter 0<P>: moving to /tmp/xxx.html ...
-rw-rw---- 1 luke kendall 16148 Aug 16 00:53 html2/leeth01.html
-rw-rw---- 1 luke kendall 15147 Aug 16 00:53 html2/leeth02.html
-rw-rw---- 1 luke kendall 26319 Aug 16 00:53 html2/leeth03.html
-rw-rw---- 1 luke kendall 13208 Aug 16 00:53 html2/leeth04.html
-rw-rw---- 1 luke kendall 19557 Aug 16 00:53 html2/leeth05.html
-rw-rw---- 1 luke kendall 4572 Aug 16 00:53 html2/leeth06.html
-rw-rw---- 1 luke kendall 44624 Aug 16 00:53 html2/leeth07.html
-rw-rw---- 1 luke kendall 34258 Aug 16 00:53 html2/leeth08.html
-rw-rw---- 1 luke kendall 42299 Aug 16 00:53 html2/leeth09.html
-rw-rw---- 1 luke kendall 29696 Aug 16 00:53 html2/leeth10.html
FH24700-001-011 is corrupted (3 weird characters)
FH24700-001-011 -> html2/leeth11.html via sed & is corrupted (3 weird
characters)
-rw-rw---- 1 luke kendall 50340 Aug 16 00:53 html2/leeth11.html
FH24700-001-012 is corrupted (7 weird characters)
FH24700-001-012 -> html2/leeth12.html via sed & is corrupted (7 weird
characters)
-rw-rw---- 1 luke kendall 59645 Aug 16 00:53 html2/leeth12.html
-rw-rw---- 1 luke kendall 56334 Aug 16 00:53 html2/leeth13.html
-rw-rw---- 1 luke kendall 28468 Aug 16 00:53 html2/leeth14.html
-rw-rw---- 1 luke kendall 34364 Aug 16 00:53 html2/leeth15.html
-rw-rw---- 1 luke kendall 41876 Aug 16 00:53 html2/leeth16.html
-rw-rw---- 1 luke kendall 13923 Aug 16 00:53 html2/leeth00.html
: /home/luke/books/leeth; diff html2/leeth11.html html2/OLD | cat -tvu
106c106
< hugging herself in a futile attempt to relieve the pain.<P>
---
> address@hidden in a futile attempt to relieve the pain.<P>
725c725
< address@hidden@address@hidden@address@hidden@"address@hidden<P>
---
> halfway to them.<P>
I've included the script fixhtml here, also the utility chcnt.c which
the script uses to detect csplit corrupting the output files.
You may have to run the script a couple of times to get it to produce
corrupted files.
: /home/luke/books/leeth; uname -a
Linux posh 2.4.23 #3 Sun Nov 30 19:53:18 EST 2003 i686 unknown unknown GNU/Linux
: /home/luke/books/leeth; cat /proc/meminfo
total: used: free: shared: buffers: cached:
Mem: 262897664 258416640 4481024 0 137314304 35610624
Swap: 271392768 148692992 122699776
MemTotal: 256736 kB
MemFree: 4376 kB
MemShared: 0 kB
Buffers: 134096 kB
Cached: 15932 kB
SwapCached: 18844 kB
Active: 140184 kB
Inactive: 79596 kB
HighTotal: 0 kB
HighFree: 0 kB
LowTotal: 256736 kB
LowFree: 4376 kB
SwapTotal: 265032 kB
SwapFree: 119824 kB
: /home/luke/books/leeth; cat /etc/redhat-release
Red Hat Linux release 7.2 (Enigma)
luke
----------------------------- fixhtml -----------------------------
#!/bin/sh
#
# Fix up the html generated by troff2html
#
# Errors include: chapter headings doubled, silly placement of
# end-bold, failure to number chapters, use of BR instead of P
#
# <P>
# Chapter 0<P>
# <B>Chapter 0<P>
# </B>Blah blah blah
#
PATH="/home/luke/linux/coreutils-5.2.1/src:$PATH"
whiches csplit
MAX_TRIES=3
MIN_OUTPUTS=0
MYNAME=`basename $0`
PATTERN="Chapter 0<P>"
USAGE="Usage: $MYNAME [-start N] [-o outfile] [-min M ] infile
-min M Means it should produce at least M output files.
-start N Means chapter number will start from N. Default is 1.
-o outfile Output filename. Default is -, i.e. stdout. A printf
format string will be expanded; e.g. %d formats will
be replaced by the current value of N.
Example: $MYNAME -start 3 -o 'fred-%02d.html' -
Will convert standard input to files named fred-03.html, fred-04.html, etc."
usage()
{
echo "$USAGE" >&2
exit 1
}
OUTPAT="-"
OUTFILE=
N=1
while [ $# != 0 ]
do
case $1 in
-h)
usage
;;
-min|-m)
[ $# -lt 2 ] && usage
MIN_OUTPUTS="$2"
shift
;;
-o)
[ $# -lt 2 ] && usage
OUTPAT="$2"
shift
;;
-start)
[ $# -lt 2 ] && usage
N="$2"
shift
;;
*)
break
;;
esac
shift
done
[ $# = 0 ] && usage
#
# For each input file...
#
BATCH=1
for f
do
#
# Split the file into chapters based on the pattern "^$PATTERN"
# creating M files named FH$$-00B-CC where B is the batch number,
# C the chapter number.
#
TRY=0
TMP=FH$$-`printf "%03d" $BATCH`-
#
# csplit prior to version 2.0.21 could routinely fail:
# "csplit: memory exhausted"
# yet succeed next time. Bug in its memory handling on large inputs.
# Worse, it can sometimes truncate input, producing fewer files
# than it should, without returning an error. Hence $MIN_OUTPUTS.
# That latter seems fixed; the former isn't, we have to avoid
# piping into csplit.
#
while :
do
### Major bad bug in csplit: piping the input straight through
### corrupts the output stream, as detected by chcnt below.
### By writing it to a temporary file we don't trigger the bug.
if sed 's/<BR>/<P>/' "$f" | \
csplit --prefix=$TMP --elide-empty-files -n 3 \
--silent - "/^$PATTERN/" "{*}"
#sed 's/<BR>/<P>/' "$f" > $TMP-expanded
#if csplit --prefix=$TMP --elide-empty-files -n 3 \
# --silent $TMP-expanded "/^$PATTERN/" "{*}"
then
ls $TMP* | wc -l > /tmp/fh$$
read N_OUTPUTS < /tmp/fh$$
rm /tmp/fh$$
if [ $N_OUTPUTS -ge $MIN_OUTPUTS ]
then
break
else
echo "csplit produced only $N_OUTPUTS files" >&2
fi
else
echo "csplit failure" >&2
fi
TRY=`expr $TRY + 1`
if [ $TRY -ge $MAX_TRIES ]
then
echo "$MYNAME: exceeded max no. of retries, giving up." >&2
rm $TMP*
exit 1
fi
done
rm "$TMP-expanded"
#
# For each of the chapter files just produced...
#
for f2 in `ls $TMP*`
do
if chcnt -q < $f2
then
:
else
echo "$f2 is corrupted ($? weird characters)"
fi
if grep "^$PATTERN" "$f2" > /dev/null
then
:
else
ls -l "$f2"
echo "$f2 contained no $PATTERN: moving to /tmp/xxx.html ..." \
| sed "s|$TMP|Chapter |" >&2
mv "$f2" /tmp/xxx.html
continue
fi
#
# Do the fixups.
# :B
# Delete the repeated line, start next cycle.
# Fix up and print dud Chapter line.
# If fixed, branch to N.
# Else print line and start next cycle.
#
# :N
# Read next line. Replace </B> by <P>
# Fall through to E.
#
# :E
# Print line. Get next line. Loop back to E.
#
OUTFILE=`printf "$OUTPAT" $N`
sed -n \
-e ":B;/^$PATTERN/d;/<\/*BODY>/d;/<\/*HTML>/d" \
-e "s|<B>$PATTERN|<B>Chapter $N</B>|p;tN;p;n;bB;:N;n;s|/B|P|;" \
-e ":E;p;n;bE" \
"$f2" > "$OUTFILE"
if chcnt -q < $OUTFILE
then
:
else
echo "$f2 -> $OUTFILE via sed & is corrupted ($? weird characters)"
fi
if expr "$OUTFILE" : ".*leeth21.html" > /dev/null
then
echo "Shrinking 'I forgot.'"
sed "s|^'I forgot.'<P>|<font size=-1>'I forgot.'</font><P>|" \
"$OUTFILE" > "$OUTFILE.tmp"
mv "$OUTFILE.tmp" "$OUTFILE"
fi
N=`expr $N + 1`
ls -l "$OUTFILE"
rm "$f2"
done
BATCH=`expr $BATCH + 1`
done
------------------------------ chcnt.c ------------------------------
/*
* chcnt.c -- count characters in a file, and display summary count
* -- (for finding existence of non-ascii characters in files).
*
* Copyright (C) 1990 Luke Kendall
*
* Permission granted to Canon Information Systems Research Australia P/L
* to use and modify as much as they like.
*/
#include <stdio.h>
#include <ctype.h>
int ch[256] = { 0 };
char *myname;
usage()
{
fprintf(stderr, "usage: %s [-q] [-l]\n", myname);
fprintf(stderr, "\twhere\t-q means quiet\n");
fprintf(stderr, "\t\t-l means 1 frequency count per line\n");
exit(1);
}
main(argc, argv)
int argc;
char *argv[];
{
register int c;
register int n;
int quiet = 0;
int single = 0;
int nons = 0;
myname = argv[0];
if (argc > 1)
{
if (argv[1][0] == '-')
{
if (argv[1][1] == 'q')
quiet = 1;
else if (argv[1][1] == 'l')
single = 1;
}
else
usage();
}
while ((c = getchar()) != EOF)
ch[c]++;
if (!quiet)
{
for (n = c = 0; c < sizeof ch / sizeof ch[0]; c++)
{
if (ch[c] == 0)
continue;
if (isprint(c))
printf(" %c:%5d", c, ch[c]);
else
printf("%02x:%5d", c, ch[c]);
if (single)
printf("\n");
else if ((n % 8) == 7)
printf("\n");
else
printf(" ");
n++;
}
if (!single && (n % 8) != 0)
printf("\n");
}
for (c = 0; c < sizeof ch / sizeof ch[0]; c++)
if (!isprint(c) && c != '\n' && c != '\t' && ch[c])
++nons;
exit(nons);
}