--- findutils/import-gnulib.config 2010-04-05 13:18:59.000000000 +0200 +++ findutils-rreale/import-gnulib.config 2010-04-08 10:49:22.000000000 +0200 @@ -73,6 +73,7 @@ realloc regex rpmatch +safe-read savedir selinux-at stat-macros --- findutils/find/defs.h 2010-04-05 13:18:59.000000000 +0200 +++ findutils-rreale/find/defs.h 2010-04-08 13:31:49.000000000 +0200 @@ -168,6 +168,15 @@ int fd; }; +struct samecontent_args +{ + struct stat st; + char *first_block; + size_t read; + char *ref_pathname; + int fd; +}; + struct size_val { enum comparison_type kind; @@ -319,6 +328,7 @@ struct time_val reftime; /* newer newerXY anewer cnewer mtime atime ctime mmin amin cmin */ struct perm_val perm; /* perm */ struct samefile_file_id samefileid; /* samefile */ + struct samecontent_args samecontentargs; /* samecontent */ mode_t type; /* type */ struct format_val printf_vec; /* printf fprintf fprint ls fls print0 fprint0 print */ security_context_t scontext; /* security context */ @@ -467,6 +477,7 @@ PREDICATEFUNCTION pred_writable; PREDICATEFUNCTION pred_xtype; PREDICATEFUNCTION pred_context; +PREDICATEFUNCTION pred_samecontent; @@ -528,6 +539,9 @@ boolean looks_like_expression PARAMS((const char *arg, boolean leading)); +/* cmp.c */ +boolean cmp (int fd0, int fd1, char *ref_pathname, char *pathname, struct predicate *pred_ptr); + enum DebugOption { DebugNone = 0, @@ -547,7 +561,7 @@ /* If true, -depth was EXPLICITLY set (as opposed to having been turned * on by -delete, for example). */ - boolean explicit_depth; + boolean explicit_depth; /* If >=0, don't descend more than this many levels of subdirectories. */ int maxdepth; --- findutils/find/Makefile.am 2010-04-05 13:18:59.000000000 +0200 +++ findutils-rreale/find/Makefile.am 2010-04-05 14:48:52.000000000 +0200 @@ -5,7 +5,7 @@ # regexprops_SOURCES = regexprops.c noinst_LIBRARIES = libfindtools.a -libfindtools_a_SOURCES = finddata.c fstype.c parser.c pred.c tree.c util.c sharefile.c +libfindtools_a_SOURCES = finddata.c fstype.c parser.c pred.c tree.c util.c sharefile.c cmp.c # We always build two versions of find, one with fts, one without. --- findutils/find/parser.c 2010-04-05 13:18:59.000000000 +0200 +++ findutils-rreale/find/parser.c 2010-04-08 17:21:49.000000000 +0200 @@ -159,6 +159,7 @@ static boolean parse_xtype PARAMS((const struct parser_table*, char *argv[], int *arg_ptr)); static boolean parse_quit PARAMS((const struct parser_table*, char *argv[], int *arg_ptr)); static boolean parse_context PARAMS((const struct parser_table*, char *argv[], int *arg_ptr)); +static boolean parse_samecontent PARAMS((const struct parser_table*, char *argv[], int *arg_ptr)); boolean parse_print PARAMS((const struct parser_table*, char *argv[], int *arg_ptr)); @@ -326,6 +327,7 @@ {ARG_TEST, "writable", parse_accesscheck, pred_writable}, /* GNU, 4.3.0+ */ PARSE_OPTION ("xdev", xdev), /* POSIX */ PARSE_TEST ("xtype", xtype), /* GNU */ + PARSE_TEST ("samecontent", samecontent), #ifdef UNIMPLEMENTED_UNIX /* It's pretty ugly for find to know about archive formats. Plus what it could do with cpio archives is very limited. @@ -2348,20 +2350,17 @@ return true; } - -static boolean -parse_samefile (const struct parser_table* entry, char **argv, int *arg_ptr) +boolean +same_file_or_content_helper (char **argv, int *arg_ptr, struct stat *st, const char **filename, int *fd) { /* General idea: stat the file, remember device and inode numbers. * If a candidate file matches those, it's the same file. */ - struct predicate *our_pred; - struct stat st, fst; - int fd, openflags; - const char *filename; + struct stat fst; + int openflags; - set_stat_placeholders (&st); - if (!collect_arg_stat_info (argv, arg_ptr, &st, &filename)) + set_stat_placeholders (st); + if (!collect_arg_stat_info (argv, arg_ptr, st, filename)) return false; set_stat_placeholders (&fst); @@ -2370,7 +2369,7 @@ * the file open if we can. This would prevent the system reusing * the file. */ - fd = -3; /* means, uninitialised */ + *fd = -3; /* means, uninitialised */ openflags = O_RDONLY; if (options.symlink_handling == SYMLINK_NEVER_DEREF) @@ -2379,11 +2378,11 @@ { assert (O_NOFOLLOW != 0); openflags |= O_NOFOLLOW; - fd = -1; /* safe to open it. */ + *fd = -1; /* safe to open it. */ } else { - if (S_ISLNK(st.st_mode)) + if (S_ISLNK(st->st_mode)) { /* no way to ensure that a symlink will not be followed * by open(2), so fall back on using lstat(). Accept @@ -2392,11 +2391,11 @@ * * Avoid opening the file. */ - fd = -2; /* Do not open it */ + *fd = -2; /* Do not open it */ } else { - fd = -1; + *fd = -1; /* Race condition here: the file might become a symlink here. */ } } @@ -2404,26 +2403,26 @@ else { /* We want to dereference the symlink anyway */ - fd = -1; /* safe to open it without O_NOFOLLOW */ + *fd = -1; /* safe to open it without O_NOFOLLOW */ } - assert (fd != -3); /* check we made a decision */ - if (fd == -1) + assert (*fd != -3); /* check we made a decision */ + if (*fd == -1) { /* Race condition here. The file might become a * symbolic link in between out call to stat and * the call to open. */ - fd = open (argv[*arg_ptr], openflags); + *fd = open (*filename, openflags); - if (fd >= 0) + if (*fd >= 0) { /* We stat the file again here to prevent a race condition * between the first stat and the call to open(2). */ - if (0 != fstat (fd, &fst)) + if (0 != fstat (*fd, &fst)) { - fatal_file_error (argv[*arg_ptr]); + fatal_file_error (*filename); } else { @@ -2432,19 +2431,19 @@ * open, fst may contain the stat information for the * destination of the link, not the link itself. */ - if ((*options.xstat) (argv[*arg_ptr], &st)) - fatal_file_error (argv[*arg_ptr]); + if ((*options.xstat) (*filename, st)) + fatal_file_error (*filename); if ((options.symlink_handling == SYMLINK_NEVER_DEREF) && (!options.open_nofollow_available)) { - if (S_ISLNK(st.st_mode)) + if (S_ISLNK(st->st_mode)) { /* We lost the race. Leave the data in st. The * file descriptor points to the wrong thing. */ - close (fd); - fd = -1; + close (*fd); + *fd = -1; } else { @@ -2460,8 +2459,8 @@ * so the open() call may have followed a symlink * even if the -P option is in effect. */ - if ((st.st_dev == fst.st_dev) - && (st.st_ino == fst.st_ino)) + if ((st->st_dev == fst.st_dev) + && (st->st_ino == fst.st_ino)) { /* No race. No need to copy fst to st, * since they should be identical (modulo @@ -2473,19 +2472,33 @@ /* We lost the race. Leave the data in st. The * file descriptor points to the wrong thing. */ - close (fd); - fd = -1; + close (*fd); + *fd = -1; } } } else { - st = fst; + memcpy (st, &fst, sizeof (struct stat)); } } } } + return true; +} + +static boolean +parse_samefile (const struct parser_table* entry, char **argv, int *arg_ptr) +{ + struct predicate *our_pred; + struct stat st; + int fd; + const char *filename; + + if (!same_file_or_content_helper (argv, arg_ptr, &st, &filename, &fd)) + return false; + our_pred = insert_primary (entry, filename); our_pred->args.samefileid.ino = st.st_ino; our_pred->args.samefileid.dev = st.st_dev; @@ -2820,6 +2833,35 @@ { return insert_type (argv, arg_ptr, entry, pred_xtype); } + +static boolean +parse_samecontent (const struct parser_table* entry, char **argv, int *arg_ptr) +{ + struct predicate *our_pred; + struct stat st; + int fd; + const char *filename; + + if (!same_file_or_content_helper (argv, arg_ptr, &st, &filename, &fd)) + return false; + + if (!S_ISREG(st.st_mode)) + return false; + + our_pred = insert_primary (entry, filename); + + memcpy (&our_pred->args.samecontentargs.st, &st, sizeof (struct stat)); + our_pred->args.samecontentargs.first_block = NULL; + our_pred->args.samecontentargs.ref_pathname = xmalloc (strlen (filename) + 1); + strcpy (our_pred->args.samecontentargs.ref_pathname, filename); + our_pred->args.samecontentargs.fd = fd; + + our_pred->need_type = false; + our_pred->need_stat = true; + our_pred->est_success_rate = 0.0001f; + + return true; +} static boolean insert_type (char **argv, int *arg_ptr, --- findutils/find/pred.c 2010-04-05 13:18:59.000000000 +0200 +++ findutils-rreale/find/pred.c 2010-04-08 17:22:51.000000000 +0200 @@ -234,6 +234,7 @@ {pred_writable, "writable "}, {pred_xtype, "xtype "}, {pred_context, "context"}, + {pred_samecontent, "samecontent"}, {0, "none "} }; #endif @@ -1905,6 +1906,28 @@ return (pred_type (pathname, &sbuf, pred_ptr)); } +boolean +pred_samecontent (const char *pathname, struct stat *stat_buf, struct predicate *pred_ptr) +{ + struct stat *st = &pred_ptr->args.samecontentargs.st; + char *ref_pathname = pred_ptr->args.samecontentargs.ref_pathname; + int fd0, fd1; + boolean exit_status = false; + + assert (S_ISREG(st->st_mode)); + + if (!S_ISREG (stat_buf->st_mode) || (stat_buf->st_size != st->st_size)) + return false; + + fd0 = pred_ptr->args.samecontentargs.fd; + fd1 = open (pathname, O_RDONLY | O_BINARY, 0); + + exit_status = cmp (fd0, fd1, ref_pathname, pathname, pred_ptr); + + close (fd1); + + return exit_status; +} boolean pred_context (const char *pathname, struct stat *stat_buf, --- findutils/find/cmp.c 1970-01-01 01:00:00.000000000 +0100 +++ findutils-rreale/find/cmp.c 2010-04-08 18:23:56.000000000 +0200 @@ -0,0 +1,174 @@ +/* Buffer primitives for comparison operations. Adapted from the + following files: + + - src/cmp.c in GNU diffutils 2.9 + + Copyright (C) 1990-1996, 1998, 2001-2002, 2004, 2006-2007, 2009-2010 Free + Software Foundation, Inc. + + - lib/cmpbuf.c in GNU diffutils 2.9 + + Copyright (C) 1993, 1995, 1998, 2001-2002, 2006, 2009-2010 Free Software + Foundation, Inc. */ + +#define LARGE_BLOCK_SIZE 4096 + +#include + +#include "defs.h" +#include +#include +#include "xalloc.h" +#include "error.h" +#include "safe-read.h" + +#ifndef word +# define word uintmax_t +#endif + +/* Read up to NBYTES bytes at BUF from descriptor FD, retrying if interrupted. + Return the actual number of bytes read, zero for EOF, or SAFE_READ_ERROR + upon error. */ + +size_t +safe_block_read (int fd, char *pathname, char *buf, size_t nbytes) +{ + char *bp = buf; + char const *buflim = buf + nbytes; + + do + { + size_t bytes_remaining = buflim - bp; + size_t nread = safe_read (fd, (void *) bp, bytes_remaining); + + if (nread == 0) + break; + + if (nread == SAFE_READ_ERROR) + { + error (EXIT_FAILURE, errno, "%s", safely_quote_err_filename (0, pathname)); + /* NOTREACHED */ + return -1; + } + + bp += nread; + } + while (bp < buflim); + + return bp - buf; +} + +/* Compare two blocks of memory P0 and P1 until they differ. + If the blocks are not guaranteed to be different, put sentinels at the ends + of the blocks before calling this function. + + Return the offset of the first byte that differs. */ + +size_t +block_compare (word const *p0, word const *p1) +{ + word const *l0, *l1; + char const *c0, *c1; + + /* Find the rough position of the first difference by reading words, + not bytes. */ + + for (l0 = p0, l1 = p1; *l0 == *l1; l0++, l1++) + continue; + + /* Find the exact differing position (endianness independent). */ + + for (c0 = (char const *) l0, c1 = (char const *) l1; + *c0 == *c1; + c0++, c1++) + continue; + + return c0 - (char const *) p0; +} + +/* Compare the target file (opened on file descriptor fd0) with the + reference file (opened on file descriptor fd1). + Return true if files don't differ, otherwise false. */ + +boolean +cmp (int fd0, int fd1, char *ref_pathname, char *pathname, struct predicate *pred_ptr) +{ + uintmax_t remaining = UINTMAX_MAX; + size_t read0, read1; /* Number of bytes read from each file. */ + size_t first_diff; /* Offset (0...) in buffers of 1st diff. */ + char *buffer0 = NULL; + char *buffer1 = NULL; + boolean is_first_block = true; + boolean differing = false; + size_t buf_size = LARGE_BLOCK_SIZE; + + lseek (fd0, 0, SEEK_SET); + lseek (fd1, 0, SEEK_SET); + + do + { + size_t bytes_to_read = buf_size; + + if (remaining != UINTMAX_MAX) + { + if (remaining < bytes_to_read) + bytes_to_read = remaining; + remaining -= bytes_to_read; + } + + if (is_first_block) + { + struct samecontent_args *refer = &pred_ptr->args.samecontentargs; + + if (refer->first_block == NULL) + { + refer->first_block = (char *) xmalloc (buf_size); + refer->read = safe_block_read (fd0, ref_pathname, refer->first_block, bytes_to_read); + } + + buffer0 = refer->first_block; + read0 = refer->read; + lseek (fd0, read0, SEEK_SET); + + buffer1 = (char *) xmalloc (buf_size); + read1 = safe_block_read (fd1, pathname, buffer1, bytes_to_read); + } + else + { + if (buffer0 == NULL) + buffer0 = (char *) xmalloc (buf_size); + + read0 = safe_block_read (fd0, ref_pathname, buffer0, bytes_to_read); + read1 = safe_block_read (fd1, pathname, buffer1, bytes_to_read); + } + + assert (read0 == read1); + + /* Insert sentinels for the block compare. */ + + buffer0[read0] = ~buffer1[read0]; + buffer1[read1] = ~buffer0[read1]; + + first_diff = block_compare ((word *) buffer0, (word *) buffer1); + + if (is_first_block) + { + buffer0 = NULL; + is_first_block = false; + } + + if (first_diff < read0) + { + differing = true; + break; + } + } + while (!differing && read0 == buf_size); + + if (buffer0 != NULL) + free (buffer0); + if (buffer1 != NULL) + free (buffer1); + + return !differing; +}