[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
sdiff Enhancement
From: |
Grant Stevens |
Subject: |
sdiff Enhancement |
Date: |
Mon, 26 Nov 2007 17:45:31 -0800 (PST) |
Hello!
A project I worked recently required me to build an enhancement to sdiff. I'd
like to contribute the enhancement to GNU diffutils. Paul Eggert assisted me
with the legal paperwork and suggested I submit the enhancement to this address.
An abstract and diff -u output are attached. Can I add anything else to help
the process along?
Thanks and best wishes ... Grant
---------------------------------
Never miss a thing. Make Yahoo your homepage.
Enhancement to sdiff
A project I worked recently required that I "sdiff" two files, but I needed a
line-matchup that would align similar-but-not-identical lines. This note
describes the requirement.
Suppose we sdiff these two files:
$ cat file1
line1
close but not exact
another not exact
last line
$ cat file2
line1
line2
close but inexact
line3
line4
another inexact
last line
$ diff -W 43 -y file1 file2
line1 line1
close but not exact | line2
another not exact | close but inexact
> line3
> line4
> another inexact
last line last line
Notice that identical lines are paired up nicely, but lines with any
differences are paired up without regard to their similarities.
I needed a tighter line-matching that would detect
similar-but-not-identical lines and associate them. I built the enhancement
to diffutils-2.8.1, using the previously unused -Y option for this feature.
The resulting output looks like this:
$ diff -W 43 -Y file1 file2 # (-Y = -y option, enhanced for tighter matching)
line1 line1
> line2
close but not exact | close but inexact
> line3
> line4
another not exact | another inexact
last line last line
Notice that the similar lines are shown together.
diff -ur original/src/diff.c work/src/diff.c
--- original/src/diff.c 2002-03-24 02:35:28.000000000 -0500
+++ work/src/diff.c 2007-05-30 21:06:27.000000000 -0400
@@ -139,7 +139,7 @@
}
static char const shortopts[] =
-"0123456789abBcC:dD:eEfF:hHiI:lL:nNpPqrsS:tTuU:vwW:x:X:y";
+"0123456789abBcC:dD:eEfF:hHiI:lL:nNpPqrsS:tTuU:vwW:x:X:yY";
/* Values for long options that do not have single-letter equivalents. */
enum
@@ -229,6 +229,7 @@
{"show-c-function", 0, 0, 'p'},
{"show-function-line", 1, 0, 'F'},
{"side-by-side", 0, 0, 'y'},
+ {"side-by-side-tight-match", 0, 0, 'Y'},
{"speed-large-files", 0, 0, 'H'},
{"starting-file", 1, 0, 'S'},
{"strip-trailing-cr", 0, 0, STRIP_TRAILING_CR_OPTION},
@@ -492,6 +493,11 @@
specify_style (OUTPUT_SDIFF);
break;
+ case 'Y':
+ specify_style (OUTPUT_SDIFF);
+ sdiff_tighter_match = 1;
+ break;
+
case 'W':
numval = strtoumax (optarg, &numend, 10);
if (! (0 < numval && numval <= INT_MAX) || *numend)
@@ -854,7 +860,8 @@
N_("-e --ed Output an ed script."),
N_("--normal Output a normal diff."),
N_("-n --rcs Output an RCS format diff."),
- N_("-y --side-by-side Output in two columns.\n\
+ N_("-y --side-by-side Output in two columns."),
+ N_("-Y --side-by-side-tight-match Output in two columns.\n\
-W NUM --width=NUM Output at most NUM (default 130) print columns.\n\
--left-column Output only the left column of common lines.\n\
--suppress-common-lines Do not output common lines."),
diff -ur original/src/diff.h work/src/diff.h
--- original/src/diff.h 2002-03-11 16:24:42.000000000 -0500
+++ work/src/diff.h 2007-05-30 21:06:20.000000000 -0400
@@ -179,6 +179,9 @@
XTERN unsigned int sdiff_half_width;
XTERN unsigned int sdiff_column2_offset;
+/* Tell OUTPUT_SDIFF to use a tighter line-matching algorithm. */
+XTERN bool sdiff_tighter_match;
+
/* String containing all the command options diff received,
with spaces between and at the beginning but none at the end.
If there were no options given, this string is empty. */
diff -ur original/src/side.c work/src/side.c
--- original/src/side.c 2002-02-07 13:17:04.000000000 -0500
+++ work/src/side.c 2007-05-31 16:17:05.000000000 -0400
@@ -22,8 +22,33 @@
#include "diff.h"
+struct line_matchup
+{
+ struct line_matchup * next;
+ lin left;
+ lin right;
+};
+
static void print_sdiff_common_lines (lin, lin);
static void print_sdiff_hunk (struct change *);
+static struct line_matchup * find_best_match(
+ char const* const*, lin, lin,
+ char const* const*, lin, lin);
+static struct line_matchup * find_best_match1(
+ char const* const*, lin, lin,
+ char const* const*, lin, lin);
+static void free_line_matchup (struct line_matchup *);
+static void insert_matchup (
+ struct line_matchup * *,
+ struct line_matchup * *,
+ lin, lin, lin);
+static lin figure_score (
+ char const* const* leftlines,
+ char const* const* rightlines,
+ struct line_matchup * line_matchup);
+static lin matching_chars (
+ char const*, char const*,
+ char const*, char const*);
/* Next line number to be printed in the two input files. */
static lin next0, next1;
@@ -238,6 +263,7 @@
{
lin first0, last0, first1, last1;
register lin i, j;
+ struct line_matchup * line_matchup, * current_line_pair;
/* Determine range of line numbers involved in each file. */
enum changes changes =
@@ -255,29 +281,373 @@
fprintf (outfile, "c%ld,%ld\n", len0, len1);
}
- /* Print ``xxx | xxx '' lines */
- if (changes == CHANGED)
+ line_matchup =
+ find_best_match (
+ files[0].linbuf, first0, last0,
+ files[1].linbuf, first1, last1);
+
+ for ( current_line_pair = line_matchup ;
+ current_line_pair ;
+ current_line_pair = current_line_pair->next )
+ {
+ if ( current_line_pair->left == -1 ) /* no line from left file */
+ {
+ print_1sdiff_line (
+ 0, '>',
+ &files[1].linbuf[current_line_pair->right]);
+ next1 = current_line_pair->right + 1;
+ }
+ else if ( current_line_pair->right == -1 ) /* no line from right file */
+ {
+ print_1sdiff_line (
+ &files[0].linbuf[current_line_pair->left],
+ '<', 0);
+ next0 = current_line_pair->left + 1;
+ }
+ else /* lines from both files */
+ {
+ print_1sdiff_line (
+ &files[0].linbuf[current_line_pair->left],
+ '|',
+ &files[1].linbuf[current_line_pair->right]);
+ next0 = first0 = current_line_pair->left + 1;
+ next1 = first1 = current_line_pair->right + 1;
+ }
+ }
+
+ /* free the memory malloc'd by find_best_match() */
+ free_line_matchup (line_matchup);
+}
+
+/* We have a group of lines from the left file and a matching group from the
+ right file. Find lines with the most similarities, and pair them up.
+*/
+static struct line_matchup *
+find_best_match (
+ char const* const* leftlines, const lin l_start, const lin l_end,
+ char const* const* rightlines, const lin r_start, const lin r_end)
+{
+ int incr, l_must_match, r_must_match, prior_left, prior_right;
+ struct line_matchup * best_match, * current_match;
+
+ best_match =
+ find_best_match1 (
+ leftlines, l_start, l_end,
+ rightlines, r_start, r_end);
+
+ if ( sdiff_tighter_match )
+ {
+ /* At this point best_match has a record for every matched pair of lines.
*/
+ /* Here we insert records for the unmatched lines. */
+ for ( current_match = best_match,
+ l_must_match = prior_left = l_start,
+ r_must_match = prior_right = r_start ;
+ l_must_match <= l_end
+ || r_must_match <= r_end ;
+ /* see increments throughout loop */ )
+ {
+ if ( current_match )
+ {
+ prior_left = MAX (prior_left, current_match->left);
+ prior_right = MAX (prior_right, current_match->right);
+ }
+ if ( l_must_match < prior_left )
+ {
+ insert_matchup (&best_match, ¤t_match,
+ l_must_match, -1,
+ 0 /* 0 = insert before current */);
+ ++ l_must_match;
+ }
+ else if ( r_must_match < prior_right )
+ {
+ insert_matchup (&best_match, ¤t_match,
+ -1, r_must_match,
+ 0 /* 0 = insert before current */);
+ ++ r_must_match;
+ }
+ else if ( prior_left == l_must_match
+ && prior_right == r_must_match
+ && l_start <= l_end
+ && r_start <= r_end )
+ {
+ ++ l_must_match;
+ ++ r_must_match;
+ }
+ else if ( l_must_match >= prior_left
+ && l_must_match <= l_end
+ && l_start <= l_end )
+ {
+ insert_matchup (&best_match, ¤t_match,
+ l_must_match, -1,
+ 1 /* 1 = insert after current */);
+ ++ l_must_match;
+ }
+ else if ( r_must_match >= prior_right
+ && r_must_match <= r_end
+ && r_start <= r_end )
+ {
+ insert_matchup (&best_match, ¤t_match,
+ -1, r_must_match,
+ 1 /* 1 = insert after current */);
+ ++ r_must_match;
+ }
+ else
+ break;
+ if ( current_match && current_match->next )
+ current_match = current_match->next;
+ }
+ }
+
+ return (best_match);
+}
+
+static void
+insert_matchup (
+ struct line_matchup * * first_matchup,
+ struct line_matchup * * current_matchup,
+ lin left, lin right,
+ lin insert_after /* 1 = insert new after current, 0 = insert before */)
+{
+ struct line_matchup * new_record =
+ (struct line_matchup *) xmalloc (sizeof (*new_record));
+ if ( ! * first_matchup ) /* insert first record in empty list */
+ {
+ * first_matchup = * current_matchup = new_record;
+ (*current_matchup)->left = left;
+ (*current_matchup)->right = right;
+ (*current_matchup)->next = (struct line_matchup *) 0;
+ }
+ else if ( insert_after ) /* insert new record after current_matchup */
+ {
+ new_record->left = left;
+ new_record->right = right;
+ new_record->next = (*current_matchup)->next;
+ (*current_matchup)->next = new_record;
+ }
+ else /* insert new record before current_matchup */
+ {
+ /* We can't actually insert the new record before the current, since we */
+ /* don't want to walk through the linked list to find the previous */
+ /* record to point its ->next to the new record. So to achieve the same */
+ /* result, we 1) insert the new record after the current, 2) populate */
+ /* the new record with the current record's values, and 3) populate the */
+ /* current record with the new values. */
+ new_record->left = (*current_matchup)->left;
+ new_record->right = (*current_matchup)->right;
+ new_record->next = (*current_matchup)->next;
+ (*current_matchup)->left = left;
+ (*current_matchup)->right = right;
+ (*current_matchup)->next = new_record;
+ }
+}
+
+static struct line_matchup *
+find_best_match1 (
+ char const* const* leftlines, const lin l_start, const lin l_end,
+ char const* const* rightlines, const lin r_start, const lin r_end)
+{
+ int incr, max;
+ struct line_matchup * best_match, * current_match;
+ long best_score, current_score;
+
+ if ( sdiff_tighter_match )
{
- for (i = first0, j = first1; i <= last0 && j <= last1; i++, j++)
- print_1sdiff_line (&files[0].linbuf[i], '|', &files[1].linbuf[j]);
- changes = (i <= last0 ? OLD : 0) + (j <= last1 ? NEW : 0);
- next0 = first0 = i;
- next1 = first1 = j;
+ /* If we have searched past the end of either group of lines, no further */
+ /* search is possible. */
+ if ( l_start > l_end
+ || r_start > r_end )
+ return ((struct line_matchup *) 0);
+
+ best_match = (struct line_matchup *) xmalloc (sizeof (*best_match));
+ best_match->left = l_start;
+ best_match->right = r_start;
+ best_match->next =
+ find_best_match1 (
+ leftlines, l_start + 1, l_end,
+ rightlines, r_start + 1, r_end);
+ best_score = figure_score (leftlines, rightlines, best_match);
+
+ current_match = (struct line_matchup *) xmalloc (sizeof (*current_match));
+
+ max = MAX (l_end - l_start, r_end - r_start);
+ for ( incr = 1 ; incr <= max ; ++ incr )
+ {
+ int i = l_start + incr;
+ if ( i <= l_end )
+ {
+ current_match->left = i;
+ current_match->right = r_start;
+ current_match->next =
+ find_best_match1 (
+ leftlines, i + 1, l_end,
+ rightlines, r_start + 1, r_end);
+ current_score = figure_score (leftlines, rightlines, current_match);
+ if ( current_score > best_score )
+ {
+ free_line_matchup (best_match);
+ best_match = current_match;
+ best_score = current_score;
+ current_match = (struct line_matchup *) xmalloc (sizeof
(*current_match));
+ }
+ }
+ i = r_start + incr;
+ if ( i <= r_end )
+ {
+ current_match->left = l_start;
+ current_match->right = i;
+ current_match->next =
+ find_best_match1 (
+ leftlines, l_start + 1, l_end,
+ rightlines, i + 1, r_end);
+ current_score = figure_score (leftlines, rightlines, current_match);
+ if ( current_score > best_score )
+ {
+ free_line_matchup (best_match);
+ best_match = current_match;
+ best_score = current_score;
+ current_match = (struct line_matchup *) xmalloc (sizeof
(*current_match));
+ }
+ }
+ }
+ free_line_matchup (current_match);
}
+ else /* sdiff_tighter_match was not selected */
+ {
+ /* If we have searched past the end of both groups of lines, no further */
+ /* search is possible. */
+ if ( l_start > l_end
+ && r_start > r_end )
+ return ((struct line_matchup *) 0);
+
+ /* This implementation just matches up lines in order, with no scoring. */
+ /* It is essentially the same as the legacy sdiff algorithm. */
+ best_match = (struct line_matchup *) xmalloc (sizeof (*best_match));
+ best_match->left = l_start <= l_end ? l_start : -1;
+ best_match->right = r_start <= r_end ? r_start : -1;
+ best_match->next =
+ find_best_match1 (
+ leftlines, l_start <= l_end ? l_start + 1 : l_start, l_end,
+ rightlines, r_start <= r_end ? r_start + 1 : r_start, r_end);
+ }
+
+ return (best_match);
+}
- /* Print `` > xxx '' lines */
- if (changes & NEW)
+static void
+free_line_matchup (struct line_matchup * line_matchup)
+{
+ struct line_matchup * current_line_pair;
+ /* free the memory malloc'd by find_best_match() */
+ for ( current_line_pair = line_matchup ;
+ current_line_pair ;
+ /* increment routine in loop */ )
{
- for (j = first1; j <= last1; ++j)
- print_1sdiff_line (0, '>', &files[1].linbuf[j]);
- next1 = j;
+ struct line_matchup * previous_line_pair = current_line_pair;
+ current_line_pair = current_line_pair->next; /* loop increment */
+ free ((char *) previous_line_pair);
}
+}
- /* Print ``xxx < '' lines */
- if (changes & OLD)
+static lin
+figure_score (
+ char const* const* leftlines,
+ char const* const* rightlines,
+ struct line_matchup * line_matchup)
+{
+ int score = 0;
+ struct line_matchup * current_line_pair;
+ for ( current_line_pair = line_matchup ;
+ current_line_pair ;
+ current_line_pair = current_line_pair->next )
+ {
+ if ( current_line_pair->left >= 0
+ && current_line_pair->right >= 0 )
+ score += matching_chars (
+ leftlines[current_line_pair->left], (char const*) 0,
+ rightlines[current_line_pair->right], (char const*) 0);
+ }
+ return (score);
+}
+
+static lin
+matching_chars (
+ char const* str1, char const* end1,
+ char const* str2, char const* end2)
+{
+ char const * s1, * s2, * e1, * e2, * c1, * c2;
+ lin best_middle_score, current_middle_score;
+
+ /* Find matching chars at the beginning of the strings. */
+ for ( s1 = str1, s2 = str2 ;
+ *s1 == *s2 && *s1 != '\0' && *s1 != '\n' ;
+ ++ s1, ++ s2 )
+ { }
+ /* If the caller didn't supply end of string values, we find them. */
+ if ( end1 == (char const*) 0
+ || end2 == (char const*) 0 )
+ {
+ for ( end1 = s1 ; * end1 && * end1 != '\n' ; ++ end1 )
+ { } /* find end of str1 */
+ for ( end2 = s2 ; * end2 && * end2 != '\n' ; ++ end2 )
+ { } /* find end of str2 */
+ }
+ /* Find matching chars at the end of the strings */
+ e1 = end1;
+ e2 = end2;
+ while ( e1 > s1
+ && e2 > s2
+ && * (e1 - 1) == * (e2 - 1) )
+ {
+ -- e1; /* walk backward thru strings until */
+ -- e2; /* we find a mismatch char */
+ }
+ /* If the middle section has zero characters, we return the matching chars */
+ /* from the beginning and end sections. */
+ if ( e1 == s1
+ || e2 == s2 )
+ return (
+ (s1 - str1) /* matching chars at beginning of strings */
+ + (end1 - e1)); /* matching chars at end of strings */
+
+ best_middle_score = 0;
+ for ( c1 = s1 ; c1 < e1 ; ++ c1 )
+ {
+ /* If we've already found *c1 earlier in s1, this later occurrence can't */
+ /* result in a better match, so we ignore it. */
+ char const* c = strchr (s1, *c1);
+ if ( c1 && c1 < s1 )
+ continue;
+ /* If *c1 does not occur in s2, it's no match; skip it. */
+ c = strchr (s2, *c1);
+ if ( ! c || c >= e2 )
+ continue;
+ /* We found a match; score it. */
+ current_middle_score =
+ 1 /* for the char at c1 that matched. */
+ + matching_chars (c1 + 1, e1, c + 1, e2);
+ if ( current_middle_score > best_middle_score )
+ best_middle_score = current_middle_score;
+ }
+ for ( c2 = s2 + 1 ; c2 < e2 ; ++ c2 )
{
- for (i = first0; i <= last0; ++i)
- print_1sdiff_line (&files[0].linbuf[i], '<', 0);
- next0 = i;
+ /* If we've already found *c2 earlier in s2, this later occurrence can't */
+ /* result in a better match, so we ignore it. */
+ char const* c = strchr (s2, *c2);
+ if ( c2 && c2 < s2 )
+ continue;
+ /* If *c2 does not occur in s1, it's no match; skip it. */
+ c = strchr (s1, *c2);
+ if ( ! c || c >= e1 )
+ continue;
+ /* We found a match; score it. */
+ current_middle_score =
+ 1 /* for the char at c2 that matched. */
+ + matching_chars (c + 1, e1, c2 + 1, e2);
+ if ( current_middle_score > best_middle_score )
+ best_middle_score = current_middle_score;
}
+ return (
+ (s1 - str1) /* matching chars at beginning of strings */
+ + best_middle_score /* best match in middle of strings */
+ + (end1 - e1)); /* matching chars at end of strings */
}
- sdiff Enhancement,
Grant Stevens <=