[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[cp-patches] RFC: gnu.regexp: fixed bugs in RETokenRepeated
From: |
Ito Kazumitsu |
Subject: |
[cp-patches] RFC: gnu.regexp: fixed bugs in RETokenRepeated |
Date: |
Thu, 19 Jan 2006 23:22:52 +0900 (JST) |
From: Ito Kazumitsu <address@hidden>
Subject: Re: [cp-patches] RFC: gnu.regexp fix to avoid unwanted
PatternSyntaxException
Date: Thu, 19 Jan 2006 01:39:52 +0900 (JST)
> From: Ito Kazumitsu <address@hidden>
> Date: Thu, 05 Jan 2006 23:47:03 +0900 (JST)
>
> > + // doables.index == lastIndex means an empty string
> > + // was the longest that matched this token.
> > + // We break here, otherwise we will fall into an endless loop.
>
> Studying various cases, I have found that this comment is not
> always true.
And this is my fix.
ChangeLog
2006-01-19 Ito Kazumitsu <address@hidden>
* gnu/regexp/REMatch.java(empty): New boolean indicating
an empty string matched.
* gnu/regexp/RE.java(match): Sets empty flag when an empty
string matched.
* gnu/regexp/RETokenRepeated.java(match): Sets empty flag
when an empty string matched. Fixed a bug of the case where
an empty string matched.
Index: classpath/gnu/regexp/RE.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RE.java,v
retrieving revision 1.11
diff -u -r1.11 RE.java
--- classpath/gnu/regexp/RE.java 19 Jan 2006 13:45:51 -0000 1.11
+++ classpath/gnu/regexp/RE.java 19 Jan 2006 14:20:20 -0000
@@ -1012,7 +1012,7 @@
* "a" : 'a' itself.
* "\0123" : Octal char 0123
* "\x1b" : Hex char 0x1b
- * "\u1234" : Unicode char \u1234
+ * "\u1234 : Unicode char \u1234
*/
private static class CharExpression {
/** character represented by this expression */
@@ -1246,12 +1246,20 @@
/* Implements abstract method REToken.match() */
boolean match(CharIndexed input, REMatch mymatch) {
- if (firstToken == null) return next(input, mymatch);
+ int origin = mymatch.index;
+ boolean b;
+ if (firstToken == null) {
+ b = next(input, mymatch);
+ if (b) mymatch.empty = (mymatch.index == origin);
+ return b;
+ }
// Note the start of this subexpression
mymatch.start[subIndex] = mymatch.index;
- return firstToken.match(input, mymatch);
+ b = firstToken.match(input, mymatch);
+ if (b) mymatch.empty = (mymatch.index == origin);
+ return b;
}
/**
Index: classpath/gnu/regexp/REMatch.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/REMatch.java,v
retrieving revision 1.2
diff -u -r1.2 REMatch.java
--- classpath/gnu/regexp/REMatch.java 2 Jul 2005 20:32:15 -0000 1.2
+++ classpath/gnu/regexp/REMatch.java 19 Jan 2006 14:20:20 -0000
@@ -67,6 +67,7 @@
int[] start; // start positions (relative to offset) for each (sub)exp.
int[] end; // end positions for the same
REMatch next; // other possibility (to avoid having to use arrays)
+ boolean empty; // empty string matched
public Object clone() {
try {
@@ -88,6 +89,7 @@
index = other.index;
// need to deep clone?
next = other.next;
+ empty = other.empty;
}
REMatch(int subs, int anchor, int eflags) {
@@ -124,6 +126,7 @@
start[i] = end[i] = -1;
}
next = null; // cut off alternates
+ empty = false;
}
/**
Index: classpath/gnu/regexp/RETokenRepeated.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenRepeated.java,v
retrieving revision 1.5
diff -u -r1.5 RETokenRepeated.java
--- classpath/gnu/regexp/RETokenRepeated.java 8 Jan 2006 23:06:43 -0000
1.5
+++ classpath/gnu/regexp/RETokenRepeated.java 19 Jan 2006 14:20:20 -0000
@@ -91,6 +91,7 @@
// the subexpression back-reference operator allow that?
boolean match(CharIndexed input, REMatch mymatch) {
+ int origin = mymatch.index;
// number of times we've matched so far
int numRepeats = 0;
@@ -116,6 +117,7 @@
REMatch result = matchRest(input, newMatch);
if (result != null) {
mymatch.assignFrom(result);
+ mymatch.empty = (mymatch.index == origin);
return true;
}
}
@@ -153,12 +155,43 @@
positions.addElement(newMatch);
- // doables.index == lastIndex means an empty string
- // was the longest that matched this token.
- // We break here, otherwise we will fall into an endless loop.
+ // doables.index == lastIndex occurs either
+ // (1) when an empty string was the longest
+ // that matched this token.
+ // And this case occurs either
+ // (1-1) when this token is always empty,
+ // for example "()" or "(())".
+ // (1-2) when this token is not always empty
+ // but can match an empty string, for example,
+ // "a*", "(a|)".
+ // or
+ // (2) when the same string matches this token many times.
+ // For example, "acbab" itself matches "a.*b" and
+ // its substrings "acb" and "ab" also match.
if (doables.index == lastIndex) {
- if (numRepeats < min) numRepeats = min;
- break;
+ if (doables.empty) {
+ // Case (1): We break here, otherwise we will fall
+ // into an endless loop.
+ if (numRepeats < min) numRepeats = min;
+ break;
+ }
+ else {
+ // Case (2): We cannot break here because, for example,
+ // "acbacb" matches "a.*b" up to 2 times but
+ // not 3 times. So we have to check numRepeats >= min.
+ // But we do not have to go further until numRepeats == max
+ // because the more numRepeats grows, the shorter the
+ // substring matching this token becomes.
+ if (numRepeats > min) {
+ // This means the previous match was successful,
+ // and that must be the best match. This match
+ // resulted in shortening the matched substring.
+ numRepeats--;
+ positions.remove(positions.size() - 1);
+ break;
+ }
+ if (numRepeats == min) break;
+ }
}
lastIndex = doables.index;
} while (numRepeats < max);
@@ -207,6 +240,7 @@
}
if (allResults != null) {
mymatch.assignFrom(allResults); // does this get all?
+ mymatch.empty = (mymatch.index == origin);
return true;
}
// If we fall out, no matches.