classpath-patches
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[cp-patches] RFC: gnu.regexp: fixed bugs in RETokenRepeated


From: Ito Kazumitsu
Subject: [cp-patches] RFC: gnu.regexp: fixed bugs in RETokenRepeated
Date: Thu, 19 Jan 2006 23:22:52 +0900 (JST)

From: Ito Kazumitsu <address@hidden>
Subject: Re: [cp-patches] RFC: gnu.regexp fix to avoid unwanted 
PatternSyntaxException
Date: Thu, 19 Jan 2006 01:39:52 +0900 (JST)

> From: Ito Kazumitsu <address@hidden>
> Date: Thu, 05 Jan 2006 23:47:03 +0900 (JST)
> 
> > +       // doables.index == lastIndex means an empty string
> > +       // was the longest that matched this token.
> > +       // We break here, otherwise we will fall into an endless loop.
> 
> Studying various cases, I have found that this comment is not
> always true.

And this is my fix.

ChangeLog
2006-01-19  Ito Kazumitsu  <address@hidden>

        * gnu/regexp/REMatch.java(empty): New boolean indicating
        an empty string matched.
        * gnu/regexp/RE.java(match): Sets empty flag when an empty
        string matched.
        * gnu/regexp/RETokenRepeated.java(match): Sets empty flag
        when an empty string matched. Fixed a bug of the case where
        an empty string matched.

Index: classpath/gnu/regexp/RE.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RE.java,v
retrieving revision 1.11
diff -u -r1.11 RE.java
--- classpath/gnu/regexp/RE.java        19 Jan 2006 13:45:51 -0000      1.11
+++ classpath/gnu/regexp/RE.java        19 Jan 2006 14:20:20 -0000
@@ -1012,7 +1012,7 @@
    * "a"      : 'a' itself.
    * "\0123"  : Octal char 0123
    * "\x1b"   : Hex char 0x1b
-   * "\u1234" : Unicode char \u1234
+   * "\u1234  : Unicode char \u1234
    */
   private static class CharExpression {
     /** character represented by this expression */
@@ -1246,12 +1246,20 @@
   
     /* Implements abstract method REToken.match() */
     boolean match(CharIndexed input, REMatch mymatch) { 
-       if (firstToken == null) return next(input, mymatch);
+       int origin = mymatch.index;
+       boolean b;
+       if (firstToken == null) {
+           b = next(input, mymatch);
+           if (b) mymatch.empty = (mymatch.index == origin);
+           return b;
+       }
 
        // Note the start of this subexpression
        mymatch.start[subIndex] = mymatch.index;
 
-       return firstToken.match(input, mymatch);
+       b = firstToken.match(input, mymatch);
+       if (b) mymatch.empty = (mymatch.index == origin);
+       return b;
     }
   
   /**
Index: classpath/gnu/regexp/REMatch.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/REMatch.java,v
retrieving revision 1.2
diff -u -r1.2 REMatch.java
--- classpath/gnu/regexp/REMatch.java   2 Jul 2005 20:32:15 -0000       1.2
+++ classpath/gnu/regexp/REMatch.java   19 Jan 2006 14:20:20 -0000
@@ -67,6 +67,7 @@
     int[] start; // start positions (relative to offset) for each (sub)exp.
     int[] end;   // end positions for the same
     REMatch next; // other possibility (to avoid having to use arrays)
+    boolean empty; // empty string matched
 
     public Object clone() {
        try {
@@ -88,6 +89,7 @@
        index = other.index;
        // need to deep clone?
        next = other.next;
+       empty = other.empty;
     }
 
     REMatch(int subs, int anchor, int eflags) {
@@ -124,6 +126,7 @@
            start[i] = end[i] = -1;
        }
        next = null; // cut off alternates
+       empty = false;
     }
     
     /**
Index: classpath/gnu/regexp/RETokenRepeated.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenRepeated.java,v
retrieving revision 1.5
diff -u -r1.5 RETokenRepeated.java
--- classpath/gnu/regexp/RETokenRepeated.java   8 Jan 2006 23:06:43 -0000       
1.5
+++ classpath/gnu/regexp/RETokenRepeated.java   19 Jan 2006 14:20:20 -0000
@@ -91,6 +91,7 @@
     // the subexpression back-reference operator allow that?
 
     boolean match(CharIndexed input, REMatch mymatch) {
+       int origin = mymatch.index;
        // number of times we've matched so far
        int numRepeats = 0; 
        
@@ -116,6 +117,7 @@
                REMatch result = matchRest(input, newMatch);
                if (result != null) {
                    mymatch.assignFrom(result);
+                   mymatch.empty = (mymatch.index == origin);
                    return true;
                }
            }
@@ -153,12 +155,43 @@
            
            positions.addElement(newMatch);
 
-           // doables.index == lastIndex means an empty string
-           // was the longest that matched this token.
-           // We break here, otherwise we will fall into an endless loop.
+           // doables.index == lastIndex occurs either
+           //   (1) when an empty string was the longest
+           //       that matched this token.
+           //       And this case occurs either
+           //         (1-1) when this token is always empty,
+           //               for example "()" or "(())".
+           //         (1-2) when this token is not always empty
+           //               but can match an empty string, for example,
+           //               "a*", "(a|)".
+           // or
+           //   (2) when the same string matches this token many times.
+           //       For example, "acbab" itself matches "a.*b" and
+           //       its substrings "acb" and "ab" also match.
            if (doables.index == lastIndex) {
-               if (numRepeats < min) numRepeats = min;
-               break;
+               if (doables.empty) {
+                   // Case (1): We break here, otherwise we will fall
+                   //          into an endless loop.
+                   if (numRepeats < min) numRepeats = min;
+                   break;
+               }
+               else {
+                   // Case (2): We cannot break here because, for example,
+                   // "acbacb" matches "a.*b" up to 2 times but
+                   // not 3 times.  So we have to check numRepeats >= min.
+                   // But we do not have to go further until numRepeats == max
+                   // because the more numRepeats grows, the shorter the
+                   // substring matching this token becomes. 
+                   if (numRepeats > min) {
+                       // This means the previous match was successful,
+                       // and that must be the best match.  This match
+                       // resulted in shortening the matched substring.
+                       numRepeats--;
+                       positions.remove(positions.size() - 1);
+                       break;
+                   }
+                   if (numRepeats == min) break;
+               }
            }           
            lastIndex = doables.index;
        } while (numRepeats < max);
@@ -207,6 +240,7 @@
        }
        if (allResults != null) {
            mymatch.assignFrom(allResults); // does this get all?
+           mymatch.empty = (mymatch.index == origin);
            return true;
        }
        // If we fall out, no matches.

reply via email to

[Prev in Thread] Current Thread [Next in Thread]