help-smalltalk
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Help-smalltalk] [rfc] regex rewrite


From: Mike Anderson
Subject: Re: [Help-smalltalk] [rfc] regex rewrite
Date: Thu, 02 Jun 2005 21:01:35 +0000
User-agent: Mozilla Thunderbird 0.7.3 (X11/20040803)

Paolo Bonzini wrote:

Both Perl and Ruby return values that can be interpreted as true/false from =~, but both of them also capture the matched expression and sub-expressions into variables. We don't have that possibility, so I think =~ should work harder.

That's fine with me, if we add #matched, #ifMatched:, etc. methods to RegexResults as in your previous suggestion, that avoid the unintuitive #isNil. To have a bit more efficiency, the RegexResults object for not-matched is a singleton; this means that RegexResults is now at the head of a small class hierarchy comprising MatchingRegexResults, and FailedMatchRegexResults.

I think this works very nicely, and is elegant too.

My final concern is about modifiers. The library allows for them, and this patch enables the case sensitivity modifier. I can't get any joy out of /m or /s, however (I haven't tried /x).

The last patch I sent was wrong, as you must have noticed. I don't *think* I made any foolish mistakes with this one.

Mike
diff -ur orig/examples/re.c mod/examples/re.c
--- orig/examples/re.c  2005-05-27 19:28:20.000000000 +0000
+++ mod/examples/re.c   2005-06-02 20:15:40.399785264 +0000
@@ -61,7 +61,7 @@
 }
 RegexCacheEntry;
 
-static RegexCaching lookupRegex (OOP patternOOP,
+static RegexCaching lookupRegex (OOP patternOOP, long options,
                                 struct pre_pattern_buffer **pRegex);
 static const char *compileRegex (OOP patternOOP,
                                 struct pre_pattern_buffer *regex);
@@ -69,11 +69,11 @@
 static void markRegexAsMRU (int i);
 
 /* Functions exported to Smalltalk */
-static OOP reh_make_cacheable (OOP patternOOP);
+static OOP reh_make_cacheable (OOP patternOOP, char *options_string);
 
 static struct pre_registers *reh_search (OOP srcOOP, OOP patternOOP,
-                                        int from, int to);
-static int reh_match (OOP srcOOP, OOP patternOOP, int from, int to);
+                                        int from, int to, char 
*options_string);
+static int reh_match (OOP srcOOP, OOP patternOOP, int from, int to, char 
*options_string);
 static void reh_free_registers(struct pre_registers *regs);
 
 static RegexCacheEntry cache[REGEX_CACHE_SIZE];
@@ -81,6 +81,58 @@
 /* Smalltalk globals */
 static OOP regexClass;
 
+/* From Ruby's re.c */
+#if 'a' == 97   /* it's ascii */
+static const char casetable[] = {
+       '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
+       '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
+       '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
+       '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
+       /* ' '     '!'     '"'     '#'     '$'     '%'     '&'     ''' */
+       '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
+       /* '('     ')'     '*'     '+'     ','     '-'     '.'     '/' */
+       '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
+       /* '0'     '1'     '2'     '3'     '4'     '5'     '6'     '7' */
+       '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
+       /* '8'     '9'     ':'     ';'     '<'     '='     '>'     '?' */
+       '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
+       /* '@'     'A'     'B'     'C'     'D'     'E'     'F'     'G' */
+       '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
+       /* 'H'     'I'     'J'     'K'     'L'     'M'     'N'     'O' */
+       '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
+       /* 'P'     'Q'     'R'     'S'     'T'     'U'     'V'     'W' */
+       '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
+       /* 'X'     'Y'     'Z'     '['     '\'     ']'     '^'     '_' */
+       '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
+       /* '`'     'a'     'b'     'c'     'd'     'e'     'f'     'g' */
+       '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
+       /* 'h'     'i'     'j'     'k'     'l'     'm'     'n'     'o' */
+       '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
+       /* 'p'     'q'     'r'     's'     't'     'u'     'v'     'w' */
+       '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
+       /* 'x'     'y'     'z'     '{'     '|'     '}'     '~' */
+       '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
+       '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
+       '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
+       '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
+       '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
+       '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
+       '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
+       '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
+       '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
+       '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
+       '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
+       '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
+       '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
+       '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
+       '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
+       '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
+       '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
+};
+#else
+       # error >>> "You lose. You will need a translation table for your 
character set." <<<
+#endif
+
 /* Allocate a buffer to be passed to the regular expression matcher */
 struct pre_pattern_buffer *
 allocateNewRegex (void)
@@ -136,7 +188,7 @@
  * caller will also have to free the buffer pointed to by pRegex.
  */
 RegexCaching
-lookupRegex (OOP patternOOP, struct pre_pattern_buffer **pRegex)
+lookupRegex (OOP patternOOP, long options, struct pre_pattern_buffer **pRegex)
 {
   int i;
   RegexCaching result;
@@ -149,7 +201,7 @@
 
   /* Search for the Regex object in the cache */
   for (i = 0; i < REGEX_CACHE_SIZE; i++)
-    if (cache[i].patternOOP == patternOOP)
+    if ((cache[i].patternOOP == patternOOP) && (cache[i].regex->options == 
options))
       break;
 
   if (i < REGEX_CACHE_SIZE)
@@ -170,14 +222,39 @@
     }
 
   /* Mark the object as most recently used */
-  if (!cache[i].regex)
-    cache[i].regex = allocateNewRegex ();
+  if (!cache[i].regex) 
+    {
+      cache[i].regex = allocateNewRegex ();
+         cache[i].regex -> options = options;
+    }
 
   markRegexAsMRU (i);
   *pRegex = cache[0].regex;
   return result;
 }
 
+long
+translate_options_string(char *options_string)
+{
+       char *p = options_string;
+       char c;
+       long result = 0;
+       if (!options_string) 
+               return result;
+       while((c = *p++) != 0) 
+       {
+               switch (c)
+               {
+                       case 'i' : result |= PRE_OPTION_IGNORECASE;
+                       case 'x' : result |= PRE_OPTION_EXTENDED;
+                       case 'm' : result |= PRE_OPTION_MULTILINE;
+                       case 'p' : result |= PRE_OPTION_POSIXLINE;
+                       case 's' : result |= PRE_OPTION_SINGLELINE;
+               }
+       }
+       return result;
+}
+
 /* Create a Regex object.  We look for one that points to the same string
  * in the cache (so that we can optimize a loop that repeatedly calls
  * asRegex; if none is found, we create one ex-novo.
@@ -185,7 +262,7 @@
  * are read-only so that we can support this kind of "interning" them.
  */
 OOP
-reh_make_cacheable (OOP patternOOP)
+reh_make_cacheable (OOP patternOOP, char *options_string)
 {
   OOP regexOOP;
   const char *pattern;
@@ -224,7 +301,7 @@
   /* Put it in the cache (we must compile it to check that it
    * is well-formed).
    */
-  lookupRegex (regexOOP, &compiled);
+  lookupRegex (regexOOP, translate_options_string(options_string), &compiled);
   if (compileRegex (patternOOP, compiled) != NULL)
     return vmProxy->nilOOP;
   else
@@ -233,7 +310,7 @@
 
 /* Search helper function */
 struct pre_registers *
-reh_search (OOP srcOOP, OOP patternOOP, int from, int to)
+reh_search (OOP srcOOP, OOP patternOOP, int from, int to, char *options_string)
 {
   int res = 0;
   const char *src;
@@ -241,9 +318,9 @@
   struct pre_registers *regs;
   RegexCaching caching;
 
-  caching = lookupRegex (patternOOP, &regex);
+  caching = lookupRegex (patternOOP, translate_options_string(options_string), 
&regex);
   if (caching != REGEX_CACHE_HIT && compileRegex (patternOOP, regex) != NULL)
-    return NULL;
+         return NULL;
 
   /* now search */
   src = &STRING_OOP_AT (OOP_TO_OBJ (srcOOP), 1);
@@ -265,14 +342,14 @@
 
 /* Match helper function */
 int
-reh_match (OOP srcOOP, OOP patternOOP, int from, int to)
+reh_match (OOP srcOOP, OOP patternOOP, int from, int to, char *options_string)
 {
   int res = 0;
   const char *src;
   struct pre_pattern_buffer *regex;
   RegexCaching caching;
-
-  caching = lookupRegex (patternOOP, &regex);
+  
+  caching = lookupRegex (patternOOP, translate_options_string(options_string), 
&regex);
   if (caching != REGEX_CACHE_HIT && compileRegex (patternOOP, regex) != NULL)
     return -100;
 
@@ -289,6 +366,7 @@
 void
 gst_initModule (VMProxy * proxy)
 {
+  pre_set_casetable(casetable);
   vmProxy = proxy;
   vmProxy->defineCFunc ("reh_search", reh_search);
   vmProxy->defineCFunc ("reh_match", reh_match);
diff -ur orig/examples/regex.st mod/examples/regex.st
--- orig/examples/regex.st      2005-06-01 08:46:37.000000000 +0000
+++ mod/examples/regex.st       2005-06-02 20:05:36.411605424 +0000
@@ -223,23 +223,23 @@
 
 Regex class
        defineCFunc: 'reh_make_cacheable'
-       withSelectorArgs: 'fromString: aString'
+       withSelectorArgs: 'fromString: aPatternString options: aOptionsString'
        returning: #smalltalk
-       args: #(#smalltalk).
+       args: #(#smalltalk #string).
 !
 
 String 
        defineCFunc: 'reh_match' 
-       withSelectorArgs: 'lengthOfRegexMatch: pattern from: from to: to'
+       withSelectorArgs: 'lengthOfRegexMatch: pattern from: from to: to 
options: aOptionsString'
        returning: #int
-       args: #(#selfSmalltalk #smalltalk #int #int)
+       args: #(#selfSmalltalk #smalltalk #int #int #string)
 !
 
 String 
        defineCFunc: 'reh_search'
-       withSelectorArgs: 'searchRegexInternal: pattern from: from to: to'
+       withSelectorArgs: 'searchRegexInternal: pattern from: from to: to 
options: aOptionsString'
        returning: CRegexRegisters type
-       args: #(#selfSmalltalk #smalltalk #int #int)
+       args: #(#selfSmalltalk #smalltalk #int #int #string)
 !
 !
 
@@ -339,12 +339,21 @@
        aStream nextPut: each.
     ].
     aStream nextPut: $/.
-! !
+! 
+!
 
 "--------------------------------------------------------------------------"
 
 !String methodsFor: 'regex'!
 
+searchRegexInternal: pattern from: from to: to
+       ^self searchRegexInternal: pattern from: from to: to options: ''
+!
+
+lengthOfRegexMatch: pattern from: from to: to
+       ^self lengthOfRegexMatch: pattern from: from to: to options: ''
+!
+
 asRegex
     "Answer the receiver, converted to a Regex object."
     ^Regex fromString: self
@@ -381,6 +390,20 @@
        ]
 !
 
+searchRegex: pattern options: options
+    | regs |
+    regs := self searchRegexInternal: pattern from: 1 to: self size options: 
options.
+    ^regs matchBeg = -1
+       ifTrue: [
+           regs free.
+           FailedMatchRegexResults notMatched
+       ]
+       ifFalse: [
+           [ MatchingRegexResults new initialize: regs subject: self ]
+               ensure: [ regs free ]
+       ]
+!
+
 searchRegex: pattern startingAt: anIndex
     | regs |
     regs := self searchRegexInternal: pattern from: anIndex to: self size.
@@ -480,6 +503,14 @@
     ^(self lengthOfRegexMatch: pattern from: 1 to: self size) = self size
 !
 
+matchRegex: pattern ignoreCase: aBoolean
+    ^(self 
+               lengthOfRegexMatch: pattern 
+               from: 1 
+               to: self size 
+               options: (aBoolean ifTrue: [ 'i' ] ifFalse: [ '' ])) = self size
+!
+
 matchRegex: pattern startingAt: idx
     ^(self lengthOfRegexMatch: pattern from: idx to: self size) > 0
 !

reply via email to

[Prev in Thread] Current Thread [Next in Thread]