--- /usr/local/src/Gnu/grep-2.5.3/src/dfa.h 2007-06-28 21:57:19.000000000 +0300 +++ dfa.h 2007-09-03 06:30:12.000000000 +0300 @@ -364,9 +364,16 @@ on a state that potentially could do so. */ int *success; /* Table of acceptance conditions used in dfaexec and computed in build_state. */ + int *newlines; /* Transitions on newlines. The entry for a + newline in any transition table is always + -1 so we can count lines without wasting + too many cycles. The transition for a + newline is stored separately and handled + as a special case. Newline is also used + as a sentinel at the end of the buffer. */ struct dfamust *musts; /* List of strings, at least one of which is known to appear in any r.e. matching the dfa. */ }; /* Some macros for user access to dfa internals. */ @@ -398,13 +409,18 @@ extern void dfacomp PARAMS ((char const *, size_t, struct dfa *, int)); /* Execute the given struct dfa on the buffer of characters. The - last byte of the buffer must equal the end-of-line byte. - The final argument points to a flag that will + first char * points to the beginning, and the second points to the + first character after the end of the buffer, which must be a writable + place so a sentinel end-of-buffer marker can be stored there. The + second-to-last argument is a flag telling whether to allow newlines to + be part of a string matching the regexp. The next-to-last argument, + if non-NULL, points to a place to increment every time we see a + newline. The final argument, if non-NULL, points to a flag that will be set if further examination by a backtracking matcher is needed in order to verify backreferencing; otherwise the flag will be cleared. - Returns (size_t) -1 if no match is found, or the offset of the first + Returns NULL if no match is found, or a pointer to the first character after the first & shortest matching string in the buffer. */ -extern size_t dfaexec PARAMS ((struct dfa *, char const *, size_t, int *)); +extern char *dfaexec PARAMS ((struct dfa *, char const *, char *, int, int *, int *)); /* Free the storage held by the components of a struct dfa. */ extern void dfafree PARAMS ((struct dfa *)); --- /usr/local/src/Gnu/grep-2.5.3/src/dfa.c 2007-06-28 21:57:19.000000000 +0300 +++ dfa.c 2007-09-03 06:30:12.000000000 +0300 @@ -2335,6 +2382,7 @@ d->trans = d->realtrans + 1; REALLOC(d->fails, int *, d->tralloc); REALLOC(d->success, int, d->tralloc); + REALLOC(d->newlines, int, d->tralloc); while (oldalloc < d->tralloc) { d->trans[oldalloc] = NULL; @@ -2342,7 +2390,9 @@ } } - /* Newline is a sentinel. */ + /* Keep the newline transition in a special place so we can use it as + a sentinel. */ + d->newlines[s] = trans[eolbyte]; trans[eolbyte] = -1; if (ACCEPTING(s, *d)) @@ -2360,6 +2410,7 @@ d->trans = d->realtrans + 1; CALLOC(d->fails, int *, d->tralloc); MALLOC(d->success, int, d->tralloc); + MALLOC(d->newlines, int, d->tralloc); build_state(0, d); } @@ -2378,13 +2429,13 @@ { \ while (inputwcs[p - buf_begin] == 0 \ && mblen_buf[p - buf_begin] > 0 \ - && p < buf_end) \ + && (unsigned char const *)p < buf_end) \ ++p; \ - if (p >= end) \ + if ((char *)p >= end) \ { \ free(mblen_buf); \ free(inputwcs); \ - return (size_t) -1; \ + return NULL; \ } \ } @@ -2403,6 +2454,7 @@ d->trans = d->realtrans + 1; REALLOC(d->fails, int *, d->tralloc); REALLOC(d->success, int, d->tralloc); + REALLOC(d->newlines, int, d->tralloc); while (oldalloc < d->tralloc) { d->trans[oldalloc] = NULL; @@ -2791,19 +2843,23 @@ /* Search through a buffer looking for a match to the given struct dfa. Find the first occurrence of a string matching the regexp in the buffer, - and the shortest possible version thereof. Return the offset of the first - character after the match, or (size_t) -1 if none is found. BEGIN points to - the beginning of the buffer, and SIZE is the size of the buffer. If SIZE - is nonzero, BEGIN[SIZE - 1] must be a newline. BACKREF points to a place + and the shortest possible version thereof. Return a pointer to the first + character after the match, or NULL if none is found. Begin points to + the beginning of the buffer, and end points to the first character after + its end. We store a newline in *end to act as a sentinel, so end had + better point somewhere valid. Newline is a flag indicating whether to + allow newlines to be in the matching string. If count is non- + NULL it points to a place we're supposed to increment every time we + see a newline. Finally, if backref is non-NULL it points to a place where we're supposed to store a 1 if backreferencing happened and the match needs to be verified by a backtracking matcher. Otherwise we store a 0 in *backref. */ -size_t -dfaexec (struct dfa *d, char const *begin, size_t size, int *backref) +char * +dfaexec (struct dfa *d, char const *begin, char *end, + int newline, int *count, int *backref) { - register int s; /* Current state. */ + register int s, s1, tmp; /* Current state. */ register unsigned char const *p; /* Current input character. */ - register unsigned char const *end; /* One past the last input character. */ register int **trans, *t; /* Copy of d->trans so it can be optimized into a register. */ register unsigned char eol = eolbyte; /* Likewise for eolbyte. */ @@ -2823,10 +2879,10 @@ if (! d->tralloc) build_state_zero(d); - s = 0; + s = s1 = 0; p = (unsigned char const *) begin; - end = p + size; trans = d->trans; + *end = eol; #ifdef MBS_SUPPORT if (MB_CUR_MAX > 1) @@ -2836,18 +2892,18 @@ buf_end = end; /* initialize mblen_buf, and inputwcs. */ - MALLOC(mblen_buf, unsigned char, end - (unsigned char const *)begin + 2); - MALLOC(inputwcs, wchar_t, end - (unsigned char const *)begin + 2); + MALLOC(mblen_buf, unsigned char, end - begin + 2); + MALLOC(inputwcs, wchar_t, end - begin + 2); memset(&mbs, 0, sizeof(mbstate_t)); remain_bytes = 0; - for (i = 0; i < end - (unsigned char const *)begin + 1; i++) + for (i = 0; i < end - begin + 1; i++) { if (remain_bytes == 0) { remain_bytes - = mbrtowc(inputwcs + i, begin + i, - end - (unsigned char const *)begin - i + 1, &mbs); - if (remain_bytes <= 1) + = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs); + if (remain_bytes < 1 + || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i])) { remain_bytes = 0; inputwcs[i] = (wchar_t)begin[i]; @@ -2877,6 +2933,9 @@ if (MB_CUR_MAX > 1) while ((t = trans[s])) { + if ((char *) p > end) + break; + s1 = s; if (d->states[s].mbps.nelem != 0) { /* Can match with a multibyte character (and multi character @@ -2887,7 +2946,7 @@ nextp = p; s = transit_state(d, s, &nextp); - p = nextp; + p = (unsigned char *)nextp; /* Trans table might be updated. */ trans = d->trans; @@ -2900,25 +2959,16 @@ } else #endif /* MBS_SUPPORT */ - while ((t = trans[s])) - s = t[*p++]; - - if (s < 0) - { - if (p == end) - { -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - { - free(mblen_buf); - free(inputwcs); - } -#endif /* MBS_SUPPORT */ - return (size_t) -1; - } - s = 0; + while ((t = trans[s]) != 0) { /* hand-optimized loop */ + s1 = t[*p++]; + if ((t = trans[s1]) == 0) { + tmp = s ; s = s1 ; s1 = tmp ; /* swap */ + break; } - else if ((t = d->fails[s])) + s = t[*p++]; + } + + if (s >= 0 && p <= (unsigned char *) end && d->fails[s]) { if (d->success[s] & sbit[*p]) { @@ -2931,37 +2981,58 @@ free(inputwcs); } #endif /* MBS_SUPPORT */ - return (char const *) p - begin; + return (char *) p; } + s1 = s; #ifdef MBS_SUPPORT if (MB_CUR_MAX > 1) { - SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p); - if (d->states[s].mbps.nelem != 0) - { - /* Can match with a multibyte character (and multi - character collating element). */ unsigned char const *nextp; nextp = p; s = transit_state(d, s, &nextp); - p = nextp; + p = (unsigned char *)nextp; /* Trans table might be updated. */ trans = d->trans; - } - else - s = t[*p++]; } else #endif /* MBS_SUPPORT */ - s = t[*p++]; + s = d->fails[s][*p++]; + continue; } - else + + /* If the previous character was a newline, count it. */ + if (count && (char *) p <= end && p[-1] == eol) + ++*count; + + /* Check if we've run off the end of the buffer. */ + if ((char *) p > end) + { +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + { + free(mblen_buf); + free(inputwcs); + } +#endif /* MBS_SUPPORT */ + return NULL; + } + + if (s >= 0) { build_state(s, d); trans = d->trans; + continue; } + + if (p[-1] == eol && newline) + { + s = d->newlines[s1]; + continue; + } + + s = 0; } } @@ -2992,13 +3063,14 @@ d->tralloc = 0; d->musts = 0; + d->newlines = 0; } /* Parse and analyze a single string of the given length. */ void dfacomp (char const *s, size_t len, struct dfa *d, int searchflag) { if (case_fold) /* dummy folding in service of dfamust() */ { char *lcopy; int i; @@ -3088,6 +3171,7 @@ free((ptr_t) d->fails[i]); if (d->realtrans) free((ptr_t) d->realtrans); if (d->fails) free((ptr_t) d->fails); + if (d->newlines) free((ptr_t) d->newlines); if (d->success) free((ptr_t) d->success); for (dm = d->musts; dm; dm = ndm) {