/* fsaparse -- Build a structure naming relationships (sequences, alternatives, backreferences, options and precedence) of tokens Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2014 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */ /* Written June, 1988 by Mike Haertel Modified July, 1988 by Arthur David Olson to assist BMG speedups */ /* 2014: Repackaged by "untangle" script, written by behoffski. */ /* This function receives a stream of tokens from fsalex, and processes them to impose precedence rules and to describe complex pattern elements that are beyond the capability of the simple lexer. In addition to the cases explicit in the syntax (e.g."(ab|c)", variable-length multibyte encodings (UTF-8; codesets including modifiers and/or shift items) also require these enhanced facilities. */ /* Always import environment-specific configuration items first. */ #include #include #include "charclass.h" #include "fsaparse.h" #include "fsalex.h" #include "fsatoken.h" #include "proto-lexparse.h" #include #include "xalloc.h" /* gettext.h ensures that we don't use gettext if ENABLE_NLS is not defined */ #include "gettext.h" #define _(str) gettext (str) #include #include /* Reallocate an array of type *P if N_ALLOC is <= N_REQUIRED. */ #define REALLOC_IF_NECESSARY(p, n_alloc, n_required) \ do \ { \ if ((n_alloc) <= (n_required)) \ { \ size_t new_n_alloc = (n_required) + !(p); \ (p) = x2nrealloc (p, &new_n_alloc, sizeof (*(p))); \ (n_alloc) = new_n_alloc; \ } \ } \ while (0) #if HAVE_LANGINFO_CODESET # include #endif /* ?? Sigh... wanted to keep multibyte code in fsaPARSE(!) to a minimum, but a LOT of code breaks if struct mb_char_classes isn't defined. */ /* A bracket operator. e.g., [a-c], [[:alpha:]], etc. */ struct mb_char_classes { ptrdiff_t cset; bool invert; wchar_t *chars; /* Normal characters. */ size_t nchars; wctype_t *ch_classes; /* Character classes. */ size_t nch_classes; wchar_t *range_sts; /* Range characters (start of the range). */ wchar_t *range_ends; /* Range characters (end of the range). */ size_t nranges; char **equivs; /* Equivalence classes. */ size_t nequivs; char **coll_elems; size_t ncoll_elems; /* Collating elements. */ }; /* fsaparse_ctxt: Gather all the context to do with the parser into a single struct. We do this mainly because it make it easier to contemplate having multiple instances of this module running in parallel, but also because it makes translating from "dfa->" easier. This definition fleshes out the opaque type given in the module header. */ struct fsaparse_ctxt_struct { /* Warning and abot functions provided by client. */ fsalex_warn_callback_fn *warn_client; fsalex_error_callback_fn *abandon_with_error; /* Plug-in functions and context to deal with lexer at arm's length. */ proto_lexparse_lex_fn_t *lexer; proto_lexparse_exchange_fn_t *lex_exchange; void *lex_context; /* Information about locale (needs to sync with lexer...?) */ bool multibyte_locale; bool unibyte_locale; fsatoken_token_t lookahead_token; size_t current_depth; /* Current depth of a hypothetical stack holding deferred productions. This is used to determine the depth that will be required of the real stack later on in dfaanalyze. */ /* Fields filled by the parser. */ fsatoken_token_t *tokens; /* Postfix parse array. */ size_t tindex; /* Index for adding new tokens. */ size_t talloc; /* Number of tokens currently allocated. */ size_t depth; /* Depth required of an evaluation stack used for depth-first traversal of the parse tree. */ size_t nleaves; /* Number of leaves on the parse tree. */ size_t nregexps; /* Count of parallel regexps being built with dfaparse. */ unsigned int mb_cur_max; /* Cached value of MB_CUR_MAX. */ fsatoken_token_t utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */ /* The following are used only if MB_CUR_MAX > 1. */ /* The value of multibyte_prop[i] is defined by following rule. if tokens[i] < NOTCHAR bit 0 : tokens[i] is the first byte of a character, including single-byte characters. bit 1 : tokens[i] is the last byte of a character, including single-byte characters. if tokens[i] = MBCSET ("the index of mbcsets corresponding to this operator" << 2) + 3 e.g. tokens = 'single_byte_a', 'multi_byte_A', single_byte_b' = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b' multibyte_prop = 3 , 1 , 0 , 2 , 3 */ size_t nmultibyte_prop; int *multibyte_prop; /* Array of the bracket expression in the DFA. */ struct mb_char_classes *mbcsets; size_t nmbcsets; size_t mbcsets_alloc; }; /* UTF-8 encoding allows some optimizations that we can't otherwise assume in a multibyte encoding. */ static int using_utf8 (void) { static int utf8 = -1; if (utf8 < 0) { wchar_t wc; mbstate_t mbs = { 0 }; utf8 = mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100; } return utf8; } /* Recursive descent parser for regular expressions. */ static void addtok_mb (fsaparse_ctxt_t *parser, fsatoken_token_t t, int mbprop) { if (parser->multibyte_locale) { REALLOC_IF_NECESSARY (parser->multibyte_prop, parser->nmultibyte_prop, parser->tindex + 1); parser->multibyte_prop[parser->tindex] = mbprop; } REALLOC_IF_NECESSARY (parser->tokens, parser->talloc, parser->tindex + 1); parser->tokens[parser->tindex++] = t; switch (t) { case FSATOKEN_TK_QMARK: case FSATOKEN_TK_STAR: case FSATOKEN_TK_PLUS: break; case FSATOKEN_TK_CAT: case FSATOKEN_TK_OR: --parser->current_depth; break; default: ++parser->nleaves; case FSATOKEN_TK_EMPTY: ++parser->current_depth; break; } if (parser->depth < parser->current_depth) parser->depth = parser->current_depth; } static void addtok_wc (fsaparse_ctxt_t *parser, wint_t wc); /* Add the given token to the parse tree, maintaining the depth count and updating the maximum depth if necessary. */ static void addtok (fsaparse_ctxt_t *parser, fsatoken_token_t t) { if (parser->multibyte_locale && t == FSATOKEN_TK_MBCSET) { bool need_or = false; struct mb_char_classes *work_mbc = &parser->mbcsets[parser->nmbcsets - 1]; /* Extract wide characters into alternations for better performance. This does not require UTF-8. */ if (!work_mbc->invert) { size_t i; for (i = 0; i < work_mbc->nchars; i++) { addtok_wc (parser, work_mbc->chars[i]); if (need_or) addtok (parser, FSATOKEN_TK_OR); need_or = true; } work_mbc->nchars = 0; } /* If the FSATOKEN_TK_MBCSET is non-inverted and doesn't include neither character classes including multibyte characters, range expressions, equivalence classes nor collating elements, it can be replaced to a simple FSATOKEN_TK_CSET. */ if (work_mbc->invert || work_mbc->nch_classes != 0 || work_mbc->nranges != 0 || work_mbc->nequivs != 0 || work_mbc->ncoll_elems != 0) { addtok_mb (parser, FSATOKEN_TK_MBCSET, ((parser->nmbcsets - 1) << 2) + 3); if (need_or) addtok (parser, FSATOKEN_TK_OR); } else { /* Characters have been handled above, so it is possible that the mbcset is empty now. Do nothing in that case. */ if (work_mbc->cset != -1) { addtok (parser, FSATOKEN_TK_CSET + work_mbc->cset); if (need_or) addtok (parser, FSATOKEN_TK_OR); } } } else { addtok_mb (parser, t, 3); } } /* We treat a multibyte character as a single atom, so that DFA can treat a multibyte character as a single expression. e.g., we construct the following tree from "". */ static void addtok_wc (fsaparse_ctxt_t *parser, wint_t wc) { unsigned char buf[MB_LEN_MAX]; mbstate_t s = { 0 }; int i; int cur_mb_len; size_t stored_bytes = wcrtomb ((char *) buf, wc, &s); if (stored_bytes != (size_t) -1) cur_mb_len = stored_bytes; else { /* This is merely stop-gap. buf[0] is undefined, yet skipping the addtok_mb call altogether can corrupt the heap. */ cur_mb_len = 1; buf[0] = 0; } addtok_mb (parser, buf[0], cur_mb_len == 1 ? 3 : 1); for (i = 1; i < cur_mb_len; i++) { addtok_mb (parser, buf[i], i == cur_mb_len - 1 ? 2 : 0); addtok (parser, FSATOKEN_TK_CAT); } } static void add_utf8_anychar (fsaparse_ctxt_t *parser) { unsigned int i; /* Have we set up the classes for the 1-byte to 4-byte sequence types? */ if (parser->utf8_anychar_classes[0] == 0) { /* No, first time we've been called, so set them up now. */ charclass_t *ccl; const charclass_t *dotclass; /* Index 0: 80-bf -- Non-leading bytes. */ ccl = charclass_alloc (); charclass_setbit_range (0x80, 0xbf, ccl); parser->utf8_anychar_classes[0] = charclass_finalise (ccl); /* Index 1: 00-7f -- 1-byte leading seq, minus dotclass exceptions. */ ccl = charclass_alloc (); charclass_setbit_range (0x00, 0x7f, ccl); fsalex_exchange (parser->lex_context, PROTO_LEXPARSE_OP_GET_DOTCLASS, &dotclass); charclass_intersectset (dotclass, ccl); parser->utf8_anychar_classes[1] = charclass_finalise (ccl); /* Index 2: c2-df -- 2-byte sequence. */ ccl = charclass_alloc (); charclass_setbit_range (0xc2, 0xdf, ccl); parser->utf8_anychar_classes[2] = charclass_finalise (ccl); /* Index 2: e0-ef -- 3-byte sequence. */ ccl = charclass_alloc (); charclass_setbit_range (0xe0, 0xef, ccl); parser->utf8_anychar_classes[3] = charclass_finalise (ccl); /* Index 2: f0-f7 -- 4-byte sequence. */ ccl = charclass_alloc (); charclass_setbit_range (0xf0, 0xf7, ccl); parser->utf8_anychar_classes[4] = charclass_finalise (ccl); } /* A valid UTF-8 character is ([0x00-0x7f] |[0xc2-0xdf][0x80-0xbf] |[0xe0-0xef[0x80-0xbf][0x80-0xbf] |[0xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf]) which I'll write more concisely "B|CA|DAA|EAAA". Factor the [0x00-0x7f] and you get "B|(C|(D|EA)A)A". And since the token buffer is in reverse Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR". */ /* Write out leaf tokens for each of the four possible starting bytes. */ for (i = 1; i < 5; i++) addtok (parser, FSATOKEN_TK_CSET + parser->utf8_anychar_classes[i]); /* Add follow-on classes, plus tokens to build a postfix tree covering all four alternatives of valid UTF-8 sequences. */ for (i = 1; i <= 3; i++) { addtok (parser, FSATOKEN_TK_CSET + parser->utf8_anychar_classes[0]); addtok (parser, FSATOKEN_TK_CAT); addtok (parser, FSATOKEN_TK_OR); } } /* The grammar understood by the parser is as follows. regexp: regexp FSATOKEN_TK_OR branch branch branch: branch closure closure closure: closure FSATOKEN_TK_QMARK closure FSATOKEN_TK_STAR closure FSATOKEN_TK_PLUS closure FSATOKEN_TK_REPMN atom atom: FSATOKEN_TK_ANYCHAR FSATOKEN_TK_MBCSET FSATOKEN_TK_CSET FSATOKEN_TK_BACKREF FSATOKEN_TK_BEGLINE FSATOKEN_TK_ENDLINE FSATOKEN_TK_BEGWORD FSATOKEN_TK_ENDWORD FSATOKEN_TK_LIMWORD FSATOKEN_TK_NOTLIMWORD FSATOKEN_TK_LPAREN regexp FSATOKEN_TK_RPAREN The parser builds a parse tree in postfix form in an array of tokens. */ /* Provide a forward declaration for regexp, as it is at the top of the parse tree, but is referenced by atom, at the bottom of the tree. */ static void regexp (fsaparse_ctxt_t *parser); static void atom (fsaparse_ctxt_t *parser) { fsatoken_token_t tok = parser->lookahead_token; if (tok == FSATOKEN_TK_WCHAR) { wchar_t wctok; int i, n; wchar_t folded[FSALEX_CASE_FOLDED_BUFSIZE]; fsalex_exchange (parser->lex_context, PROTO_LEXPARSE_OP_GET_WIDE_CHAR, &wctok); addtok_wc (parser, wctok); n = fsalex_case_folded_counterparts (parser->lex_context, wctok, folded); for (i = 0; i < n; i++) { addtok_wc (parser, folded[i]); addtok (parser, FSATOKEN_TK_OR); } parser->lookahead_token = parser->lexer (parser->lex_context); } else if (tok == FSATOKEN_TK_ANYCHAR && using_utf8 ()) { /* For UTF-8 expand the period to a series of CSETs that define a valid UTF-8 character. This avoids using the slow multibyte path. I'm pretty sure it would be both profitable and correct to do it for any encoding; however, the optimization must be done manually as it is done above in add_utf8_anychar. So, let's start with UTF-8: it is the most used, and the structure of the encoding makes the correctness more obvious. */ add_utf8_anychar (parser); parser->lookahead_token = parser->lexer (parser->lex_context); } else if ((tok >= 0 && tok < FSATOKEN_NOTCHAR) || tok >= FSATOKEN_TK_CSET || tok == FSATOKEN_TK_BACKREF || tok == FSATOKEN_TK_BEGLINE || tok == FSATOKEN_TK_ENDLINE || tok == FSATOKEN_TK_BEGWORD || tok == FSATOKEN_TK_ANYCHAR || tok == FSATOKEN_TK_MBCSET || tok == FSATOKEN_TK_ENDWORD || tok == FSATOKEN_TK_LIMWORD || tok == FSATOKEN_TK_NOTLIMWORD) { addtok (parser, tok); parser->lookahead_token = parser->lexer (parser->lex_context); } else if (tok == FSATOKEN_TK_LPAREN) { parser->lookahead_token = parser->lexer (parser->lex_context); regexp (parser); tok = parser->lookahead_token; if (tok != FSATOKEN_TK_RPAREN) parser->abandon_with_error (_("unbalanced (")); parser->lookahead_token = parser->lexer (parser->lex_context); } else addtok (parser, FSATOKEN_TK_EMPTY); } /* Return the number of tokens in the given subexpression. */ static size_t _GL_ATTRIBUTE_PURE nsubtoks (fsaparse_ctxt_t *parser, size_t tindex) { size_t ntoks1; switch (parser->tokens[tindex - 1]) { default: return 1; case FSATOKEN_TK_QMARK: case FSATOKEN_TK_STAR: case FSATOKEN_TK_PLUS: return 1 + nsubtoks (parser, tindex - 1); case FSATOKEN_TK_CAT: case FSATOKEN_TK_OR: ntoks1 = nsubtoks (parser, tindex - 1); return 1 + ntoks1 + nsubtoks (parser, tindex - 1 - ntoks1); } } /* Copy the given subexpression to the top of the tree. */ static void copytoks (fsaparse_ctxt_t *parser, size_t tindex, size_t ntokens) { size_t i; if (parser->multibyte_locale) for (i = 0; i < ntokens; ++i) addtok_mb (parser, parser->tokens[tindex + i], parser->multibyte_prop[tindex + i]); else for (i = 0; i < ntokens; ++i) addtok_mb (parser, parser->tokens[tindex + i], 3); } /* Rewriting fsaparse:closure () from scratch; original is clever but a little tricky to follow, so I'm trying to break up a while + compound-if loop into a simpler construct (more like a finite-state machine). Also, edits such as replacing "dfa->" with "parser->" are done here, adding "parser" as a parameter in lots of places, as well as the long-winded FSATOKEN_TK_" prefix. I'm not sure if this version is an improvement over the original; the need to use "parser->lookahead_token" instead of "tok" influenced my decision to try this... but the jury is still out. */ static void closure (fsaparse_ctxt_t *parser) { restart_closure: atom (parser); for (;;) { switch (parser->lookahead_token) { case FSATOKEN_TK_QMARK: case FSATOKEN_TK_STAR: case FSATOKEN_TK_PLUS: addtok (parser, parser->lookahead_token); parser->lookahead_token = parser->lexer (parser->lex_context); continue; case FSATOKEN_TK_REPMN: /* REPMN needs extra work; move outside the switch statement. */ break; default: /* Merely let the intial atom call stand as our return result. */ return; } /* Deal with REPMN{min, max} cases in a separate block. */ { int i; size_t prev_sub_index, ntokens; int minrep, maxrep; /* Get the {min, max} pair decoded by the lexer. */ minrep = parser->lex_exchange (parser->lex_context, PROTO_LEXPARSE_OP_GET_REPMN_MIN, NULL); maxrep = parser->lex_exchange (parser->lex_context, PROTO_LEXPARSE_OP_GET_REPMN_MAX, NULL); /* Find out how many tokens are in the peceding token list that are covered by this REPMN directive. This involves carefully working backwards through the linear, postfix token ordering. */ ntokens = nsubtoks (parser, parser->tindex); /* If min and max are both zero, merely remove preceding subexpression, get a new token, and restart the atom/closure processing from the top of the function. Not sure if people will like this goto statement, but we'll give it a whirl. */ if (minrep == 0 && maxrep == 0) { parser->tindex -= ntokens; parser->lookahead_token = parser->lexer (parser->lex_context); goto restart_closure; } /* Non-zero min or max, defined as follows: {n} The preceding item is matched exactly n times. {n,} The preceding item is matched n or more times. {,m} The preceding item is matched at most m times (GNU ext.) {n,m} The preceding item is matched at least n, but not more than m times. For {n,} and {,m} cases, the omitted parameter is reported here as a negative value. */ prev_sub_index = parser->tindex - ntokens; if (maxrep < 0) addtok (parser, FSATOKEN_TK_PLUS); if (minrep == 0) addtok (parser, FSATOKEN_TK_QMARK); for (i = 1; i < minrep; ++i) { copytoks (parser, prev_sub_index, ntokens); addtok (parser, FSATOKEN_TK_CAT); } for (; i < maxrep; ++i) { copytoks (parser, prev_sub_index, ntokens); addtok (parser, FSATOKEN_TK_QMARK); addtok (parser, FSATOKEN_TK_CAT); } /* Prime the parser with the next token after REPMN and loop. */ parser->lookahead_token = parser->lexer (parser->lex_context); } } } static void branch (fsaparse_ctxt_t *parser) { fsatoken_token_t tok; closure (parser); tok = parser->lookahead_token; while (tok != FSATOKEN_TK_RPAREN && tok != FSATOKEN_TK_OR && tok >= 0) { closure (parser); tok = parser->lookahead_token; addtok (parser, FSATOKEN_TK_CAT); } } static void regexp (fsaparse_ctxt_t *parser) { branch (parser); while (parser->lookahead_token == FSATOKEN_TK_OR) { parser->lookahead_token = parser->lexer (parser->lex_context); branch (parser); addtok (parser, FSATOKEN_TK_OR); } } /* Main entry point for the parser. Parser is a pointer to a parser context struct created by fsaparse_new. Before calling this function, the parser instance must be supplied with a lexer (fsaparse_lexer), and also with callback functions to receive warning and error reports (fsaparse_esception_fns). */ void fsaparse_parse (fsaparse_ctxt_t *parser) { /* Obtain an initial token for lookahead, and keep tracking tree depth. */ parser->lookahead_token = parser->lexer (parser->lex_context); parser->current_depth = parser->depth; /* Run regexp to manage the next level of parsing. */ regexp (parser); if (parser->lookahead_token != FSATOKEN_TK_END) parser->abandon_with_error (_("unbalanced )")); /* If multiple expressions are parsed, second and subsequent patters are presented as alternatives to preceding patterns. */ addtok (parser, FSATOKEN_TK_END - parser->nregexps); addtok (parser, FSATOKEN_TK_CAT); if (parser->nregexps) addtok (parser, FSATOKEN_TK_OR); ++parser->nregexps; } /* Receive functions to deal with exceptions detected by the parser: Warnings and errors. Internally, we add the _Noreturn attribute to the error callback, to help the compiler with code flow analysis. */ extern void fsaparse_exception_fns (fsaparse_ctxt_t *parser, fsaparse_warn_callback_fn *warningfn, fsaparse_error_callback_fn *errorfn) { /* Exception handling is done by explicit callbacks. */ parser->warn_client = warningfn; parser->abandon_with_error = errorfn; } /* Add "not provided!" stub function that gets called if the client fails to provide proper resources. This is a hack, merely to get the module started; better treatment needs to be added later. */ static void no_function_provided (void *unused) { assert (!"fsaparse: Plug-in function required, but not provided."); } /* Receiver a lexer function, plus lexer instance context pointer, for use by the parser. Although not needed initially, this plug-in architecture may be useful in the future, and it breaks up some of the intricate connections that made the original dfa.c code so daunting. */ void fsaparse_lexer (fsaparse_ctxt_t *parser, void *lexer_context, proto_lexparse_lex_fn_t *lex_fn, proto_lexparse_exchange_fn_t *lex_exchange_fn) { bool is_multibyte; /* Record supplied lexer function and context for use later. */ parser->lex_context = lexer_context; parser->lexer = lex_fn; parser->lex_exchange = lex_exchange_fn; /* Query lexer to get multibyte nature of this locale. */ is_multibyte = lex_exchange_fn (lexer_context, PROTO_LEXPARSE_OP_GET_IS_MULTIBYTE_ENV, NULL); parser->multibyte_locale = is_multibyte; parser->unibyte_locale = ! is_multibyte; } /* Generate a new instance of an FSA parser. */ fsaparse_ctxt_t * fsaparse_new (void) { fsaparse_ctxt_t *new_context; /* Acquire zeroed memory for new parser context. */ new_context = XZALLOC (fsaparse_ctxt_t); /* ?? Point warning, error and lexer functions to a "you need to tell me these first!" function? */ new_context->warn_client = (fsaparse_warn_callback_fn *) no_function_provided; new_context->abandon_with_error = (fsaparse_error_callback_fn *) no_function_provided; new_context->lexer = (fsaparse_lexer_fn_t *) no_function_provided; /* Default to unibyte locale... but we should synchronise with lexer. */ new_context->multibyte_locale = false; new_context->unibyte_locale = true; return new_context; } /* After parsing, report a list of tokens describing the pattern. Complex structures such as alternation, backreferences, and locale-induced complexity such as variable-length utf8 sequences are described here by appending operators that apply to the preceding item(s) (postfix notation). */ void fsaparse_get_token_list (fsaparse_ctxt_t *parser, size_t *nr_tokens, fsatoken_token_t **token_list) { *nr_tokens = parser->tindex; *token_list = parser->tokens; } /* vim:set shiftwidth=2: */