/* fsaparse -- Build a structure naming relationships (sequences, alternatives,
               backreferences, options and precedence) of tokens

   Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2014 Free Software
   Foundation, Inc.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc.,
   51 Franklin Street - Fifth Floor, Boston, MA  02110-1301, USA */

/* Written June, 1988 by Mike Haertel
   Modified July, 1988 by Arthur David Olson to assist BMG speedups  */

/* 2014: Repackaged by "untangle" script, written by behoffski.  */

/* This function receives a stream of tokens from fsalex, and processes
   them to impose precedence rules and to describe complex pattern elements
   that are beyond the capability of the simple lexer.  In addition to the
   cases explicit in the syntax (e.g."(ab|c)", variable-length multibyte
   encodings (UTF-8; codesets including modifiers and/or shift items) also
   require these enhanced facilities.  */

/* Always import environment-specific configuration items first.  */
#include <config.h>

#include <assert.h>
#include "charclass.h"
#include "fsaparse.h"
#include "fsalex.h"
#include "fsatoken.h"
#include "proto-lexparse.h"
#include <stdbool.h>
#include "xalloc.h"

/* gettext.h ensures that we don't use gettext if ENABLE_NLS is not defined */
#include "gettext.h"
#define _(str) gettext (str)

#include <wchar.h>
#include <wctype.h>

/* Reallocate an array of type *P if N_ALLOC is <= N_REQUIRED.  */
#define REALLOC_IF_NECESSARY(p, n_alloc, n_required)		\
  do								\
    {								\
      if ((n_alloc) <= (n_required))				\
        {							\
          size_t new_n_alloc = (n_required) + !(p);		\
          (p) = x2nrealloc (p, &new_n_alloc, sizeof (*(p)));	\
          (n_alloc) = new_n_alloc;				\
        }							\
    }								\
  while (0)

#if HAVE_LANGINFO_CODESET
# include <langinfo.h>
#endif

/* ?? Sigh... wanted to keep multibyte code in fsaPARSE(!) to a minimum, but
   a LOT of code breaks if struct mb_char_classes isn't defined.  */

/* A bracket operator.
   e.g., [a-c], [[:alpha:]], etc.  */
struct mb_char_classes
{
  ptrdiff_t cset;
  bool invert;
  wchar_t *chars;               /* Normal characters.  */
  size_t nchars;
  wctype_t *ch_classes;         /* Character classes.  */
  size_t nch_classes;
  wchar_t *range_sts;           /* Range characters (start of the range).  */
  wchar_t *range_ends;          /* Range characters (end of the range).  */
  size_t nranges;
  char **equivs;                /* Equivalence classes.  */
  size_t nequivs;
  char **coll_elems;
  size_t ncoll_elems;           /* Collating elements.  */
};

/* fsaparse_ctxt: Gather all the context to do with the parser into a single
   struct.  We do this mainly because it make it easier to contemplate
   having multiple instances of this module running in parallel, but also
   because it makes translating from "dfa->" easier.  This definition
   fleshes out the opaque type given in the module header.  */
struct fsaparse_ctxt_struct
{
  /* Warning and abot functions provided by client.  */
  fsalex_warn_callback_fn *warn_client;
  fsalex_error_callback_fn *abandon_with_error;

  /* Plug-in functions and context to deal with lexer at arm's length.  */
  proto_lexparse_lex_fn_t *lexer;
  proto_lexparse_exchange_fn_t *lex_exchange;
  void *lex_context;

  /* Information about locale (needs to sync with lexer...?) */
  bool multibyte_locale;
  bool unibyte_locale;

  fsatoken_token_t lookahead_token;

  size_t current_depth;      /* Current depth of a hypothetical stack
                             holding deferred productions.  This is
                             used to determine the depth that will be
                             required of the real stack later on in
                             dfaanalyze.  */
  /* Fields filled by the parser.  */
  fsatoken_token_t *tokens;                /* Postfix parse array.  */
  size_t tindex;                /* Index for adding new tokens.  */
  size_t talloc;                /* Number of tokens currently allocated.  */
  size_t depth;                 /* Depth required of an evaluation stack
                                   used for depth-first traversal of the
                                   parse tree.  */
  size_t nleaves;               /* Number of leaves on the parse tree.  */
  size_t nregexps;              /* Count of parallel regexps being built
                                   with dfaparse.  */
  unsigned int mb_cur_max;      /* Cached value of MB_CUR_MAX.  */
  fsatoken_token_t utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales.  */
  /* The following are used only if MB_CUR_MAX > 1.  */

  /* The value of multibyte_prop[i] is defined by following rule.
     if tokens[i] < NOTCHAR
     bit 0 : tokens[i] is the first byte of a character, including
     single-byte characters.
     bit 1 : tokens[i] is the last byte of a character, including
     single-byte characters.

     if tokens[i] = MBCSET
     ("the index of mbcsets corresponding to this operator" << 2) + 3

     e.g.
     tokens
     = 'single_byte_a', 'multi_byte_A', single_byte_b'
     = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
     multibyte_prop
     = 3     , 1               ,  0              ,  2              , 3
   */
  size_t nmultibyte_prop;
  int *multibyte_prop;
  /* Array of the bracket expression in the DFA.  */
  struct mb_char_classes *mbcsets;
  size_t nmbcsets;
  size_t mbcsets_alloc;
};

/* UTF-8 encoding allows some optimizations that we can't otherwise
   assume in a multibyte encoding.  */
static int
using_utf8 (void)
{
  static int utf8 = -1;
  if (utf8 < 0)
    {
      wchar_t wc;
      mbstate_t mbs = { 0 };
      utf8 = mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
    }
  return utf8;
}

/* Recursive descent parser for regular expressions.  */

static void
addtok_mb (fsaparse_ctxt_t *parser, fsatoken_token_t t, int mbprop)
{
  if (parser->multibyte_locale)
    {
      REALLOC_IF_NECESSARY (parser->multibyte_prop, parser->nmultibyte_prop,
                            parser->tindex + 1);
      parser->multibyte_prop[parser->tindex] = mbprop;
    }

  REALLOC_IF_NECESSARY (parser->tokens, parser->talloc, parser->tindex + 1);
  parser->tokens[parser->tindex++] = t;

  switch (t)
    {
    case FSATOKEN_TK_QMARK:
    case FSATOKEN_TK_STAR:
    case FSATOKEN_TK_PLUS:
      break;

    case FSATOKEN_TK_CAT:
    case FSATOKEN_TK_OR:
      --parser->current_depth;
      break;

    default:
      ++parser->nleaves;
    case FSATOKEN_TK_EMPTY:
      ++parser->current_depth;
      break;
    }
  if (parser->depth < parser->current_depth)
    parser->depth = parser->current_depth;
}

static void addtok_wc (fsaparse_ctxt_t *parser, wint_t wc);

/* Add the given token to the parse tree, maintaining the depth count and
   updating the maximum depth if necessary.  */
static void
addtok (fsaparse_ctxt_t *parser, fsatoken_token_t t)
{
  if (parser->multibyte_locale && t == FSATOKEN_TK_MBCSET)
    {
      bool need_or = false;
      struct mb_char_classes *work_mbc = &parser->mbcsets[parser->nmbcsets - 1];

      /* Extract wide characters into alternations for better performance.
         This does not require UTF-8.  */
      if (!work_mbc->invert)
        {
          size_t i;
          for (i = 0; i < work_mbc->nchars; i++)
            {
              addtok_wc (parser, work_mbc->chars[i]);
              if (need_or)
                addtok (parser, FSATOKEN_TK_OR);
              need_or = true;
            }
          work_mbc->nchars = 0;
        }

      /* If the FSATOKEN_TK_MBCSET is non-inverted and doesn't include neither
         character classes including multibyte characters, range
         expressions, equivalence classes nor collating elements,
         it can be replaced to a simple FSATOKEN_TK_CSET. */
      if (work_mbc->invert
          || work_mbc->nch_classes != 0
          || work_mbc->nranges != 0
          || work_mbc->nequivs != 0 || work_mbc->ncoll_elems != 0)
        {
          addtok_mb (parser, FSATOKEN_TK_MBCSET, ((parser->nmbcsets - 1) << 2) + 3);
          if (need_or)
            addtok (parser, FSATOKEN_TK_OR);
        }
      else
        {
          /* Characters have been handled above, so it is possible
             that the mbcset is empty now.  Do nothing in that case.  */
          if (work_mbc->cset != -1)
            {
              addtok (parser, FSATOKEN_TK_CSET + work_mbc->cset);
              if (need_or)
                addtok (parser, FSATOKEN_TK_OR);
            }
        }
    }
  else
    {
      addtok_mb (parser, t, 3);
    }
}
/* We treat a multibyte character as a single atom, so that DFA
   can treat a multibyte character as a single expression.

   e.g., we construct the following tree from "<mb1><mb2>".
   <mb1(1st-byte)><mb1(2nd-byte)><FSATOKEN_TK_CAT><mb1(3rd-byte)><FSATOKEN_TK_CAT>
   <mb2(1st-byte)><mb2(2nd-byte)><FSATOKEN_TK_CAT><mb2(3rd-byte)><FSATOKEN_TK_CAT><FSATOKEN_TK_CAT> */
static void
addtok_wc (fsaparse_ctxt_t *parser, wint_t wc)
{
  unsigned char buf[MB_LEN_MAX];
  mbstate_t s = { 0 };
  int i;
  int cur_mb_len;
  size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);

  if (stored_bytes != (size_t) -1)
    cur_mb_len = stored_bytes;
  else
    {
      /* This is merely stop-gap.  buf[0] is undefined, yet skipping
         the addtok_mb call altogether can corrupt the heap.  */
      cur_mb_len = 1;
      buf[0] = 0;
    }

  addtok_mb (parser, buf[0], cur_mb_len == 1 ? 3 : 1);
  for (i = 1; i < cur_mb_len; i++)
    {
      addtok_mb (parser, buf[i], i == cur_mb_len - 1 ? 2 : 0);
      addtok (parser, FSATOKEN_TK_CAT);
    }
}
static void
add_utf8_anychar (fsaparse_ctxt_t *parser)
{
  unsigned int i;

  /* Have we set up the classes for the 1-byte to 4-byte sequence types?  */
  if (parser->utf8_anychar_classes[0] == 0)
    {
      /* No, first time we've been called, so set them up now.  */
      charclass_t *ccl;
      const charclass_t *dotclass;

      /* Index 0: 80-bf -- Non-leading bytes.  */
      ccl = charclass_alloc ();
      charclass_setbit_range (0x80, 0xbf, ccl);
      parser->utf8_anychar_classes[0] = charclass_finalise (ccl);

      /* Index 1: 00-7f -- 1-byte leading seq, minus dotclass exceptions.  */
      ccl = charclass_alloc ();
      charclass_setbit_range (0x00, 0x7f, ccl);
      fsalex_exchange (parser->lex_context, PROTO_LEXPARSE_OP_GET_DOTCLASS,
                       &dotclass);
      charclass_intersectset (dotclass, ccl);
      parser->utf8_anychar_classes[1] = charclass_finalise (ccl);

      /* Index 2: c2-df -- 2-byte sequence.  */
      ccl = charclass_alloc ();
      charclass_setbit_range (0xc2, 0xdf, ccl);
      parser->utf8_anychar_classes[2] = charclass_finalise (ccl);

      /* Index 2: e0-ef -- 3-byte sequence.  */
      ccl = charclass_alloc ();
      charclass_setbit_range (0xe0, 0xef, ccl);
      parser->utf8_anychar_classes[3] = charclass_finalise (ccl);

      /* Index 2: f0-f7 -- 4-byte sequence.  */
      ccl = charclass_alloc ();
      charclass_setbit_range (0xf0, 0xf7, ccl);
      parser->utf8_anychar_classes[4] = charclass_finalise (ccl);
    }

  /* A valid UTF-8 character is

     ([0x00-0x7f]
     |[0xc2-0xdf][0x80-0xbf]
     |[0xe0-0xef[0x80-0xbf][0x80-0xbf]
     |[0xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf])

     which I'll write more concisely "B|CA|DAA|EAAA".  Factor the [0x00-0x7f]
     and you get "B|(C|(D|EA)A)A".  And since the token buffer is in reverse
     Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR".  */
  /* Write out leaf tokens for each of the four possible starting bytes.  */
  for (i = 1; i < 5; i++)
    addtok (parser, FSATOKEN_TK_CSET + parser->utf8_anychar_classes[i]);
  /* Add follow-on classes, plus tokens to build a postfix tree covering all
     four alternatives of valid UTF-8 sequences.  */
  for (i = 1; i <= 3; i++)
    {
      addtok (parser, FSATOKEN_TK_CSET + parser->utf8_anychar_classes[0]);
      addtok (parser, FSATOKEN_TK_CAT);
      addtok (parser, FSATOKEN_TK_OR);
    }
}

/* The grammar understood by the parser is as follows.

   regexp:
     regexp FSATOKEN_TK_OR branch
     branch

   branch:
     branch closure
     closure

   closure:
     closure FSATOKEN_TK_QMARK
     closure FSATOKEN_TK_STAR
     closure FSATOKEN_TK_PLUS
     closure FSATOKEN_TK_REPMN
     atom

   atom:
     <normal character>
     <multibyte character>
     FSATOKEN_TK_ANYCHAR
     FSATOKEN_TK_MBCSET
     FSATOKEN_TK_CSET
     FSATOKEN_TK_BACKREF
     FSATOKEN_TK_BEGLINE
     FSATOKEN_TK_ENDLINE
     FSATOKEN_TK_BEGWORD
     FSATOKEN_TK_ENDWORD
     FSATOKEN_TK_LIMWORD
     FSATOKEN_TK_NOTLIMWORD
     FSATOKEN_TK_LPAREN regexp FSATOKEN_TK_RPAREN
     <empty>

   The parser builds a parse tree in postfix form in an array of tokens.  */

/* Provide a forward declaration for regexp, as it is at the top of the
   parse tree, but is referenced by atom, at the bottom of the tree.  */
static void regexp (fsaparse_ctxt_t *parser);

static void
atom (fsaparse_ctxt_t *parser)
{
  fsatoken_token_t tok = parser->lookahead_token;

  if (tok == FSATOKEN_TK_WCHAR)
    {
      wchar_t wctok;
      int i, n;
      wchar_t folded[FSALEX_CASE_FOLDED_BUFSIZE];

      fsalex_exchange (parser->lex_context, PROTO_LEXPARSE_OP_GET_WIDE_CHAR,
                       &wctok);
      addtok_wc (parser, wctok);

      n = fsalex_case_folded_counterparts (parser->lex_context,
                                           wctok, folded);
      for (i = 0; i < n; i++)
        {
          addtok_wc (parser, folded[i]);
          addtok (parser, FSATOKEN_TK_OR);
        }

      parser->lookahead_token = parser->lexer (parser->lex_context);
    }
  else if (tok == FSATOKEN_TK_ANYCHAR && using_utf8 ())
    {
      /* For UTF-8 expand the period to a series of CSETs that define a valid
         UTF-8 character.  This avoids using the slow multibyte path.  I'm
         pretty sure it would be both profitable and correct to do it for
         any encoding; however, the optimization must be done manually as
         it is done above in add_utf8_anychar.  So, let's start with
         UTF-8: it is the most used, and the structure of the encoding
         makes the correctness more obvious.  */
     add_utf8_anychar (parser);
      parser->lookahead_token = parser->lexer (parser->lex_context);
    }
  else if ((tok >= 0 && tok < FSATOKEN_NOTCHAR) || tok >= FSATOKEN_TK_CSET || tok == FSATOKEN_TK_BACKREF
           || tok == FSATOKEN_TK_BEGLINE || tok == FSATOKEN_TK_ENDLINE || tok == FSATOKEN_TK_BEGWORD
           || tok == FSATOKEN_TK_ANYCHAR || tok == FSATOKEN_TK_MBCSET
           || tok == FSATOKEN_TK_ENDWORD || tok == FSATOKEN_TK_LIMWORD || tok == FSATOKEN_TK_NOTLIMWORD)
    {
      addtok (parser, tok);
      parser->lookahead_token = parser->lexer (parser->lex_context);
    }
  else if (tok == FSATOKEN_TK_LPAREN)
    {
      parser->lookahead_token = parser->lexer (parser->lex_context);
      regexp (parser);
      tok = parser->lookahead_token;
      if (tok != FSATOKEN_TK_RPAREN)
        parser->abandon_with_error (_("unbalanced ("));
      parser->lookahead_token = parser->lexer (parser->lex_context);
    }
  else
    addtok (parser, FSATOKEN_TK_EMPTY);
}

/* Return the number of tokens in the given subexpression.  */
static size_t _GL_ATTRIBUTE_PURE
nsubtoks (fsaparse_ctxt_t *parser, size_t tindex)
{
  size_t ntoks1;

  switch (parser->tokens[tindex - 1])
    {
    default:
      return 1;
    case FSATOKEN_TK_QMARK:
    case FSATOKEN_TK_STAR:
    case FSATOKEN_TK_PLUS:
      return 1 + nsubtoks (parser, tindex - 1);
    case FSATOKEN_TK_CAT:
    case FSATOKEN_TK_OR:
      ntoks1 = nsubtoks (parser, tindex - 1);
      return 1 + ntoks1 + nsubtoks (parser, tindex - 1 - ntoks1);
    }
}

/* Copy the given subexpression to the top of the tree.  */
static void
copytoks (fsaparse_ctxt_t *parser, size_t tindex, size_t ntokens)
{
  size_t i;

  if (parser->multibyte_locale)
    for (i = 0; i < ntokens; ++i)
      addtok_mb (parser, parser->tokens[tindex + i], parser->multibyte_prop[tindex + i]);
  else
    for (i = 0; i < ntokens; ++i)
      addtok_mb (parser, parser->tokens[tindex + i], 3);
}

/* Rewriting fsaparse:closure () from scratch; original is clever but a
   little tricky to follow, so I'm trying to break up a while + compound-if
   loop into a simpler construct (more like a finite-state machine).  Also,
   edits such as replacing "dfa->" with "parser->" are done here, adding
   "parser" as a parameter in lots of places, as well as the long-winded
   FSATOKEN_TK_" prefix.

   I'm not sure if this version is an improvement over the original; the
   need to use "parser->lookahead_token" instead of "tok" influenced my
   decision to try this... but the jury is still out.  */
static void
closure (fsaparse_ctxt_t *parser)
{
restart_closure:
  atom (parser);
  for (;;)
    {
      switch (parser->lookahead_token)
        {
          case FSATOKEN_TK_QMARK:
          case FSATOKEN_TK_STAR:
          case FSATOKEN_TK_PLUS:
            addtok (parser, parser->lookahead_token);
            parser->lookahead_token = parser->lexer (parser->lex_context);
            continue;

          case FSATOKEN_TK_REPMN:
            /* REPMN needs extra work; move outside the switch statement.  */
            break;

          default:
            /* Merely let the intial atom call stand as our return result.  */
            return;
        }

      /* Deal with REPMN{min, max} cases in a separate block.  */
      {
        int i;
        size_t prev_sub_index, ntokens;
        int minrep, maxrep;

        /* Get the {min, max} pair decoded by the lexer.  */
        minrep = parser->lex_exchange (parser->lex_context,
                                       PROTO_LEXPARSE_OP_GET_REPMN_MIN,
                                       NULL);
        maxrep = parser->lex_exchange (parser->lex_context,
                                       PROTO_LEXPARSE_OP_GET_REPMN_MAX,
                                       NULL);

        /* Find out how many tokens are in the peceding token list that are
           covered by this REPMN directive.  This involves carefully working
           backwards through the linear, postfix token ordering.  */
        ntokens = nsubtoks (parser, parser->tindex);

        /* If min and max are both zero, merely remove preceding
           subexpression, get a new token, and restart the atom/closure
           processing from the top of the function.  Not sure if people will
           like this goto statement, but we'll give it a whirl.   */
        if (minrep == 0 && maxrep == 0)
          {
            parser->tindex -= ntokens;
            parser->lookahead_token = parser->lexer (parser->lex_context);
            goto restart_closure;
          }

        /* Non-zero min or max, defined as follows:
             {n}   The preceding item is matched exactly n times.
             {n,}  The preceding item is matched n or more times.
             {,m}  The preceding item is matched at most m times (GNU ext.)
             {n,m} The preceding item is matched at least n, but not more
                   than m times.
           For {n,} and {,m} cases, the omitted parameter is reported here
           as a negative value.  */
        prev_sub_index = parser->tindex - ntokens;
        if (maxrep < 0)
          addtok (parser, FSATOKEN_TK_PLUS);
        if (minrep == 0)
          addtok (parser, FSATOKEN_TK_QMARK);
        for (i = 1; i < minrep; ++i)
          {
            copytoks (parser, prev_sub_index, ntokens);
            addtok (parser, FSATOKEN_TK_CAT);
          }
        for (; i < maxrep; ++i)
          {
            copytoks (parser, prev_sub_index, ntokens);
            addtok (parser, FSATOKEN_TK_QMARK);
            addtok (parser, FSATOKEN_TK_CAT);
          }
        /* Prime the parser with the next token after REPMN and loop.  */
        parser->lookahead_token = parser->lexer (parser->lex_context);
      }
    }
}

static void
branch (fsaparse_ctxt_t *parser)
{
  fsatoken_token_t tok;

  closure (parser);
  tok = parser->lookahead_token;
  while (tok != FSATOKEN_TK_RPAREN && tok != FSATOKEN_TK_OR && tok >= 0)
    {
      closure (parser);
      tok = parser->lookahead_token;
      addtok (parser, FSATOKEN_TK_CAT);
    }
}

static void
regexp (fsaparse_ctxt_t *parser)
{
  branch (parser);
  while (parser->lookahead_token == FSATOKEN_TK_OR)
    {
      parser->lookahead_token = parser->lexer (parser->lex_context);
      branch (parser);
      addtok (parser, FSATOKEN_TK_OR);
    }
}

/* Main entry point for the parser.  Parser is a pointer to a parser
   context struct created by fsaparse_new.  Before calling this function,
   the parser instance must be supplied with a lexer (fsaparse_lexer), and
   also with callback functions to receive warning and error reports
   (fsaparse_esception_fns).  */
void
fsaparse_parse (fsaparse_ctxt_t *parser)
{
  /* Obtain an initial token for lookahead, and keep tracking tree depth.  */
  parser->lookahead_token = parser->lexer (parser->lex_context);
  parser->current_depth = parser->depth;

  /* Run regexp to manage the next level of parsing.  */
  regexp (parser);
  if (parser->lookahead_token != FSATOKEN_TK_END)
    parser->abandon_with_error (_("unbalanced )"));

  /* If multiple expressions are parsed, second and subsequent patters are
     presented as alternatives to preceding patterns.  */
  addtok (parser, FSATOKEN_TK_END - parser->nregexps);
  addtok (parser, FSATOKEN_TK_CAT);
  if (parser->nregexps)
    addtok (parser, FSATOKEN_TK_OR);

  ++parser->nregexps;
}

/* Receive functions to deal with exceptions detected by the parser:
   Warnings and errors.  Internally, we add the _Noreturn attribute
   to the error callback, to help the compiler with code flow
   analysis.  */
extern void
fsaparse_exception_fns (fsaparse_ctxt_t *parser,
                      fsaparse_warn_callback_fn *warningfn,
                      fsaparse_error_callback_fn *errorfn)
{
  /* Exception handling is done by explicit callbacks.  */
  parser->warn_client = warningfn;
  parser->abandon_with_error = errorfn;
}

/* Add "not provided!" stub function that gets called if the client
   fails to provide proper resources.  This is a hack, merely to get the
   module started; better treatment needs to be added later.  */
static void
no_function_provided (void *unused)
{
 assert (!"fsaparse: Plug-in function required, but not provided.");
}

/* Receiver a lexer function, plus lexer instance context pointer, for use by
   the parser.  Although not needed initially, this plug-in architecture may
   be useful in the future, and it breaks up some of the intricate
   connections that made the original dfa.c code so daunting.  */
void
fsaparse_lexer (fsaparse_ctxt_t *parser,
                void *lexer_context,
                proto_lexparse_lex_fn_t *lex_fn,
                proto_lexparse_exchange_fn_t *lex_exchange_fn)
{
  bool is_multibyte;

  /* Record supplied lexer function and context for use later.  */
  parser->lex_context  = lexer_context;
  parser->lexer        = lex_fn;
  parser->lex_exchange = lex_exchange_fn;

  /* Query lexer to get multibyte nature of this locale.  */
  is_multibyte = lex_exchange_fn (lexer_context,
                                  PROTO_LEXPARSE_OP_GET_IS_MULTIBYTE_ENV,
                                  NULL);
  parser->multibyte_locale = is_multibyte;
  parser->unibyte_locale = ! is_multibyte;
}
/* Generate a new instance of an FSA parser.  */
fsaparse_ctxt_t *
fsaparse_new (void)
{
  fsaparse_ctxt_t *new_context;

  /* Acquire zeroed memory for new parser context.  */
  new_context = XZALLOC (fsaparse_ctxt_t);

  /* ?? Point warning, error and lexer functions to a "you need to tell me
     these first!" function? */
  new_context->warn_client        = (fsaparse_warn_callback_fn *)
                                    no_function_provided;
  new_context->abandon_with_error = (fsaparse_error_callback_fn *)
                                    no_function_provided;
  new_context->lexer              = (fsaparse_lexer_fn_t  *)
                                    no_function_provided;

  /* Default to unibyte locale... but we should synchronise with lexer. */
  new_context->multibyte_locale = false;
  new_context->unibyte_locale = true;

  return new_context;
}
/* After parsing, report a list of tokens describing the pattern.  Complex
   structures such as alternation, backreferences, and locale-induced
   complexity such as variable-length utf8 sequences are described here by
   appending operators that apply to the preceding item(s) (postfix
   notation).  */
void
fsaparse_get_token_list (fsaparse_ctxt_t *parser,
                         size_t *nr_tokens,
                         fsatoken_token_t **token_list)
{
  *nr_tokens  = parser->tindex;
  *token_list = parser->tokens;
}
/* vim:set shiftwidth=2: */