[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Eliot-dev] eliot configure.in dic/Makefile.am dic/dic.h di...
From: |
eliot-dev |
Subject: |
[Eliot-dev] eliot configure.in dic/Makefile.am dic/dic.h di... |
Date: |
Mon, 07 Jul 2008 17:30:03 +0000 |
CVSROOT: /cvsroot/eliot
Module name: eliot
Changes by: Olivier Teulière <ipkiss> 08/07/07 17:30:03
Modified files:
. : configure.in
dic : Makefile.am dic.h dic_search.cpp regexp.cpp
regexp.h regexpmain.cpp
test : regexp.input regexp.ref
utils : eliottxt.cpp
Added files:
dic : grammar.cpp grammar.h
Removed files:
dic : erl.lpp ery.ypp
Log message:
- Added several tests for the regular expressions engine
- New regexp parser using Boost.Spirit. Lex and yacc are now gone.
The main advantage of this new parser, apart from being purely C++,
is that it can handle wide characters.
Currently, the new parser does the same as the previous one, but the
code is not yet ready to use regular expressions with non-ASCII
dictionaries.
CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/eliot/configure.in?cvsroot=eliot&r1=1.24&r2=1.25
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/Makefile.am?cvsroot=eliot&r1=1.18&r2=1.19
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/dic.h?cvsroot=eliot&r1=1.15&r2=1.16
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/dic_search.cpp?cvsroot=eliot&r1=1.4&r2=1.5
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/regexp.cpp?cvsroot=eliot&r1=1.2&r2=1.3
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/regexp.h?cvsroot=eliot&r1=1.13&r2=1.14
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/regexpmain.cpp?cvsroot=eliot&r1=1.3&r2=1.4
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/grammar.cpp?cvsroot=eliot&rev=1.1
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/grammar.h?cvsroot=eliot&rev=1.1
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/erl.lpp?cvsroot=eliot&r1=1.2&r2=0
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/ery.ypp?cvsroot=eliot&r1=1.2&r2=0
http://cvs.savannah.gnu.org/viewcvs/eliot/test/regexp.input?cvsroot=eliot&r1=1.1&r2=1.2
http://cvs.savannah.gnu.org/viewcvs/eliot/test/regexp.ref?cvsroot=eliot&r1=1.2&r2=1.3
http://cvs.savannah.gnu.org/viewcvs/eliot/utils/eliottxt.cpp?cvsroot=eliot&r1=1.21&r2=1.22
Patches:
Index: configure.in
===================================================================
RCS file: /cvsroot/eliot/eliot/configure.in,v
retrieving revision 1.24
retrieving revision 1.25
diff -u -b -r1.24 -r1.25
--- configure.in 20 Jan 2008 18:40:12 -0000 1.24
+++ configure.in 7 Jul 2008 17:29:59 -0000 1.25
@@ -23,17 +23,6 @@
AC_PROG_RANLIB
PKG_PROG_PKG_CONFIG
-AC_PROG_YACC
-if test "$YACC" = yacc ; then
- AC_MSG_ERROR([Could not find the 'bison' program on your system])
-fi
-
-dnl Better than AC_PROG_LEX
-AM_PROG_LEX
-if test "$LEX" != "flex" ; then
- AC_MSG_ERROR([Could not find the 'flex' program on your system])
-fi
-
dnl --------------------------------------------------------------
dnl Checks for compilation flags
dnl --------------------------------------------------------------
Index: dic/Makefile.am
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/Makefile.am,v
retrieving revision 1.18
retrieving revision 1.19
diff -u -b -r1.18 -r1.19
--- dic/Makefile.am 13 Jan 2008 20:26:04 -0000 1.18
+++ dic/Makefile.am 7 Jul 2008 17:29:59 -0000 1.19
@@ -26,8 +26,6 @@
libdic_a_YFLAGS=-d
libdic_a_LFLAGS=
libdic_a_SOURCES = \
- erl.lpp \
- ery.ypp \
dic_exception.cpp dic_exception.h \
header.cpp header.h \
dic_internals.h \
@@ -36,44 +34,8 @@
dic_search.cpp \
encoding.cpp encoding.h \
automaton.cpp automaton.h \
- regexp.cpp regexp.h
-
-BUILT_SOURCES= \
- libdic_a-erl.cpp \
- libdic_a-erl.h \
- libdic_a-ery.cpp \
- libdic_a-ery.h
-
-
-nodist_libdic_a_SOURCES= \
- libdic_a-erl.cpp \
- libdic_a-erl.h \
- libdic_a-ery.cpp \
- libdic_a-ery.h
-
-# This hook triggers on 'make dist' (and 'make distcheck')
-# XXX: In fact, the recommended behaviour is:
-# - list only libdic_a-ery.h in BUILT_SOURCES,
-# - do not die with an error in configure.in if flex or bison is not found
-# - do not have any dist-hook trigger
-# The result is that the generated files are kept in the tarball generated
with make dist,
-# with still an error message for developers when the ypp or lpp file has been
modified
-# and bison or flex is not found.
-# The problem is that, even though Automake is aware of the header generated
by bison,
-# it seems to have problems with the one generated by flex...
-dist-hook:
- -for file in $(BUILT_SOURCES) ; do rm -f $(distdir)/$$file ; done
-
-CLEANFILES= \
- libdic_a-erl.cpp \
- libdic_a-erl.h \
- libdic_a-ery.cpp \
- libdic_a-ery.h
-
-
-## automake workaround to generate .h file
-libdic_a-erl.h: erl.lpp
- ${LEX} ${srcdir}/erl.lpp
+ regexp.cpp regexp.h \
+ grammar.cpp grammar.h
#####################################
if BUILD_DICTOOLS
Index: dic/dic.h
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/dic.h,v
retrieving revision 1.15
retrieving revision 1.16
diff -u -b -r1.15 -r1.16
--- dic/dic.h 2 Mar 2008 18:45:10 -0000 1.15
+++ dic/dic.h 7 Jul 2008 17:30:00 -0000 1.16
@@ -100,7 +100,7 @@
/**
* Returns the character code associated with an element,
- * codes may range from 0 to 31. 0 is the null character.
+ * codes may range from 0 to 63. 0 is the null character.
* @returns code for the encoded character
*/
const dic_code_t getCode(const dic_elt_t &elt) const;
@@ -114,14 +114,14 @@
/**
* Returns a boolean to show if there is another available
* character in the current depth (a neighbor in the tree)
- * @returns 0 or 1 (true)
+ * @return true if the character is the last one at the current depth
*/
bool isLast(const dic_elt_t &elt) const;
/**
* Returns a boolean to show if we are at the end of a word
- * (see getNext)
- * @returns 0 or 1 (true)
+ * (see getNext())
+ * @return true if this is the end of a word
*/
bool isEndOfWord(const dic_elt_t &elt) const;
@@ -132,7 +132,7 @@
const dic_elt_t getRoot() const;
/**
- * Returns the next available neighbor (see getLast)
+ * Returns the next available neighbor (see isLast())
* @returns next dictionary element at the same depth
*/
const dic_elt_t getNext(const dic_elt_t &elt) const;
@@ -292,21 +292,12 @@
void searchWordByLen(struct params_7plus1_t *params,
int i, const DAWG_EDGE *edgeptr) const;
- /**
- * Internal version of searchRegExp, needed until
- * wide chars are supported by our regexp engine.
- */
- void searchRegExpInner(const string &iRegexp,
- vector<string> &oWordList,
- struct search_RegE_list_t *iList,
- unsigned int iMaxResults) const;
-
/// Helper for searchRegExp()
template <typename DAWG_EDGE>
void searchRegexpRecTempl(struct params_regexp_t *params,
int state,
const DAWG_EDGE *edgeptr,
- vector<string> &oWordList,
+ vector<wstring> &oWordList,
unsigned int iMaxResults) const;
};
Index: dic/dic_search.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/dic_search.cpp,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -b -r1.4 -r1.5
--- dic/dic_search.cpp 2 Mar 2008 18:45:10 -0000 1.4
+++ dic/dic_search.cpp 7 Jul 2008 17:30:00 -0000 1.5
@@ -1,7 +1,8 @@
/*****************************************************************************
* Eliot
- * Copyright (C) 2002-2007 Antoine Fraboulet
+ * Copyright (C) 2002-2008 Antoine Fraboulet & Olivier Teulière
* Authors: Antoine Fraboulet <antoine.fraboulet @@ free.fr>
+ * Olivier Teulière <ipkiss @@ gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -18,13 +19,6 @@
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*****************************************************************************/
-/**
- * \file dic_search.c
- * \brief Dictionary lookup functions
- * \author Antoine Fraboulet
- * \date 2002
- */
-
#include <cstdlib>
#include <cstring>
#include <cwchar>
@@ -35,22 +29,13 @@
#include "header.h"
#include "encoding.h"
#include "regexp.h"
-#include "libdic_a-ery.h" /* generated by bison */
-#include "libdic_a-erl.h" /* generated by flex */
#include "automaton.h"
+#include "grammar.h"
static const unsigned int DEFAULT_VECT_ALLOC = 100;
-/**
- * Function prototype for bison generated parser
- */
-int regexpparse(yyscan_t scanner, NODE** root,
- struct search_RegE_list_t *iList,
- struct regexp_error_report_t *err);
-
-
template <typename DAWG_EDGE>
const DAWG_EDGE* Dictionary::seekEdgePtr(const wchar_t* s, const DAWG_EDGE
*eptr) const
{
@@ -469,7 +454,7 @@
int maxlength;
Automaton *automaton_field;
struct search_RegE_list_t *charlist;
- char word[DIC_WORD_MAX];
+ wchar_t word[DIC_WORD_MAX];
int wordlen;
};
@@ -478,7 +463,7 @@
void Dictionary::searchRegexpRecTempl(struct params_regexp_t *params,
int state,
const DAWG_EDGE *edgeptr,
- vector<string> &oWordList,
+ vector<wstring> &oWordList,
unsigned int iMaxResults) const
{
if (iMaxResults && oWordList.size() >= iMaxResults)
@@ -488,7 +473,7 @@
/* if we have a valid word we store it */
if (params->automaton_field->accept(state) && edgeptr->term)
{
- int l = strlen(params->word);
+ int l = wcslen(params->word);
if (params->minlength <= l &&
params->maxlength >= l)
{
@@ -504,21 +489,24 @@
/* 1: the letter appears in the automaton as is */
if (next_state)
{
- params->word[params->wordlen] = current->chr + 'a' - 1;
+ params->word[params->wordlen] = current->chr + L'a' - 1;
params->wordlen ++;
searchRegexpRecTempl(params, next_state, current, oWordList,
iMaxResults);
params->wordlen --;
- params->word[params->wordlen] = '\0';
+ params->word[params->wordlen] = L'\0';
}
} while (!(*current++).last);
}
-void Dictionary::searchRegExpInner(const string &iRegexp,
- vector<string> &oWordList,
+void Dictionary::searchRegExp(const wstring &iRegexp,
+ vector<wstring> &oWordList,
struct search_RegE_list_t *iList,
unsigned int iMaxResults) const
{
+ if (iRegexp == L"")
+ return;
+
// Allocate room for all the results
if (iMaxResults)
oWordList.reserve(iMaxResults);
@@ -528,9 +516,6 @@
int ptl[REGEXP_MAX+1];
int PS [REGEXP_MAX+1];
- /* (expr)# */
- char stringbuf[250];
- sprintf(stringbuf, "(%s)#", iRegexp.c_str());
for (int i = 0; i < REGEXP_MAX; i++)
{
PS[i] = 0;
@@ -543,33 +528,28 @@
report.msg[0] = '\0';
/* parsing */
- yyscan_t scanner;
- regexplex_init( &scanner );
- YY_BUFFER_STATE buf = regexp_scan_string(stringbuf, scanner);
- NODE *root = NULL;
- int value = regexpparse(scanner , &root, iList, &report);
- regexp_delete_buffer(buf, scanner);
- regexplex_destroy(scanner);
+ Node *root = NULL;
+ bool parsingOk = parseRegexp(*this, (iRegexp + L"#").c_str(), &root,
iList);
- if (value)
+ if (!parsingOk)
{
-#ifdef DEBUG_FLEX_IS_BROKEN
+#if 0
fprintf(stderr, "parser error at pos %d - %d: %s\n",
report.pos1, report.pos2, report.msg);
#endif
- regexp_delete_tree(root);
- return ;
+ delete root;
+ return;
}
int n = 1;
int p = 1;
- regexp_parcours(root, &p, &n, ptl);
+ root->traverse(p, n, ptl);
PS [0] = p - 1;
ptl[0] = p - 1;
- regexp_possuivante(root, PS);
+ root->nextPos(PS);
- Automaton *a = new Automaton(root->PP, ptl, PS, iList);
+ Automaton *a = new Automaton(root->getFirstPos(), ptl, PS, iList);
if (a)
{
struct params_regexp_t params;
@@ -577,7 +557,7 @@
params.maxlength = iList->maxlength;
params.automaton_field = a;
params.charlist = iList;
- memset(params.word, '\0', sizeof(params.word));
+ memset(params.word, L'\0', sizeof(params.word));
params.wordlen = 0;
if (getHeader().getVersion() == 0)
{
@@ -592,32 +572,6 @@
delete a;
}
- regexp_delete_tree(root);
-}
-
-
-void Dictionary::searchRegExp(const wstring &iRegexp,
- vector<wstring> &oWordList,
- struct search_RegE_list_t *iList,
- unsigned int iMaxResults) const
-{
- if (iRegexp == L"")
- return;
-
- // Allocate room for all the results
- if (iMaxResults)
- oWordList.reserve(iMaxResults);
- else
- oWordList.reserve(DEFAULT_VECT_ALLOC);
-
- vector<string> tmpWordList;
- // Do the actual work
- searchRegExpInner(convertToMb(iRegexp), tmpWordList, iList, iMaxResults);
-
- vector<string>::const_iterator it;
- for (it = tmpWordList.begin(); it != tmpWordList.end(); it++)
- {
- oWordList.push_back(convertToWc(*it));
- }
+ delete root;
}
Index: dic/regexp.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/regexp.cpp,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -b -r1.2 -r1.3
--- dic/regexp.cpp 8 Jan 2008 13:52:36 -0000 1.2
+++ dic/regexp.cpp 7 Jul 2008 17:30:01 -0000 1.3
@@ -39,115 +39,80 @@
#include "regexp.h"
#include "automaton.h"
-#ifndef PDBG
-#ifdef DEBUG_RE2
-#define PDBG(x) x
-#else
-#define PDBG(x)
-#endif
-#endif
-
-NODE* regexp_createNODE(int type, char v, NODE *fg, NODE *fd)
+Node::Node(int type, char v, Node *fg, Node *fd)
+ : m_type(type), m_var(v), m_fg(fg), m_fd(fd), m_number(0), m_position(0),
+ m_annulable(false), m_PP(0), m_DP(0)
{
- NODE *x;
- x=(NODE *)malloc(sizeof(NODE));
- x->type = type;
- x->var = v;
- x->fd = fd;
- x->fg = fg;
- x->number = 0;
- x->position = 0;
- x->annulable = 0;
- x->PP = 0;
- x->DP = 0;
- return x;
}
-void regexp_delete_tree(NODE *root)
+Node::~Node()
{
- if (root == NULL)
- return;
- regexp_delete_tree(root->fg);
- regexp_delete_tree(root->fd);
- free(root);
+ delete m_fg;
+ delete m_fd;
}
-#ifdef DEBUG_RE
-static void print_node(FILE*, NODE *n, int detail);
-#endif
/**
- * computes position, annulable, PP, DP attributes
- * @param r = root
- * @param p = current leaf position
- * @param n = current node number
- * @param ptl = position to letter
+ * p is the current leaf position
+ * n is the current node number
*/
-
-void regexp_parcours(NODE* r, int *p, int *n, int ptl[])
+void Node::traverse(int &p, int &n, int ptl[])
{
- if (r == NULL)
- return;
+ if (m_fg)
+ m_fg->traverse(p, n, ptl);
+ if (m_fd)
+ m_fd->traverse(p, n, ptl);
- regexp_parcours(r->fg, p, n, ptl);
- regexp_parcours(r->fd, p, n, ptl);
+ m_number = n;
+ ++n;
- switch (r->type)
+ switch (m_type)
{
case NODE_VAR:
- r->position = *p;
- ptl[*p] = r->var;
- *p = *p + 1;
- r->annulable = 0;
- r->PP = 1 << (r->position - 1);
- r->DP = 1 << (r->position - 1);
+ m_position = p;
+ ptl[p] = m_var;
+ ++p;
+ m_annulable = false;
+ m_PP = 1 << (m_position - 1);
+ m_DP = 1 << (m_position - 1);
break;
case NODE_OR:
- r->position = 0;
- r->annulable = r->fg->annulable || r->fd->annulable;
- r->PP = r->fg->PP | r->fd->PP;
- r->DP = r->fg->DP | r->fd->DP;
+ m_position = 0;
+ m_annulable = m_fg->m_annulable || m_fd->m_annulable;
+ m_PP = m_fg->m_PP | m_fd->m_PP;
+ m_DP = m_fg->m_DP | m_fd->m_DP;
break;
case NODE_AND:
- r->position = 0;
- r->annulable = r->fg->annulable && r->fd->annulable;
- r->PP = (r->fg->annulable) ? (r->fg->PP | r->fd->PP) : r->fg->PP;
- r->DP = (r->fd->annulable) ? (r->fg->DP | r->fd->DP) : r->fd->DP;
+ m_position = 0;
+ m_annulable = m_fg->m_annulable && m_fd->m_annulable;
+ m_PP = (m_fg->m_annulable) ? (m_fg->m_PP | m_fd->m_PP) :
m_fg->m_PP;
+ m_DP = (m_fd->m_annulable) ? (m_fg->m_DP | m_fd->m_DP) :
m_fd->m_DP;
break;
case NODE_PLUS:
- r->position = 0;
- r->annulable = 0;
- r->PP = r->fg->PP;
- r->DP = r->fg->DP;
+ m_position = 0;
+ m_annulable = false;
+ m_PP = m_fg->m_PP;
+ m_DP = m_fg->m_DP;
break;
case NODE_STAR:
- r->position = 0;
- r->annulable = 1;
- r->PP = r->fg->PP;
- r->DP = r->fg->DP;
+ m_position = 0;
+ m_annulable = true;
+ m_PP = m_fg->m_PP;
+ m_DP = m_fg->m_DP;
break;
}
-
- r->number = *n;
- *n = *n + 1;
}
-/**
- * computes possuivante
- * @param r = root
- * @param PS = next position
- */
-void regexp_possuivante(NODE* r, int PS[])
+void Node::nextPos(int PS[])
{
- if (r == NULL)
- return;
+ if (m_fg)
+ m_fg->nextPos(PS);
+ if (m_fd)
+ m_fd->nextPos(PS);
- regexp_possuivante(r->fg, PS);
- regexp_possuivante(r->fd, PS);
-
- switch (r->type)
+ switch (m_type)
{
case NODE_AND:
/************************************/
@@ -156,8 +121,8 @@
/************************************/
for (int pos = 1; pos <= PS[0]; pos++)
{
- if (r->fg->DP & (1 << (pos-1)))
- PS[pos] |= r->fd->PP;
+ if (m_fg->m_DP & (1 << (pos-1)))
+ PS[pos] |= m_fd->m_PP;
}
break;
case NODE_PLUS:
@@ -168,8 +133,8 @@
/************************************/
for (int pos = 1; pos <= PS[0]; pos++)
{
- if (r->DP & (1 << (pos-1)))
- PS[pos] |= r->PP;
+ if (m_DP & (1 << (pos-1)))
+ PS[pos] |= m_PP;
}
break;
case NODE_STAR:
@@ -179,32 +144,27 @@
/************************************/
for (int pos = 1; pos <= PS[0]; pos++)
{
- if (r->DP & (1 << (pos-1)))
- PS[pos] |= r->PP;
+ if (m_DP & (1 << (pos-1)))
+ PS[pos] |= m_PP;
}
break;
}
}
-/*////////////////////////////////////////////////
+////////////////////////////////////////////////
// DEBUG only fonctions
-////////////////////////////////////////////////*/
+////////////////////////////////////////////////
#ifdef DEBUG_RE
-void regexp_print_PS(int PS[])
+void printPS(int PS[])
{
- printf("** positions suivantes **\n");
+ printf("** next positions **\n");
for (int i = 1; i <= PS[0]; i++)
{
printf("%02d: 0x%08x\n", i, PS[i]);
}
}
-#endif
-/*////////////////////////////////////////////////
-////////////////////////////////////////////////*/
-
-#ifdef DEBUG_RE
void regexp_print_ptl(int ptl[])
{
printf("** pos -> lettre: ");
@@ -216,8 +176,6 @@
}
#endif
-/*////////////////////////////////////////////////
-////////////////////////////////////////////////*/
void regexp_print_letter(FILE* f, char l)
{
@@ -239,8 +197,6 @@
}
}
-/*////////////////////////////////////////////////
-////////////////////////////////////////////////*/
void regexp_print_letter2(FILE* f, char l)
{
@@ -262,19 +218,14 @@
}
}
-/*////////////////////////////////////////////////
-////////////////////////////////////////////////*/
#ifdef DEBUG_RE
-static void print_node(FILE* f, NODE *n, int detail)
+void Node::printNode(FILE* f, int detail) const
{
- if (n == NULL)
- return;
-
- switch (n->type)
+ switch (m_type)
{
case NODE_VAR:
- regexp_print_letter(f, n->var);
+ regexp_print_letter(f, m_var);
break;
case NODE_OR:
fprintf(f, "OR");
@@ -292,71 +243,54 @@
if (detail == 2)
{
fprintf(f, "\\n pos=%d\\n annul=%d\\n PP=0x%04x\\n DP=0x%04x",
- n->position, n->annulable, n->PP, n->DP);
+ m_position, m_annulable, m_PP, m_DP);
}
}
-#endif
-
-/*////////////////////////////////////////////////
-////////////////////////////////////////////////*/
-#ifdef DEBUG_RE
-static void print_tree_nodes(FILE* f, NODE* n, int detail)
+void Node::printNodesRec(FILE* f, int detail) const
{
- if (n == NULL)
- return;
+ if (m_fg)
+ m_fg->printNodesRec(f, detail);
+ if (m_fd)
+ m_fd->printNodesRec(f, detail);
- print_tree_nodes(f, n->fg, detail);
- print_tree_nodes(f, n->fd, detail);
-
- fprintf(f, "%d [ label=\"", n->number);
- print_node(f, n, detail);
+ fprintf(f, "%d [ label=\"", m_number);
+ printNode(f, detail);
fprintf(f, "\"];\n");
}
-#endif
-/*////////////////////////////////////////////////
-////////////////////////////////////////////////*/
-
-#ifdef DEBUG_RE
-static void print_tree_edges(FILE *f, NODE *n)
+void Node::printEdgesRec(FILE *f) const
{
- if (n == NULL)
- return;
-
- print_tree_edges(f, n->fg);
- print_tree_edges(f, n->fd);
+ if (m_fg)
+ m_fg->printEdgesRec(f);
+ if (m_fd)
+ m_fd->printEdgesRec(f);
- switch (n->type)
+ switch (m_type)
{
case NODE_OR:
- fprintf(f, "%d -> %d;", n->number, n->fg->number);
- fprintf(f, "%d -> %d;", n->number, n->fd->number);
+ fprintf(f, "%d -> %d;", m_number, m_fg->m_number);
+ fprintf(f, "%d -> %d;", m_number, m_fd->m_number);
break;
case NODE_AND:
- fprintf(f, "%d -> %d;", n->number, n->fg->number);
- fprintf(f, "%d -> %d;", n->number, n->fd->number);
+ fprintf(f, "%d -> %d;", m_number, m_fg->m_number);
+ fprintf(f, "%d -> %d;", m_number, m_fd->m_number);
break;
case NODE_PLUS:
case NODE_STAR:
- fprintf(f, "%d -> %d;", n->number, n->fg->number);
+ fprintf(f, "%d -> %d;", m_number, m_fg->m_number);
break;
}
}
-#endif
-
-/*////////////////////////////////////////////////
-////////////////////////////////////////////////*/
-#ifdef DEBUG_RE
-void regexp_print_tree(NODE* n, const string &iName, int detail)
+void Node::printTreeDot(const string &iFileName, int detail) const
{
- FILE *f = fopen(iName.c_str(), "w");
+ FILE *f = fopen(iFileName.c_str(), "w");
if (f == NULL)
return;
- fprintf(f, "digraph %s {\n", iName.c_str());
- print_tree_nodes(f, n, detail);
- print_tree_edges(f, n);
+ fprintf(f, "digraph %s {\n", iFileName.c_str());
+ printNodesRec(f, detail);
+ printEdgesRec(f);
fprintf(f, "fontsize=20;\n");
fprintf(f, "}\n");
fclose(f);
@@ -369,7 +303,7 @@
}
else if (pid == 0)
{
- execlp("dotty", "dotty", iName.c_str(), NULL);
+ execlp("dotty", "dotty", iFileName.c_str(), NULL);
printf("exec dotty failed\n");
exit(1);
}
Index: dic/regexp.h
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/regexp.h,v
retrieving revision 1.13
retrieving revision 1.14
diff -u -b -r1.13 -r1.14
--- dic/regexp.h 8 Jan 2008 13:52:36 -0000 1.13
+++ dic/regexp.h 7 Jul 2008 17:30:01 -0000 1.14
@@ -28,6 +28,8 @@
#ifndef _REGEXP_H_
#define _REGEXP_H_
+#include <string>
+
#define NODE_TOP 0
#define NODE_VAR 1
#define NODE_OR 2
@@ -35,31 +37,86 @@
#define NODE_STAR 4
#define NODE_PLUS 5
+using std::string;
-typedef struct node
+class Node
{
- int type;
- char var;
- struct node *fg;
- struct node *fd;
- int number;
- int position;
- int annulable;
- int PP;
- int DP;
-} NODE;
+public:
+ /**
+ * Create a node for the syntactic tree used for
+ * parsing regular expressions
+ */
+ Node(int type, char v, Node *fg, Node *fd);
/**
- * different letters in the dictionary
+ * Delete regexp syntactic tree
+ */
+ ~Node();
+
+ /**
+ * Computes positions, first positions (PP), last position (DP),
+ * and annulable attribute
+ *
+ * @param p : max position found in the tree (must be initialized to 1)
+ * @param n : number of nodes in the tree (must be initialized to 1)
+ * @param ptl : position to letter translation table
+ */
+ void traverse(int &p, int &n, int ptl[]);
+
+ /**
+ * Computes 'next position' table used for building the
+ * automaton
+ * @param r : root node of the syntactic tree
+ * @param PS : next position table, PS[0] must contain the
+ * number of terminals contained in the regular expression
*/
-#define DIC_LETTERS 27
+ void nextPos(int PS[]);
+
+ /// Return the first position
+ int getFirstPos() const { return m_PP; }
+#ifdef DEBUG_RE
/**
+ * Print the tree rooted at the current node to a file suitable
+ * for dot (Graphviz)
+ */
+ void printTreeDot(const string &iFileName, int detail) const;
+#endif
+
+private:
+ int m_type;
+ char m_var;
+ Node *m_fg;
+ Node *m_fd;
+ int m_number;
+ int m_position;
+ bool m_annulable;
+ int m_PP;
+ int m_DP;
+
+#ifdef DEBUG_RE
+ /// Print the current node to file
+ void printNode(FILE* f, int detail) const;
+
+ /// Print recursively the current node and its subnodes to file
+ void printNodesRec(FILE *f, int detail) const;
+
+ /// Print recursively the edges of the tree rooted at the current node
+ void printEdgesRec(FILE *f) const;
+#endif
+};
+
+/**
+ * different letters in the dictionary
+ */
+#define DIC_LETTERS 63
+
+/**
* maximum number of accepted terminals in regular expressions
*/
#define REGEXP_MAX 32
- /**
+/**
* special terminals that should not appear in the dictionary
*/
#define RE_EPSILON (DIC_LETTERS + 0)
@@ -70,7 +127,7 @@
#define RE_USR1_MATCH (DIC_LETTERS + 5)
#define RE_USR2_MATCH (DIC_LETTERS + 6)
- /**
+/**
* number of lists for regexp letter match \n
* 0 : all tiles \n
* 1 : vowels \n
@@ -81,12 +138,13 @@
*/
#define DIC_SEARCH_REGE_LIST (REGEXP_MAX)
- /**
+/**
* Structure used for Dic_search_RegE \n
* this structure is used to explicit letters list that will be matched
* against special tokens in the regular expression search
*/
-struct search_RegE_list_t {
+struct search_RegE_list_t
+{
/** maximum length for results */
int minlength;
/** maximum length for results */
@@ -94,9 +152,9 @@
/** special symbol associated with the list */
char symbl[DIC_SEARCH_REGE_LIST];
/** 0 or 1 if list is valid */
- int valid[DIC_SEARCH_REGE_LIST];
+ bool valid[DIC_SEARCH_REGE_LIST];
/** 0 or 1 if letter is present in the list */
- char letters[DIC_SEARCH_REGE_LIST][DIC_LETTERS];
+ bool letters[DIC_SEARCH_REGE_LIST][DIC_LETTERS];
};
#define RE_LIST_ALL_MATCH 0
@@ -105,39 +163,10 @@
#define RE_LIST_USER_BEGIN 3
#define RE_LIST_USER_END 4
- /**
- * Create a node for the syntactic tree used for
- * parsing regular expressions \n
- * The fonction is called by bison grammar rules
- */
-NODE* regexp_createNODE(int type,char v,NODE *fg,NODE *fd);
-
- /**
- * delete regexp syntactic tree
- */
-void regexp_delete_tree(NODE * root);
-
- /**
- * Computes positions, first positions (PP), last position (DP)
- * and translation table 'position to letter' (ptl)
- * @param p : max position found in the tree (must be initialized to 1)
- * @param n : number of nodes in the tree (must be initialized to 1)
- * @param ptl : position to letter translation table
- */
-void regexp_parcours(NODE* r, int *p, int *n, int ptl[]);
-
- /**
- * Computes 'next position' table used for building the
- * automaton
- * @param r : root node of the syntactic tree
- * @param PS : next position table, PS[0] must contain the
- * number of terminals contained in the regular expression
- */
-void regexp_possuivante(NODE* r, int PS[]);
-
#define MAX_REGEXP_ERROR_LENGTH 500
-struct regexp_error_report_t {
+struct regexp_error_report_t
+{
int pos1;
int pos2;
char msg[MAX_REGEXP_ERROR_LENGTH];
@@ -149,7 +178,6 @@
void regexp_print_letter2(FILE* f, char l);
void regexp_print_PS(int PS[]);
void regexp_print_ptl(int ptl[]);
-void regexp_print_tree(NODE* n, char* name, int detail);
#endif /* _REGEXP_H_ */
Index: dic/regexpmain.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/regexpmain.cpp,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -b -r1.3 -r1.4
--- dic/regexpmain.cpp 2 Mar 2008 18:45:11 -0000 1.3
+++ dic/regexpmain.cpp 7 Jul 2008 17:30:01 -0000 1.4
@@ -40,62 +40,40 @@
#endif
#include "dic.h"
+#include "header.h"
#include "regexp.h"
#include "encoding.h"
-#define __UNUSED__ __attribute__((unused))
-
-/********************************************************/
-/********************************************************/
-/********************************************************/
-
-const unsigned int all_letter[DIC_LETTERS] =
-{
- /* 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 */
- /* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 */
- /* x A B C D E F G H I J K L M N O P Q R S T U V W X Y Z */
- 0,1,1,1,1, 1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1, 1, 1, 1, 1
-};
-
-const unsigned int vowels[DIC_LETTERS] =
-{
- /* x A B C D E F G H I J K L M N O P Q R S T U V W X Y Z */
- 0,1,0,0,0, 1,0,0,0,1,0, 0,0,0,0,1,0,0,0,0,0,1,0, 0, 0, 1, 0
-};
-
-const unsigned int consonants[DIC_LETTERS] =
+void init_letter_lists(const Dictionary &iDic, struct search_RegE_list_t
*iList)
{
- /* x A B C D E F G H I J K L M N O P Q R S T U V W X Y Z */
- 0,0,1,1,1, 0,1,1,1,0,1, 1,1,1,1,0,1,1,1,1,1,0,1, 1, 1, 1, 1
-};
-
-void init_letter_lists(struct search_RegE_list_t *iList)
-{
- memset (iList, 0, sizeof(*iList));
+ memset(iList, 0, sizeof(*iList));
iList->minlength = 1;
iList->maxlength = 15;
- iList->valid[0] = 1; // all letters
+ iList->valid[0] = true; // all letters
iList->symbl[0] = RE_ALL_MATCH;
- iList->valid[1] = 1; // vowels
+ iList->valid[1] = true; // vowels
iList->symbl[1] = RE_VOWL_MATCH;
- iList->valid[2] = 1; // consonants
+ iList->valid[2] = true; // consonants
iList->symbl[2] = RE_CONS_MATCH;
- for (int i = 0; i < DIC_LETTERS; i++)
- {
- iList->letters[0][i] = all_letter[i];
- iList->letters[1][i] = vowels[i];
- iList->letters[2][i] = consonants[i];
+ iList->letters[0][0] = false;
+ iList->letters[1][0] = false;
+ iList->letters[2][0] = false;
+ const wstring &allLetters = iDic.getHeader().getLetters();
+ for (size_t i = 1; i <= allLetters.size(); ++i)
+ {
+ iList->letters[0][i] = true;
+ iList->letters[1][i] = iDic.getHeader().isVowel(i);
+ iList->letters[2][i] = iDic.getHeader().isConsonant(i);
}
- iList->valid[3] = 0; // user defined list 1
+
+ iList->valid[3] = false; // user defined list 1
iList->symbl[3] = RE_USR1_MATCH;
- iList->valid[4] = 0; // user defined list 2
+ iList->valid[4] = false; // user defined list 2
iList->symbl[4] = RE_USR2_MATCH;
}
-/********************************************************/
-/********************************************************/
-/********************************************************/
+
void usage(const char *iBinaryName)
{
cerr << _("usage: %s dictionary") << iBinaryName << endl;
@@ -142,7 +120,7 @@
break;
/* automaton */
- init_letter_lists(®List);
+ init_letter_lists(dic, ®List);
vector<wstring> wordList;
dic.searchRegExp(convertToWc(er), wordList, ®List);
@@ -163,7 +141,7 @@
}
catch (...)
{
- std::cerr << "Unkown exception taken" << endl;
+ std::cerr << "Unknown exception taken" << endl;
return 1;
}
}
Index: test/regexp.input
===================================================================
RCS file: /cvsroot/eliot/eliot/test/regexp.input,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -b -r1.1 -r1.2
--- test/regexp.input 1 Jan 2006 19:25:10 -0000 1.1
+++ test/regexp.input 7 Jul 2008 17:30:02 -0000 1.2
@@ -12,5 +12,11 @@
x .*(cba)+b
x .*(nn)+.*
x .*(nn)+.*x 200
+x ne.
+x ne:v:
+x ne:v:?
+x ne:c:s
+x (ass)+..
+x c:v:+p
q
Index: test/regexp.ref
===================================================================
RCS file: /cvsroot/eliot/eliot/test/regexp.ref,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -b -r1.2 -r1.3
--- test/regexp.ref 8 Jan 2008 13:52:41 -0000 1.2
+++ test/regexp.ref 7 Jul 2008 17:30:02 -0000 1.3
@@ -539,4 +539,52 @@
vanneaux
vicennaux
57 printed results
+commande> x ne.
+search for ne. (50,1,15)
+nee
+nef
+nem
+neo
+nes
+net
+ney
+nez
+8 printed results
+commande> x ne:v:
+search for ne:v: (50,1,15)
+nee
+neo
+ney
+3 printed results
+commande> x ne:v:?
+search for ne:v:? (50,1,15)
+ne
+nee
+neo
+ney
+4 printed results
+commande> x ne:c:s
+search for ne:c:s (50,1,15)
+nefs
+nems
+nets
+news
+neys
+5 printed results
+commande> x (ass)+..
+search for (ass)+.. (50,1,15)
+assai
+assassin
+assec
+asses
+assez
+assis
+assit
+7 printed results
+commande> x c:v:+p
+search for c:v:+p (50,1,15)
+cap
+cep
+coup
+3 printed results
commande> q
Index: utils/eliottxt.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/utils/eliottxt.cpp,v
retrieving revision 1.21
retrieving revision 1.22
diff -u -b -r1.21 -r1.22
--- utils/eliottxt.cpp 2 Mar 2008 18:45:11 -0000 1.21
+++ utils/eliottxt.cpp 7 Jul 2008 17:30:02 -0000 1.22
@@ -800,11 +800,11 @@
llist.symbl[3] = RE_USR1_MATCH;
llist.symbl[5] = RE_USR2_MATCH;
- llist.valid[0] = 1; // all letters
- llist.valid[1] = 1; // vowels
- llist.valid[2] = 1; // consonants
- llist.valid[3] = 0; // user defined list 1
- llist.valid[4] = 0; // user defined list 2
+ llist.valid[0] = true; // all letters
+ llist.valid[1] = true; // vowels
+ llist.valid[2] = true; // consonants
+ llist.valid[3] = false; // user defined list 1
+ llist.valid[4] = false; // user defined list 2
for (int i = 0; i < DIC_SEARCH_REGE_LIST; i++)
{
Index: dic/grammar.cpp
===================================================================
RCS file: dic/grammar.cpp
diff -N dic/grammar.cpp
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ dic/grammar.cpp 7 Jul 2008 17:30:00 -0000 1.1
@@ -0,0 +1,340 @@
+/*****************************************************************************
+ * Eliot
+ * Copyright (C) 2008 Olivier Teulière
+ * Authors: Olivier Teulière <ipkiss @@ gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *****************************************************************************/
+
+#include <string>
+#include <stack>
+#include <boost/spirit/core.hpp>
+#include <boost/spirit/utility/chset.hpp>
+#include <boost/spirit/tree/ast.hpp>
+#ifdef DEBUG_RE
+#include <boost/spirit/tree/tree_to_xml.hpp>
+#include <map>
+#include <iostream>
+#endif
+
+#include "dic.h"
+#include "header.h"
+#include "regexp.h"
+
+using namespace boost::spirit;
+using namespace std;
+
+// TODO:
+// - error handling
+
+// A few typedefs to simplify things
+typedef const wchar_t *iterator_t;
+typedef tree_match<iterator_t> parse_tree_match_t;
+typedef parse_tree_match_t::const_tree_iterator iter_t;
+
+
+struct RegexpGrammar : grammar<RegexpGrammar>
+{
+ static const int wrapperId = 0;
+ static const int exprId = 1;
+ static const int repeatId = 2;
+ static const int groupId = 3;
+ static const int varId = 4;
+ static const int choiceId = 5;
+ static const int alphavarId = 6;
+
+ RegexpGrammar(const wstring &letters)
+ {
+ wstring lower = letters;
+ std::transform(lower.begin(), lower.end(), lower.begin(), towlower);
+ m_allLetters = letters + lower;
+ }
+
+ template <typename ScannerT>
+ struct definition
+ {
+ // Constructor
+ definition(const RegexpGrammar &self)
+ {
+ wrapper
+ = expr >> L"#"
+ ;
+
+ expr
+ = repeat >> *expr;
+ ;
+
+ repeat
+ = group >> root_node_d[ch_p(L'?')]
+ | group >> root_node_d[ch_p(L'*')]
+ | group >> root_node_d[ch_p(L'+')]
+ | group
+ ;
+
+ group
+ = var
+ | root_node_d[str_p(L"[^")] >> choice >> no_node_d[ch_p(L']')]
+ | root_node_d[ch_p(L'[')] >> choice >> no_node_d[ch_p(L']')]
+ | root_node_d[ch_p(L'(')] >> +repeat >> no_node_d[ch_p(L')')]
// XXX: 'expr' instead of '+repeat' doesn't work. Why?
+ ;
+
+ var
+ = alphavar
+ | ch_p(L'.')
+ | str_p(L":v:")
+ | str_p(L":c:")
+ | str_p(L":1:")
+ | str_p(L":2:")
+ ;
+
+ choice
+ = leaf_node_d[+alphavar]
+ ;
+
+ alphavar
+ = chset<>(self.m_allLetters.c_str())
+ ;
+ }
+
+ rule<ScannerT, parser_context<>, parser_tag<wrapperId> > wrapper;
+ rule<ScannerT, parser_context<>, parser_tag<exprId> > expr;
+ rule<ScannerT, parser_context<>, parser_tag<repeatId> > repeat;
+ rule<ScannerT, parser_context<>, parser_tag<groupId> > group;
+ rule<ScannerT, parser_context<>, parser_tag<varId> > var;
+ rule<ScannerT, parser_context<>, parser_tag<choiceId> > choice;
+ rule<ScannerT, parser_context<>, parser_tag<alphavarId> > alphavar;
+
+ const rule<ScannerT, parser_context<>, parser_tag<wrapperId> > &
start() const { return wrapper; }
+ };
+
+ wstring m_allLetters;
+};
+
+
+void evaluate(const Header &iHeader, iter_t const& i, stack<Node*> &evalStack,
+ struct search_RegE_list_t *iList, bool negate = false)
+{
+ if (i->value.id() == RegexpGrammar::alphavarId)
+ {
+ assert(i->children.size() == 0);
+
+ // Extract the character and convert it to its internal code
+ uint8_t code = iHeader.getCodeFromChar(*i->value.begin());
+ Node *n = new Node(NODE_VAR, code, NULL, NULL);
+ evalStack.push(n);
+ }
+ else if (i->value.id() == RegexpGrammar::choiceId)
+ {
+#if 0
+ assert(i->children.size() == 0);
+
+ string choiceLetters(i->value.begin(), i->value.end());
+ int j;
+ for (j = RE_LIST_USER_END + 1; j < DIC_SEARCH_REGE_LIST; j++)
+ {
+ if (!iList->valid[j])
+ {
+ iList->valid[j] = true;
+ iList->symbl[j] = RE_ALL_MATCH + j;
+ iList->letters[j][0] = false;
+ for (int k = 1; k < DIC_LETTERS; k++)
+ {
+ bool contains = (choiceLetters.find(k + L'a' - 1) !=
string::npos);
+ iList->letters[j][k] = (contains ? !negate : negate);
+ }
+ break;
+ }
+ }
+ Node *node = new Node(NODE_VAR, iList->symbl[j], NULL, NULL);
+ evalStack.push(node);
+#endif
+#if 1
+ assert(i->children.size() == 0);
+
+ wstring choiceLetters(i->value.begin(), i->value.end());
+ // Make sure the letters are in upper case
+ std::transform(choiceLetters.begin(), choiceLetters.end(),
+ choiceLetters.begin(), towupper);
+ // The dictionary letters are already in upper case
+ const wstring &letters = iHeader.getLetters();
+ wstring::const_iterator itLetter;
+ int j;
+ for (j = RE_LIST_USER_END + 1; j < DIC_SEARCH_REGE_LIST; ++j)
+ {
+ if (!iList->valid[j])
+ {
+ iList->valid[j] = true;
+ iList->symbl[j] = RE_ALL_MATCH + j;
+ iList->letters[j][0] = false;
+ for (itLetter = letters.begin(); itLetter != letters.end();
++itLetter)
+ {
+ bool contains = (choiceLetters.find(*itLetter) !=
string::npos);
+ iList->letters[j][iHeader.getCodeFromChar(*itLetter)] =
+ (contains ? !negate : negate);
+ }
+ break;
+ }
+ }
+ Node *node = new Node(NODE_VAR, iList->symbl[j], NULL, NULL);
+ evalStack.push(node);
+#endif
+ }
+ else if (i->value.id() == RegexpGrammar::varId)
+ {
+ assert(i->children.size() == 0);
+
+ string var(i->value.begin(), i->value.end());
+ Node *node = NULL;
+ if (var == ":v:")
+ node = new Node(NODE_VAR, RE_VOWL_MATCH, NULL, NULL);
+ else if (var == ":c:")
+ node = new Node(NODE_VAR, RE_CONS_MATCH, NULL, NULL);
+ else if (var == ":1:")
+ node = new Node(NODE_VAR, RE_USR1_MATCH, NULL, NULL);
+ else if (var == ":2:")
+ node = new Node(NODE_VAR, RE_USR2_MATCH, NULL, NULL);
+ else if (var == ".")
+ node = new Node(NODE_VAR, RE_ALL_MATCH, NULL, NULL);
+ else
+ assert(0);
+
+ evalStack.push(node);
+ }
+ else if (i->value.id() == RegexpGrammar::groupId)
+ {
+ if (*i->value.begin() == L'(')
+ {
+ assert(i->children.size() != 0);
+ // Create a node for each child
+ iter_t iter;
+ for (iter = i->children.begin(); iter != i->children.end(); ++iter)
+ evaluate(iHeader, iter, evalStack, iList);
+ // "Concatenate" the created child nodes with AND nodes
+ for (uint j = 0; j < i->children.size() - 1; ++j)
+ {
+ Node *old2 = evalStack.top();
+ evalStack.pop();
+ Node *old1 = evalStack.top();
+ evalStack.pop();
+ Node *node = new Node(NODE_AND, '\0', old1, old2);
+ evalStack.push(node);
+ }
+ }
+ else if (*i->value.begin() == L'[')
+ {
+ assert(i->children.size() == 1);
+ bool hasCaret = (i->value.begin() + 1 != i->value.end());
+ evaluate(iHeader, i->children.begin(), evalStack, iList, hasCaret);
+ }
+ else
+ assert(0);
+ }
+ else if (i->value.id() == RegexpGrammar::repeatId)
+ {
+ assert(i->children.size() == 1);
+ evaluate(iHeader, i->children.begin(), evalStack, iList);
+
+ if (*i->value.begin() == L'*')
+ {
+ assert(i->children.size() == 1);
+ Node *old = evalStack.top();
+ evalStack.pop();
+ Node *node = new Node(NODE_STAR, '\0', old, NULL);
+ evalStack.push(node);
+ }
+ else if (*i->value.begin() == L'+')
+ {
+ assert(i->children.size() == 1);
+ Node *old = evalStack.top();
+ evalStack.pop();
+ Node *node = new Node(NODE_PLUS, '\0', old, NULL);
+ evalStack.push(node);
+ }
+ else if (*i->value.begin() == L'?')
+ {
+ assert(i->children.size() == 1);
+ Node *old = evalStack.top();
+ evalStack.pop();
+ Node *epsilon = new Node(NODE_VAR, RE_EPSILON, NULL, NULL);
+ Node *node = new Node(NODE_OR, '\0', old, epsilon);
+ evalStack.push(node);
+ }
+ else
+ assert(0);
+ }
+ else if (i->value.id() == RegexpGrammar::exprId)
+ {
+ assert(i->children.size() == 2);
+ evaluate(iHeader, i->children.begin(), evalStack, iList);
+ evaluate(iHeader, i->children.begin() + 1, evalStack, iList);
+
+ Node *old2 = evalStack.top();
+ evalStack.pop();
+ Node *old1 = evalStack.top();
+ evalStack.pop();
+ Node *node = new Node(NODE_AND, '\0', old1, old2);
+ evalStack.push(node);
+ }
+ else if (i->value.id() == RegexpGrammar::wrapperId)
+ {
+ assert(i->children.size() == 2);
+ evaluate(iHeader, i->children.begin(), evalStack, iList);
+ Node *old = evalStack.top();
+ evalStack.pop();
+ Node* sharp = new Node(NODE_VAR, RE_FINAL_TOK, NULL, NULL);
+ Node *node = new Node(NODE_AND, '\0', old, sharp);
+ evalStack.push(node);
+ }
+ else
+ {
+ assert(0);
+ }
+}
+
+
+bool parseRegexp(const Dictionary &iDic, const wchar_t *input, Node **root,
struct search_RegE_list_t *iList)
+{
+ // Create a grammar object
+ RegexpGrammar g(iDic.getHeader().getLetters());
+ // Parse the input and generate an Abstract Syntax Tree (AST)
+ tree_parse_info<const wchar_t*> info = ast_parse(input, g);
+
+ if (info.full)
+ {
+#ifdef DEBUG_RE
+ // Dump parse tree as XML
+ std::map<parser_id, std::string> rule_names;
+ rule_names[RegexpGrammar::wrapperId] = "wrapper";
+ rule_names[RegexpGrammar::exprId] = "expr";
+ rule_names[RegexpGrammar::repeatId] = "repeat";
+ rule_names[RegexpGrammar::groupId] = "group";
+ rule_names[RegexpGrammar::varId] = "var";
+ rule_names[RegexpGrammar::choiceId] = "choice";
+ rule_names[RegexpGrammar::alphavarId] = "alphavar";
+ tree_to_xml(cout, info.trees);
+#endif
+
+ stack<Node*> evalStack;
+ evaluate(iDic.getHeader(), info.trees.begin(), evalStack, iList);
+ assert(evalStack.size() == 1);
+ *root = evalStack.top();
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+}
+
Index: dic/grammar.h
===================================================================
RCS file: dic/grammar.h
diff -N dic/grammar.h
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ dic/grammar.h 7 Jul 2008 17:30:01 -0000 1.1
@@ -0,0 +1,31 @@
+/*****************************************************************************
+ * Eliot
+ * Copyright (C) 2008 Olivier Teulière
+ * Authors: Olivier Teulière <ipkiss @@ gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *****************************************************************************/
+
+#ifndef _GRAMMAR_H_
+#define _GRAMMAR_H_
+
+class Dictionary;
+class Node;
+struct search_RegE_list_t;
+
+bool parseRegexp(const Dictionary &iDic, const wchar_t *input, Node **root,
struct search_RegE_list_t *iList);
+
+#endif
+
Index: dic/erl.lpp
===================================================================
RCS file: dic/erl.lpp
diff -N dic/erl.lpp
--- dic/erl.lpp 8 Jan 2008 13:52:35 -0000 1.2
+++ /dev/null 1 Jan 1970 00:00:00 -0000
@@ -1,59 +0,0 @@
-%{
-/*****************************************************************************
- * Eliot
- * Copyright (C) 2005-2007 Antoine Fraboulet
- * Authors: Antoine Fraboulet
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *****************************************************************************/
-
-#include "dic.h"
-#include "regexp.h"
-#include "libdic_a-ery.h"
-
-#define MASK_TO_REMOVE 0x1F
-
-%}
-%option prefix="regexp"
-%option outfile="lex.yy.c"
-%option header-file="libdic_a-erl.h"
-%option reentrant bison-bridge
-%option bison-locations
-%option noyywrap nounput
-
-/* TODO : remove lexer translation */
-alphabet [a-zA-Z]
-%%
-
-{alphabet} {yylval_param->c=(yytext[0]&MASK_TO_REMOVE); return LEX_CHAR;}
-"[" {return LEX_L_SQBRACKET;}
-"]" {return LEX_R_SQBRACKET;}
-"(" {return LEX_L_BRACKET;}
-")" {return LEX_R_BRACKET;}
-"^" {return LEX_HAT;}
-
-"." {return LEX_ALL;}
-":v:" {return LEX_VOWL;}
-":c:" {return LEX_CONS;}
-":1:" {return LEX_USER1;}
-":2:" {return LEX_USER2;}
-
-"?" {return LEX_QMARK;}
-"+" {return LEX_PLUS;}
-"*" {return LEX_STAR;}
-
-"#" {return LEX_SHARP;}
-%%
-
Index: dic/ery.ypp
===================================================================
RCS file: dic/ery.ypp
diff -N dic/ery.ypp
--- dic/ery.ypp 8 Jan 2008 13:52:35 -0000 1.2
+++ /dev/null 1 Jan 1970 00:00:00 -0000
@@ -1,295 +0,0 @@
-%{
-/*****************************************************************************
- * Eliot
- * Copyright (C) 2005-2007 Antoine Fraboulet
- * Authors: Antoine Fraboulet
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *****************************************************************************/
-
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <malloc.h>
-
-#include "dic.h"
-#include "regexp.h"
-#include "libdic_a-ery.h"
-#include "libdic_a-erl.h"
-
-/* ************************************************** */
-/* ************************************************** */
-/* ************************************************** */
-
- /**
- * function prototype for parser generated by bison
- */
-int regexpparse(yyscan_t scanner, NODE** root,
- struct search_RegE_list_t *list,
- struct regexp_error_report_t *err);
-
- /**
- * function prototype for error reporting
- */
-void regexperror(YYLTYPE *llocp, yyscan_t scanner, NODE** root,
- struct search_RegE_list_t *list,
- struct regexp_error_report_t *err,
- char const *msg);
-
-/* ************************************************** */
-/* ************************************************** */
-/* ************************************************** */
-
-%}
-%union {
- char c;
- NODE *NODE_TYPE;
- char letters[DIC_LETTERS];
-};
-
-%defines
-%name-prefix="regexp"
-%pure-parser
-%locations
-%parse-param {yyscan_t yyscanner}
-%parse-param {NODE **root}
-%parse-param {struct search_RegE_list_t *list}
-%parse-param {struct regexp_error_report_t *err}
-%lex-param {yyscan_t yyscanner}
-
-%token <c> LEX_CHAR
-%token LEX_ALL
-%token LEX_VOWL
-%token LEX_CONS
-%token LEX_USER1
-%token LEX_USER2
-
-%token LEX_L_SQBRACKET LEX_R_SQBRACKET
-%token LEX_L_BRACKET LEX_R_BRACKET
-%token LEX_HAT
-
-%token LEX_QMARK
-%token LEX_PLUS
-%token LEX_STAR
-%token LEX_SHARP
-
-%type <NODE_TYPE> var
-%type <NODE_TYPE> expr
-%type <letters> vardis
-%type <letters> exprdis
-%type <NODE_TYPE> exprdisnode
-%start start
-%%
-
-start: LEX_L_BRACKET expr LEX_R_BRACKET LEX_SHARP
- {
- NODE* sharp = regexp_createNODE(NODE_VAR,RE_FINAL_TOK,NULL,NULL);
- *root = regexp_createNODE(NODE_AND,'\0',$2,sharp);
- YYACCEPT;
- }
- ;
-
-
-expr : var
- {
- $$=$1;
- }
- | expr expr
- {
- $$=regexp_createNODE(NODE_AND,'\0',$1,$2);
- }
- | var LEX_QMARK
- {
- NODE* epsilon=regexp_createNODE(NODE_VAR,RE_EPSILON,NULL,NULL);
- $$=regexp_createNODE(NODE_OR,'\0',$1,epsilon);
- }
- | var LEX_PLUS
- {
- $$=regexp_createNODE(NODE_PLUS,'\0',$1,NULL);
- }
- | var LEX_STAR
- {
- $$=regexp_createNODE(NODE_STAR,'\0',$1,NULL);
- }
-/* () */
- | LEX_L_BRACKET expr LEX_R_BRACKET
- {
- $$=$2;
- }
- | LEX_L_BRACKET expr LEX_R_BRACKET LEX_QMARK
- {
- NODE* epsilon=regexp_createNODE(NODE_VAR,RE_EPSILON,NULL,NULL);
- $$=regexp_createNODE(NODE_OR,'\0',$2,epsilon);
- }
- | LEX_L_BRACKET expr LEX_R_BRACKET LEX_PLUS
- {
- $$=regexp_createNODE(NODE_PLUS,'\0',$2,NULL);
- }
- | LEX_L_BRACKET expr LEX_R_BRACKET LEX_STAR
- {
- $$=regexp_createNODE(NODE_STAR,'\0',$2,NULL);
- }
-/* [] */
- | LEX_L_SQBRACKET exprdisnode LEX_R_SQBRACKET
- {
- $$=$2;
- }
- | LEX_L_SQBRACKET exprdisnode LEX_R_SQBRACKET LEX_QMARK
- {
- NODE* epsilon=regexp_createNODE(NODE_VAR,RE_EPSILON,NULL,NULL);
- $$=regexp_createNODE(NODE_OR,'\0',$2,epsilon);
- }
- | LEX_L_SQBRACKET exprdisnode LEX_R_SQBRACKET LEX_PLUS
- {
- $$=regexp_createNODE(NODE_PLUS,'\0',$2,NULL);
- }
- | LEX_L_SQBRACKET exprdisnode LEX_R_SQBRACKET LEX_STAR
- {
- $$=regexp_createNODE(NODE_STAR,'\0',$2,NULL);
- }
- ;
-
-
-
-var : LEX_CHAR
- {
-#ifdef DEBUG_RE_PARSE
- printf("var : lecture %c\n",$1 + 'a' -1);
-#endif
- $$=regexp_createNODE(NODE_VAR,$1,NULL,NULL);
- }
- | LEX_ALL
- {
- $$=regexp_createNODE(NODE_VAR,RE_ALL_MATCH,NULL,NULL);
- }
- | LEX_VOWL
- {
- $$=regexp_createNODE(NODE_VAR,RE_VOWL_MATCH,NULL,NULL);
- }
- | LEX_CONS
- {
- $$=regexp_createNODE(NODE_VAR,RE_CONS_MATCH,NULL,NULL);
- }
- | LEX_USER1
- {
- $$=regexp_createNODE(NODE_VAR,RE_USR1_MATCH,NULL,NULL);
- }
- | LEX_USER2
- {
- $$=regexp_createNODE(NODE_VAR,RE_USR2_MATCH,NULL,NULL);
- }
- ;
-
-
-exprdisnode : exprdis
- {
- int i,j;
-#ifdef DEBUG_RE_PARSE
- printf("exprdisnode : exprdis : ");
-#endif
- for(i=RE_LIST_USER_END + 1; i < DIC_SEARCH_REGE_LIST; i++)
- {
- if (list->valid[i] == 0)
- {
- list->valid[i] = 1;
- list->symbl[i] = RE_ALL_MATCH + i;
- list->letters[i][0] = 0;
- for(j=1; j < DIC_LETTERS; j++)
- list->letters[i][j] = $1[j] ? 1 : 0;
-#ifdef DEBUG_RE_PARSE
- printf("list %d symbl x%02x : ",i,list->symbl[i]);
- for(j=0; j < DIC_LETTERS; j++)
- if (list->letters[i][j])
- printf("%c",j+'a'-1);
- printf("\n");
-#endif
- break;
- }
- }
- $$=regexp_createNODE(NODE_VAR,list->symbl[i],NULL,NULL);
- }
- | LEX_HAT exprdis
- {
- int i,j;
-#ifdef DEBUG_RE_PARSE
- printf("exprdisnode : HAT exprdis : ");
-#endif
- for(i=RE_LIST_USER_END + 1; i < DIC_SEARCH_REGE_LIST; i++)
- {
- if (list->valid[i] == 0)
- {
- list->valid[i] = 1;
- list->symbl[i] = RE_ALL_MATCH + i;
- list->letters[i][0] = 0;
- for(j=1; j < DIC_LETTERS; j++)
- list->letters[i][j] = $2[j] ? 0 : 1;
-#ifdef DEBUG_RE_PARSE
- printf("list %d symbl x%02x : ",i,list->symbl[i]);
- for(j=0; j < DIC_LETTERS; j++)
- if (list->letters[i][j])
- printf("%c",j+'a'-1);
- printf("\n");
-#endif
- break;
- }
- }
- $$=regexp_createNODE(NODE_VAR,list->symbl[i],NULL,NULL);
- }
- ;
-
-
-exprdis: vardis
- {
- memcpy($$,$1,sizeof(char)*DIC_LETTERS);
- }
- | vardis exprdis
- {
- int i;
- for(i=0; i < DIC_LETTERS; i++)
- $$[i] = $1[i] | $2[i];
- }
- ;
-
-
-
-vardis: LEX_CHAR
- {
- int c = $1;
- memset($$,0,sizeof(char)*DIC_LETTERS);
-#ifdef DEBUG_RE_PARSE
- printf("vardis : lecture %c\n",c + 'a' -1);
-#endif
- $$[c] = 1;
- }
- ;
-
-
-%%
-
-#define UNUSED __attribute__((unused))
-
-void regexperror(YYLTYPE *llocp, yyscan_t UNUSED yyscanner, NODE UNUSED **root,
- struct search_RegE_list_t UNUSED *list,
- struct regexp_error_report_t *err, char const *msg)
-{
- err->pos1 = llocp->first_column;
- err->pos2 = llocp->last_column;
- strncpy(err->msg,msg,sizeof(err->msg));
-}
-
-/*
- * shut down the compiler
- */
-//int yy_init_globals (yyscan_t yyscanner);
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [Eliot-dev] eliot configure.in dic/Makefile.am dic/dic.h di...,
eliot-dev <=