[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Eliot-dev] eliot/dic automaton.cpp automaton.h dic_search....
From: |
eliot-dev |
Subject: |
[Eliot-dev] eliot/dic automaton.cpp automaton.h dic_search.... |
Date: |
Sun, 13 Jul 2008 07:55:48 +0000 |
CVSROOT: /cvsroot/eliot
Module name: eliot
Changes by: Olivier Teulière <ipkiss> 08/07/13 07:55:48
Modified files:
dic : automaton.cpp automaton.h dic_search.cpp
regexp.cpp regexp.h regexpmain.cpp
Log message:
Various little fixes to the regexp engine to support wide characters.
It now works fine on any dictionary using non-ASCII characters!
CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/automaton.cpp?cvsroot=eliot&r1=1.2&r2=1.3
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/automaton.h?cvsroot=eliot&r1=1.12&r2=1.13
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/dic_search.cpp?cvsroot=eliot&r1=1.5&r2=1.6
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/regexp.cpp?cvsroot=eliot&r1=1.3&r2=1.4
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/regexp.h?cvsroot=eliot&r1=1.14&r2=1.15
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/regexpmain.cpp?cvsroot=eliot&r1=1.4&r2=1.5
Patches:
Index: automaton.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/automaton.cpp,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -b -r1.2 -r1.3
--- automaton.cpp 8 Jan 2008 13:52:33 -0000 1.2
+++ automaton.cpp 13 Jul 2008 07:55:47 -0000 1.3
@@ -71,7 +71,7 @@
void dump(const string &iFileName) const;
#endif
- static AutomatonHelper *ps2nfa(int iInitState, int *ptl, int *PS);
+ static AutomatonHelper *ps2nfa(uint64_t iInitState, int *ptl, uint64_t
*PS);
static AutomatonHelper *nfa2dfa(const AutomatonHelper &iNfa,
struct search_RegE_list_t *iList);
@@ -83,11 +83,11 @@
astate m_initState;
void addState(astate s);
- astate getState(const set<int> &iId) const;
+ astate getState(const set<uint64_t> &iId) const;
void printNodes(FILE* f) const;
void printEdges(FILE* f) const;
void setAccept(astate s) const;
- set<int> getSuccessor(const set<int> &S, int letter, struct
search_RegE_list_t *iList) const;
+ set<uint64_t> getSuccessor(const set<uint64_t> &S, int letter, struct
search_RegE_list_t *iList) const;
};
@@ -95,14 +95,14 @@
State handling
* ************************************************** */
-static set<int> s_state_id_create(int id);
-static string s_state_id_to_str(const set<int> &iId);
-static astate s_state_create (const set<int> &iId);
+static set<uint64_t> s_state_id_create(uint64_t id);
+static string s_state_id_to_str(const set<uint64_t> &iId);
+static astate s_state_create (const set<uint64_t> &iId);
struct automaton_state_t
{
- set<int> id;
- int accept;
+ set<uint64_t> id;
+ bool accept;
int id_static;
astate next[MAX_TRANSITION_LETTERS];
};
@@ -112,7 +112,7 @@
Definition of the Automaton class
* ************************************************** */
-Automaton::Automaton(int iInitState, int *ptl, int *PS, struct
search_RegE_list_t *iList)
+Automaton::Automaton(uint64_t iInitState, int *ptl, uint64_t *PS, struct
search_RegE_list_t *iList)
{
AutomatonHelper *nfa = AutomatonHelper::ps2nfa(iInitState, ptl, PS);
DMSG(printf("\n non deterministic automaton OK \n\n"));
@@ -172,7 +172,7 @@
if (s == iHelper.getInitState())
m_init = i;
- if (s->accept == 1)
+ if (s->accept)
m_acceptors[i] = true;
for (int l = 0; l < MAX_TRANSITION_LETTERS; l++)
@@ -234,34 +234,34 @@
Definition of the state handling methods
* ************************************************** */
-static set<int> s_state_id_create(int id)
+static set<uint64_t> s_state_id_create(uint64_t id)
{
- set<int> l;
+ set<uint64_t> l;
l.insert(id);
return l;
}
-static string s_state_id_to_str(const set<int> &iId)
+static string s_state_id_to_str(const set<uint64_t> &iId)
{
string s;
- set<int>::const_iterator it;
+ set<uint64_t>::const_iterator it;
for (it = iId.begin(); it != iId.end(); it++)
{
char tmp[50];
- sprintf(tmp, "%d ", *it);
+ sprintf(tmp, "%llu ", *it);
s += tmp;
}
return s;
}
-static astate s_state_create(const set<int> &iId)
+static astate s_state_create(const set<uint64_t> &iId)
{
astate s = new automaton_state_t();
// TODO: use copy constructor
s->id = iId;
- s->accept = 0;
+ s->accept = false;
memset(s->next, 0, sizeof(astate)*MAX_TRANSITION_LETTERS);
DMSG(printf("** state %s creation\n", s_state_id_to_str(iId).c_str()));
return s;
@@ -295,7 +295,7 @@
}
-astate AutomatonHelper::getState(const set<int> &iId) const
+astate AutomatonHelper::getState(const set<uint64_t> &iId) const
{
list<astate>::const_iterator it;
for (it = m_states.begin(); it != m_states.end(); it++)
@@ -314,15 +314,15 @@
* ************************************************** *
* ************************************************** */
-AutomatonHelper *AutomatonHelper::ps2nfa(int init_state_id, int *ptl, int *PS)
+AutomatonHelper *AutomatonHelper::ps2nfa(uint64_t init_state_id, int *ptl,
uint64_t *PS)
{
- int maxpos = PS[0];
+ uint64_t maxpos = PS[0];
astate current_state;
char used_letter[MAX_TRANSITION_LETTERS];
/* 1: init_state = root->PP */
- set<int> temp_id0 = s_state_id_create(init_state_id);
+ set<uint64_t> temp_id0 = s_state_id_create(init_state_id);
astate temp_state = s_state_create(temp_id0);
AutomatonHelper *nfa = new AutomatonHelper(temp_state);
nfa->addState(temp_state);
@@ -336,14 +336,14 @@
DMSG(printf("** current state = %s\n",
s_state_id_to_str(current_state->id).c_str()));
memset(used_letter, 0, sizeof(used_letter));
/* 3: \foreach l in \sigma | l \neq # */
- for (int p = 1; p < maxpos; p++)
+ for (uint32_t p = 1; p < maxpos; p++)
{
int current_letter = ptl[p];
if (used_letter[current_letter] == 0)
{
/* 4: int set = \cup { PS(pos) | pos \in state \wedge pos == l
} */
- int ens = 0;
- for (int pos = 1; pos <= maxpos; pos++)
+ uint64_t ens = 0;
+ for (uint32_t pos = 1; pos <= maxpos; pos++)
{
if (ptl[pos] == current_letter &&
(unsigned int)*(current_state->id.begin()) & (1 <<
(pos - 1)))
@@ -352,7 +352,7 @@
/* 5: transition from current_state to temp_state */
if (ens)
{
- set<int> temp_id = s_state_id_create(ens);
+ set<uint64_t> temp_id = s_state_id_create(ens);
temp_state = nfa->getState(temp_id);
if (temp_state == NULL)
{
@@ -376,7 +376,7 @@
{
astate s = *it;
if (*(s->id.begin()) & (1 << (maxpos - 1)))
- s->accept = 1;
+ s->accept = true;
}
return nfa;
@@ -386,20 +386,20 @@
* ************************************************** *
* ************************************************** */
-set<int> AutomatonHelper::getSuccessor(const set<int> &S,
+set<uint64_t> AutomatonHelper::getSuccessor(const set<uint64_t> &S,
int letter,
struct search_RegE_list_t *iList) const
{
- set<int> R, r;
- set<int>::const_iterator it;
+ set<uint64_t> R, r;
+ set<uint64_t>::const_iterator it;
for (it = S.begin(); it != S.end(); it++) /* \forall y \in
S */
{
astate y, z;
- set<int> t = s_state_id_create(*it);
+ set<uint64_t> t = s_state_id_create(*it);
assert(y = getState(t));
- set<int> Ry; /* Ry = \empty
*/
+ set<uint64_t> Ry; /* Ry =
\empty */
if ((z = y->next[letter]) != NULL) /* \delta (y,z) =
l */
{
@@ -460,7 +460,7 @@
if (ns->accept && (find(s->id.begin(), s->id.end(), idx) !=
s->id.end()))
{
DMSG(printf("(ok) "));
- s->accept = 1;
+ s->accept = true;
}
}
DMSG(printf("\n"));
@@ -475,7 +475,7 @@
list<astate> L;
// Clone the list
- set<int> temp_id0 = iNfa.m_initState->id;
+ set<uint64_t> temp_id0 = iNfa.m_initState->id;
astate temp_state = s_state_create(temp_id0);
AutomatonHelper *dfa = new AutomatonHelper(temp_state);
dfa->addState(temp_state);
@@ -489,7 +489,7 @@
{
// DMSG(printf("*** start successor of %s\n",
s_state_id_to_str(current_state->id).c_str()));
- set<int> temp_id = iNfa.getSuccessor(current_state->id, letter,
iList);
+ set<uint64_t> temp_id = iNfa.getSuccessor(current_state->id,
letter, iList);
if (! temp_id.empty())
{
Index: automaton.h
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/automaton.h,v
retrieving revision 1.12
retrieving revision 1.13
diff -u -b -r1.12 -r1.13
--- automaton.h 8 Jan 2008 13:52:33 -0000 1.12
+++ automaton.h 13 Jul 2008 07:55:47 -0000 1.13
@@ -38,7 +38,7 @@
* Build a static deterministic finite automaton from
* "init_state", "ptl" and "PS" given by the parser
*/
- Automaton(int init_state, int *ptl, int *PS, struct search_RegE_list_t
*iList);
+ Automaton(uint64_t init_state, int *ptl, uint64_t *PS, struct
search_RegE_list_t *iList);
/// Destructor
~Automaton();
@@ -59,13 +59,13 @@
* Query the acceptor flag for the given state
* @return true/false
*/
- bool accept(int state) const { return m_acceptors[state]; }
+ bool accept(uint64_t state) const { return m_acceptors[state]; }
/**
* Return the next state when the transition is taken
* @returns next state id (1 <= id <= nstate, 0 = invalid id)
*/
- int getNextState(int start, char l) const
+ uint64_t getNextState(uint64_t start, char l) const
{
return m_transitions[start][(int)l];
}
Index: dic_search.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/dic_search.cpp,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -b -r1.5 -r1.6
--- dic_search.cpp 7 Jul 2008 17:30:00 -0000 1.5
+++ dic_search.cpp 13 Jul 2008 07:55:47 -0000 1.6
@@ -489,7 +489,8 @@
/* 1: the letter appears in the automaton as is */
if (next_state)
{
- params->word[params->wordlen] = current->chr + L'a' - 1;
+ params->word[params->wordlen] =
+ towlower(getHeader().getCharFromCode(current->chr));
params->wordlen ++;
searchRegexpRecTempl(params, next_state, current, oWordList,
iMaxResults);
params->wordlen --;
@@ -513,15 +514,6 @@
else
oWordList.reserve(DEFAULT_VECT_ALLOC);
- int ptl[REGEXP_MAX+1];
- int PS [REGEXP_MAX+1];
-
- for (int i = 0; i < REGEXP_MAX; i++)
- {
- PS[i] = 0;
- ptl[i] = 0;
- }
-
struct regexp_error_report_t report;
report.pos1 = 0;
report.pos2 = 0;
@@ -541,6 +533,15 @@
return;
}
+ int ptl[REGEXP_MAX+1];
+ uint64_t PS [REGEXP_MAX+1];
+
+ for (int i = 0; i < REGEXP_MAX; i++)
+ {
+ PS[i] = 0;
+ ptl[i] = 0;
+ }
+
int n = 1;
int p = 1;
root->traverse(p, n, ptl);
Index: regexp.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/regexp.cpp,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -b -r1.3 -r1.4
--- regexp.cpp 7 Jul 2008 17:30:01 -0000 1.3
+++ regexp.cpp 13 Jul 2008 07:55:47 -0000 1.4
@@ -105,7 +105,7 @@
}
-void Node::nextPos(int PS[])
+void Node::nextPos(uint64_t PS[])
{
if (m_fg)
m_fg->nextPos(PS);
@@ -119,7 +119,7 @@
/* \forall p \in DP(left) */
/* PS[p] = PS[p] \cup PP(right) */
/************************************/
- for (int pos = 1; pos <= PS[0]; pos++)
+ for (uint32_t pos = 1; pos <= PS[0]; pos++)
{
if (m_fg->m_DP & (1 << (pos-1)))
PS[pos] |= m_fd->m_PP;
@@ -131,7 +131,7 @@
/* \forall p \in DP(left) */
/* PS[p] = PS[p] \cup PP(left) */
/************************************/
- for (int pos = 1; pos <= PS[0]; pos++)
+ for (uint32_t pos = 1; pos <= PS[0]; pos++)
{
if (m_DP & (1 << (pos-1)))
PS[pos] |= m_PP;
@@ -142,7 +142,7 @@
/* \forall p \in DP(left) */
/* PS[p] = PS[p] \cup PP(left) */
/************************************/
- for (int pos = 1; pos <= PS[0]; pos++)
+ for (uint32_t pos = 1; pos <= PS[0]; pos++)
{
if (m_DP & (1 << (pos-1)))
PS[pos] |= m_PP;
Index: regexp.h
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/regexp.h,v
retrieving revision 1.14
retrieving revision 1.15
diff -u -b -r1.14 -r1.15
--- regexp.h 7 Jul 2008 17:30:01 -0000 1.14
+++ regexp.h 13 Jul 2008 07:55:47 -0000 1.15
@@ -70,7 +70,7 @@
* @param PS : next position table, PS[0] must contain the
* number of terminals contained in the regular expression
*/
- void nextPos(int PS[]);
+ void nextPos(uint64_t PS[]);
/// Return the first position
int getFirstPos() const { return m_PP; }
@@ -91,8 +91,8 @@
int m_number;
int m_position;
bool m_annulable;
- int m_PP;
- int m_DP;
+ uint64_t m_PP;
+ uint64_t m_DP;
#ifdef DEBUG_RE
/// Print the current node to file
Index: regexpmain.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/regexpmain.cpp,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -b -r1.4 -r1.5
--- regexpmain.cpp 7 Jul 2008 17:30:01 -0000 1.4
+++ regexpmain.cpp 13 Jul 2008 07:55:47 -0000 1.5
@@ -29,8 +29,6 @@
#include <exception>
#include <iostream>
-#include <cstdlib>
-#include <cstring>
#if ENABLE_NLS
# include <libintl.h>
@@ -104,25 +102,20 @@
{
Dictionary dic(argv[1]);
- char er[200];
- strcpy(er, ".");
-
struct search_RegE_list_t regList;
- while (strcmp(er, ""))
- {
+ string line;
cout <<
"**************************************************************" << endl;
cout <<
"**************************************************************" << endl;
cout << _("enter a regular expression:") << endl;
- fgets(er, sizeof(er), stdin);
- /* strip \n */
- er[strlen(er) - 1] = '\0';
- if (strcmp(er, "") == 0)
+ while (getline(cin, line))
+ {
+ if (line == "")
break;
- /* automaton */
+ /* Automaton */
init_letter_lists(dic, ®List);
vector<wstring> wordList;
- dic.searchRegExp(convertToWc(er), wordList, ®List);
+ dic.searchRegExp(convertToWc(line), wordList, ®List);
cout << _("result:") << endl;
vector<wstring>::const_iterator it;
@@ -130,6 +123,9 @@
{
cerr << convertToMb(*it) << endl;
}
+ cout <<
"**************************************************************" << endl;
+ cout <<
"**************************************************************" << endl;
+ cout << _("enter a regular expression:") << endl;
}
return 0;
- [Eliot-dev] eliot/dic automaton.cpp automaton.h dic_search....,
eliot-dev <=