eliot-dev
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Eliot-dev] eliot/dic automaton.cpp automaton.h dic_search....


From: eliot-dev
Subject: [Eliot-dev] eliot/dic automaton.cpp automaton.h dic_search....
Date: Sun, 13 Jul 2008 07:55:48 +0000

CVSROOT:        /cvsroot/eliot
Module name:    eliot
Changes by:     Olivier Teulière <ipkiss>      08/07/13 07:55:48

Modified files:
        dic            : automaton.cpp automaton.h dic_search.cpp 
                         regexp.cpp regexp.h regexpmain.cpp 

Log message:
        Various little fixes to the regexp engine to support wide characters.
        It now works fine on any dictionary using non-ASCII characters!

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/automaton.cpp?cvsroot=eliot&r1=1.2&r2=1.3
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/automaton.h?cvsroot=eliot&r1=1.12&r2=1.13
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/dic_search.cpp?cvsroot=eliot&r1=1.5&r2=1.6
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/regexp.cpp?cvsroot=eliot&r1=1.3&r2=1.4
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/regexp.h?cvsroot=eliot&r1=1.14&r2=1.15
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/regexpmain.cpp?cvsroot=eliot&r1=1.4&r2=1.5

Patches:
Index: automaton.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/automaton.cpp,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -b -r1.2 -r1.3
--- automaton.cpp       8 Jan 2008 13:52:33 -0000       1.2
+++ automaton.cpp       13 Jul 2008 07:55:47 -0000      1.3
@@ -71,7 +71,7 @@
     void dump(const string &iFileName) const;
 #endif
 
-    static AutomatonHelper *ps2nfa(int iInitState, int *ptl, int *PS);
+    static AutomatonHelper *ps2nfa(uint64_t iInitState, int *ptl, uint64_t 
*PS);
     static AutomatonHelper *nfa2dfa(const AutomatonHelper &iNfa,
                                     struct search_RegE_list_t *iList);
 
@@ -83,11 +83,11 @@
     astate m_initState;
 
     void addState(astate s);
-    astate getState(const set<int> &iId) const;
+    astate getState(const set<uint64_t> &iId) const;
     void printNodes(FILE* f) const;
     void printEdges(FILE* f) const;
     void setAccept(astate s) const;
-    set<int> getSuccessor(const set<int> &S, int letter, struct 
search_RegE_list_t *iList) const;
+    set<uint64_t> getSuccessor(const set<uint64_t> &S, int letter, struct 
search_RegE_list_t *iList) const;
 };
 
 
@@ -95,14 +95,14 @@
    State handling
  * ************************************************** */
 
-static set<int> s_state_id_create(int id);
-static string   s_state_id_to_str(const set<int> &iId);
-static astate   s_state_create   (const set<int> &iId);
+static set<uint64_t> s_state_id_create(uint64_t id);
+static string   s_state_id_to_str(const set<uint64_t> &iId);
+static astate   s_state_create   (const set<uint64_t> &iId);
 
 struct automaton_state_t
 {
-    set<int> id;
-    int      accept;
+    set<uint64_t> id;
+    bool accept;
     int      id_static;
     astate   next[MAX_TRANSITION_LETTERS];
 };
@@ -112,7 +112,7 @@
    Definition of the Automaton class
  * ************************************************** */
 
-Automaton::Automaton(int iInitState, int *ptl, int *PS, struct 
search_RegE_list_t *iList)
+Automaton::Automaton(uint64_t iInitState, int *ptl, uint64_t *PS, struct 
search_RegE_list_t *iList)
 {
     AutomatonHelper *nfa = AutomatonHelper::ps2nfa(iInitState, ptl, PS);
     DMSG(printf("\n non deterministic automaton OK \n\n"));
@@ -172,7 +172,7 @@
 
         if (s == iHelper.getInitState())
             m_init = i;
-        if (s->accept == 1)
+        if (s->accept)
             m_acceptors[i] = true;
 
         for (int l = 0; l < MAX_TRANSITION_LETTERS; l++)
@@ -234,34 +234,34 @@
    Definition of the state handling methods
  * ************************************************** */
 
-static set<int> s_state_id_create(int id)
+static set<uint64_t> s_state_id_create(uint64_t id)
 {
-    set<int> l;
+    set<uint64_t> l;
     l.insert(id);
     return l;
 }
 
 
-static string s_state_id_to_str(const set<int> &iId)
+static string s_state_id_to_str(const set<uint64_t> &iId)
 {
     string s;
-    set<int>::const_iterator it;
+    set<uint64_t>::const_iterator it;
     for (it = iId.begin(); it != iId.end(); it++)
     {
         char tmp[50];
-        sprintf(tmp, "%d ", *it);
+        sprintf(tmp, "%llu ", *it);
         s += tmp;
     }
     return s;
 }
 
 
-static astate s_state_create(const set<int> &iId)
+static astate s_state_create(const set<uint64_t> &iId)
 {
     astate s = new automaton_state_t();
     // TODO: use copy constructor
     s->id      = iId;
-    s->accept  = 0;
+    s->accept = false;
     memset(s->next, 0, sizeof(astate)*MAX_TRANSITION_LETTERS);
     DMSG(printf("** state %s creation\n", s_state_id_to_str(iId).c_str()));
     return s;
@@ -295,7 +295,7 @@
 }
 
 
-astate AutomatonHelper::getState(const set<int> &iId) const
+astate AutomatonHelper::getState(const set<uint64_t> &iId) const
 {
     list<astate>::const_iterator it;
     for (it = m_states.begin(); it != m_states.end(); it++)
@@ -314,15 +314,15 @@
  * ************************************************** *
  * ************************************************** */
 
-AutomatonHelper *AutomatonHelper::ps2nfa(int init_state_id, int *ptl, int *PS)
+AutomatonHelper *AutomatonHelper::ps2nfa(uint64_t init_state_id, int *ptl, 
uint64_t *PS)
 {
-    int maxpos = PS[0];
+    uint64_t maxpos = PS[0];
     astate current_state;
     char used_letter[MAX_TRANSITION_LETTERS];
 
 
     /* 1: init_state = root->PP */
-    set<int> temp_id0 = s_state_id_create(init_state_id);
+    set<uint64_t> temp_id0 = s_state_id_create(init_state_id);
     astate temp_state = s_state_create(temp_id0);
     AutomatonHelper *nfa = new AutomatonHelper(temp_state);
     nfa->addState(temp_state);
@@ -336,14 +336,14 @@
         DMSG(printf("** current state = %s\n", 
s_state_id_to_str(current_state->id).c_str()));
         memset(used_letter, 0, sizeof(used_letter));
         /* 3: \foreach l in \sigma | l \neq # */
-        for (int p = 1; p < maxpos; p++)
+        for (uint32_t p = 1; p < maxpos; p++)
         {
             int current_letter = ptl[p];
             if (used_letter[current_letter] == 0)
             {
                 /* 4: int set = \cup { PS(pos) | pos \in state \wedge pos == l 
} */
-                int ens = 0;
-                for (int pos = 1; pos <= maxpos; pos++)
+                uint64_t ens = 0;
+                for (uint32_t pos = 1; pos <= maxpos; pos++)
                 {
                     if (ptl[pos] == current_letter &&
                         (unsigned int)*(current_state->id.begin()) & (1 << 
(pos - 1)))
@@ -352,7 +352,7 @@
                 /* 5: transition from current_state to temp_state */
                 if (ens)
                 {
-                    set<int> temp_id = s_state_id_create(ens);
+                    set<uint64_t> temp_id = s_state_id_create(ens);
                     temp_state = nfa->getState(temp_id);
                     if (temp_state == NULL)
                     {
@@ -376,7 +376,7 @@
     {
         astate s = *it;
         if (*(s->id.begin()) & (1 << (maxpos - 1)))
-            s->accept = 1;
+            s->accept = true;
     }
 
     return nfa;
@@ -386,20 +386,20 @@
  * ************************************************** *
  * ************************************************** */
 
-set<int> AutomatonHelper::getSuccessor(const set<int> &S,
+set<uint64_t> AutomatonHelper::getSuccessor(const set<uint64_t> &S,
                                        int letter,
                                        struct search_RegE_list_t *iList) const
 {
-    set<int> R, r;
-    set<int>::const_iterator it;
+    set<uint64_t> R, r;
+    set<uint64_t>::const_iterator it;
     for (it = S.begin(); it != S.end(); it++)                /* \forall y \in 
S */
     {
         astate y, z;
 
-        set<int> t = s_state_id_create(*it);
+        set<uint64_t> t = s_state_id_create(*it);
         assert(y = getState(t));
 
-        set<int> Ry;                                        /* Ry = \empty     
        */
+        set<uint64_t> Ry;                                        /* Ry = 
\empty             */
 
         if ((z = y->next[letter]) != NULL)                   /* \delta (y,z) = 
l        */
         {
@@ -460,7 +460,7 @@
         if (ns->accept && (find(s->id.begin(), s->id.end(), idx) != 
s->id.end()))
         {
             DMSG(printf("(ok) "));
-            s->accept = 1;
+            s->accept = true;
         }
     }
     DMSG(printf("\n"));
@@ -475,7 +475,7 @@
     list<astate> L;
 
     // Clone the list
-    set<int> temp_id0 = iNfa.m_initState->id;
+    set<uint64_t> temp_id0 = iNfa.m_initState->id;
     astate temp_state = s_state_create(temp_id0);
     AutomatonHelper *dfa = new AutomatonHelper(temp_state);
     dfa->addState(temp_state);
@@ -489,7 +489,7 @@
         {
             // DMSG(printf("*** start successor of %s\n", 
s_state_id_to_str(current_state->id).c_str()));
 
-            set<int> temp_id = iNfa.getSuccessor(current_state->id, letter, 
iList);
+            set<uint64_t> temp_id = iNfa.getSuccessor(current_state->id, 
letter, iList);
 
             if (! temp_id.empty())
             {

Index: automaton.h
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/automaton.h,v
retrieving revision 1.12
retrieving revision 1.13
diff -u -b -r1.12 -r1.13
--- automaton.h 8 Jan 2008 13:52:33 -0000       1.12
+++ automaton.h 13 Jul 2008 07:55:47 -0000      1.13
@@ -38,7 +38,7 @@
      * Build a static deterministic finite automaton from
      * "init_state", "ptl" and "PS" given by the parser
      */
-    Automaton(int init_state, int *ptl, int *PS, struct search_RegE_list_t 
*iList);
+    Automaton(uint64_t init_state, int *ptl, uint64_t *PS, struct 
search_RegE_list_t *iList);
 
     /// Destructor
     ~Automaton();
@@ -59,13 +59,13 @@
      * Query the acceptor flag for the given state
      * @return true/false
      */
-    bool accept(int state) const { return m_acceptors[state]; }
+    bool accept(uint64_t state) const { return m_acceptors[state]; }
 
     /**
      * Return the next state when the transition is taken
      * @returns next state id (1 <= id <= nstate, 0 = invalid id)
      */
-    int getNextState(int start, char l) const
+    uint64_t getNextState(uint64_t start, char l) const
     {
         return m_transitions[start][(int)l];
     }

Index: dic_search.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/dic_search.cpp,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -b -r1.5 -r1.6
--- dic_search.cpp      7 Jul 2008 17:30:00 -0000       1.5
+++ dic_search.cpp      13 Jul 2008 07:55:47 -0000      1.6
@@ -489,7 +489,8 @@
         /* 1: the letter appears in the automaton as is */
         if (next_state)
         {
-            params->word[params->wordlen] = current->chr + L'a' - 1;
+            params->word[params->wordlen] =
+                towlower(getHeader().getCharFromCode(current->chr));
             params->wordlen ++;
             searchRegexpRecTempl(params, next_state, current, oWordList, 
iMaxResults);
             params->wordlen --;
@@ -513,15 +514,6 @@
     else
         oWordList.reserve(DEFAULT_VECT_ALLOC);
 
-    int ptl[REGEXP_MAX+1];
-    int PS [REGEXP_MAX+1];
-
-    for (int i = 0; i < REGEXP_MAX; i++)
-    {
-        PS[i] = 0;
-        ptl[i] = 0;
-    }
-
     struct regexp_error_report_t report;
     report.pos1 = 0;
     report.pos2 = 0;
@@ -541,6 +533,15 @@
         return;
     }
 
+    int ptl[REGEXP_MAX+1];
+    uint64_t PS [REGEXP_MAX+1];
+
+    for (int i = 0; i < REGEXP_MAX; i++)
+    {
+        PS[i] = 0;
+        ptl[i] = 0;
+    }
+
     int n = 1;
     int p = 1;
     root->traverse(p, n, ptl);

Index: regexp.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/regexp.cpp,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -b -r1.3 -r1.4
--- regexp.cpp  7 Jul 2008 17:30:01 -0000       1.3
+++ regexp.cpp  13 Jul 2008 07:55:47 -0000      1.4
@@ -105,7 +105,7 @@
 }
 
 
-void Node::nextPos(int PS[])
+void Node::nextPos(uint64_t PS[])
 {
     if (m_fg)
         m_fg->nextPos(PS);
@@ -119,7 +119,7 @@
             /* \forall p \in DP(left)           */
             /*     PS[p] = PS[p] \cup PP(right) */
             /************************************/
-            for (int pos = 1; pos <= PS[0]; pos++)
+            for (uint32_t pos = 1; pos <= PS[0]; pos++)
             {
                 if (m_fg->m_DP & (1 << (pos-1)))
                     PS[pos] |= m_fd->m_PP;
@@ -131,7 +131,7 @@
             /* \forall p \in DP(left)           */
             /*     PS[p] = PS[p] \cup PP(left)  */
             /************************************/
-            for (int pos = 1; pos <= PS[0]; pos++)
+            for (uint32_t pos = 1; pos <= PS[0]; pos++)
             {
                 if (m_DP & (1 << (pos-1)))
                     PS[pos] |= m_PP;
@@ -142,7 +142,7 @@
             /* \forall p \in DP(left)           */
             /*     PS[p] = PS[p] \cup PP(left)  */
             /************************************/
-            for (int pos = 1; pos <= PS[0]; pos++)
+            for (uint32_t pos = 1; pos <= PS[0]; pos++)
             {
                 if (m_DP & (1 << (pos-1)))
                     PS[pos] |= m_PP;

Index: regexp.h
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/regexp.h,v
retrieving revision 1.14
retrieving revision 1.15
diff -u -b -r1.14 -r1.15
--- regexp.h    7 Jul 2008 17:30:01 -0000       1.14
+++ regexp.h    13 Jul 2008 07:55:47 -0000      1.15
@@ -70,7 +70,7 @@
      * @param PS : next position table, PS[0] must contain the
      * number of terminals contained in the regular expression
      */
-    void nextPos(int PS[]);
+    void nextPos(uint64_t PS[]);
 
     /// Return the first position
     int getFirstPos() const { return m_PP; }
@@ -91,8 +91,8 @@
     int m_number;
     int m_position;
     bool m_annulable;
-    int m_PP;
-    int m_DP;
+    uint64_t m_PP;
+    uint64_t m_DP;
 
 #ifdef DEBUG_RE
     /// Print the current node to file

Index: regexpmain.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/regexpmain.cpp,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -b -r1.4 -r1.5
--- regexpmain.cpp      7 Jul 2008 17:30:01 -0000       1.4
+++ regexpmain.cpp      13 Jul 2008 07:55:47 -0000      1.5
@@ -29,8 +29,6 @@
 
 #include <exception>
 #include <iostream>
-#include <cstdlib>
-#include <cstring>
 
 #if ENABLE_NLS
 #   include <libintl.h>
@@ -104,25 +102,20 @@
     {
         Dictionary dic(argv[1]);
 
-        char er[200];
-        strcpy(er, ".");
-
         struct search_RegE_list_t regList;
-        while (strcmp(er, ""))
-        {
+        string line;
             cout << 
"**************************************************************" << endl;
             cout << 
"**************************************************************" << endl;
             cout << _("enter a regular expression:") << endl;
-            fgets(er, sizeof(er), stdin);
-            /* strip \n */
-            er[strlen(er) - 1] = '\0';
-            if (strcmp(er, "") == 0)
+        while (getline(cin, line))
+        {
+            if (line == "")
                 break;
 
-            /* automaton */
+            /* Automaton */
             init_letter_lists(dic, &regList);
             vector<wstring> wordList;
-            dic.searchRegExp(convertToWc(er), wordList, &regList);
+            dic.searchRegExp(convertToWc(line), wordList, &regList);
 
             cout << _("result:") << endl;
             vector<wstring>::const_iterator it;
@@ -130,6 +123,9 @@
             {
                 cerr << convertToMb(*it) << endl;
             }
+            cout << 
"**************************************************************" << endl;
+            cout << 
"**************************************************************" << endl;
+            cout << _("enter a regular expression:") << endl;
         }
 
         return 0;




reply via email to

[Prev in Thread] Current Thread [Next in Thread]