diff --git a/dic/automaton.cpp b/dic/automaton.cpp index cdd6283..de465b5 100644 --- a/dic/automaton.cpp +++ b/dic/automaton.cpp @@ -18,13 +18,6 @@ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA *****************************************************************************/ -/** - * \file automaton.c - * \brief (Non)Deterministic Finite AutomatonHelper for Regexp - * \author Antoine Fraboulet - * \date 2005 - */ - #include "config.h" #include @@ -79,6 +72,11 @@ public: State * m_next[MAX_TRANSITION_LETTERS]; private: + /** + * Id of the state. For the first automaton, each ID contains only 1 + * integer, but the ID of the deterministic automaton will contain + * several integers, according to the usual "determinization" algorithm. + */ set m_id; void init() @@ -107,7 +105,7 @@ public: static AutomatonHelper *ps2nfa(uint64_t iInitState, int *ptl, uint64_t *PS); static AutomatonHelper *nfa2dfa(const AutomatonHelper &iNfa, - struct search_RegE_list_t *iList); + const searchRegExpLists &iList); /// List of states list m_states; @@ -121,7 +119,8 @@ private: void printNodes(FILE* f) const; void printEdges(FILE* f) const; void setAccept(State * s) const; - set getSuccessor(const set &S, int letter, struct search_RegE_list_t *iList) const; + set getSuccessor(const set &S, int letter, + const searchRegExpLists &iList) const; }; @@ -129,7 +128,8 @@ private: Definition of the Automaton class * ************************************************** */ -Automaton::Automaton(uint64_t iInitState, int *ptl, uint64_t *PS, struct search_RegE_list_t *iList) +Automaton::Automaton(uint64_t iInitState, int *ptl, uint64_t *PS, + const searchRegExpLists &iList) { AutomatonHelper *nfa = AutomatonHelper::ps2nfa(iInitState, ptl, PS); DMSG(printf("\n non deterministic automaton OK \n\n")); @@ -151,7 +151,7 @@ Automaton::Automaton(uint64_t iInitState, int *ptl, uint64_t *PS, struct search_ Automaton::~Automaton() { delete[] m_acceptors; - for (int i = 0; i <= m_nbStates; i++) + for (unsigned int i = 0; i <= m_nbStates; i++) { delete[] m_transitions[i]; } @@ -166,7 +166,7 @@ void Automaton::finalize(const AutomatonHelper &iHelper) m_acceptors = new bool[m_nbStates + 1]; memset(m_acceptors, 0, (m_nbStates + 1) * sizeof(bool)); m_transitions = new int*[m_nbStates + 1]; - for (int i = 0; i <= m_nbStates; i++) + for (unsigned int i = 0; i <= m_nbStates; i++) { m_transitions[i] = new int[MAX_TRANSITION_LETTERS]; memset(m_transitions[i], 0, MAX_TRANSITION_LETTERS * sizeof(int)); @@ -205,7 +205,7 @@ void Automaton::dump(const string &iFileName) const { FILE *f = fopen(iFileName.c_str(), "w"); fprintf(f, "digraph automaton {\n"); - for (int i = 1; i <= m_nbStates; i++) + for (unsigned int i = 1; i <= m_nbStates; i++) { fprintf(f, "\t%d [label = \"%d\"", i, i); if (i == m_init) @@ -215,7 +215,7 @@ void Automaton::dump(const string &iFileName) const fprintf(f, "];\n"); } fprintf(f, "\n"); - for (int i = 1; i <= m_nbStates; i++) + for (unsigned int i = 1; i <= m_nbStates; i++) { for (int l = 0; l < MAX_TRANSITION_LETTERS; l++) { @@ -363,7 +363,7 @@ AutomatonHelper *AutomatonHelper::ps2nfa(uint64_t init_state_id, int *ptl, uint6 set AutomatonHelper::getSuccessor(const set &S, int letter, - struct search_RegE_list_t *iList) const + const searchRegExpLists &iList) const { set R, r; set::const_iterator it; @@ -394,26 +394,23 @@ set AutomatonHelper::getSuccessor(const set &S, if (letter < RE_FINAL_TOK) { - for (int i = 0; i < DIC_SEARCH_REGE_LIST; i++) + for (unsigned int i = 0; i < iList.symbl.size(); i++) { - if (iList->valid[i]) + if (iList.letters[i][letter] && (z = y->m_next[(int)iList.symbl[i]]) != NULL) { - if (iList->letters[i][letter] && (z = y->m_next[(int)iList->symbl[i]]) != NULL) - { - DMSG(printf("*** letter ")); - DMSG(regexp_print_letter(stdout, letter)); - DMSG(printf("is in ")); - DMSG(regexp_print_letter(stdout, i)); + DMSG(printf("*** letter ")); + DMSG(regexp_print_letter(stdout, letter)); + DMSG(printf("is in ")); + DMSG(regexp_print_letter(stdout, i)); - r = getSuccessor(z->getId(), RE_EPSILON, iList); - Ry.insert(r.begin(), r.end()); - Ry.insert(z->getId().begin(), z->getId().end()); - } + r = getSuccessor(z->getId(), RE_EPSILON, iList); + Ry.insert(r.begin(), r.end()); + Ry.insert(z->getId().begin(), z->getId().end()); } } } - R.insert(Ry.begin(), Ry.end()); /* R = R \cup Ry */ + R.insert(Ry.begin(), Ry.end()); /* R = R \cup Ry */ } return R; @@ -440,7 +437,7 @@ void AutomatonHelper::setAccept(State * s) const AutomatonHelper *AutomatonHelper::nfa2dfa(const AutomatonHelper &iNfa, - struct search_RegE_list_t *iList) + const searchRegExpLists &iList) { State * current_state; diff --git a/dic/automaton.h b/dic/automaton.h index 11648c1..dfc0db4 100644 --- a/dic/automaton.h +++ b/dic/automaton.h @@ -18,17 +18,11 @@ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA *****************************************************************************/ -/** - * \file automaton.h - * \brief (Non)Deterministic Finite Automaton for Regexp - * \author Antoine Fraboulet - * \date 2005 - */ - #ifndef _DIC_AUTOMATON_H_ #define _DIC_AUTOMATON_H_ class AutomatonHelper; +struct searchRegExpLists; class Automaton { @@ -38,7 +32,8 @@ public: * Build a static deterministic finite automaton from * "init_state", "ptl" and "PS" given by the parser */ - Automaton(uint64_t init_state, int *ptl, uint64_t *PS, struct search_RegE_list_t *iList); + Automaton(uint64_t init_state, int *ptl, uint64_t *PS, + const searchRegExpLists &iList); /// Destructor ~Automaton(); @@ -77,10 +72,10 @@ public: private: /// Number of states - int m_nbStates; + unsigned int m_nbStates; /// ID of the init state - int m_init; + uint64_t m_init; /// Array of booleans, one for each state bool *m_acceptors; diff --git a/dic/dic_search.cpp b/dic/dic_search.cpp index 97e839b..3f1aa2b 100644 --- a/dic/dic_search.cpp +++ b/dic/dic_search.cpp @@ -499,18 +499,27 @@ void Dictionary::searchRegexpRecTempl(struct params_regexp_t *params, } -static void init_letter_lists(const Dictionary &iDic, struct search_RegE_list_t &iList) +/** + * Initialize the lists of letters with pre-defined lists + + * 0: all tiles + * 1: vowels + * 2: consonants + * 3: user defined 1 + * 4: user defined 2 + * x: lists used during parsing + */ +static void initLetterLists(const Dictionary &iDic, + searchRegExpLists &iList) { memset(&iList, 0, sizeof(iList)); // Prepare the space for 5 items iList.symbl.assign(5, 0); + iList.letters.assign(5, vector(DIC_LETTERS, false)); - iList.valid[0] = true; // all letters - iList.symbl[0] = RE_ALL_MATCH; - iList.valid[1] = true; // vowels - iList.symbl[1] = RE_VOWL_MATCH; - iList.valid[2] = true; // consonants - iList.symbl[2] = RE_CONS_MATCH; + iList.symbl[0] = RE_ALL_MATCH; // All letters + iList.symbl[1] = RE_VOWL_MATCH; // Vowels + iList.symbl[2] = RE_CONS_MATCH; // Consonants iList.letters[0][0] = false; iList.letters[1][0] = false; iList.letters[2][0] = false; @@ -522,10 +531,8 @@ static void init_letter_lists(const Dictionary &iDic, struct search_RegE_list_t iList.letters[2][i] = iDic.getHeader().isConsonant(i); } - iList.valid[3] = false; // user defined list 1 - iList.symbl[3] = RE_USR1_MATCH; - iList.valid[4] = false; // user defined list 2 - iList.symbl[4] = RE_USR2_MATCH; + iList.symbl[3] = RE_USR1_MATCH; // User defined list 1 + iList.symbl[4] = RE_USR2_MATCH; // User defined list 2 } @@ -546,9 +553,10 @@ void Dictionary::searchRegExp(const wstring &iRegexp, // Parsing Node *root = NULL; - struct search_RegE_list_t llist; - init_letter_lists(*this, llist); - bool parsingOk = parseRegexp(*this, (iRegexp + L"#").c_str(), &root, &llist); + searchRegExpLists llist; + // Initialize the lists of letters + initLetterLists(*this, llist); + bool parsingOk = parseRegexp(*this, (iRegexp + L"#").c_str(), &root, llist); if (!parsingOk) { @@ -574,7 +582,7 @@ void Dictionary::searchRegExp(const wstring &iRegexp, root->nextPos(PS); - Automaton *a = new Automaton(root->getFirstPos(), ptl, PS, &llist); + Automaton *a = new Automaton(root->getFirstPos(), ptl, PS, llist); if (a) { struct params_regexp_t params; diff --git a/dic/grammar.cpp b/dic/grammar.cpp index a7b1aa5..882ae2c 100644 --- a/dic/grammar.cpp +++ b/dic/grammar.cpp @@ -124,7 +124,7 @@ struct RegexpGrammar : grammar void evaluate(const Header &iHeader, iter_t const& i, stack &evalStack, - struct search_RegE_list_t *iList, bool negate = false) + searchRegExpLists &iList, bool negate = false) { if (i->value.id() == RegexpGrammar::alphavarId) { @@ -146,24 +146,17 @@ void evaluate(const Header &iHeader, iter_t const& i, stack &evalStack, // The dictionary letters are already in upper case const wstring &letters = iHeader.getLetters(); wstring::const_iterator itLetter; - int j; - for (j = RE_LIST_USER_END + 1; j < DIC_SEARCH_REGE_LIST; ++j) + // j is the index of the new list we create + size_t j = iList.symbl.size(); + iList.symbl.push_back(RE_ALL_MATCH + j); + iList.letters.push_back(vector(DIC_LETTERS, false)); + for (itLetter = letters.begin(); itLetter != letters.end(); ++itLetter) { - if (!iList->valid[j]) - { - iList->valid[j] = true; - iList->symbl.push_back(RE_ALL_MATCH + j); - iList->letters[j][0] = false; - for (itLetter = letters.begin(); itLetter != letters.end(); ++itLetter) - { - bool contains = (choiceLetters.find(*itLetter) != string::npos); - iList->letters[j][iHeader.getCodeFromChar(*itLetter)] = - (contains ? !negate : negate); - } - break; - } + bool contains = (choiceLetters.find(*itLetter) != string::npos); + iList.letters[j][iHeader.getCodeFromChar(*itLetter)] = + (contains ? !negate : negate); } - Node *node = new Node(NODE_VAR, iList->symbl[j], NULL, NULL); + Node *node = new Node(NODE_VAR, iList.symbl[j], NULL, NULL); evalStack.push(node); } else if (i->value.id() == RegexpGrammar::varId) @@ -279,7 +272,8 @@ void evaluate(const Header &iHeader, iter_t const& i, stack &evalStack, } -bool parseRegexp(const Dictionary &iDic, const wchar_t *input, Node **root, struct search_RegE_list_t *iList) +bool parseRegexp(const Dictionary &iDic, const wchar_t *input, Node **root, + searchRegExpLists &iList) { // Create a grammar object RegexpGrammar g(iDic.getHeader().getLetters()); diff --git a/dic/grammar.h b/dic/grammar.h index 1be6dcb..8448e51 100644 --- a/dic/grammar.h +++ b/dic/grammar.h @@ -23,9 +23,12 @@ class Dictionary; class Node; -struct search_RegE_list_t; +class searchRegExpLists; -bool parseRegexp(const Dictionary &iDic, const wchar_t *input, Node **root, struct search_RegE_list_t *iList); +bool parseRegexp(const Dictionary &iDic, + const wchar_t *input, + Node **root, + searchRegExpLists &iList); #endif diff --git a/dic/regexp.h b/dic/regexp.h index 1dc1c2d..681ba2d 100644 --- a/dic/regexp.h +++ b/dic/regexp.h @@ -127,30 +127,20 @@ private: #define RE_USR1_MATCH (DIC_LETTERS + 5) #define RE_USR2_MATCH (DIC_LETTERS + 6) -/** - * number of lists for regexp letter match \n - * 0 : all tiles \n - * 1 : vowels \n - * 2 : consonants \n - * 3 : user defined 1 \n - * 4 : user defined 2 \n - * x : lists used during parsing \n - */ -#define DIC_SEARCH_REGE_LIST (REGEXP_MAX) - /** * Structure used for dic.searchRegExp * This structure is used to explicit letters list that will be matched * against special tokens in the regular expression search */ -struct search_RegE_list_t +struct searchRegExpLists { /** special symbol associated with the list */ vector symbl; - /** 0 or 1 if list is valid */ - bool valid[DIC_SEARCH_REGE_LIST]; - /** 0 or 1 if letter is present in the list */ - bool letters[DIC_SEARCH_REGE_LIST][DIC_LETTERS]; + /** + * 0 or 1 if letter is present in the list. + * The inner vector should have a length of DIC_LETTERS (it is a bitmask) + */ + vector > letters; }; #define RE_LIST_ALL_MATCH 0