/***************************************************************************** * Eliot * Copyright (C) 2002-2007 Antoine Fraboulet * Authors: Antoine Fraboulet * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA *****************************************************************************/ /** * \file dic_search.c * \brief Dictionary lookup functions * \author Antoine Fraboulet * \date 2002 */ #include #include #include #include #include "dic_internals.h" #include "dic.h" #include "header.h" #include "encoding.h" #include "regexp.h" #include "libdic_a-ery.h" /* generated by bison */ #include "libdic_a-erl.h" /* generated by flex */ #include "automaton.h" /** * Function prototype for bison generated parser */ int regexpparse(yyscan_t scanner, NODE** root, struct search_RegE_list_t *iList, struct regexp_error_report_t *err); template const DAWG_EDGE* Dictionary::seekEdgePtr(const wchar_t* s, const DAWG_EDGE *eptr) const { if (*s) { const DAWG_EDGE *p = getEdgeAt(eptr->ptr); do { if (p->chr == getHeader().getCodeFromChar(*s)) return seekEdgePtr(s + 1, p); } while (!(*p++).last); return getEdgeAt(0); } else return eptr; } bool Dictionary::searchWord(const wstring &iWord) const { if (!validateLetters(iWord)) return false; if (getHeader().getVersion() == 0) { const DicEdgeOld *e = seekEdgePtr(iWord.c_str(), getEdgeAt(getRoot())); return e->term; } else { const DicEdge *e = seekEdgePtr(iWord.c_str(), getEdgeAt(getRoot())); return e->term; } } /** * Global variables for searchWordByLen: * * A pointer to the structure is passed as a parameter * so that all the search_* variables appear to the functions * as global but the code remains re-entrant. * Should be better to change the algorithm ... */ struct params_7plus1_t { wchar_t added_char; map > *results; int search_len; wchar_t search_wordtst[DIC_WORD_MAX]; char search_letters[63]; }; template void Dictionary::searchWordByLen(struct params_7plus1_t *params, int i, const DAWG_EDGE *edgeptr) const { /* depth first search in the dictionary */ do { /* the test is false only when reach the end-node */ if (edgeptr->chr) { /* is the letter available in search_letters */ if (params->search_letters[edgeptr->chr]) { params->search_wordtst[i] = getHeader().getCharFromCode(edgeptr->chr); params->search_letters[edgeptr->chr] --; if (i == params->search_len) { if (edgeptr->term) { (*params->results)[params->added_char].push_back(params->search_wordtst); } } else { searchWordByLen(params, i + 1, getEdgeAt(edgeptr->ptr)); } params->search_letters[edgeptr->chr] ++; params->search_wordtst[i] = L'\0'; } /* the letter is of course available if we have a joker available */ if (params->search_letters[0]) { params->search_wordtst[i] = getHeader().getCharFromCode(edgeptr->chr); params->search_letters[0] --; if (i == params->search_len) { if (edgeptr->term) { (*params->results)[params->added_char].push_back(params->search_wordtst); } } else { searchWordByLen(params, i + 1, getEdgeAt(edgeptr->ptr)); } params->search_letters[0] ++; params->search_wordtst[i] = L'\0'; } } } while (! (*edgeptr++).last); } template void Dictionary::search7pl1Templ(const wstring &iRack, map > &oWordList, bool joker) const { if (iRack == L"" || iRack.size() > DIC_WORD_MAX) return; struct params_7plus1_t params; for (unsigned int i = 0; i < sizeof(params.search_letters); i++) params.search_letters[i] = 0; /* * the letters are verified and changed to the dic internal * representation (using getCodeFromChar(*r)) */ int wordlen = 0; for (const wchar_t* r = iRack.c_str(); *r; r++) { if (iswalpha(*r)) { params.search_letters[getHeader().getCodeFromChar(*r)]++; wordlen++; } else if (*r == L'?') { if (joker) { params.search_letters[0]++; wordlen++; } else { oWordList[0].push_back(L"** joker **"); return; } } } if (wordlen < 1) return; const DAWG_EDGE *root_edge = getEdgeAt(getRoot()); root_edge = getEdgeAt(root_edge->ptr); params.results = &oWordList; /* search for all the words that can be done with the letters */ params.added_char = L'\0'; params.search_len = wordlen - 1; params.search_wordtst[wordlen] = L'\0'; searchWordByLen(¶ms, 0, root_edge); /* search for all the words that can be done with the letters +1 */ params.search_len = wordlen; params.search_wordtst[wordlen + 1] = L'\0'; const wstring &letters = getHeader().getLetters(); for (unsigned int i = 0; i < letters.size(); i++) { params.added_char = letters[i]; unsigned int code = getHeader().getCodeFromChar(letters[i]); params.search_letters[code]++; searchWordByLen(¶ms, 0, root_edge); params.search_letters[code]--; } } void Dictionary::search7pl1(const wstring &iRack, map > &oWordList, bool joker) const { if (getHeader().getVersion() == 0) search7pl1Templ(iRack, oWordList, joker); else search7pl1Templ(iRack, oWordList, joker); } /****************************************/ /****************************************/ template void Dictionary::searchRaccTempl(const wstring &iWord, list &oWordList) const { if (iWord == L"") return; /* search_racc will try to add a letter in front and at the end of a word */ /* let's try for the front */ wchar_t wordtst[DIC_WORD_MAX]; wcscpy(wordtst + 1, iWord.c_str()); const wstring &letters = getHeader().getLetters(); for (unsigned int i = 0; i <= letters.size(); i++) { wordtst[0] = letters[i]; if (searchWord(wordtst)) oWordList.push_back(wordtst); } /* add a letter at the end */ int i; for (i = 0; iWord[i]; i++) wordtst[i] = iWord[i]; wordtst[i ] = '\0'; wordtst[i+1] = '\0'; const DAWG_EDGE *edge_seek = seekEdgePtr(iWord.c_str(), getEdgeAt(getRoot())); /* points to what the next letter can be */ const DAWG_EDGE *edge = getEdgeAt(edge_seek->ptr); if (edge != getEdgeAt(0)) { do { if (edge->term) { wordtst[i] = getHeader().getCharFromCode(edge->chr); oWordList.push_back(wordtst); } } while (!(*edge++).last); } } void Dictionary::searchRacc(const wstring &iWord, list &oWordList) const { if (getHeader().getVersion() == 0) searchRaccTempl(iWord, oWordList); else searchRaccTempl(iWord, oWordList); } /****************************************/ /****************************************/ template void Dictionary::searchBenjTempl(const wstring &iWord, list &oWordList) const { if (iWord == L"") return; wchar_t wordtst[DIC_WORD_MAX]; wcscpy(wordtst + 3, iWord.c_str()); const DAWG_EDGE *edge0, *edge1, *edge2, *edgetst; edge0 = getEdgeAt(getRoot()); edge0 = getEdgeAt(edge0->ptr); do { wordtst[0] = getHeader().getCharFromCode(edge0->chr); edge1 = getEdgeAt(edge0->ptr); do { wordtst[1] = getHeader().getCharFromCode(edge1->chr); edge2 = getEdgeAt(edge1->ptr); do { edgetst = seekEdgePtr(iWord.c_str(), edge2); if (edgetst->term) { wordtst[2] = getHeader().getCharFromCode(edge2->chr); oWordList.push_back(wordtst); } } while (!(*edge2++).last); } while (!(*edge1++).last); } while (!(*edge0++).last); } void Dictionary::searchBenj(const wstring &iWord, list &oWordList) const { if (getHeader().getVersion() == 0) searchBenjTempl(iWord, oWordList); else searchBenjTempl(iWord, oWordList); } /****************************************/ /****************************************/ struct params_cross_t { int wordlen; wchar_t mask[DIC_WORD_MAX]; }; template void Dictionary::searchCrossRecTempl(struct params_cross_t *params, list &oWordList, const DAWG_EDGE *edgeptr) const { const DAWG_EDGE *current = getEdgeAt(edgeptr->ptr); if (params->mask[params->wordlen] == '\0' && edgeptr->term) { oWordList.push_back(params->mask); } else if (params->mask[params->wordlen] == '.') { do { params->mask[params->wordlen] = getHeader().getCharFromCode(current->chr); params->wordlen ++; searchCrossRecTempl(params, oWordList, current); params->wordlen --; params->mask[params->wordlen] = '.'; } while (!(*current++).last); } else { do { if (current->chr == getHeader().getCodeFromChar(params->mask[params->wordlen])) { params->wordlen ++; searchCrossRecTempl(params, oWordList, current); params->wordlen --; break; } } while (!(*current++).last); } } void Dictionary::searchCross(const wstring &iMask, list &oWordList) const { if (iMask == L"") return; struct params_cross_t params; int i; for (i = 0; i < DIC_WORD_MAX && iMask[i]; i++) { if (iswalpha(iMask[i])) params.mask[i] = towupper(iMask[i]); else params.mask[i] = '.'; } params.mask[i] = '\0'; params.wordlen = 0; if (getHeader().getVersion() == 0) { searchCrossRecTempl(¶ms, oWordList, getEdgeAt(getRoot())); } else { searchCrossRecTempl(¶ms, oWordList, getEdgeAt(getRoot())); } } /****************************************/ /****************************************/ struct params_regexp_t { int minlength; int maxlength; Automaton *automaton_field; struct search_RegE_list_t *charlist; char word[DIC_WORD_MAX]; int wordlen; }; template void Dictionary::searchRegexpRecTempl(struct params_regexp_t *params, int state, const DAWG_EDGE *edgeptr, list &oWordList) const { int next_state; /* if we have a valid word we store it */ if (params->automaton_field->accept(state) && edgeptr->term) { int l = strlen(params->word); if (params->minlength <= l && params->maxlength >= l) { oWordList.push_back(params->word); } } /* we now drive the search by exploring the dictionary */ const DAWG_EDGE *current = getEdgeAt(edgeptr->ptr); do { /* the current letter is current->chr */ next_state = params->automaton_field->getNextState(state, current->chr); /* 1: the letter appears in the automaton as is */ if (next_state) { params->word[params->wordlen] = current->chr + 'a' - 1; params->wordlen ++; searchRegexpRecTempl(params, next_state, current, oWordList); params->wordlen --; params->word[params->wordlen] = '\0'; } } while (!(*current++).last); } void Dictionary::searchRegExpInner(const string &iRegexp, list &oWordList, struct search_RegE_list_t *iList) const { int ptl[REGEXP_MAX+1]; int PS [REGEXP_MAX+1]; /* (expr)# */ char stringbuf[250]; sprintf(stringbuf, "(%s)#", iRegexp.c_str()); for (int i = 0; i < REGEXP_MAX; i++) { PS[i] = 0; ptl[i] = 0; } struct regexp_error_report_t report; report.pos1 = 0; report.pos2 = 0; report.msg[0] = '\0'; /* parsing */ yyscan_t scanner; regexplex_init( &scanner ); YY_BUFFER_STATE buf = regexp_scan_string(stringbuf, scanner); NODE *root = NULL; int value = regexpparse(scanner , &root, iList, &report); regexp_delete_buffer(buf, scanner); regexplex_destroy(scanner); if (value) { #ifdef DEBUG_FLEX_IS_BROKEN fprintf(stderr, "parser error at pos %d - %d: %s\n", report.pos1, report.pos2, report.msg); #endif regexp_delete_tree(root); return ; } int n = 1; int p = 1; regexp_parcours(root, &p, &n, ptl); PS [0] = p - 1; ptl[0] = p - 1; regexp_possuivante(root, PS); Automaton *a = new Automaton(root->PP, ptl, PS, iList); if (a) { struct params_regexp_t params; params.minlength = iList->minlength; params.maxlength = iList->maxlength; params.automaton_field = a; params.charlist = iList; memset(params.word, '\0', sizeof(params.word)); params.wordlen = 0; if (getHeader().getVersion() == 0) { searchRegexpRecTempl(¶ms, a->getInitId(), getEdgeAt(getRoot()), oWordList); } else { searchRegexpRecTempl(¶ms, a->getInitId(), getEdgeAt(getRoot()), oWordList); } delete a; } regexp_delete_tree(root); } void Dictionary::searchRegExp(const wstring &iRegexp, list &oWordList, struct search_RegE_list_t *iList) const { if (iRegexp == L"") return; list tmpWordList; // Do the actual work searchRegExpInner(convertToMb(iRegexp), tmpWordList, iList); list::const_iterator it; for (it = tmpWordList.begin(); it != tmpWordList.end(); it++) { oWordList.push_back(convertToWc(*it)); } }