mirror of
git://git.savannah.nongnu.org/eliot.git
synced 2025-01-18 10:26:15 +01:00
567 lines
16 KiB
C++
567 lines
16 KiB
C++
|
/*****************************************************************************
|
||
|
* Eliot
|
||
|
* Copyright (C) 2002-2007 Antoine Fraboulet
|
||
|
* Authors: Antoine Fraboulet <antoine.fraboulet @@ free.fr>
|
||
|
*
|
||
|
* This program is free software; you can redistribute it and/or modify
|
||
|
* it under the terms of the GNU General Public License as published by
|
||
|
* the Free Software Foundation; either version 2 of the License, or
|
||
|
* (at your option) any later version.
|
||
|
*
|
||
|
* This program is distributed in the hope that it will be useful,
|
||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
* GNU General Public License for more details.
|
||
|
*
|
||
|
* You should have received a copy of the GNU General Public License
|
||
|
* along with this program; if not, write to the Free Software
|
||
|
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||
|
*****************************************************************************/
|
||
|
|
||
|
/**
|
||
|
* \file dic_search.c
|
||
|
* \brief Dictionary lookup functions
|
||
|
* \author Antoine Fraboulet
|
||
|
* \date 2002
|
||
|
*/
|
||
|
|
||
|
#include <cstdlib>
|
||
|
#include <cstring>
|
||
|
#include <cwchar>
|
||
|
#include <cwctype>
|
||
|
|
||
|
#include "dic_internals.h"
|
||
|
#include "dic.h"
|
||
|
#include "header.h"
|
||
|
#include "encoding.h"
|
||
|
#include "regexp.h"
|
||
|
#include "libdic_a-ery.h" /* generated by bison */
|
||
|
#include "libdic_a-erl.h" /* generated by flex */
|
||
|
#include "automaton.h"
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Function prototype for bison generated parser
|
||
|
*/
|
||
|
int regexpparse(yyscan_t scanner, NODE** root,
|
||
|
struct search_RegE_list_t *iList,
|
||
|
struct regexp_error_report_t *err);
|
||
|
|
||
|
|
||
|
template <typename DAWG_EDGE>
|
||
|
const DAWG_EDGE* Dictionary::seekEdgePtr(const wchar_t* s, const DAWG_EDGE *eptr) const
|
||
|
{
|
||
|
if (*s)
|
||
|
{
|
||
|
const DAWG_EDGE *p = getEdgeAt<DAWG_EDGE>(eptr->ptr);
|
||
|
do
|
||
|
{
|
||
|
if (p->chr == getHeader().getCodeFromChar(*s))
|
||
|
return seekEdgePtr(s + 1, p);
|
||
|
} while (!(*p++).last);
|
||
|
return getEdgeAt<DAWG_EDGE>(0);
|
||
|
}
|
||
|
else
|
||
|
return eptr;
|
||
|
}
|
||
|
|
||
|
|
||
|
bool Dictionary::searchWord(const wstring &iWord) const
|
||
|
{
|
||
|
if (!validateLetters(iWord))
|
||
|
return false;
|
||
|
|
||
|
if (getHeader().getVersion() == 0)
|
||
|
{
|
||
|
const DicEdgeOld *e =
|
||
|
seekEdgePtr(iWord.c_str(), getEdgeAt<DicEdgeOld>(getRoot()));
|
||
|
return e->term;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
const DicEdge *e =
|
||
|
seekEdgePtr(iWord.c_str(), getEdgeAt<DicEdge>(getRoot()));
|
||
|
return e->term;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Global variables for searchWordByLen:
|
||
|
*
|
||
|
* A pointer to the structure is passed as a parameter
|
||
|
* so that all the search_* variables appear to the functions
|
||
|
* as global but the code remains re-entrant.
|
||
|
* Should be better to change the algorithm ...
|
||
|
*/
|
||
|
|
||
|
struct params_7plus1_t
|
||
|
{
|
||
|
wchar_t added_char;
|
||
|
map<wchar_t, list<wstring> > *results;
|
||
|
int search_len;
|
||
|
wchar_t search_wordtst[DIC_WORD_MAX];
|
||
|
char search_letters[63];
|
||
|
};
|
||
|
|
||
|
template <typename DAWG_EDGE>
|
||
|
void Dictionary::searchWordByLen(struct params_7plus1_t *params,
|
||
|
int i, const DAWG_EDGE *edgeptr) const
|
||
|
{
|
||
|
/* depth first search in the dictionary */
|
||
|
do
|
||
|
{
|
||
|
/* the test is false only when reach the end-node */
|
||
|
if (edgeptr->chr)
|
||
|
{
|
||
|
/* is the letter available in search_letters */
|
||
|
if (params->search_letters[edgeptr->chr])
|
||
|
{
|
||
|
params->search_wordtst[i] = getHeader().getCharFromCode(edgeptr->chr);
|
||
|
params->search_letters[edgeptr->chr] --;
|
||
|
if (i == params->search_len)
|
||
|
{
|
||
|
if (edgeptr->term)
|
||
|
{
|
||
|
(*params->results)[params->added_char].push_back(params->search_wordtst);
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
searchWordByLen(params, i + 1, getEdgeAt<DAWG_EDGE>(edgeptr->ptr));
|
||
|
}
|
||
|
params->search_letters[edgeptr->chr] ++;
|
||
|
params->search_wordtst[i] = L'\0';
|
||
|
}
|
||
|
|
||
|
/* the letter is of course available if we have a joker available */
|
||
|
if (params->search_letters[0])
|
||
|
{
|
||
|
params->search_wordtst[i] = getHeader().getCharFromCode(edgeptr->chr);
|
||
|
params->search_letters[0] --;
|
||
|
if (i == params->search_len)
|
||
|
{
|
||
|
if (edgeptr->term)
|
||
|
{
|
||
|
(*params->results)[params->added_char].push_back(params->search_wordtst);
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
searchWordByLen(params, i + 1, getEdgeAt<DAWG_EDGE>(edgeptr->ptr));
|
||
|
}
|
||
|
params->search_letters[0] ++;
|
||
|
params->search_wordtst[i] = L'\0';
|
||
|
}
|
||
|
}
|
||
|
} while (! (*edgeptr++).last);
|
||
|
}
|
||
|
|
||
|
|
||
|
template <typename DAWG_EDGE>
|
||
|
void Dictionary::search7pl1Templ(const wstring &iRack,
|
||
|
map<wchar_t, list<wstring> > &oWordList,
|
||
|
bool joker) const
|
||
|
{
|
||
|
if (iRack == L"" || iRack.size() > DIC_WORD_MAX)
|
||
|
return;
|
||
|
|
||
|
struct params_7plus1_t params;
|
||
|
|
||
|
for (unsigned int i = 0; i < sizeof(params.search_letters); i++)
|
||
|
params.search_letters[i] = 0;
|
||
|
|
||
|
/*
|
||
|
* the letters are verified and changed to the dic internal
|
||
|
* representation (using getCodeFromChar(*r))
|
||
|
*/
|
||
|
int wordlen = 0;
|
||
|
for (const wchar_t* r = iRack.c_str(); *r; r++)
|
||
|
{
|
||
|
if (iswalpha(*r))
|
||
|
{
|
||
|
params.search_letters[getHeader().getCodeFromChar(*r)]++;
|
||
|
wordlen++;
|
||
|
}
|
||
|
else if (*r == L'?')
|
||
|
{
|
||
|
if (joker)
|
||
|
{
|
||
|
params.search_letters[0]++;
|
||
|
wordlen++;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
oWordList[0].push_back(L"** joker **");
|
||
|
return;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (wordlen < 1)
|
||
|
return;
|
||
|
|
||
|
const DAWG_EDGE *root_edge = getEdgeAt<DAWG_EDGE>(getRoot());
|
||
|
root_edge = getEdgeAt<DAWG_EDGE>(root_edge->ptr);
|
||
|
|
||
|
params.results = &oWordList;
|
||
|
|
||
|
/* search for all the words that can be done with the letters */
|
||
|
params.added_char = L'\0';
|
||
|
params.search_len = wordlen - 1;
|
||
|
params.search_wordtst[wordlen] = L'\0';
|
||
|
searchWordByLen(¶ms, 0, root_edge);
|
||
|
|
||
|
/* search for all the words that can be done with the letters +1 */
|
||
|
params.search_len = wordlen;
|
||
|
params.search_wordtst[wordlen + 1] = L'\0';
|
||
|
const wstring &letters = getHeader().getLetters();
|
||
|
for (unsigned int i = 0; i < letters.size(); i++)
|
||
|
{
|
||
|
params.added_char = letters[i];
|
||
|
unsigned int code = getHeader().getCodeFromChar(letters[i]);
|
||
|
params.search_letters[code]++;
|
||
|
|
||
|
searchWordByLen(¶ms, 0, root_edge);
|
||
|
|
||
|
params.search_letters[code]--;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
void Dictionary::search7pl1(const wstring &iRack,
|
||
|
map<wchar_t, list<wstring> > &oWordList,
|
||
|
bool joker) const
|
||
|
{
|
||
|
if (getHeader().getVersion() == 0)
|
||
|
search7pl1Templ<DicEdgeOld>(iRack, oWordList, joker);
|
||
|
else
|
||
|
search7pl1Templ<DicEdge>(iRack, oWordList, joker);
|
||
|
}
|
||
|
|
||
|
/****************************************/
|
||
|
/****************************************/
|
||
|
|
||
|
template <typename DAWG_EDGE>
|
||
|
void Dictionary::searchRaccTempl(const wstring &iWord, list<wstring> &oWordList) const
|
||
|
{
|
||
|
if (iWord == L"")
|
||
|
return;
|
||
|
|
||
|
/* search_racc will try to add a letter in front and at the end of a word */
|
||
|
|
||
|
/* let's try for the front */
|
||
|
wchar_t wordtst[DIC_WORD_MAX];
|
||
|
wcscpy(wordtst + 1, iWord.c_str());
|
||
|
const wstring &letters = getHeader().getLetters();
|
||
|
for (unsigned int i = 0; i <= letters.size(); i++)
|
||
|
{
|
||
|
wordtst[0] = letters[i];
|
||
|
if (searchWord(wordtst))
|
||
|
oWordList.push_back(wordtst);
|
||
|
}
|
||
|
|
||
|
/* add a letter at the end */
|
||
|
int i;
|
||
|
for (i = 0; iWord[i]; i++)
|
||
|
wordtst[i] = iWord[i];
|
||
|
|
||
|
wordtst[i ] = '\0';
|
||
|
wordtst[i+1] = '\0';
|
||
|
|
||
|
const DAWG_EDGE *edge_seek =
|
||
|
seekEdgePtr(iWord.c_str(), getEdgeAt<DAWG_EDGE>(getRoot()));
|
||
|
|
||
|
/* points to what the next letter can be */
|
||
|
const DAWG_EDGE *edge = getEdgeAt<DAWG_EDGE>(edge_seek->ptr);
|
||
|
|
||
|
if (edge != getEdgeAt<DAWG_EDGE>(0))
|
||
|
{
|
||
|
do
|
||
|
{
|
||
|
if (edge->term)
|
||
|
{
|
||
|
wordtst[i] = getHeader().getCharFromCode(edge->chr);
|
||
|
oWordList.push_back(wordtst);
|
||
|
}
|
||
|
} while (!(*edge++).last);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
void Dictionary::searchRacc(const wstring &iWord, list<wstring> &oWordList) const
|
||
|
{
|
||
|
if (getHeader().getVersion() == 0)
|
||
|
searchRaccTempl<DicEdgeOld>(iWord, oWordList);
|
||
|
else
|
||
|
searchRaccTempl<DicEdge>(iWord, oWordList);
|
||
|
}
|
||
|
|
||
|
/****************************************/
|
||
|
/****************************************/
|
||
|
|
||
|
template <typename DAWG_EDGE>
|
||
|
void Dictionary::searchBenjTempl(const wstring &iWord, list<wstring> &oWordList) const
|
||
|
{
|
||
|
if (iWord == L"")
|
||
|
return;
|
||
|
|
||
|
wchar_t wordtst[DIC_WORD_MAX];
|
||
|
wcscpy(wordtst + 3, iWord.c_str());
|
||
|
const DAWG_EDGE *edge0, *edge1, *edge2, *edgetst;
|
||
|
edge0 = getEdgeAt<DAWG_EDGE>(getRoot());
|
||
|
edge0 = getEdgeAt<DAWG_EDGE>(edge0->ptr);
|
||
|
do
|
||
|
{
|
||
|
wordtst[0] = getHeader().getCharFromCode(edge0->chr);
|
||
|
edge1 = getEdgeAt<DAWG_EDGE>(edge0->ptr);
|
||
|
do
|
||
|
{
|
||
|
wordtst[1] = getHeader().getCharFromCode(edge1->chr);
|
||
|
edge2 = getEdgeAt<DAWG_EDGE>(edge1->ptr);
|
||
|
do
|
||
|
{
|
||
|
edgetst = seekEdgePtr(iWord.c_str(), edge2);
|
||
|
if (edgetst->term)
|
||
|
{
|
||
|
wordtst[2] = getHeader().getCharFromCode(edge2->chr);
|
||
|
oWordList.push_back(wordtst);
|
||
|
}
|
||
|
} while (!(*edge2++).last);
|
||
|
} while (!(*edge1++).last);
|
||
|
} while (!(*edge0++).last);
|
||
|
}
|
||
|
|
||
|
|
||
|
void Dictionary::searchBenj(const wstring &iWord, list<wstring> &oWordList) const
|
||
|
{
|
||
|
if (getHeader().getVersion() == 0)
|
||
|
searchBenjTempl<DicEdgeOld>(iWord, oWordList);
|
||
|
else
|
||
|
searchBenjTempl<DicEdge>(iWord, oWordList);
|
||
|
}
|
||
|
|
||
|
/****************************************/
|
||
|
/****************************************/
|
||
|
|
||
|
struct params_cross_t
|
||
|
{
|
||
|
int wordlen;
|
||
|
wchar_t mask[DIC_WORD_MAX];
|
||
|
};
|
||
|
|
||
|
|
||
|
template <typename DAWG_EDGE>
|
||
|
void Dictionary::searchCrossRecTempl(struct params_cross_t *params,
|
||
|
list<wstring> &oWordList,
|
||
|
const DAWG_EDGE *edgeptr) const
|
||
|
{
|
||
|
const DAWG_EDGE *current = getEdgeAt<DAWG_EDGE>(edgeptr->ptr);
|
||
|
|
||
|
if (params->mask[params->wordlen] == '\0' && edgeptr->term)
|
||
|
{
|
||
|
oWordList.push_back(params->mask);
|
||
|
}
|
||
|
else if (params->mask[params->wordlen] == '.')
|
||
|
{
|
||
|
do
|
||
|
{
|
||
|
params->mask[params->wordlen] = getHeader().getCharFromCode(current->chr);
|
||
|
params->wordlen ++;
|
||
|
searchCrossRecTempl(params, oWordList, current);
|
||
|
params->wordlen --;
|
||
|
params->mask[params->wordlen] = '.';
|
||
|
}
|
||
|
while (!(*current++).last);
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
do
|
||
|
{
|
||
|
if (current->chr == getHeader().getCodeFromChar(params->mask[params->wordlen]))
|
||
|
{
|
||
|
params->wordlen ++;
|
||
|
searchCrossRecTempl(params, oWordList, current);
|
||
|
params->wordlen --;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
while (!(*current++).last);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
void Dictionary::searchCross(const wstring &iMask, list<wstring> &oWordList) const
|
||
|
{
|
||
|
if (iMask == L"")
|
||
|
return;
|
||
|
|
||
|
struct params_cross_t params;
|
||
|
|
||
|
int i;
|
||
|
for (i = 0; i < DIC_WORD_MAX && iMask[i]; i++)
|
||
|
{
|
||
|
if (iswalpha(iMask[i]))
|
||
|
params.mask[i] = towupper(iMask[i]);
|
||
|
else
|
||
|
params.mask[i] = '.';
|
||
|
}
|
||
|
params.mask[i] = '\0';
|
||
|
|
||
|
params.wordlen = 0;
|
||
|
if (getHeader().getVersion() == 0)
|
||
|
{
|
||
|
searchCrossRecTempl(¶ms, oWordList,
|
||
|
getEdgeAt<DicEdgeOld>(getRoot()));
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
searchCrossRecTempl(¶ms, oWordList,
|
||
|
getEdgeAt<DicEdge>(getRoot()));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/****************************************/
|
||
|
/****************************************/
|
||
|
|
||
|
struct params_regexp_t
|
||
|
{
|
||
|
int minlength;
|
||
|
int maxlength;
|
||
|
Automaton *automaton_field;
|
||
|
struct search_RegE_list_t *charlist;
|
||
|
char word[DIC_WORD_MAX];
|
||
|
int wordlen;
|
||
|
};
|
||
|
|
||
|
|
||
|
template <typename DAWG_EDGE>
|
||
|
void Dictionary::searchRegexpRecTempl(struct params_regexp_t *params,
|
||
|
int state,
|
||
|
const DAWG_EDGE *edgeptr,
|
||
|
list<string> &oWordList) const
|
||
|
{
|
||
|
int next_state;
|
||
|
/* if we have a valid word we store it */
|
||
|
if (params->automaton_field->accept(state) && edgeptr->term)
|
||
|
{
|
||
|
int l = strlen(params->word);
|
||
|
if (params->minlength <= l &&
|
||
|
params->maxlength >= l)
|
||
|
{
|
||
|
oWordList.push_back(params->word);
|
||
|
}
|
||
|
}
|
||
|
/* we now drive the search by exploring the dictionary */
|
||
|
const DAWG_EDGE *current = getEdgeAt<DAWG_EDGE>(edgeptr->ptr);
|
||
|
do
|
||
|
{
|
||
|
/* the current letter is current->chr */
|
||
|
next_state = params->automaton_field->getNextState(state, current->chr);
|
||
|
/* 1: the letter appears in the automaton as is */
|
||
|
if (next_state)
|
||
|
{
|
||
|
params->word[params->wordlen] = current->chr + 'a' - 1;
|
||
|
params->wordlen ++;
|
||
|
searchRegexpRecTempl(params, next_state, current, oWordList);
|
||
|
params->wordlen --;
|
||
|
params->word[params->wordlen] = '\0';
|
||
|
}
|
||
|
} while (!(*current++).last);
|
||
|
}
|
||
|
|
||
|
|
||
|
void Dictionary::searchRegExpInner(const string &iRegexp,
|
||
|
list<string> &oWordList,
|
||
|
struct search_RegE_list_t *iList) const
|
||
|
{
|
||
|
int ptl[REGEXP_MAX+1];
|
||
|
int PS [REGEXP_MAX+1];
|
||
|
|
||
|
/* (expr)# */
|
||
|
char stringbuf[250];
|
||
|
sprintf(stringbuf, "(%s)#", iRegexp.c_str());
|
||
|
for (int i = 0; i < REGEXP_MAX; i++)
|
||
|
{
|
||
|
PS[i] = 0;
|
||
|
ptl[i] = 0;
|
||
|
}
|
||
|
|
||
|
struct regexp_error_report_t report;
|
||
|
report.pos1 = 0;
|
||
|
report.pos2 = 0;
|
||
|
report.msg[0] = '\0';
|
||
|
|
||
|
/* parsing */
|
||
|
yyscan_t scanner;
|
||
|
regexplex_init( &scanner );
|
||
|
YY_BUFFER_STATE buf = regexp_scan_string(stringbuf, scanner);
|
||
|
NODE *root = NULL;
|
||
|
int value = regexpparse(scanner , &root, iList, &report);
|
||
|
regexp_delete_buffer(buf, scanner);
|
||
|
regexplex_destroy(scanner);
|
||
|
|
||
|
if (value)
|
||
|
{
|
||
|
#ifdef DEBUG_FLEX_IS_BROKEN
|
||
|
fprintf(stderr, "parser error at pos %d - %d: %s\n",
|
||
|
report.pos1, report.pos2, report.msg);
|
||
|
#endif
|
||
|
regexp_delete_tree(root);
|
||
|
return ;
|
||
|
}
|
||
|
|
||
|
int n = 1;
|
||
|
int p = 1;
|
||
|
regexp_parcours(root, &p, &n, ptl);
|
||
|
PS [0] = p - 1;
|
||
|
ptl[0] = p - 1;
|
||
|
|
||
|
regexp_possuivante(root, PS);
|
||
|
|
||
|
Automaton *a = new Automaton(root->PP, ptl, PS, iList);
|
||
|
if (a)
|
||
|
{
|
||
|
struct params_regexp_t params;
|
||
|
params.minlength = iList->minlength;
|
||
|
params.maxlength = iList->maxlength;
|
||
|
params.automaton_field = a;
|
||
|
params.charlist = iList;
|
||
|
memset(params.word, '\0', sizeof(params.word));
|
||
|
params.wordlen = 0;
|
||
|
if (getHeader().getVersion() == 0)
|
||
|
{
|
||
|
searchRegexpRecTempl(¶ms, a->getInitId(),
|
||
|
getEdgeAt<DicEdgeOld>(getRoot()), oWordList);
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
searchRegexpRecTempl(¶ms, a->getInitId(),
|
||
|
getEdgeAt<DicEdge>(getRoot()), oWordList);
|
||
|
}
|
||
|
|
||
|
delete a;
|
||
|
}
|
||
|
regexp_delete_tree(root);
|
||
|
}
|
||
|
|
||
|
|
||
|
void Dictionary::searchRegExp(const wstring &iRegexp,
|
||
|
list<wstring> &oWordList,
|
||
|
struct search_RegE_list_t *iList) const
|
||
|
{
|
||
|
if (iRegexp == L"")
|
||
|
return;
|
||
|
|
||
|
list<string> tmpWordList;
|
||
|
// Do the actual work
|
||
|
searchRegExpInner(convertToMb(iRegexp), tmpWordList, iList);
|
||
|
|
||
|
list<string>::const_iterator it;
|
||
|
for (it = tmpWordList.begin(); it != tmpWordList.end(); it++)
|
||
|
{
|
||
|
oWordList.push_back(convertToWc(*it));
|
||
|
}
|
||
|
}
|
||
|
|