eliot/dic/dic_search.cpp

/*****************************************************************************
 * Eliot
 * Copyright (C) 2002-2009 Antoine Fraboulet & Olivier Teulière
 * Authors: Antoine Fraboulet <antoine.fraboulet @@ free.fr>
 *          Olivier Teulière  <ipkiss @@ gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *****************************************************************************/

#include <cstdlib>
#include <cstring>
#include <cwchar>
#include <cwctype>
#include <algorithm>

#include "dic_internals.h"
#include "dic_exception.h"
#include "dic.h"
#include "header.h"
#include "encoding.h"
#include "regexp.h"
#include "automaton.h"
#include "grammar.h"


static const unsigned int DEFAULT_VECT_ALLOC = 100;


const DicEdge* Dictionary::seekEdgePtr(const wchar_t* s, const DicEdge *eptr) const
{
    if (*s)
    {
        const DicEdge *p = getEdgeAt(eptr->ptr);
        do
        {
            if (p->chr == getHeader().getCodeFromChar(*s))
                return seekEdgePtr(s + 1, p);
        } while (!(*p++).last);
        return getEdgeAt(0);
    }
    else
        return eptr;
}


bool Dictionary::searchWord(const wstring &iWord) const
{
    if (!validateLetters(iWord))
        return false;

    const DicEdge *e = seekEdgePtr(iWord.c_str(), getEdgeAt(getRoot()));
    return e->term;
}


/**
 * Global variables for searchWordByLen:
 *
 * A pointer to the structure is passed as a parameter
 * so that all the search_* variables appear to the functions
 * as global but the code remains re-entrant.
 * Should be better to change the algorithm ...
 */

struct params_7plus1_t
{
    unsigned int added_code;
    wdstring added_display;
    map<unsigned int, vector<wdstring> > *results;
    int search_len;
    wchar_t search_wordtst[DIC_WORD_MAX];
    char search_letters[63];
};

void Dictionary::searchWordByLen(struct params_7plus1_t &params,
                                 int i, const DicEdge *edgeptr) const
{
    /* depth first search in the dictionary */
    do
    {
        /* the test is false only when reach the end-node */
        if (edgeptr->chr)
        {
            /* is the letter available in search_letters */
            if (params.search_letters[edgeptr->chr])
            {
                params.search_wordtst[i] = getHeader().getCharFromCode(edgeptr->chr);
                params.search_letters[edgeptr->chr] --;
                if (i == params.search_len)
                {
                    if (edgeptr->term)
                    {
                        // Add the solution
                        vector<wdstring> &sols = (*params.results)[params.added_code];
                        if (sols.empty() || sols.back() != params.search_wordtst)
                            sols.push_back(convertToDisplay(params.search_wordtst));
                    }
                }
                else
                {
                    searchWordByLen(params, i + 1, getEdgeAt(edgeptr->ptr));
                }
                params.search_letters[edgeptr->chr] ++;
                params.search_wordtst[i] = L'\0';
            }

            /* the letter is of course available if we have a joker available */
            if (params.search_letters[0])
            {
                params.search_wordtst[i] = getHeader().getCharFromCode(edgeptr->chr);
                params.search_letters[0] --;
                if (i == params.search_len)
                {
                    if (edgeptr->term)
                    {
                        // Add the solution
                        vector<wdstring> &sols = (*params.results)[params.added_code];
                        if (sols.empty() || sols.back() != params.search_wordtst)
                            sols.push_back(convertToDisplay(params.search_wordtst));
                    }
                }
                else
                {
                    searchWordByLen(params, i + 1, getEdgeAt(edgeptr->ptr));
                }
                params.search_letters[0] ++;
                params.search_wordtst[i] = L'\0';
            }
        }
    } while (! (*edgeptr++).last);
}


void Dictionary::search7pl1(const wstring &iRack,
                            map<unsigned int, vector<wdstring> > &oWordList,
                            bool joker) const
{
    if (iRack == L"" || iRack.size() > DIC_WORD_MAX)
        return;

    struct params_7plus1_t params;

    for (unsigned int i = 0; i < sizeof(params.search_letters); i++)
        params.search_letters[i] = 0;

    /*
     * the letters are verified and changed to the dic internal
     * representation (using getCodeFromChar(*r))
     */
    int wordlen = 0;
    for (const wchar_t* r = iRack.c_str(); *r; r++)
    {
        if (iswalpha(*r))
        {
            params.search_letters[getHeader().getCodeFromChar(*r)]++;
            wordlen++;
        }
        else if (*r == L'?')
        {
            if (joker)
            {
                params.search_letters[0]++;
                wordlen++;
            }
            else
            {
                oWordList[0].push_back(L"** joker **");
                return;
            }
        }
    }

    if (wordlen < 1)
        return;

    const DicEdge *root_edge = getEdgeAt(getRoot());
    root_edge = getEdgeAt(root_edge->ptr);

    params.results = &oWordList;

    /* search for all the words that can be done with the letters */
    params.added_code = 0;
    params.added_display = L"";
    params.search_len = wordlen - 1;
    params.search_wordtst[wordlen] = L'\0';
    searchWordByLen(params, 0, root_edge);

    /* search for all the words that can be done with the letters +1 */
    params.search_len = wordlen;
    params.search_wordtst[wordlen + 1] = L'\0';
    const wstring &letters = getHeader().getLetters();
    for (unsigned int i = 0; i < letters.size(); i++)
    {
        unsigned int code = getHeader().getCodeFromChar(letters[i]);
        params.added_code = code;
        params.added_display = getHeader().getDisplayStr(code);
        params.search_letters[code]++;

        searchWordByLen(params, 0, root_edge);

        params.search_letters[code]--;
    }
}

/****************************************/
/****************************************/

void Dictionary::searchRacc(const wstring &iWord,
                            vector<wdstring> &oWordList,
                            unsigned int iMaxResults) const
{
    if (iWord == L"")
        return;

    // Allocate room for all the results
    if (iMaxResults)
        oWordList.reserve(iMaxResults);
    else
        oWordList.reserve(DEFAULT_VECT_ALLOC);

    // Transform the given word to make it suitable for display
    wdstring displayWord = convertToDisplay(iWord);
    // Make it uppercase
    std::transform(displayWord.begin(), displayWord.end(),
                   displayWord.begin(), towupper);

    // Try to add a letter at the front
    const wstring &letters = getHeader().getLetters();
    for (unsigned int i = 0; i <= letters.size(); i++)
    {
        if (searchWord(letters[i] + iWord))
        {
            const wdstring &chr = getHeader().getDisplayStr(getHeader().getCodeFromChar(letters[i]));
            oWordList.push_back(chr + displayWord);
        }
        if (iMaxResults && oWordList.size() >= iMaxResults)
            return;
    }

    // Try to add a letter at the end
    const DicEdge *edge_seek =
        seekEdgePtr(iWord.c_str(), getEdgeAt(getRoot()));

    // Point to what the next letter can be
    const DicEdge *edge = getEdgeAt(edge_seek->ptr);

    if (edge != getEdgeAt(0))
    {
        do
        {
            if (edge->term)
            {
                oWordList.push_back(displayWord + getHeader().getDisplayStr(edge->chr));
                if (iMaxResults && oWordList.size() >= iMaxResults)
                    return;
            }
        } while (!(*edge++).last);
    }
}

/****************************************/
/****************************************/

void Dictionary::searchBenj(const wstring &iWord, vector<wdstring> &oWordList,
                            unsigned int iMaxResults) const
{
    if (iWord == L"")
        return;

    // Allocate room for all the results
    if (iMaxResults)
        oWordList.reserve(iMaxResults);
    else
        oWordList.reserve(DEFAULT_VECT_ALLOC);

    // Transform the given word to make it suitable for display
    wdstring displayWord = convertToDisplay(iWord);
    // Make it uppercase
    std::transform(displayWord.begin(), displayWord.end(),
                   displayWord.begin(), towupper);

    const DicEdge *edge0, *edge1, *edge2, *edgetst;
    edge0 = getEdgeAt(getRoot());
    edge0 = getEdgeAt(edge0->ptr);
    do
    {
        const wdstring &chr0 = getHeader().getDisplayStr(edge0->chr);
        edge1 = getEdgeAt(edge0->ptr);
        do
        {
            const wdstring &chr1 = getHeader().getDisplayStr(edge1->chr);
            edge2 = getEdgeAt(edge1->ptr);
            do
            {
                edgetst = seekEdgePtr(iWord.c_str(), edge2);
                if (edgetst->term)
                {
                    const wdstring &chr2 = getHeader().getDisplayStr(edge2->chr);
                    oWordList.push_back(chr0 + chr1 + chr2 + displayWord);
                    if (iMaxResults && oWordList.size() >= iMaxResults)
                        return;
                }
            } while (!(*edge2++).last);
        } while (!(*edge1++).last);
    } while (!(*edge0++).last);
}

/****************************************/
/****************************************/

struct params_regexp_t
{
    unsigned int minlength;
    unsigned int maxlength;
    Automaton *automaton_field;
};


void Dictionary::searchRegexpRec(const struct params_regexp_t &params,
                                 int state,
                                 const DicEdge *edgeptr,
                                 vector<wdstring> &oWordList,
                                 unsigned int iMaxResults,
                                 const wdstring &iCurrWord,
                                 unsigned int iNbChars) const
{
    if (iMaxResults && oWordList.size() >= iMaxResults)
        return;

    int next_state;
    /* if we have a valid word we store it */
    if (params.automaton_field->accept(state) && edgeptr->term)
    {
        if (params.minlength <= iNbChars &&
            params.maxlength >= iNbChars)
        {
            oWordList.push_back(iCurrWord);
        }
    }
    /* we now drive the search by exploring the dictionary */
    const DicEdge *current = getEdgeAt(edgeptr->ptr);
    do
    {
        /* the current letter is current->chr */
        next_state = params.automaton_field->getNextState(state, current->chr);
        /* 1: the letter appears in the automaton as is */
        if (next_state)
        {
            searchRegexpRec(params, next_state, current, oWordList, iMaxResults,
                            iCurrWord + getHeader().getDisplayStr(current->chr), iNbChars + 1);
        }
    } while (!(*current++).last);
}


/**
 * Initialize the lists of letters with pre-defined lists
 * 0: all tiles
 * 1: vowels
 * 2: consonants
 * 3: user defined 1
 * 4: user defined 2
 * x: lists used during parsing
 */
static void initLetterLists(const Dictionary &iDic,
                            searchRegExpLists &iList)
{
    memset(&iList, 0, sizeof(iList));
    // Prepare the space for 5 items
    iList.symbl.assign(5, 0);
    iList.letters.assign(5, vector<bool>(DIC_LETTERS + 1, false));

    iList.symbl[0] = RE_ALL_MATCH; // All letters
    iList.symbl[1] = RE_VOWL_MATCH; // Vowels
    iList.symbl[2] = RE_CONS_MATCH; // Consonants
    iList.letters[0][0] = false;
    iList.letters[1][0] = false;
    iList.letters[2][0] = false;
    const wstring &allLetters = iDic.getHeader().getLetters();
    for (size_t i = 1; i <= allLetters.size(); ++i)
    {
        iList.letters[0][i] = true;
        iList.letters[1][i] = iDic.getHeader().isVowel(i);
        iList.letters[2][i] = iDic.getHeader().isConsonant(i);
    }

    iList.symbl[3] = RE_USR1_MATCH; // User defined list 1
    iList.symbl[4] = RE_USR2_MATCH; // User defined list 2
}


bool Dictionary::searchRegExp(const wstring &iRegexp,
                              vector<wdstring> &oWordList,
                              unsigned int iMinLength,
                              unsigned int iMaxLength,
                              unsigned int iMaxResults) const
{
    if (iRegexp == L"")
        return true;

    // Allocate room for all the results
    // XXX: is it really a good idea?
    if (iMaxResults)
        oWordList.reserve(iMaxResults);
    else
        oWordList.reserve(DEFAULT_VECT_ALLOC);

    // Parsing
    Node *root = NULL;
    searchRegExpLists llist;
    // Initialize the lists of letters
    initLetterLists(*this, llist);
    bool parsingOk = parseRegexp(*this, (iRegexp + L"#").c_str(), &root, llist);

    if (!parsingOk)
    {
        delete root;
        throw InvalidRegexpException(convertToMb(iRegexp));
    }

    int ptl[REGEXP_MAX+1];
    uint64_t PS[REGEXP_MAX+1];

    for (int i = 0; i < REGEXP_MAX; i++)
    {
        PS[i] = 0;
        ptl[i] = 0;
    }

    int n = 1;
    int p = 1;
    root->traverse(p, n, ptl);
    PS [0] = p - 1;
    ptl[0] = p - 1;

    root->nextPos(PS);

    Automaton *a = new Automaton(root->getFirstPos(), ptl, PS, llist);
    if (a)
    {
        struct params_regexp_t params;
        params.minlength = iMinLength;
        params.maxlength = iMaxLength;
        params.automaton_field = a;
        searchRegexpRec(params, a->getInitId(),
                        getEdgeAt(getRoot()), oWordList,
                        iMaxResults ? iMaxResults + 1 : 0);
        delete a;
    }
    delete root;

    // Check whether the maximum number of results was reached
    if (iMaxResults && oWordList.size() > iMaxResults)
    {
        oWordList.pop_back();
        return false;
    }
    else
        return true;
}