Removed support for the old dictionary format. The code becomes simpler and (very slightly) faster.

This commit is contained in:
Olivier Teulière 2008-11-22 13:11:48 +00:00
parent 70be50c64e
commit 66538b4806
8 changed files with 195 additions and 432 deletions

View file

@ -50,24 +50,6 @@
const Dictionary *Dictionary::m_dic = NULL;
// Note: duplicated in header.cpp
#if defined(WORDS_BIGENDIAN)
static uint32_t swap4(uint32_t v)
{
uint32_t r;
uint8_t *pv = (uint8_t*)&v;
uint8_t *pr = (uint8_t*)&r;
pr[0] = pv[3];
pr[1] = pv[2];
pr[2] = pv[1];
pr[3] = pv[0];
return r;
}
#endif
Dictionary::Dictionary(const string &iPath)
: m_dawg(NULL)
{
@ -112,21 +94,9 @@ Dictionary::~Dictionary()
void Dictionary::convertDataToArch()
{
if (m_header->getVersion() == 0)
for (unsigned int i = 0; i < (m_header->getNbEdgesUsed() + 1); i++)
{
#if defined(WORDS_BIGENDIAN)
for (unsigned int i = 0; i < (m_header->getNbEdgesUsed() + 1); i++)
{
m_dawg[i] = swap4(m_dawg[i]);
}
#endif
}
else
{
for (unsigned int i = 0; i < (m_header->getNbEdgesUsed() + 1); i++)
{
m_dawg[i] = ntohl(m_dawg[i]);
}
m_dawg[i] = ntohl(m_dawg[i]);
}
}
@ -164,10 +134,7 @@ dic_elt_t Dictionary::getNext(const dic_elt_t &e) const
dic_elt_t Dictionary::getSucc(const dic_elt_t &e) const
{
if (m_header->getVersion() == 0)
return reinterpret_cast<const DicEdgeOld*>(m_dawg + e)->ptr;
else
return reinterpret_cast<const DicEdge*>(m_dawg + e)->ptr;
return reinterpret_cast<const DicEdge*>(m_dawg + e)->ptr;
}
@ -179,10 +146,7 @@ dic_elt_t Dictionary::getRoot() const
dic_code_t Dictionary::getCode(const dic_elt_t &e) const
{
if (m_header->getVersion() == 0)
return reinterpret_cast<const DicEdgeOld*>(m_dawg + e)->chr;
else
return reinterpret_cast<const DicEdge*>(m_dawg + e)->chr;
return reinterpret_cast<const DicEdge*>(m_dawg + e)->chr;
}
@ -194,19 +158,13 @@ wchar_t Dictionary::getChar(const dic_elt_t &e) const
bool Dictionary::isLast(const dic_elt_t &e) const
{
if (m_header->getVersion() == 0)
return reinterpret_cast<const DicEdgeOld*>(m_dawg + e)->last;
else
return reinterpret_cast<const DicEdge*>(m_dawg + e)->last;
return reinterpret_cast<const DicEdge*>(m_dawg + e)->last;
}
bool Dictionary::isEndOfWord(const dic_elt_t &e) const
{
if (m_header->getVersion() == 0)
return reinterpret_cast<const DicEdgeOld*>(m_dawg + e)->term;
else
return reinterpret_cast<const DicEdge*>(m_dawg + e)->term;
return reinterpret_cast<const DicEdge*>(m_dawg + e)->term;
}

View file

@ -42,6 +42,7 @@ typedef unsigned char dic_code_t;
struct params_cross_t;
struct params_7plus1_t;
struct params_regexp_t;
class DicEdge;
class Dictionary
{
@ -162,7 +163,11 @@ public:
unsigned int charLookup(const dic_elt_t &iRoot, const wchar_t *iPattern) const;
/// Getter for the edge at the given position
const uint32_t *getEdgeAt(const dic_elt_t &iElt) const { return m_dawg + iElt; }
const DicEdge * getEdgeAt(const dic_elt_t &iElt) const
{
return reinterpret_cast<const DicEdge*>(m_dawg + iElt);
}
/**
* Search for a word in the dictionary
@ -245,56 +250,29 @@ private:
void convertDataToArch();
void initializeTiles();
/// Template getter for the edge at the given position
template <typename DAWG_EDGE>
const DAWG_EDGE * getEdgeAt(const dic_elt_t &iElt) const
{
return reinterpret_cast<const DAWG_EDGE*>(m_dawg + iElt);
}
/**
* Walk the dictionary until the end of the word
* @param s: current pointer to letters
* @param eptr: current edge in the dawg
*/
template <typename DAWG_EDGE>
const DAWG_EDGE * seekEdgePtr(const wchar_t *s, const DAWG_EDGE *eptr) const;
/// Helper for searchBenj()
template <typename DAWG_EDGE>
void searchBenjTempl(const wstring &iWord, vector<wstring> &oWordList,
unsigned int iMaxResults) const;
/// Helper for searchRacc()
template <typename DAWG_EDGE>
void searchRaccTempl(const wstring &iWord, vector<wstring> &oWordList,
unsigned int iMaxResults) const;
const DicEdge * seekEdgePtr(const wchar_t *s, const DicEdge *eptr) const;
/// Helper for searchCross()
template <typename DAWG_EDGE>
void searchCrossRecTempl(struct params_cross_t *params,
vector<wstring> &oWordList,
const DAWG_EDGE *edgeptr,
unsigned int iMaxResults) const;
void searchCrossRec(struct params_cross_t *params,
vector<wstring> &oWordList,
const DicEdge *edgeptr,
unsigned int iMaxResults) const;
/// Helper for search7pl1()
template <typename DAWG_EDGE>
void search7pl1Templ(const wstring &iRack,
map<wchar_t, vector<wstring> > &oWordList,
bool joker) const;
/// Second helper for search7pl1()
template <typename DAWG_EDGE>
void searchWordByLen(struct params_7plus1_t *params,
int i, const DAWG_EDGE *edgeptr) const;
int i, const DicEdge *edgeptr) const;
/// Helper for searchRegExp()
template <typename DAWG_EDGE>
void searchRegexpRecTempl(struct params_regexp_t *params,
int state,
const DAWG_EDGE *edgeptr,
vector<wstring> &oWordList,
unsigned int iMaxResults) const;
void searchRegexpRec(struct params_regexp_t *params,
int state,
const DicEdge *edgeptr,
vector<wstring> &oWordList,
unsigned int iMaxResults) const;
};
#endif /* _DIC_H_ */

View file

@ -38,22 +38,6 @@
* ----------------
*/
struct __attribute__ ((packed)) DicEdgeOld
{
public:
uint32_t
ptr : 24,
term: 1,
last: 1,
fill: 1,
chr : 5;
bool operator==(const DicEdgeOld &iOther) const
{
return memcmp(this, &iOther, sizeof(*this)) == 0;
}
};
struct __attribute__ ((packed)) DicEdge
{
public:

View file

@ -37,18 +37,17 @@
static const unsigned int DEFAULT_VECT_ALLOC = 100;
template <typename DAWG_EDGE>
const DAWG_EDGE* Dictionary::seekEdgePtr(const wchar_t* s, const DAWG_EDGE *eptr) const
const DicEdge* Dictionary::seekEdgePtr(const wchar_t* s, const DicEdge *eptr) const
{
if (*s)
{
const DAWG_EDGE *p = getEdgeAt<DAWG_EDGE>(eptr->ptr);
const DicEdge *p = getEdgeAt(eptr->ptr);
do
{
if (p->chr == getHeader().getCodeFromChar(*s))
return seekEdgePtr(s + 1, p);
} while (!(*p++).last);
return getEdgeAt<DAWG_EDGE>(0);
return getEdgeAt(0);
}
else
return eptr;
@ -60,18 +59,8 @@ bool Dictionary::searchWord(const wstring &iWord) const
if (!validateLetters(iWord))
return false;
if (getHeader().getVersion() == 0)
{
const DicEdgeOld *e =
seekEdgePtr(iWord.c_str(), getEdgeAt<DicEdgeOld>(getRoot()));
return e->term;
}
else
{
const DicEdge *e =
seekEdgePtr(iWord.c_str(), getEdgeAt<DicEdge>(getRoot()));
return e->term;
}
const DicEdge *e = seekEdgePtr(iWord.c_str(), getEdgeAt(getRoot()));
return e->term;
}
@ -93,9 +82,8 @@ struct params_7plus1_t
char search_letters[63];
};
template <typename DAWG_EDGE>
void Dictionary::searchWordByLen(struct params_7plus1_t *params,
int i, const DAWG_EDGE *edgeptr) const
int i, const DicEdge *edgeptr) const
{
/* depth first search in the dictionary */
do
@ -120,7 +108,7 @@ void Dictionary::searchWordByLen(struct params_7plus1_t *params,
}
else
{
searchWordByLen(params, i + 1, getEdgeAt<DAWG_EDGE>(edgeptr->ptr));
searchWordByLen(params, i + 1, getEdgeAt(edgeptr->ptr));
}
params->search_letters[edgeptr->chr] ++;
params->search_wordtst[i] = L'\0';
@ -143,7 +131,7 @@ void Dictionary::searchWordByLen(struct params_7plus1_t *params,
}
else
{
searchWordByLen(params, i + 1, getEdgeAt<DAWG_EDGE>(edgeptr->ptr));
searchWordByLen(params, i + 1, getEdgeAt(edgeptr->ptr));
}
params->search_letters[0] ++;
params->search_wordtst[i] = L'\0';
@ -153,10 +141,9 @@ void Dictionary::searchWordByLen(struct params_7plus1_t *params,
}
template <typename DAWG_EDGE>
void Dictionary::search7pl1Templ(const wstring &iRack,
map<wchar_t, vector<wstring> > &oWordList,
bool joker) const
void Dictionary::search7pl1(const wstring &iRack,
map<wchar_t, vector<wstring> > &oWordList,
bool joker) const
{
if (iRack == L"" || iRack.size() > DIC_WORD_MAX)
return;
@ -196,8 +183,8 @@ void Dictionary::search7pl1Templ(const wstring &iRack,
if (wordlen < 1)
return;
const DAWG_EDGE *root_edge = getEdgeAt<DAWG_EDGE>(getRoot());
root_edge = getEdgeAt<DAWG_EDGE>(root_edge->ptr);
const DicEdge *root_edge = getEdgeAt(getRoot());
root_edge = getEdgeAt(root_edge->ptr);
params.results = &oWordList;
@ -223,23 +210,12 @@ void Dictionary::search7pl1Templ(const wstring &iRack,
}
}
void Dictionary::search7pl1(const wstring &iRack,
map<wchar_t, vector<wstring> > &oWordList,
bool joker) const
{
if (getHeader().getVersion() == 0)
search7pl1Templ<DicEdgeOld>(iRack, oWordList, joker);
else
search7pl1Templ<DicEdge>(iRack, oWordList, joker);
}
/****************************************/
/****************************************/
template <typename DAWG_EDGE>
void Dictionary::searchRaccTempl(const wstring &iWord, vector<wstring> &oWordList,
unsigned int iMaxResults) const
void Dictionary::searchRacc(const wstring &iWord,
vector<wstring> &oWordList,
unsigned int iMaxResults) const
{
if (iWord == L"")
return;
@ -271,13 +247,13 @@ void Dictionary::searchRaccTempl(const wstring &iWord, vector<wstring> &oWordLis
wordtst[i ] = '\0';
wordtst[i+1] = '\0';
const DAWG_EDGE *edge_seek =
seekEdgePtr(iWord.c_str(), getEdgeAt<DAWG_EDGE>(getRoot()));
const DicEdge *edge_seek =
seekEdgePtr(iWord.c_str(), getEdgeAt(getRoot()));
/* points to what the next letter can be */
const DAWG_EDGE *edge = getEdgeAt<DAWG_EDGE>(edge_seek->ptr);
const DicEdge *edge = getEdgeAt(edge_seek->ptr);
if (edge != getEdgeAt<DAWG_EDGE>(0))
if (edge != getEdgeAt(0))
{
do
{
@ -292,21 +268,11 @@ void Dictionary::searchRaccTempl(const wstring &iWord, vector<wstring> &oWordLis
}
}
void Dictionary::searchRacc(const wstring &iWord, vector<wstring> &oWordList, unsigned int iMaxResults) const
{
if (getHeader().getVersion() == 0)
searchRaccTempl<DicEdgeOld>(iWord, oWordList, iMaxResults);
else
searchRaccTempl<DicEdge>(iWord, oWordList, iMaxResults);
}
/****************************************/
/****************************************/
template <typename DAWG_EDGE>
void Dictionary::searchBenjTempl(const wstring &iWord, vector<wstring> &oWordList,
unsigned int iMaxResults) const
void Dictionary::searchBenj(const wstring &iWord, vector<wstring> &oWordList,
unsigned int iMaxResults) const
{
if (iWord == L"")
return;
@ -319,17 +285,17 @@ void Dictionary::searchBenjTempl(const wstring &iWord, vector<wstring> &oWordLis
wchar_t wordtst[DIC_WORD_MAX];
wcscpy(wordtst + 3, iWord.c_str());
const DAWG_EDGE *edge0, *edge1, *edge2, *edgetst;
edge0 = getEdgeAt<DAWG_EDGE>(getRoot());
edge0 = getEdgeAt<DAWG_EDGE>(edge0->ptr);
const DicEdge *edge0, *edge1, *edge2, *edgetst;
edge0 = getEdgeAt(getRoot());
edge0 = getEdgeAt(edge0->ptr);
do
{
wordtst[0] = getHeader().getCharFromCode(edge0->chr);
edge1 = getEdgeAt<DAWG_EDGE>(edge0->ptr);
edge1 = getEdgeAt(edge0->ptr);
do
{
wordtst[1] = getHeader().getCharFromCode(edge1->chr);
edge2 = getEdgeAt<DAWG_EDGE>(edge1->ptr);
edge2 = getEdgeAt(edge1->ptr);
do
{
edgetst = seekEdgePtr(iWord.c_str(), edge2);
@ -345,16 +311,6 @@ void Dictionary::searchBenjTempl(const wstring &iWord, vector<wstring> &oWordLis
} while (!(*edge0++).last);
}
void Dictionary::searchBenj(const wstring &iWord, vector<wstring> &oWordList,
unsigned int iMaxResults) const
{
if (getHeader().getVersion() == 0)
searchBenjTempl<DicEdgeOld>(iWord, oWordList, iMaxResults);
else
searchBenjTempl<DicEdge>(iWord, oWordList, iMaxResults);
}
/****************************************/
/****************************************/
@ -365,16 +321,15 @@ struct params_cross_t
};
template <typename DAWG_EDGE>
void Dictionary::searchCrossRecTempl(struct params_cross_t *params,
vector<wstring> &oWordList,
const DAWG_EDGE *edgeptr,
unsigned int iMaxResults) const
void Dictionary::searchCrossRec(struct params_cross_t *params,
vector<wstring> &oWordList,
const DicEdge *edgeptr,
unsigned int iMaxResults) const
{
if (iMaxResults && oWordList.size() >= iMaxResults)
return;
const DAWG_EDGE *current = getEdgeAt<DAWG_EDGE>(edgeptr->ptr);
const DicEdge *current = getEdgeAt(edgeptr->ptr);
if (params->mask[params->wordlen] == '\0')
{
@ -392,7 +347,7 @@ void Dictionary::searchCrossRecTempl(struct params_cross_t *params,
{
params->mask[params->wordlen] = getHeader().getCharFromCode(current->chr);
params->wordlen ++;
searchCrossRecTempl(params, oWordList, current, iMaxResults);
searchCrossRec(params, oWordList, current, iMaxResults);
params->wordlen --;
params->mask[params->wordlen] = '.';
}
@ -405,7 +360,7 @@ void Dictionary::searchCrossRecTempl(struct params_cross_t *params,
if (current->chr == getHeader().getCodeFromChar(params->mask[params->wordlen]))
{
params->wordlen ++;
searchCrossRecTempl(params, oWordList, current, iMaxResults);
searchCrossRec(params, oWordList, current, iMaxResults);
params->wordlen --;
break;
}
@ -440,16 +395,7 @@ void Dictionary::searchCross(const wstring &iMask, vector<wstring> &oWordList,
params.mask[i] = '\0';
params.wordlen = 0;
if (getHeader().getVersion() == 0)
{
searchCrossRecTempl(&params, oWordList,
getEdgeAt<DicEdgeOld>(getRoot()), iMaxResults);
}
else
{
searchCrossRecTempl(&params, oWordList,
getEdgeAt<DicEdge>(getRoot()), iMaxResults);
}
searchCrossRec(&params, oWordList, getEdgeAt(getRoot()), iMaxResults);
}
/****************************************/
@ -461,16 +407,15 @@ struct params_regexp_t
int maxlength;
Automaton *automaton_field;
wchar_t word[DIC_WORD_MAX];
int wordlen;
int wordlen;
};
template <typename DAWG_EDGE>
void Dictionary::searchRegexpRecTempl(struct params_regexp_t *params,
int state,
const DAWG_EDGE *edgeptr,
vector<wstring> &oWordList,
unsigned int iMaxResults) const
void Dictionary::searchRegexpRec(struct params_regexp_t *params,
int state,
const DicEdge *edgeptr,
vector<wstring> &oWordList,
unsigned int iMaxResults) const
{
if (iMaxResults && oWordList.size() >= iMaxResults)
return;
@ -487,7 +432,7 @@ void Dictionary::searchRegexpRecTempl(struct params_regexp_t *params,
}
}
/* we now drive the search by exploring the dictionary */
const DAWG_EDGE *current = getEdgeAt<DAWG_EDGE>(edgeptr->ptr);
const DicEdge *current = getEdgeAt(edgeptr->ptr);
do
{
/* the current letter is current->chr */
@ -498,7 +443,7 @@ void Dictionary::searchRegexpRecTempl(struct params_regexp_t *params,
params->word[params->wordlen] =
getHeader().getCharFromCode(current->chr);
params->wordlen ++;
searchRegexpRecTempl(params, next_state, current, oWordList, iMaxResults);
searchRegexpRec(params, next_state, current, oWordList, iMaxResults);
params->wordlen --;
params->word[params->wordlen] = L'\0';
}
@ -599,19 +544,9 @@ bool Dictionary::searchRegExp(const wstring &iRegexp,
params.automaton_field = a;
memset(params.word, L'\0', sizeof(params.word));
params.wordlen = 0;
if (getHeader().getVersion() == 0)
{
searchRegexpRecTempl(&params, a->getInitId(),
getEdgeAt<DicEdgeOld>(getRoot()), oWordList,
iMaxResults ? iMaxResults + 1 : 0);
}
else
{
searchRegexpRecTempl(&params, a->getInitId(),
getEdgeAt<DicEdge>(getRoot()), oWordList,
iMaxResults ? iMaxResults + 1 : 0);
}
searchRegexpRec(&params, a->getInitId(),
getEdgeAt(getRoot()), oWordList,
iMaxResults ? iMaxResults + 1 : 0);
delete a;
}
delete root;

View file

@ -36,9 +36,6 @@
using namespace boost::spirit;
using namespace std;
// TODO:
// - error handling
// A few typedefs to simplify things
typedef const wchar_t *iterator_t;
typedef tree_match<iterator_t> parse_tree_match_t;

View file

@ -285,134 +285,78 @@ void Header::read(istream &iStream)
m_version = aHeader.version;
// Version 0 corresponds to the dictionary format in the first Eliot
// versions, supported until Eliot 1.8 (excluded).
// The new version (version 1) was introduced in Eliot 1.6.
if (m_version == 0)
{
throw DicException(_("Too old dictionary format. This format is not "
"supported anymore since Eliot 1.8. You can "
"create dictionaries in the new format with the "
"'compdic' tool provided with Eliot (since "
"version 1.6)."));
}
// Handle endianness
if (m_version == 0)
{
#if defined(WORDS_BIGENDIAN)
aHeader.root = swap4(aHeader.root);
aHeader.nwords = swap4(aHeader.nwords);
aHeader.nodesused = swap4(aHeader.nodesused);
aHeader.edgesused = swap4(aHeader.edgesused);
aHeader.nodessaved = swap4(aHeader.nodessaved);
aHeader.edgessaved = swap4(aHeader.edgessaved);
#endif
m_root = aHeader.root;
m_nbWords = aHeader.nwords;
m_nodesUsed = aHeader.nodesused;
m_edgesUsed = aHeader.edgesused;
m_nodesSaved = aHeader.nodessaved;
m_edgesSaved = aHeader.edgessaved;
}
m_root = ntohl(aHeader.root);
m_nbWords = ntohl(aHeader.nwords);
m_nodesUsed = ntohl(aHeader.nodesused);
m_edgesUsed = ntohl(aHeader.edgesused);
m_nodesSaved = ntohl(aHeader.nodessaved);
m_edgesSaved = ntohl(aHeader.edgessaved);
// After reading the old header, we now read the extension
Dict_header_ext aHeaderExt;
iStream.read((char*)&aHeaderExt, sizeof(Dict_header_ext));
if (iStream.gcount() != sizeof(Dict_header_ext))
throw DicException("Header::read: expected to read more bytes");
// Handle endianness in the extension
aHeaderExt.compressDate = ntohll(aHeaderExt.compressDate);
aHeaderExt.userHostSize = ntohl(aHeaderExt.userHostSize);
aHeaderExt.dicNameSize = ntohl(aHeaderExt.dicNameSize);
aHeaderExt.lettersSize = ntohl(aHeaderExt.lettersSize);
aHeaderExt.nbLetters = ntohl(aHeaderExt.nbLetters);
aHeaderExt.vowels = ntohll(aHeaderExt.vowels);
aHeaderExt.consonants = ntohll(aHeaderExt.consonants);
m_compressDate = aHeaderExt.compressDate;
if (aHeaderExt.algorithm == kDAWG)
m_type = kDAWG;
else if (aHeaderExt.algorithm == kGADDAG)
m_type = kGADDAG;
else
throw DicException("Header::read: unrecognized algorithm type");
m_userHost = readFromUTF8(aHeaderExt.userHost, aHeaderExt.userHostSize,
"user and host information");
// Convert the dictionary letters from UTF-8 to wchar_t*
m_dicName = readFromUTF8(aHeaderExt.dicName, aHeaderExt.dicNameSize,
"dictionary name");
// Convert the dictionary letters from UTF-8 to wchar_t*
m_letters = readFromUTF8(aHeaderExt.letters, aHeaderExt.lettersSize,
"dictionary letters");
// Safety check: correct number of letters?
if (m_letters.size() != aHeaderExt.nbLetters)
{
m_root = ntohl(aHeader.root);
m_nbWords = ntohl(aHeader.nwords);
m_nodesUsed = ntohl(aHeader.nodesused);
m_edgesUsed = ntohl(aHeader.edgesused);
m_nodesSaved = ntohl(aHeader.nodessaved);
m_edgesSaved = ntohl(aHeader.edgessaved);
throw DicException("Header::read: inconsistent header");
}
if (m_version == 0)
// Letters points and frequency
for (unsigned int i = 0; i < m_letters.size(); ++i)
{
m_compressDate = 0;
m_userHost = convertToWc(_("Unknown (old format)"));
m_dicName = convertToWc(_("Unknown (old format)"));
// In version 0, the letters, points, frequency,
// vowels and consonants were hard-coded...
m_letters = convertToWc("ABCDEFGHIJKLMNOPQRSTUVWXYZ?");
static const uint8_t Frenchpoints[] =
{
// A B C D E F G H I J K L M N O P Q R S T U V W X Y Z ?
1,3,3,2, 1,4,2,4,1,8,10,1,2,1,1,3,8,1,1,1,1,4,10,10,10,10,0
};
static const uint8_t FrenchFrequency[] =
{
// A B C D E F G H I J K L M N O P Q R S T U V W X Y Z ?
9,2,2,3,15,2,2,2,8,1, 1,5,3,6,6,2,1,6,6,6,6,2, 1, 1, 1, 1,2
};
// The jokers and the 'Y' can be considered both as vowels or consonants
static const uint8_t FrenchVowels[] =
{
// A B C D E F G H I J K L M N O P Q R S T U V W X Y Z ?
1,0,0,0, 1,0,0,0,1,0, 0,0,0,0,1,0,0,0,0,0,1,0, 0, 0, 1, 0,1
};
static const uint8_t FrenchConsonants[] =
{
// A B C D E F G H I J K L M N O P Q R S T U V W X Y Z ?
0,1,1,1, 0,1,1,1,0,1, 1,1,1,1,0,1,1,1,1,1,0,1, 1, 1, 1, 1,1
};
for (unsigned int i = 0; i < m_letters.size(); ++i)
{
m_points.push_back(Frenchpoints[i]);
m_frequency.push_back(FrenchFrequency[i]);
m_vowels.push_back(FrenchVowels[i]);
m_consonants.push_back(FrenchConsonants[i]);
}
m_points.push_back(aHeaderExt.points[i]);
m_frequency.push_back(aHeaderExt.frequency[i]);
}
else
// Vowels and consonants
for (unsigned int i = 0; i < m_letters.size(); ++i)
{
// This header doesn't use the old serialization format, so read the
// extension as well
Dict_header_ext aHeaderExt;
iStream.read((char*)&aHeaderExt, sizeof(Dict_header_ext));
if (iStream.gcount() != sizeof(Dict_header_ext))
throw DicException("Header::read: expected to read more bytes");
// Handle endianness in the extension
aHeaderExt.compressDate = ntohll(aHeaderExt.compressDate);
aHeaderExt.userHostSize = ntohl(aHeaderExt.userHostSize);
aHeaderExt.dicNameSize = ntohl(aHeaderExt.dicNameSize);
aHeaderExt.lettersSize = ntohl(aHeaderExt.lettersSize);
aHeaderExt.nbLetters = ntohl(aHeaderExt.nbLetters);
aHeaderExt.vowels = ntohll(aHeaderExt.vowels);
aHeaderExt.consonants = ntohll(aHeaderExt.consonants);
m_compressDate = aHeaderExt.compressDate;
if (aHeaderExt.algorithm == kDAWG)
m_type = kDAWG;
else if (aHeaderExt.algorithm == kGADDAG)
m_type = kGADDAG;
else
throw DicException("Header::read: unrecognized algorithm type");
m_userHost = readFromUTF8(aHeaderExt.userHost, aHeaderExt.userHostSize,
"user and host information");
// Convert the dictionary letters from UTF-8 to wchar_t*
m_dicName = readFromUTF8(aHeaderExt.dicName, aHeaderExt.dicNameSize,
"dictionary name");
// Convert the dictionary letters from UTF-8 to wchar_t*
m_letters = readFromUTF8(aHeaderExt.letters, aHeaderExt.lettersSize,
"dictionary letters");
// Safety check: correct number of letters?
if (m_letters.size() != aHeaderExt.nbLetters)
{
throw DicException("Header::read: inconsistent header");
}
// Letters points and frequency
for (unsigned int i = 0; i < m_letters.size(); ++i)
{
m_points.push_back(aHeaderExt.points[i]);
m_frequency.push_back(aHeaderExt.frequency[i]);
}
// Vowels and consonants
for (unsigned int i = 0; i < m_letters.size(); ++i)
{
m_vowels.push_back(aHeaderExt.vowels & (1 << i));
m_consonants.push_back(aHeaderExt.consonants & (1 << i));
}
m_vowels.push_back(aHeaderExt.vowels & (1 << i));
m_consonants.push_back(aHeaderExt.consonants & (1 << i));
}
}
@ -434,81 +378,70 @@ void Header::write(ostream &oStream) const
if (!oStream.good())
throw DicException("Header::write: error when writing to file");
if (m_version != 0)
Dict_header_ext aHeaderExt;
aHeaderExt.compressDate = m_compressDate;
aHeaderExt.userHostSize =
writeInUTF8(m_userHost, aHeaderExt.userHost,
_MAX_USER_HOST_, "user and host information");
aHeaderExt.algorithm = m_type;
// Convert the dictionary name to UTF-8
aHeaderExt.dicNameSize =
writeInUTF8(m_dicName, aHeaderExt.dicName,
_MAX_DIC_NAME_SIZE_, "dictionary name");
// Convert the dictionary letters to UTF-8
aHeaderExt.lettersSize =
writeInUTF8(m_letters, aHeaderExt.letters,
_MAX_LETTERS_SIZE_, "dictionary letters");
aHeaderExt.nbLetters = (uint32_t)m_letters.size();
// Letters points and frequency
for (unsigned int i = 0; i < m_letters.size(); ++i)
{
Dict_header_ext aHeaderExt;
aHeaderExt.compressDate = m_compressDate;
aHeaderExt.userHostSize =
writeInUTF8(m_userHost, aHeaderExt.userHost,
_MAX_USER_HOST_, "user and host information");
aHeaderExt.algorithm = m_type;
// Convert the dictionary name to UTF-8
aHeaderExt.dicNameSize =
writeInUTF8(m_dicName, aHeaderExt.dicName,
_MAX_DIC_NAME_SIZE_, "dictionary name");
// Convert the dictionary letters to UTF-8
aHeaderExt.lettersSize =
writeInUTF8(m_letters, aHeaderExt.letters,
_MAX_LETTERS_SIZE_, "dictionary letters");
aHeaderExt.nbLetters = (uint32_t)m_letters.size();
// Letters points and frequency
for (unsigned int i = 0; i < m_letters.size(); ++i)
{
aHeaderExt.points[i] = m_points[i];
aHeaderExt.frequency[i] = m_frequency[i];
}
// Vowels and consonants
aHeaderExt.vowels = 0;
aHeaderExt.consonants = 0;
for (unsigned int i = 0; i < m_letters.size(); ++i)
{
if (m_vowels[i])
aHeaderExt.vowels |= 1 << i;
if (m_consonants[i])
aHeaderExt.consonants |= 1 << i;
}
// Handle endianness in the extension
aHeaderExt.userHostSize = htonl(aHeaderExt.userHostSize);
aHeaderExt.compressDate = htonll(aHeaderExt.compressDate);
aHeaderExt.dicNameSize = htonl(aHeaderExt.dicNameSize);
aHeaderExt.lettersSize = htonl(aHeaderExt.lettersSize);
aHeaderExt.nbLetters = htonl(aHeaderExt.nbLetters);
aHeaderExt.vowels = htonll(aHeaderExt.vowels);
aHeaderExt.consonants = htonll(aHeaderExt.consonants);
// Write the extension
oStream.write((char*)&aHeaderExt, sizeof(Dict_header_ext));
if (!oStream.good())
throw DicException("Header::write: error when writing to file");
aHeaderExt.points[i] = m_points[i];
aHeaderExt.frequency[i] = m_frequency[i];
}
// Vowels and consonants
aHeaderExt.vowels = 0;
aHeaderExt.consonants = 0;
for (unsigned int i = 0; i < m_letters.size(); ++i)
{
if (m_vowels[i])
aHeaderExt.vowels |= 1 << i;
if (m_consonants[i])
aHeaderExt.consonants |= 1 << i;
}
// Handle endianness in the extension
aHeaderExt.userHostSize = htonl(aHeaderExt.userHostSize);
aHeaderExt.compressDate = htonll(aHeaderExt.compressDate);
aHeaderExt.dicNameSize = htonl(aHeaderExt.dicNameSize);
aHeaderExt.lettersSize = htonl(aHeaderExt.lettersSize);
aHeaderExt.nbLetters = htonl(aHeaderExt.nbLetters);
aHeaderExt.vowels = htonll(aHeaderExt.vowels);
aHeaderExt.consonants = htonll(aHeaderExt.consonants);
// Write the extension
oStream.write((char*)&aHeaderExt, sizeof(Dict_header_ext));
if (!oStream.good())
throw DicException("Header::write: error when writing to file");
}
void Header::print() const
{
printf(_("dictionary name: %s\n"), convertToMb(m_dicName).c_str());
if (m_version)
{
char buf[50];
strftime(buf, sizeof(buf), "%c", gmtime(&m_compressDate));
printf(_("compressed on: %s\n"), buf);
}
else
{
printf(_("compressed on: Unknown date (old format)\n"));
}
char buf[50];
strftime(buf, sizeof(buf), "%c", gmtime(&m_compressDate));
printf(_("compressed on: %s\n"), buf);
printf(_("compressed using a binary compiled by: %s\n"), convertToMb(m_userHost).c_str());
printf(_("dictionary type: %s\n"), m_type == kDAWG ? "DAWG" : "GADDAG");
printf(_("letters: %s\n"), convertToMb(m_letters).c_str());
printf(_("number of letters: %lu\n"), (long unsigned int)m_letters.size());
printf(_("number of words: %d\n"), m_nbWords);
long unsigned int size =
sizeof(Dict_header_old) + (m_version ? sizeof(Dict_header_ext) : 0);
long unsigned int size = sizeof(Dict_header_old) + sizeof(Dict_header_ext);
printf(_("header size: %lu bytes\n"), size);
printf(_("root: %d (edge)\n"), m_root);
printf(_("nodes: %d used + %d saved\n"), m_nodesUsed, m_nodesSaved);

View file

@ -46,8 +46,7 @@
using namespace std;
template <typename DAWG_EDGE>
static void print_dic_rec(ostream &out, const Dictionary &iDic, wchar_t *buf, wchar_t *s, DAWG_EDGE i)
static void print_dic_rec(ostream &out, const Dictionary &iDic, wchar_t *buf, wchar_t *s, DicEdge i)
{
if (i.term) /* edge points at a complete word */
{
@ -56,7 +55,7 @@ static void print_dic_rec(ostream &out, const Dictionary &iDic, wchar_t *buf, wc
}
if (i.ptr)
{ /* Compute index: is it non-zero ? */
const DAWG_EDGE *p = reinterpret_cast<const DAWG_EDGE*>(iDic.getEdgeAt(i.ptr));
const DicEdge *p = iDic.getEdgeAt(i.ptr);
do
{ /* for each edge out of this node */
*s = iDic.getHeader().getCharFromCode(p->chr);
@ -67,24 +66,22 @@ static void print_dic_rec(ostream &out, const Dictionary &iDic, wchar_t *buf, wc
}
template <typename DAWG_EDGE>
void print_dic_list(const Dictionary &iDic)
{
static wchar_t buf[80];
print_dic_rec(cout, iDic, buf, buf, *reinterpret_cast<const DAWG_EDGE*>(iDic.getEdgeAt(iDic.getRoot())));
print_dic_rec(cout, iDic, buf, buf, *iDic.getEdgeAt(iDic.getRoot()));
}
template <typename DAWG_EDGE>
static void print_node_hex(const Dictionary &dic, int i)
{
union edge_t
{
DAWG_EDGE e;
DicEdge e;
uint32_t s;
} ee;
ee.e = *reinterpret_cast<const DAWG_EDGE*>(dic.getEdgeAt(i));
ee.e = *reinterpret_cast<const DicEdge*>(dic.getEdgeAt(i));
printf("0x%04lx %08x |%4d ptr=%8d t=%d l=%d chr=%2d (%c)\n",
(unsigned long)i*sizeof(ee), (unsigned int)(ee.s),
@ -92,13 +89,12 @@ static void print_node_hex(const Dictionary &dic, int i)
}
template <typename DAWG_EDGE>
void print_dic_hex(const Dictionary &iDic)
{
printf(_("offset binary | structure\n"));
printf("------ -------- | --------------------\n");
for (unsigned int i = 0; i < (iDic.getHeader().getNbEdgesUsed() + 1); i++)
print_node_hex<DAWG_EDGE>(iDic, i);
print_node_hex(iDic, i);
}
@ -172,17 +168,11 @@ int main(int argc, char *argv[])
}
if (option_print_dic_hex || option_print_all)
{
if (dic.getHeader().getVersion() == 0)
print_dic_hex<DicEdgeOld>(dic);
else
print_dic_hex<DicEdge>(dic);
print_dic_hex(dic);
}
if (option_print_dic_list || option_print_all)
{
if (dic.getHeader().getVersion() == 0)
print_dic_list<DicEdgeOld>(dic);
else
print_dic_list<DicEdge>(dic);
print_dic_list(dic);
}
return 0;
}

View file

@ -230,21 +230,9 @@ void MainWindow::updateStatusBar(const Dictionary *iDic)
if (iDic == NULL)
m_dicNameLabel->setText("No dictionary");
else {
if (iDic->getHeader().getVersion() != 0)
{
QString dicName = qfw(m_dic->getHeader().getName());
m_dicNameLabel->setText(_q("Dictionary: %1").arg(dicName));
m_dicNameLabel->setToolTip("");
}
else
{
m_dicNameLabel->setText(_q("Dictionary: Unknown (old format)"));
QString warning = _q("The dictionary name cannot be "
"retrieved, because you are using an old dictionary format.\n"
"You can probably download a newer version of the dictionary "
"on http://www.nongnu.org/eliot/");
m_dicNameLabel->setToolTip(warning);
}
QString dicName = qfw(m_dic->getHeader().getName());
m_dicNameLabel->setText(_q("Dictionary: %1").arg(dicName));
m_dicNameLabel->setToolTip("");
}
}