mirror of
git://git.savannah.nongnu.org/eliot.git
synced 2024-12-27 09:58:08 +01:00
Sort the word list before processing it. It allows a much better compression, and avoid problems when a word is a prefix of the word just before.
Also, the word list is not a raw buffer anymore, which makes it a bit easier to understand the algorithm.
This commit is contained in:
parent
32a252c96b
commit
01297ebdb4
2 changed files with 94 additions and 109 deletions
177
dic/compdic.cpp
177
dic/compdic.cpp
|
@ -125,7 +125,7 @@ void CompDic::addLetter(wchar_t chr, int points, int frequency,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
const wchar_t * CompDic::loadWordList(const string &iFileName, unsigned int &oDicSize)
|
void CompDic::loadWordList(const string &iFileName, vector<wstring> &oWordList)
|
||||||
{
|
{
|
||||||
ifstream file(iFileName.c_str(), ios::in | ios::binary);
|
ifstream file(iFileName.c_str(), ios::in | ios::binary);
|
||||||
if (!file.is_open())
|
if (!file.is_open())
|
||||||
|
@ -135,42 +135,28 @@ const wchar_t * CompDic::loadWordList(const string &iFileName, unsigned int &oDi
|
||||||
struct stat stat_buf;
|
struct stat stat_buf;
|
||||||
if (stat(iFileName.c_str(), &stat_buf) < 0)
|
if (stat(iFileName.c_str(), &stat_buf) < 0)
|
||||||
throw DicException((fmt(_("Could not open file '%1%'")) % iFileName).str());
|
throw DicException((fmt(_("Could not open file '%1%'")) % iFileName).str());
|
||||||
oDicSize = (unsigned int)stat_buf.st_size;
|
int dicSize = (unsigned int)stat_buf.st_size;
|
||||||
|
|
||||||
// Place the buffer in a vector to avoid worrying about memory handling
|
// Reserve some space (heuristic: the average length of words is 11)
|
||||||
vector<char> buffer(oDicSize);
|
oWordList.reserve(dicSize / 11);
|
||||||
// Load the file data, everything in one shot
|
|
||||||
file.read(&buffer.front(), oDicSize);
|
|
||||||
file.close();
|
|
||||||
|
|
||||||
// If there is a BOM in the file, use an offset to start reading after it
|
string line;
|
||||||
size_t bomOffset = 0;
|
while (getline(file, line))
|
||||||
if ((uint8_t)buffer[0] == 0xEF &&
|
|
||||||
(uint8_t)buffer[1] == 0xBB &&
|
|
||||||
(uint8_t)buffer[2] == 0xBF)
|
|
||||||
{
|
{
|
||||||
bomOffset = 3;
|
// If there is a BOM in the file, remove it from the first word
|
||||||
|
if (oWordList.empty() && line.size() >= 3 &&
|
||||||
|
(uint8_t)line[0] == 0xEF &&
|
||||||
|
(uint8_t)line[1] == 0xBB &&
|
||||||
|
(uint8_t)line[2] == 0xBF)
|
||||||
|
{
|
||||||
|
line = line.substr(3);
|
||||||
|
}
|
||||||
|
oWordList.push_back(readFromUTF8(line.data(),
|
||||||
|
line.size(), "loadWordList"));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Buffer for the wide characters (it will use at most as many characters
|
// Sort the word list, to perform a better compression
|
||||||
// as the utf-8 version)
|
sort(oWordList.begin(), oWordList.end());
|
||||||
wchar_t *wideBuf = new wchar_t[oDicSize];
|
|
||||||
|
|
||||||
try
|
|
||||||
{
|
|
||||||
unsigned int number = readFromUTF8(wideBuf, oDicSize,
|
|
||||||
(&buffer.front()) + bomOffset,
|
|
||||||
oDicSize - bomOffset,
|
|
||||||
"loadWordList");
|
|
||||||
oDicSize = number;
|
|
||||||
return wideBuf;
|
|
||||||
}
|
|
||||||
catch (...)
|
|
||||||
{
|
|
||||||
// Avoid leaks, and propagate the exception
|
|
||||||
delete[] wideBuf;
|
|
||||||
throw;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -229,8 +215,11 @@ class IncDec
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
unsigned int CompDic::makeNode(const wchar_t *iPrefix, ostream &outFile,
|
unsigned int CompDic::makeNode(ostream &outFile, const Header &iHeader,
|
||||||
const Header &iHeader)
|
vector<wstring>::const_iterator &itCurrWord,
|
||||||
|
const vector<wstring>::const_iterator &itLastWord,
|
||||||
|
wstring::const_iterator &itPosInWord,
|
||||||
|
const wchar_t *iPrefix)
|
||||||
{
|
{
|
||||||
#ifdef CHECK_RECURSION
|
#ifdef CHECK_RECURSION
|
||||||
IncDec inc(m_currentRec);
|
IncDec inc(m_currentRec);
|
||||||
|
@ -256,7 +245,9 @@ unsigned int CompDic::makeNode(const wchar_t *iPrefix, ostream &outFile,
|
||||||
newEdge.last = 0;
|
newEdge.last = 0;
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
newEdge.chr = iHeader.getCodeFromChar(*m_endString++ = *m_input++);
|
newEdge.chr = iHeader.getCodeFromChar(*m_endString = *itPosInWord);
|
||||||
|
++m_endString;
|
||||||
|
++itPosInWord;
|
||||||
}
|
}
|
||||||
catch (DicException &e)
|
catch (DicException &e)
|
||||||
{
|
{
|
||||||
|
@ -271,32 +262,31 @@ unsigned int CompDic::makeNode(const wchar_t *iPrefix, ostream &outFile,
|
||||||
edges.push_back(newEdge);
|
edges.push_back(newEdge);
|
||||||
|
|
||||||
// End of a word?
|
// End of a word?
|
||||||
if (*m_input == L'\n' || *m_input == L'\r')
|
if (itPosInWord == itCurrWord->end())
|
||||||
{
|
{
|
||||||
m_headerInfo.nwords++;
|
m_headerInfo.nwords++;
|
||||||
*m_endString = L'\0';
|
*m_endString = L'\0';
|
||||||
// Mark edge as word
|
// Mark edge as word
|
||||||
edges.back().term = 1;
|
edges.back().term = 1;
|
||||||
|
|
||||||
// Skip \r and/or \n
|
// Next word
|
||||||
while (m_input != m_endOfInput &&
|
++itCurrWord;
|
||||||
(*m_input == L'\n' || *m_input == L'\r'))
|
|
||||||
{
|
|
||||||
++m_input;
|
|
||||||
}
|
|
||||||
// At the end of input?
|
// At the end of input?
|
||||||
if (m_input == m_endOfInput)
|
if (itCurrWord == itLastWord)
|
||||||
break;
|
break;
|
||||||
|
itPosInWord = itCurrWord->begin();
|
||||||
|
|
||||||
m_endString = m_stringBuf;
|
m_endString = m_stringBuf;
|
||||||
while (*m_endString == *m_input)
|
// This assumes that a word cannot be a prefix of the previous one
|
||||||
|
while (*m_endString == *itPosInWord)
|
||||||
{
|
{
|
||||||
m_endString++;
|
++m_endString;
|
||||||
m_input++;
|
++itPosInWord;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Make dawg pointed to by this edge
|
// Make dawg pointed to by this edge
|
||||||
edges.back().ptr = makeNode(iPrefix + 1, outFile, iHeader);
|
edges.back().ptr = makeNode(outFile, iHeader, itCurrWord, itLastWord,
|
||||||
|
itPosInWord, iPrefix + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
int numedges = edges.size();
|
int numedges = edges.size();
|
||||||
|
@ -348,56 +338,51 @@ Header CompDic::generateDawg(const string &iWordListFile,
|
||||||
throw DicException(oss.str());
|
throw DicException(oss.str());
|
||||||
}
|
}
|
||||||
|
|
||||||
const wchar_t *wordList = NULL;
|
const clock_t startLoadTime = clock();
|
||||||
try
|
vector<wstring> wordList;
|
||||||
|
loadWordList(iWordListFile, wordList);
|
||||||
|
const clock_t endLoadTime = clock();
|
||||||
|
m_loadTime = 1.0 * (endLoadTime - startLoadTime) / CLOCKS_PER_SEC;
|
||||||
|
|
||||||
|
if (wordList.empty())
|
||||||
{
|
{
|
||||||
const clock_t startLoadTime = clock();
|
throw DicException(_("The word list is empty!"));
|
||||||
unsigned int dicSize;
|
|
||||||
wordList = loadWordList(iWordListFile, dicSize);
|
|
||||||
const clock_t endLoadTime = clock();
|
|
||||||
m_loadTime = 1.0 * (endLoadTime - startLoadTime) / CLOCKS_PER_SEC;
|
|
||||||
|
|
||||||
m_input = wordList;
|
|
||||||
m_endOfInput = m_input + dicSize;
|
|
||||||
|
|
||||||
// Write the header a first time, to reserve the space in the file
|
|
||||||
Header tempHeader = writeHeader(outFile);
|
|
||||||
|
|
||||||
DicEdge specialNode = {0, 0, 0, 0};
|
|
||||||
specialNode.last = 1;
|
|
||||||
// Temporary variable to avoid a warning when compiling with -O2
|
|
||||||
// (there is no warning with -O0... g++ bug?)
|
|
||||||
DicEdge *tmpPtr = &specialNode;
|
|
||||||
writeNode(reinterpret_cast<uint32_t*>(tmpPtr), 1, outFile);
|
|
||||||
|
|
||||||
// Call makeNode with null (relative to stringbuf) prefix;
|
|
||||||
// Initialize string to null; Put index of start node on output
|
|
||||||
DicEdge rootNode = {0, 0, 0, 0};
|
|
||||||
m_endString = m_stringBuf;
|
|
||||||
const clock_t startBuildTime = clock();
|
|
||||||
rootNode.ptr = makeNode(m_endString, outFile, tempHeader);
|
|
||||||
// Reuse the temporary variable
|
|
||||||
tmpPtr = &rootNode;
|
|
||||||
writeNode(reinterpret_cast<uint32_t*>(tmpPtr), 1, outFile);
|
|
||||||
const clock_t endBuildTime = clock();
|
|
||||||
m_buildTime = 1.0 * (endBuildTime - startBuildTime) / CLOCKS_PER_SEC;
|
|
||||||
|
|
||||||
// Write the header again, now that it is complete
|
|
||||||
m_headerInfo.root = m_headerInfo.edgesused;
|
|
||||||
const Header finalHeader = writeHeader(outFile);
|
|
||||||
|
|
||||||
// Clean up
|
|
||||||
delete[] wordList;
|
|
||||||
outFile.close();
|
|
||||||
|
|
||||||
return finalHeader;
|
|
||||||
}
|
|
||||||
catch (std::exception &e)
|
|
||||||
{
|
|
||||||
// Avoid memory leaks
|
|
||||||
if (wordList != NULL)
|
|
||||||
delete[] wordList;
|
|
||||||
throw;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Write the header a first time, to reserve the space in the file
|
||||||
|
Header tempHeader = writeHeader(outFile);
|
||||||
|
|
||||||
|
DicEdge specialNode = {0, 0, 0, 0};
|
||||||
|
specialNode.last = 1;
|
||||||
|
// Temporary variable to avoid a warning when compiling with -O2
|
||||||
|
// (there is no warning with -O0... g++ bug?)
|
||||||
|
DicEdge *tmpPtr = &specialNode;
|
||||||
|
writeNode(reinterpret_cast<uint32_t*>(tmpPtr), 1, outFile);
|
||||||
|
|
||||||
|
vector<wstring>::const_iterator firstWord = wordList.begin();
|
||||||
|
wstring::const_iterator initialPos = firstWord->begin();
|
||||||
|
|
||||||
|
// Call makeNode with null (relative to stringbuf) prefix;
|
||||||
|
// Initialize string to null; Put index of start node on output
|
||||||
|
DicEdge rootNode = {0, 0, 0, 0};
|
||||||
|
m_endString = m_stringBuf;
|
||||||
|
const clock_t startBuildTime = clock();
|
||||||
|
rootNode.ptr = makeNode(outFile, tempHeader,
|
||||||
|
firstWord, wordList.end(),
|
||||||
|
initialPos, m_endString);
|
||||||
|
// Reuse the temporary variable
|
||||||
|
tmpPtr = &rootNode;
|
||||||
|
writeNode(reinterpret_cast<uint32_t*>(tmpPtr), 1, outFile);
|
||||||
|
const clock_t endBuildTime = clock();
|
||||||
|
m_buildTime = 1.0 * (endBuildTime - startBuildTime) / CLOCKS_PER_SEC;
|
||||||
|
|
||||||
|
// Write the header again, now that it is complete
|
||||||
|
m_headerInfo.root = m_headerInfo.edgesused;
|
||||||
|
const Header finalHeader = writeHeader(outFile);
|
||||||
|
|
||||||
|
// Clean up
|
||||||
|
outFile.close();
|
||||||
|
|
||||||
|
return finalHeader;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -94,10 +94,6 @@ private:
|
||||||
wchar_t m_stringBuf[MAX_STRING_LENGTH];
|
wchar_t m_stringBuf[MAX_STRING_LENGTH];
|
||||||
/// Point to the end of the string
|
/// Point to the end of the string
|
||||||
wchar_t* m_endString;
|
wchar_t* m_endString;
|
||||||
/// Current position in the word list
|
|
||||||
const wchar_t *m_input;
|
|
||||||
/// Mark the end of the input
|
|
||||||
const wchar_t *m_endOfInput;
|
|
||||||
#ifdef CHECK_RECURSION
|
#ifdef CHECK_RECURSION
|
||||||
map<int, vector<DicEdge> > m_mapForDepth;
|
map<int, vector<DicEdge> > m_mapForDepth;
|
||||||
int m_currentRec;
|
int m_currentRec;
|
||||||
|
@ -110,15 +106,12 @@ private:
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Read the word list stored in iFileName, convert it to wide chars,
|
* Read the word list stored in iFileName, convert it to wide chars,
|
||||||
* and return it. The oDicSize parameter contains the size of the
|
* and return it (in the oWordList argument).
|
||||||
* returned array.
|
|
||||||
* In case of problem, an exception is thrown.
|
* In case of problem, an exception is thrown.
|
||||||
* @param iFileName: Name (and path) of the file containing the word list.
|
* @param iFileName: Name (and path) of the file containing the word list.
|
||||||
* @param oDicSize: Size of the returned array
|
* @param oWordList: Word list
|
||||||
* @return Word list as a wchar_t array
|
|
||||||
*/
|
*/
|
||||||
const wchar_t * loadWordList(const string &iFileName,
|
void loadWordList(const string &iFileName, vector<wstring> &oWordList);
|
||||||
unsigned int &oDicSize);
|
|
||||||
|
|
||||||
Header writeHeader(ostream &outFile) const;
|
Header writeHeader(ostream &outFile) const;
|
||||||
|
|
||||||
|
@ -137,13 +130,20 @@ private:
|
||||||
* the words beginning with that prefix. String is a pointer (relative
|
* the words beginning with that prefix. String is a pointer (relative
|
||||||
* to m_stringBuf) indicating how much of iPrefix is matched in the
|
* to m_stringBuf) indicating how much of iPrefix is matched in the
|
||||||
* input.
|
* input.
|
||||||
* @param iPrefix: prefix to work on
|
|
||||||
* @param outfile: stream where to write the nodes
|
* @param outfile: stream where to write the nodes
|
||||||
* @param iHeader: temporary header, used only to do the conversion between
|
* @param iHeader: temporary header, used only to do the conversion between
|
||||||
* the (wide) chars and their corresponding internal code
|
* the (wide) chars and their corresponding internal code
|
||||||
|
* @param itCurrWord: iterator on the word list
|
||||||
|
* @param itLastWord: end of the word list
|
||||||
|
* @param itPosInWord: iterator on the letters of the current word
|
||||||
|
* @param iPrefix: prefix to work on
|
||||||
|
* @return the index of a DAWG matching all the words with prefix iPrefix
|
||||||
*/
|
*/
|
||||||
unsigned int makeNode(const wchar_t *iPrefix, ostream &outFile,
|
unsigned int makeNode(ostream &outFile, const Header &iHeader,
|
||||||
const Header &iHeader);
|
vector<wstring>::const_iterator &itCurrWord,
|
||||||
|
const vector<wstring>::const_iterator &itLastWord,
|
||||||
|
wstring::const_iterator &itPosInWord,
|
||||||
|
const wchar_t *iPrefix);
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue