Sort the word list before processing it. It allows a much better compression, and avoid problems when a word is a prefix of the word just before.

Also, the word list is not a raw buffer anymore, which makes it a bit easier to understand the algorithm.
This commit is contained in:
Olivier Teulière 2010-05-16 10:06:01 +00:00
parent 32a252c96b
commit 01297ebdb4
2 changed files with 94 additions and 109 deletions

View file

@ -125,7 +125,7 @@ void CompDic::addLetter(wchar_t chr, int points, int frequency,
} }
const wchar_t * CompDic::loadWordList(const string &iFileName, unsigned int &oDicSize) void CompDic::loadWordList(const string &iFileName, vector<wstring> &oWordList)
{ {
ifstream file(iFileName.c_str(), ios::in | ios::binary); ifstream file(iFileName.c_str(), ios::in | ios::binary);
if (!file.is_open()) if (!file.is_open())
@ -135,42 +135,28 @@ const wchar_t * CompDic::loadWordList(const string &iFileName, unsigned int &oDi
struct stat stat_buf; struct stat stat_buf;
if (stat(iFileName.c_str(), &stat_buf) < 0) if (stat(iFileName.c_str(), &stat_buf) < 0)
throw DicException((fmt(_("Could not open file '%1%'")) % iFileName).str()); throw DicException((fmt(_("Could not open file '%1%'")) % iFileName).str());
oDicSize = (unsigned int)stat_buf.st_size; int dicSize = (unsigned int)stat_buf.st_size;
// Place the buffer in a vector to avoid worrying about memory handling // Reserve some space (heuristic: the average length of words is 11)
vector<char> buffer(oDicSize); oWordList.reserve(dicSize / 11);
// Load the file data, everything in one shot
file.read(&buffer.front(), oDicSize);
file.close();
// If there is a BOM in the file, use an offset to start reading after it string line;
size_t bomOffset = 0; while (getline(file, line))
if ((uint8_t)buffer[0] == 0xEF &&
(uint8_t)buffer[1] == 0xBB &&
(uint8_t)buffer[2] == 0xBF)
{ {
bomOffset = 3; // If there is a BOM in the file, remove it from the first word
if (oWordList.empty() && line.size() >= 3 &&
(uint8_t)line[0] == 0xEF &&
(uint8_t)line[1] == 0xBB &&
(uint8_t)line[2] == 0xBF)
{
line = line.substr(3);
}
oWordList.push_back(readFromUTF8(line.data(),
line.size(), "loadWordList"));
} }
// Buffer for the wide characters (it will use at most as many characters // Sort the word list, to perform a better compression
// as the utf-8 version) sort(oWordList.begin(), oWordList.end());
wchar_t *wideBuf = new wchar_t[oDicSize];
try
{
unsigned int number = readFromUTF8(wideBuf, oDicSize,
(&buffer.front()) + bomOffset,
oDicSize - bomOffset,
"loadWordList");
oDicSize = number;
return wideBuf;
}
catch (...)
{
// Avoid leaks, and propagate the exception
delete[] wideBuf;
throw;
}
} }
@ -229,8 +215,11 @@ class IncDec
#endif #endif
unsigned int CompDic::makeNode(const wchar_t *iPrefix, ostream &outFile, unsigned int CompDic::makeNode(ostream &outFile, const Header &iHeader,
const Header &iHeader) vector<wstring>::const_iterator &itCurrWord,
const vector<wstring>::const_iterator &itLastWord,
wstring::const_iterator &itPosInWord,
const wchar_t *iPrefix)
{ {
#ifdef CHECK_RECURSION #ifdef CHECK_RECURSION
IncDec inc(m_currentRec); IncDec inc(m_currentRec);
@ -256,7 +245,9 @@ unsigned int CompDic::makeNode(const wchar_t *iPrefix, ostream &outFile,
newEdge.last = 0; newEdge.last = 0;
try try
{ {
newEdge.chr = iHeader.getCodeFromChar(*m_endString++ = *m_input++); newEdge.chr = iHeader.getCodeFromChar(*m_endString = *itPosInWord);
++m_endString;
++itPosInWord;
} }
catch (DicException &e) catch (DicException &e)
{ {
@ -271,32 +262,31 @@ unsigned int CompDic::makeNode(const wchar_t *iPrefix, ostream &outFile,
edges.push_back(newEdge); edges.push_back(newEdge);
// End of a word? // End of a word?
if (*m_input == L'\n' || *m_input == L'\r') if (itPosInWord == itCurrWord->end())
{ {
m_headerInfo.nwords++; m_headerInfo.nwords++;
*m_endString = L'\0'; *m_endString = L'\0';
// Mark edge as word // Mark edge as word
edges.back().term = 1; edges.back().term = 1;
// Skip \r and/or \n // Next word
while (m_input != m_endOfInput && ++itCurrWord;
(*m_input == L'\n' || *m_input == L'\r'))
{
++m_input;
}
// At the end of input? // At the end of input?
if (m_input == m_endOfInput) if (itCurrWord == itLastWord)
break; break;
itPosInWord = itCurrWord->begin();
m_endString = m_stringBuf; m_endString = m_stringBuf;
while (*m_endString == *m_input) // This assumes that a word cannot be a prefix of the previous one
while (*m_endString == *itPosInWord)
{ {
m_endString++; ++m_endString;
m_input++; ++itPosInWord;
} }
} }
// Make dawg pointed to by this edge // Make dawg pointed to by this edge
edges.back().ptr = makeNode(iPrefix + 1, outFile, iHeader); edges.back().ptr = makeNode(outFile, iHeader, itCurrWord, itLastWord,
itPosInWord, iPrefix + 1);
} }
int numedges = edges.size(); int numedges = edges.size();
@ -348,56 +338,51 @@ Header CompDic::generateDawg(const string &iWordListFile,
throw DicException(oss.str()); throw DicException(oss.str());
} }
const wchar_t *wordList = NULL; const clock_t startLoadTime = clock();
try vector<wstring> wordList;
loadWordList(iWordListFile, wordList);
const clock_t endLoadTime = clock();
m_loadTime = 1.0 * (endLoadTime - startLoadTime) / CLOCKS_PER_SEC;
if (wordList.empty())
{ {
const clock_t startLoadTime = clock(); throw DicException(_("The word list is empty!"));
unsigned int dicSize;
wordList = loadWordList(iWordListFile, dicSize);
const clock_t endLoadTime = clock();
m_loadTime = 1.0 * (endLoadTime - startLoadTime) / CLOCKS_PER_SEC;
m_input = wordList;
m_endOfInput = m_input + dicSize;
// Write the header a first time, to reserve the space in the file
Header tempHeader = writeHeader(outFile);
DicEdge specialNode = {0, 0, 0, 0};
specialNode.last = 1;
// Temporary variable to avoid a warning when compiling with -O2
// (there is no warning with -O0... g++ bug?)
DicEdge *tmpPtr = &specialNode;
writeNode(reinterpret_cast<uint32_t*>(tmpPtr), 1, outFile);
// Call makeNode with null (relative to stringbuf) prefix;
// Initialize string to null; Put index of start node on output
DicEdge rootNode = {0, 0, 0, 0};
m_endString = m_stringBuf;
const clock_t startBuildTime = clock();
rootNode.ptr = makeNode(m_endString, outFile, tempHeader);
// Reuse the temporary variable
tmpPtr = &rootNode;
writeNode(reinterpret_cast<uint32_t*>(tmpPtr), 1, outFile);
const clock_t endBuildTime = clock();
m_buildTime = 1.0 * (endBuildTime - startBuildTime) / CLOCKS_PER_SEC;
// Write the header again, now that it is complete
m_headerInfo.root = m_headerInfo.edgesused;
const Header finalHeader = writeHeader(outFile);
// Clean up
delete[] wordList;
outFile.close();
return finalHeader;
}
catch (std::exception &e)
{
// Avoid memory leaks
if (wordList != NULL)
delete[] wordList;
throw;
} }
// Write the header a first time, to reserve the space in the file
Header tempHeader = writeHeader(outFile);
DicEdge specialNode = {0, 0, 0, 0};
specialNode.last = 1;
// Temporary variable to avoid a warning when compiling with -O2
// (there is no warning with -O0... g++ bug?)
DicEdge *tmpPtr = &specialNode;
writeNode(reinterpret_cast<uint32_t*>(tmpPtr), 1, outFile);
vector<wstring>::const_iterator firstWord = wordList.begin();
wstring::const_iterator initialPos = firstWord->begin();
// Call makeNode with null (relative to stringbuf) prefix;
// Initialize string to null; Put index of start node on output
DicEdge rootNode = {0, 0, 0, 0};
m_endString = m_stringBuf;
const clock_t startBuildTime = clock();
rootNode.ptr = makeNode(outFile, tempHeader,
firstWord, wordList.end(),
initialPos, m_endString);
// Reuse the temporary variable
tmpPtr = &rootNode;
writeNode(reinterpret_cast<uint32_t*>(tmpPtr), 1, outFile);
const clock_t endBuildTime = clock();
m_buildTime = 1.0 * (endBuildTime - startBuildTime) / CLOCKS_PER_SEC;
// Write the header again, now that it is complete
m_headerInfo.root = m_headerInfo.edgesused;
const Header finalHeader = writeHeader(outFile);
// Clean up
outFile.close();
return finalHeader;
} }

View file

@ -94,10 +94,6 @@ private:
wchar_t m_stringBuf[MAX_STRING_LENGTH]; wchar_t m_stringBuf[MAX_STRING_LENGTH];
/// Point to the end of the string /// Point to the end of the string
wchar_t* m_endString; wchar_t* m_endString;
/// Current position in the word list
const wchar_t *m_input;
/// Mark the end of the input
const wchar_t *m_endOfInput;
#ifdef CHECK_RECURSION #ifdef CHECK_RECURSION
map<int, vector<DicEdge> > m_mapForDepth; map<int, vector<DicEdge> > m_mapForDepth;
int m_currentRec; int m_currentRec;
@ -110,15 +106,12 @@ private:
/** /**
* Read the word list stored in iFileName, convert it to wide chars, * Read the word list stored in iFileName, convert it to wide chars,
* and return it. The oDicSize parameter contains the size of the * and return it (in the oWordList argument).
* returned array.
* In case of problem, an exception is thrown. * In case of problem, an exception is thrown.
* @param iFileName: Name (and path) of the file containing the word list. * @param iFileName: Name (and path) of the file containing the word list.
* @param oDicSize: Size of the returned array * @param oWordList: Word list
* @return Word list as a wchar_t array
*/ */
const wchar_t * loadWordList(const string &iFileName, void loadWordList(const string &iFileName, vector<wstring> &oWordList);
unsigned int &oDicSize);
Header writeHeader(ostream &outFile) const; Header writeHeader(ostream &outFile) const;
@ -137,13 +130,20 @@ private:
* the words beginning with that prefix. String is a pointer (relative * the words beginning with that prefix. String is a pointer (relative
* to m_stringBuf) indicating how much of iPrefix is matched in the * to m_stringBuf) indicating how much of iPrefix is matched in the
* input. * input.
* @param iPrefix: prefix to work on
* @param outfile: stream where to write the nodes * @param outfile: stream where to write the nodes
* @param iHeader: temporary header, used only to do the conversion between * @param iHeader: temporary header, used only to do the conversion between
* the (wide) chars and their corresponding internal code * the (wide) chars and their corresponding internal code
* @param itCurrWord: iterator on the word list
* @param itLastWord: end of the word list
* @param itPosInWord: iterator on the letters of the current word
* @param iPrefix: prefix to work on
* @return the index of a DAWG matching all the words with prefix iPrefix
*/ */
unsigned int makeNode(const wchar_t *iPrefix, ostream &outFile, unsigned int makeNode(ostream &outFile, const Header &iHeader,
const Header &iHeader); vector<wstring>::const_iterator &itCurrWord,
const vector<wstring>::const_iterator &itLastWord,
wstring::const_iterator &itPosInWord,
const wchar_t *iPrefix);
}; };