mirror of
git://git.savannah.nongnu.org/eliot.git
synced 2024-12-28 09:58:15 +01:00
The dictionary creation is now encapsulated into a dedicated class, to allow reusing it easily
This commit is contained in:
parent
7cf6e90303
commit
9d5c9fe1d0
4 changed files with 579 additions and 397 deletions
|
@ -32,7 +32,8 @@ libdic_a_SOURCES = \
|
||||||
encoding.cpp encoding.h \
|
encoding.cpp encoding.h \
|
||||||
automaton.cpp automaton.h \
|
automaton.cpp automaton.h \
|
||||||
regexp.cpp regexp.h \
|
regexp.cpp regexp.h \
|
||||||
grammar.cpp grammar.h
|
grammar.cpp grammar.h \
|
||||||
|
compdic.cpp compdic.h
|
||||||
|
|
||||||
#####################################
|
#####################################
|
||||||
if BUILD_DICTOOLS
|
if BUILD_DICTOOLS
|
||||||
|
@ -42,7 +43,7 @@ bin_PROGRAMS = \
|
||||||
listdic \
|
listdic \
|
||||||
regexp
|
regexp
|
||||||
|
|
||||||
compdic_SOURCES=compdic.cpp
|
compdic_SOURCES=compdicmain.cpp
|
||||||
compdic_CPPFLAGS=$(AM_CPPFLAGS) @BOOST_CPPFLAGS@
|
compdic_CPPFLAGS=$(AM_CPPFLAGS) @BOOST_CPPFLAGS@
|
||||||
compdic_LDADD=libdic.a @LIBINTL@
|
compdic_LDADD=libdic.a @LIBINTL@
|
||||||
|
|
||||||
|
|
560
dic/compdic.cpp
560
dic/compdic.cpp
|
@ -24,14 +24,10 @@
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <vector>
|
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <boost/format.hpp>
|
#include <boost/format.hpp>
|
||||||
#include <boost/foreach.hpp>
|
#include <boost/foreach.hpp>
|
||||||
#include <boost/tokenizer.hpp>
|
|
||||||
#include <boost/unordered_map.hpp>
|
|
||||||
#include <boost/functional/hash.hpp>
|
#include <boost/functional/hash.hpp>
|
||||||
#include <getopt.h>
|
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
|
@ -42,6 +38,10 @@
|
||||||
#include <cerrno>
|
#include <cerrno>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
|
||||||
|
#include "compdic.h"
|
||||||
|
#include "encoding.h"
|
||||||
|
#include "dic_exception.h"
|
||||||
|
|
||||||
// For htonl & Co.
|
// For htonl & Co.
|
||||||
#ifdef WIN32
|
#ifdef WIN32
|
||||||
# include <winsock2.h>
|
# include <winsock2.h>
|
||||||
|
@ -60,42 +60,87 @@
|
||||||
#else
|
#else
|
||||||
# define _(String) String
|
# define _(String) String
|
||||||
#endif
|
#endif
|
||||||
#ifdef WIN32
|
|
||||||
# include <windows.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include "encoding.h"
|
|
||||||
#include "header.h"
|
|
||||||
#include "dic_internals.h"
|
|
||||||
#include "dic_exception.h"
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
// Useful shortcut
|
// Useful shortcut
|
||||||
#define fmt(a) boost::format(a)
|
#define fmt(a) boost::format(a)
|
||||||
|
|
||||||
//#define DEBUG_OUTPUT
|
|
||||||
#define CHECK_RECURSION
|
|
||||||
|
|
||||||
|
CompDic::CompDic()
|
||||||
unsigned int getFileSize(const string &iFileName)
|
: m_currentRec(0), m_maxRec(0), m_loadTime(0), m_buildTime(0)
|
||||||
{
|
{
|
||||||
struct stat stat_buf;
|
m_headerInfo.root = 0;
|
||||||
if (stat(iFileName.c_str(), &stat_buf) < 0)
|
m_headerInfo.nwords = 0;
|
||||||
throw DicException((fmt(_("Could not open file '%1%'")) % iFileName).str());
|
m_headerInfo.nodesused = 1;
|
||||||
return (unsigned int)stat_buf.st_size;
|
m_headerInfo.edgesused = 1;
|
||||||
|
m_headerInfo.nodessaved = 0;
|
||||||
|
m_headerInfo.edgessaved = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
const wchar_t* load_uncompressed(const string &iFileName, unsigned int &ioDicSize)
|
|
||||||
|
void CompDic::addLetter(wchar_t chr, int points, int frequency,
|
||||||
|
bool isVowel, bool isConsonant,
|
||||||
|
const vector<wstring> &iInputs)
|
||||||
|
{
|
||||||
|
// We don't support non-alphabetical characters in the dictionary
|
||||||
|
// apart from the joker '?'. For more explanations on the issue, see
|
||||||
|
// on the eliot-dev mailing-list the thread with the following title:
|
||||||
|
// re: Unable to show menus in Catalan, and some weird char "problem"
|
||||||
|
// (started on 2009/12/31)
|
||||||
|
if (!iswalpha(chr) && chr != L'?')
|
||||||
|
{
|
||||||
|
ostringstream ss;
|
||||||
|
ss << fmt(_("'%1%' is not a valid letter.")) % convertToMb(chr) << endl;
|
||||||
|
ss << fmt(_("For technical reasons, Eliot currently only supports "
|
||||||
|
"alphabetical characters as internal character "
|
||||||
|
"representation, even if the tile has a display string "
|
||||||
|
"defined. Please use another character and change your "
|
||||||
|
"word list accordingly."));
|
||||||
|
throw DicException(ss.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
const wchar_t upChar = towupper(chr);
|
||||||
|
m_headerInfo.letters += upChar;
|
||||||
|
m_headerInfo.points.push_back(points);
|
||||||
|
m_headerInfo.frequency.push_back(frequency);
|
||||||
|
m_headerInfo.vowels.push_back(isVowel);
|
||||||
|
m_headerInfo.consonants.push_back(isConsonant);
|
||||||
|
|
||||||
|
// Ensure the input strings are in upper case
|
||||||
|
if (!iInputs.empty())
|
||||||
|
{
|
||||||
|
vector<wstring> upperInputs = iInputs;
|
||||||
|
BOOST_FOREACH(wstring &str, upperInputs)
|
||||||
|
{
|
||||||
|
std::transform(str.begin(), str.end(), str.begin(), towupper);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the display string is identical to the internal char and if
|
||||||
|
// there is no other input, no need to save this information, as
|
||||||
|
// it is already the default.
|
||||||
|
if (upperInputs.size() != 1 || upperInputs[0] != wstring(1, upChar))
|
||||||
|
{
|
||||||
|
m_headerInfo.displayInputData[upChar] = upperInputs;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
const wchar_t * CompDic::loadWordList(const string &iFileName, unsigned int &oDicSize)
|
||||||
{
|
{
|
||||||
ifstream file(iFileName.c_str(), ios::in | ios::binary);
|
ifstream file(iFileName.c_str(), ios::in | ios::binary);
|
||||||
if (!file.is_open())
|
if (!file.is_open())
|
||||||
throw DicException((fmt(_("Could not open file '%1%'")) % iFileName).str());
|
throw DicException((fmt(_("Could not open file '%1%'")) % iFileName).str());
|
||||||
|
|
||||||
|
// Get the file size
|
||||||
|
struct stat stat_buf;
|
||||||
|
if (stat(iFileName.c_str(), &stat_buf) < 0)
|
||||||
|
throw DicException((fmt(_("Could not open file '%1%'")) % iFileName).str());
|
||||||
|
oDicSize = (unsigned int)stat_buf.st_size;
|
||||||
|
|
||||||
// Place the buffer in a vector to avoid worrying about memory handling
|
// Place the buffer in a vector to avoid worrying about memory handling
|
||||||
vector<char> buffer(ioDicSize);
|
vector<char> buffer(oDicSize);
|
||||||
// Load the file data, everything in one shot
|
// Load the file data, everything in one shot
|
||||||
file.read(&buffer.front(), ioDicSize);
|
file.read(&buffer.front(), oDicSize);
|
||||||
file.close();
|
file.close();
|
||||||
|
|
||||||
// If there is a BOM in the file, use an offset to start reading after it
|
// If there is a BOM in the file, use an offset to start reading after it
|
||||||
|
@ -109,15 +154,15 @@ const wchar_t* load_uncompressed(const string &iFileName, unsigned int &ioDicSiz
|
||||||
|
|
||||||
// Buffer for the wide characters (it will use at most as many characters
|
// Buffer for the wide characters (it will use at most as many characters
|
||||||
// as the utf-8 version)
|
// as the utf-8 version)
|
||||||
wchar_t *wideBuf = new wchar_t[ioDicSize];
|
wchar_t *wideBuf = new wchar_t[oDicSize];
|
||||||
|
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
unsigned int number = readFromUTF8(wideBuf, ioDicSize,
|
unsigned int number = readFromUTF8(wideBuf, oDicSize,
|
||||||
(&buffer.front()) + bomOffset,
|
(&buffer.front()) + bomOffset,
|
||||||
ioDicSize - bomOffset,
|
oDicSize - bomOffset,
|
||||||
"load_uncompressed");
|
"loadWordList");
|
||||||
ioDicSize = number;
|
oDicSize = number;
|
||||||
return wideBuf;
|
return wideBuf;
|
||||||
}
|
}
|
||||||
catch (...)
|
catch (...)
|
||||||
|
@ -129,133 +174,17 @@ const wchar_t* load_uncompressed(const string &iFileName, unsigned int &ioDicSiz
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void readLetters(const string &iFileName, DictHeaderInfo &ioHeaderInfo)
|
Header CompDic::writeHeader(ostream &outFile) const
|
||||||
{
|
{
|
||||||
ifstream in(iFileName.c_str());
|
// Go back to the beginning of the stream before writing the header
|
||||||
if (!in.is_open())
|
outFile.seekp(0, ios::beg);
|
||||||
throw DicException((fmt(_("Could not open file '%1%'")) % iFileName).str());
|
Header aHeader(m_headerInfo);
|
||||||
|
aHeader.write(outFile);
|
||||||
// Use a more friendly type name
|
|
||||||
typedef boost::tokenizer<boost::char_separator<wchar_t>,
|
|
||||||
std::wstring::const_iterator,
|
|
||||||
std::wstring> Tokenizer;
|
|
||||||
|
|
||||||
int lineNb = 1;
|
|
||||||
string line;
|
|
||||||
while (getline(in, line))
|
|
||||||
{
|
|
||||||
// Ignore empty lines
|
|
||||||
if (line == "" || line == "\r" || line == "\n")
|
|
||||||
continue;
|
|
||||||
|
|
||||||
// Convert the line to a wstring
|
|
||||||
const wstring &wline = readFromUTF8(line.c_str(), line.size(), "readLetters (1)");
|
|
||||||
// Split the lines on space characters
|
|
||||||
boost::char_separator<wchar_t> sep(L" ");
|
|
||||||
Tokenizer tok(wline, sep);
|
|
||||||
Tokenizer::iterator it;
|
|
||||||
vector<wstring> tokens(tok.begin(), tok.end());
|
|
||||||
|
|
||||||
// We expect at least 5 fields on the line
|
|
||||||
if (tokens.size() < 5)
|
|
||||||
{
|
|
||||||
ostringstream ss;
|
|
||||||
ss << fmt(_("readLetters: Not enough fields "
|
|
||||||
"in %1% (line %2%)")) % iFileName % lineNb;
|
|
||||||
throw DicException(ss.str());
|
|
||||||
}
|
|
||||||
|
|
||||||
// The first field is a single character
|
|
||||||
wstring letter = tokens[0];
|
|
||||||
if (letter.size() != 1)
|
|
||||||
{
|
|
||||||
ostringstream ss;
|
|
||||||
ss << fmt(_("readLetters: Invalid letter at line %1% "
|
|
||||||
"(only one character allowed)")) % lineNb;
|
|
||||||
throw DicException(ss.str());
|
|
||||||
}
|
|
||||||
|
|
||||||
// We don't support non-alphabetical characters in the dictionary
|
|
||||||
// apart from the joker '?'. For more explanations on the issue, see
|
|
||||||
// on the eliot-dev mailing-list the thread with the following title:
|
|
||||||
// re: Unable to show menus in Catalan, and some weird char "problem"
|
|
||||||
// (started on 2009/12/31)
|
|
||||||
wchar_t chr = letter[0];
|
|
||||||
if (!iswalpha(chr) && chr != L'?')
|
|
||||||
{
|
|
||||||
ostringstream ss;
|
|
||||||
ss << fmt(_("'%1%' is not a valid letter.")) % convertToMb(letter) << endl;
|
|
||||||
ss << fmt(_("For technical reasons, Eliot currently only supports "
|
|
||||||
"alphabetical characters as internal character "
|
|
||||||
"representation, even if the tile has a display string "
|
|
||||||
"defined. Please use another character and change your "
|
|
||||||
"word list accordingly."));
|
|
||||||
throw DicException(ss.str());
|
|
||||||
}
|
|
||||||
wchar_t upChar = towupper(chr);
|
|
||||||
ioHeaderInfo.letters += upChar;
|
|
||||||
|
|
||||||
ioHeaderInfo.points.push_back(_wtoi(tokens[1].c_str()));
|
|
||||||
ioHeaderInfo.frequency.push_back(_wtoi(tokens[2].c_str()));
|
|
||||||
ioHeaderInfo.vowels.push_back(_wtoi(tokens[3].c_str()));
|
|
||||||
ioHeaderInfo.consonants.push_back(_wtoi(tokens[4].c_str()));
|
|
||||||
|
|
||||||
if (tokens.size() > 5)
|
|
||||||
{
|
|
||||||
vector<wstring> inputs(tokens.begin() + 5, tokens.end());
|
|
||||||
// Ensure the input strings are in upper case
|
|
||||||
BOOST_FOREACH(wstring &str, inputs)
|
|
||||||
{
|
|
||||||
std::transform(str.begin(), str.end(), str.begin(), towupper);
|
|
||||||
}
|
|
||||||
|
|
||||||
// If the display string is identical to the internal char and if
|
|
||||||
// there is no other input, no need to save this information, as
|
|
||||||
// it is already the default.
|
|
||||||
if (inputs.size() != 1 || inputs[0] != wstring(1, upChar))
|
|
||||||
{
|
|
||||||
ioHeaderInfo.displayInputData[upChar] = inputs;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
++lineNb;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
Header skip_init_header(ostream &outfile, DictHeaderInfo &ioHeaderInfo)
|
|
||||||
{
|
|
||||||
ioHeaderInfo.root = 0;
|
|
||||||
ioHeaderInfo.nwords = 0;
|
|
||||||
ioHeaderInfo.nodesused = 1;
|
|
||||||
ioHeaderInfo.edgesused = 1;
|
|
||||||
ioHeaderInfo.nodessaved = 0;
|
|
||||||
ioHeaderInfo.edgessaved = 0;
|
|
||||||
|
|
||||||
Header aHeader(ioHeaderInfo);
|
|
||||||
aHeader.write(outfile);
|
|
||||||
return aHeader;
|
return aHeader;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void fix_header(ostream &outfile, DictHeaderInfo &ioHeaderInfo)
|
void CompDic::writeNode(uint32_t *ioEdges, unsigned int num, ostream &outFile)
|
||||||
{
|
|
||||||
ioHeaderInfo.root = ioHeaderInfo.edgesused;
|
|
||||||
// Go back to the beginning of the stream to overwrite the header
|
|
||||||
outfile.seekp(0, ios::beg);
|
|
||||||
#if defined(WORDS_BIGENDIAN)
|
|
||||||
#warning "**********************************************"
|
|
||||||
#warning "compdic does not run yet on bigendian machines"
|
|
||||||
#warning "**********************************************"
|
|
||||||
#else
|
|
||||||
Header aHeader(ioHeaderInfo);
|
|
||||||
aHeader.write(outfile);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Change endianness of the pointed edges, and write them to the given ostream
|
|
||||||
void write_node(uint32_t *ioEdges, unsigned int num, ostream &outfile)
|
|
||||||
{
|
{
|
||||||
// Handle endianness
|
// Handle endianness
|
||||||
for (unsigned int i = 0; i < num; ++i)
|
for (unsigned int i = 0; i < num; ++i)
|
||||||
|
@ -267,16 +196,13 @@ void write_node(uint32_t *ioEdges, unsigned int num, ostream &outfile)
|
||||||
cout << fmt(_("writing %1% edges")) % num << endl;
|
cout << fmt(_("writing %1% edges")) % num << endl;
|
||||||
for (int i = 0; i < num; i++)
|
for (int i = 0; i < num; i++)
|
||||||
{
|
{
|
||||||
outfile.write((char*)(ioEdges + i), sizeof(DicEdge));
|
outFile.write((char*)(ioEdges + i), sizeof(DicEdge));
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
outfile.write((char*)ioEdges, num * sizeof(DicEdge));
|
outFile.write((char*)ioEdges, num * sizeof(DicEdge));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#define MAX_STRING_LENGTH 200
|
|
||||||
|
|
||||||
|
|
||||||
#define MAX_EDGES 2000
|
#define MAX_EDGES 2000
|
||||||
/* ods3: ?? */
|
/* ods3: ?? */
|
||||||
/* ods4: 1746 */
|
/* ods4: 1746 */
|
||||||
|
@ -295,62 +221,24 @@ size_t hash_value(const DicEdge &iEdge)
|
||||||
class IncDec
|
class IncDec
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
IncDec(int &ioCounter)
|
IncDec(int &ioCounter) : m_counter(ioCounter) { ++m_counter; }
|
||||||
: m_counter(ioCounter)
|
~IncDec() { --m_counter; }
|
||||||
{
|
|
||||||
m_counter++;
|
|
||||||
}
|
|
||||||
|
|
||||||
~IncDec()
|
|
||||||
{
|
|
||||||
m_counter--;
|
|
||||||
}
|
|
||||||
private:
|
private:
|
||||||
int &m_counter;
|
int &m_counter;
|
||||||
};
|
};
|
||||||
|
|
||||||
int current_rec = 0;
|
|
||||||
int max_rec = 0;
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef boost::unordered_map<vector<DicEdge>, unsigned int> HashMap;
|
|
||||||
|
|
||||||
/* global variables */
|
unsigned int CompDic::makeNode(const wchar_t *iPrefix, ostream &outFile,
|
||||||
HashMap global_hashmap;
|
const Header &iHeader)
|
||||||
|
|
||||||
wchar_t global_stringbuf[MAX_STRING_LENGTH]; /* Space for current string */
|
|
||||||
wchar_t* global_endstring; /* Marks END of current string */
|
|
||||||
const wchar_t* global_input;
|
|
||||||
const wchar_t* global_endofinput;
|
|
||||||
#ifdef CHECK_RECURSION
|
|
||||||
map<int, vector<DicEdge> > global_mapfordepth;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Makenode takes a prefix (as position relative to stringbuf) and
|
|
||||||
* returns an index of the start node of a dawg that recognizes all
|
|
||||||
* words beginning with that prefix. String is a pointer (relative
|
|
||||||
* to stringbuf) indicating how much of iPrefix is matched in the
|
|
||||||
* input.
|
|
||||||
* @param iPrefix: prefix to work on
|
|
||||||
* @param outfile: stream where to write the nodes
|
|
||||||
* @param ioHeaderInfo: information needed to build the final header, updated
|
|
||||||
* during the processing
|
|
||||||
* @param iHeader: temporary header, used only to do the conversion between
|
|
||||||
* the (wide) chars and their corresponding internal code
|
|
||||||
*/
|
|
||||||
unsigned int makenode(const wchar_t *iPrefix, ostream &outfile,
|
|
||||||
DictHeaderInfo &ioHeaderInfo, const Header &iHeader)
|
|
||||||
{
|
{
|
||||||
#ifdef CHECK_RECURSION
|
#ifdef CHECK_RECURSION
|
||||||
IncDec inc(current_rec);
|
IncDec inc(m_currentRec);
|
||||||
if (current_rec > max_rec)
|
if (m_currentRec > m_maxRec)
|
||||||
max_rec = current_rec;
|
m_maxRec = m_currentRec;
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef CHECK_RECURSION
|
|
||||||
// Instead of creating a vector, try to reuse an existing one
|
// Instead of creating a vector, try to reuse an existing one
|
||||||
vector<DicEdge> &edges = global_mapfordepth[current_rec];
|
vector<DicEdge> &edges = m_mapForDepth[m_currentRec];
|
||||||
edges.reserve(MAX_EDGES);
|
edges.reserve(MAX_EDGES);
|
||||||
edges.clear();
|
edges.clear();
|
||||||
#else
|
#else
|
||||||
|
@ -360,7 +248,7 @@ unsigned int makenode(const wchar_t *iPrefix, ostream &outfile,
|
||||||
#endif
|
#endif
|
||||||
DicEdge newEdge;
|
DicEdge newEdge;
|
||||||
|
|
||||||
while (iPrefix == global_endstring)
|
while (iPrefix == m_endString)
|
||||||
{
|
{
|
||||||
// More edges out of node
|
// More edges out of node
|
||||||
newEdge.ptr = 0;
|
newEdge.ptr = 0;
|
||||||
|
@ -368,48 +256,47 @@ unsigned int makenode(const wchar_t *iPrefix, ostream &outfile,
|
||||||
newEdge.last = 0;
|
newEdge.last = 0;
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
newEdge.chr = iHeader.getCodeFromChar(*global_endstring++ = *global_input++);
|
newEdge.chr = iHeader.getCodeFromChar(*m_endString++ = *m_input++);
|
||||||
}
|
}
|
||||||
catch (DicException &e)
|
catch (DicException &e)
|
||||||
{
|
{
|
||||||
// If an invalid character is found, be specific about the problem
|
// If an invalid character is found, be specific about the problem
|
||||||
ostringstream oss;
|
ostringstream oss;
|
||||||
oss << fmt(_("Error on line %1%, col %2%: %3%"))
|
oss << fmt(_("Error on line %1%, col %2%: %3%"))
|
||||||
% (1 + ioHeaderInfo.nwords)
|
% (1 + m_headerInfo.nwords)
|
||||||
% (global_endstring - global_stringbuf)
|
% (m_endString - m_stringBuf)
|
||||||
% e.what() << endl;
|
% e.what() << endl;
|
||||||
throw DicException(oss.str());
|
throw DicException(oss.str());
|
||||||
}
|
}
|
||||||
edges.push_back(newEdge);
|
edges.push_back(newEdge);
|
||||||
|
|
||||||
// End of a word?
|
// End of a word?
|
||||||
if (*global_input == L'\n' || *global_input == L'\r')
|
if (*m_input == L'\n' || *m_input == L'\r')
|
||||||
{
|
{
|
||||||
ioHeaderInfo.nwords++;
|
m_headerInfo.nwords++;
|
||||||
*global_endstring = L'\0';
|
*m_endString = L'\0';
|
||||||
// Mark edge as word
|
// Mark edge as word
|
||||||
edges.back().term = 1;
|
edges.back().term = 1;
|
||||||
|
|
||||||
// Skip \r and/or \n
|
// Skip \r and/or \n
|
||||||
while (global_input != global_endofinput &&
|
while (m_input != m_endOfInput &&
|
||||||
(*global_input == L'\n' || *global_input == L'\r'))
|
(*m_input == L'\n' || *m_input == L'\r'))
|
||||||
{
|
{
|
||||||
++global_input;
|
++m_input;
|
||||||
}
|
}
|
||||||
// At the end of input?
|
// At the end of input?
|
||||||
if (global_input == global_endofinput)
|
if (m_input == m_endOfInput)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
global_endstring = global_stringbuf;
|
m_endString = m_stringBuf;
|
||||||
while (*global_endstring == *global_input)
|
while (*m_endString == *m_input)
|
||||||
{
|
{
|
||||||
global_endstring++;
|
m_endString++;
|
||||||
global_input++;
|
m_input++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Make dawg pointed to by this edge
|
// Make dawg pointed to by this edge
|
||||||
edges.back().ptr =
|
edges.back().ptr = makeNode(iPrefix + 1, outFile, iHeader);
|
||||||
makenode(iPrefix + 1, outfile, ioHeaderInfo, iHeader);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int numedges = edges.size();
|
int numedges = edges.size();
|
||||||
|
@ -422,212 +309,95 @@ unsigned int makenode(const wchar_t *iPrefix, ostream &outfile,
|
||||||
// Mark the last edge
|
// Mark the last edge
|
||||||
edges.back().last = 1;
|
edges.back().last = 1;
|
||||||
|
|
||||||
HashMap::const_iterator itMap = global_hashmap.find(edges);
|
HashMap::const_iterator itMap = m_hashMap.find(edges);
|
||||||
if (itMap != global_hashmap.end())
|
if (itMap != m_hashMap.end())
|
||||||
{
|
{
|
||||||
ioHeaderInfo.edgessaved += numedges;
|
m_headerInfo.edgessaved += numedges;
|
||||||
ioHeaderInfo.nodessaved++;
|
m_headerInfo.nodessaved++;
|
||||||
|
|
||||||
return itMap->second;
|
return itMap->second;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
unsigned int node_pos = ioHeaderInfo.edgesused;
|
unsigned int node_pos = m_headerInfo.edgesused;
|
||||||
global_hashmap[edges] = ioHeaderInfo.edgesused;
|
m_hashMap[edges] = m_headerInfo.edgesused;
|
||||||
ioHeaderInfo.edgesused += numedges;
|
m_headerInfo.edgesused += numedges;
|
||||||
ioHeaderInfo.nodesused++;
|
m_headerInfo.nodesused++;
|
||||||
write_node(reinterpret_cast<uint32_t*>(&edges.front()),
|
writeNode(reinterpret_cast<uint32_t*>(&edges.front()),
|
||||||
numedges, outfile);
|
numedges, outFile);
|
||||||
|
|
||||||
return node_pos;
|
return node_pos;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void printUsage(const string &iBinaryName)
|
Header CompDic::generateDawg(const string &iWordListFile,
|
||||||
|
const string &iDawgFile,
|
||||||
|
const string &iDicName)
|
||||||
{
|
{
|
||||||
cout << "Usage: " << iBinaryName << " [options]" << endl
|
m_headerInfo.dicName = convertToWc(iDicName);
|
||||||
<< _("Mandatory options:") << endl
|
// We are not (yet) able to build the GADDAG format
|
||||||
<< _(" -d, --dicname <string> Set the dictionary name and version") << endl
|
m_headerInfo.dawg = true;
|
||||||
<< _(" -l, --letters <string> Path to the file containing the letters (see below)") << endl
|
|
||||||
<< _(" -i, --input <string> Path to the uncompressed dictionary file (encoded in UTF-8)") << endl
|
|
||||||
<< _(" The words must be in alphabetical order, without duplicates") << endl
|
|
||||||
<< _(" -o, --output <string Path to the generated compressed dictionary file") << endl
|
|
||||||
<< _("Other options:") << endl
|
|
||||||
<< _(" -h, --help Print this help and exit") << endl
|
|
||||||
<< _("Example:") << endl
|
|
||||||
<< " " << iBinaryName << _(" -d 'ODS 5.0' -l letters.txt -i ods5.txt -o ods5.dawg") << endl
|
|
||||||
<< endl
|
|
||||||
<< _("The file containing the letters (--letters switch) must be UTF-8 encoded.") << endl
|
|
||||||
<< _("Each line corresponds to one letter, and must contain at least 5 fields separated with "
|
|
||||||
"one or more space(s).") << endl
|
|
||||||
<< _(" - 1st field: the letter itself, as stored in the input file (single character)") << endl
|
|
||||||
<< _(" - 2nd field: the points of the letter") << endl
|
|
||||||
<< _(" - 3rd field: the frequency of the letter (how many letters of this kind in the game)") << endl
|
|
||||||
<< _(" - 4th field: 1 if the letter is considered as a vowel in Scrabble game, 0 otherwise") << endl
|
|
||||||
<< _(" - 5th field: 1 if the letter is considered as a consonant in Scrabble game, 0 otherwise") << endl
|
|
||||||
<< _(" - 6th field (optional): display string for the letter (default: the letter itself)") << endl
|
|
||||||
<< _(" - other fields (optional): input strings for the letter, in addition to the display string") << endl
|
|
||||||
<< endl
|
|
||||||
<< _("Example for french:") << endl
|
|
||||||
<< "A 1 9 1 0" << endl
|
|
||||||
<< "[...]" << endl
|
|
||||||
<< "Z 10 1 0 1" << endl
|
|
||||||
<< "? 0 2 1 1" << endl
|
|
||||||
<< endl
|
|
||||||
<< _("Example for catalan:") << endl
|
|
||||||
<< "A 1 12 1 0" << endl
|
|
||||||
<< "[...]" << endl
|
|
||||||
// TRANSLATORS: the first "L.L" must be translated "L·L",
|
|
||||||
// and the last one translated "ĿL"
|
|
||||||
<< _("W 10 1 0 1 L.L L.L L-L L.L") << endl
|
|
||||||
<< "X 10 1 0 1" << endl
|
|
||||||
<< "Y 10 1 0 1 NY" << endl
|
|
||||||
<< "[...]" << endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
// Open the output file
|
||||||
int main(int argc, char* argv[])
|
ofstream outFile(iDawgFile.c_str(), ios::out | ios::binary | ios::trunc);
|
||||||
{
|
if (!outFile.is_open())
|
||||||
#if HAVE_SETLOCALE
|
|
||||||
// Set locale via LC_ALL
|
|
||||||
setlocale(LC_ALL, "");
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if ENABLE_NLS
|
|
||||||
// Set the message domain
|
|
||||||
#ifdef WIN32
|
|
||||||
// Get the absolute path, as returned by GetFullPathName()
|
|
||||||
char baseDir[MAX_PATH];
|
|
||||||
GetFullPathName(argv[0], MAX_PATH, baseDir, NULL);
|
|
||||||
char *pos = strrchr(baseDir, L'\\');
|
|
||||||
if (pos)
|
|
||||||
*pos = '\0';
|
|
||||||
const string localeDir = baseDir + string("\\locale");
|
|
||||||
#else
|
|
||||||
static const string localeDir = LOCALEDIR;
|
|
||||||
#endif
|
|
||||||
bindtextdomain(PACKAGE, localeDir.c_str());
|
|
||||||
textdomain(PACKAGE);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static const struct option long_options[] =
|
|
||||||
{
|
{
|
||||||
{"help", no_argument, NULL, 'h'},
|
ostringstream oss;
|
||||||
{"dicname", required_argument, NULL, 'd'},
|
oss << fmt(_("Cannot open output file '%1%'")) % iDawgFile;
|
||||||
{"letters", required_argument, NULL, 'l'},
|
throw DicException(oss.str());
|
||||||
{"input", required_argument, NULL, 'i'},
|
}
|
||||||
{"output", required_argument, NULL, 'o'},
|
|
||||||
{0, 0, 0, 0}
|
|
||||||
};
|
|
||||||
static const char short_options[] = "hd:l:i:o:";
|
|
||||||
|
|
||||||
bool found_d = false;
|
const wchar_t *wordList = NULL;
|
||||||
bool found_l = false;
|
|
||||||
bool found_i = false;
|
|
||||||
bool found_o = false;
|
|
||||||
string inFileName;
|
|
||||||
string outFileName;
|
|
||||||
DictHeaderInfo headerInfo;
|
|
||||||
|
|
||||||
int res;
|
|
||||||
int option_index = 1;
|
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
while ((res = getopt_long(argc, argv, short_options,
|
const clock_t startLoadTime = clock();
|
||||||
long_options, &option_index)) != -1)
|
unsigned int dicSize;
|
||||||
{
|
wordList = loadWordList(iWordListFile, dicSize);
|
||||||
switch (res)
|
const clock_t endLoadTime = clock();
|
||||||
{
|
m_loadTime = 1.0 * (endLoadTime - startLoadTime) / CLOCKS_PER_SEC;
|
||||||
case 'h':
|
|
||||||
printUsage(argv[0]);
|
|
||||||
exit(0);
|
|
||||||
case 'd':
|
|
||||||
found_d = true;
|
|
||||||
headerInfo.dicName = convertToWc(optarg);
|
|
||||||
break;
|
|
||||||
case 'l':
|
|
||||||
found_l = true;
|
|
||||||
readLetters(optarg, headerInfo);
|
|
||||||
break;
|
|
||||||
case 'i':
|
|
||||||
found_i = true;
|
|
||||||
inFileName = optarg;
|
|
||||||
break;
|
|
||||||
case 'o':
|
|
||||||
found_o = true;
|
|
||||||
outFileName = optarg;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check mandatory options
|
m_input = wordList;
|
||||||
if (!found_d || !found_l || !found_i || !found_o)
|
m_endOfInput = m_input + dicSize;
|
||||||
{
|
|
||||||
cerr << _("A mandatory option is missing") << endl;
|
|
||||||
printUsage(argv[0]);
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int dicSize = getFileSize(inFileName);
|
// Write the header a first time, to reserve the space in the file
|
||||||
|
Header tempHeader = writeHeader(outFile);
|
||||||
|
|
||||||
ofstream outfile(outFileName.c_str(), ios::out | ios::binary | ios::trunc);
|
DicEdge specialNode = {0, 0, 0, 0};
|
||||||
if (!outfile.is_open())
|
specialNode.last = 1;
|
||||||
{
|
|
||||||
cerr << fmt(_("Cannot open output file '%1%'")) % outFileName << endl;
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
clock_t startLoadTime = clock();
|
|
||||||
// FIXME: not exception safe
|
|
||||||
const wchar_t *uncompressed = load_uncompressed(inFileName, dicSize);
|
|
||||||
clock_t endLoadTime = clock();
|
|
||||||
|
|
||||||
global_input = uncompressed;
|
|
||||||
global_endofinput = global_input + dicSize;
|
|
||||||
|
|
||||||
headerInfo.dawg = true;
|
|
||||||
Header tempHeader = skip_init_header(outfile, headerInfo);
|
|
||||||
|
|
||||||
DicEdge specialnode = {0, 0, 0, 0};
|
|
||||||
specialnode.last = 1;
|
|
||||||
// Temporary variable to avoid a warning when compiling with -O2
|
// Temporary variable to avoid a warning when compiling with -O2
|
||||||
// (there is no warning with -O0... g++ bug?)
|
// (there is no warning with -O0... g++ bug?)
|
||||||
DicEdge *tmpPtr = &specialnode;
|
DicEdge *tmpPtr = &specialNode;
|
||||||
write_node(reinterpret_cast<uint32_t*>(tmpPtr), 1, outfile);
|
writeNode(reinterpret_cast<uint32_t*>(tmpPtr), 1, outFile);
|
||||||
|
|
||||||
/*
|
// Call makeNode with null (relative to stringbuf) prefix;
|
||||||
* Call makenode with null (relative to stringbuf) prefix;
|
// Initialize string to null; Put index of start node on output
|
||||||
* Initialize string to null; Put index of start node on output
|
DicEdge rootNode = {0, 0, 0, 0};
|
||||||
*/
|
m_endString = m_stringBuf;
|
||||||
DicEdge rootnode = {0, 0, 0, 0};
|
const clock_t startBuildTime = clock();
|
||||||
global_endstring = global_stringbuf;
|
rootNode.ptr = makeNode(m_endString, outFile, tempHeader);
|
||||||
clock_t startBuildTime = clock();
|
|
||||||
rootnode.ptr = makenode(global_endstring, outfile, headerInfo, tempHeader);
|
|
||||||
clock_t endBuildTime = clock();
|
|
||||||
// Reuse the temporary variable
|
// Reuse the temporary variable
|
||||||
tmpPtr = &rootnode;
|
tmpPtr = &rootNode;
|
||||||
write_node(reinterpret_cast<uint32_t*>(tmpPtr), 1, outfile);
|
writeNode(reinterpret_cast<uint32_t*>(tmpPtr), 1, outFile);
|
||||||
|
const clock_t endBuildTime = clock();
|
||||||
|
m_buildTime = 1.0 * (endBuildTime - startBuildTime) / CLOCKS_PER_SEC;
|
||||||
|
|
||||||
fix_header(outfile, headerInfo);
|
// Write the header again, now that it is complete
|
||||||
|
m_headerInfo.root = m_headerInfo.edgesused;
|
||||||
|
const Header finalHeader = writeHeader(outFile);
|
||||||
|
|
||||||
Header aHeader(headerInfo);
|
// Clean up
|
||||||
aHeader.print();
|
delete[] wordList;
|
||||||
|
outFile.close();
|
||||||
|
|
||||||
delete[] uncompressed;
|
return finalHeader;
|
||||||
outfile.close();
|
|
||||||
|
|
||||||
printf(_(" Load time: %.3f s\n"), 1.0 * (endLoadTime - startLoadTime) / CLOCKS_PER_SEC);
|
|
||||||
printf(_(" Compression time: %.3f s\n"), 1.0 * (endBuildTime - startBuildTime) / CLOCKS_PER_SEC);
|
|
||||||
#ifdef CHECK_RECURSION
|
|
||||||
cout << fmt(_(" Maximum recursion level reached: %1%")) % max_rec << endl;
|
|
||||||
#endif
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
catch (std::exception &e)
|
catch (std::exception &e)
|
||||||
{
|
{
|
||||||
cerr << fmt(_("Exception caught: %1%")) % e.what() << endl;
|
// Avoid memory leaks
|
||||||
return 1;
|
if (wordList != NULL)
|
||||||
|
delete[] wordList;
|
||||||
|
throw;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
150
dic/compdic.h
Normal file
150
dic/compdic.h
Normal file
|
@ -0,0 +1,150 @@
|
||||||
|
/*****************************************************************************
|
||||||
|
* Eliot
|
||||||
|
* Copyright (C) 2005-2007 Antoine Fraboulet
|
||||||
|
* Authors: Antoine Fraboulet <antoine.fraboulet @@ free.fr>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef DIC_COMPDIC_H_
|
||||||
|
#define DIC_COMPDIC_H_
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
#include <iosfwd>
|
||||||
|
#include <boost/unordered_map.hpp>
|
||||||
|
|
||||||
|
#include "header.h"
|
||||||
|
#include "dic_internals.h"
|
||||||
|
|
||||||
|
class DicEdge;
|
||||||
|
class DictHeaderInfo;
|
||||||
|
class Header;
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
//#define DEBUG_OUTPUT
|
||||||
|
#define CHECK_RECURSION
|
||||||
|
|
||||||
|
|
||||||
|
class CompDic
|
||||||
|
{
|
||||||
|
typedef boost::unordered_map<vector<DicEdge>, unsigned int> HashMap;
|
||||||
|
|
||||||
|
public:
|
||||||
|
CompDic();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Define a new letter. The letter must be alphabetic (i.e. iswalpha()
|
||||||
|
* returns true for it).
|
||||||
|
* @param letter: Letter to addLetter
|
||||||
|
* @param points: Points of the letter
|
||||||
|
* @param frequency: Number of occurrences of the letter in the game
|
||||||
|
* @param isVowel: True if the letter can be considered as a vowel,
|
||||||
|
* false otherwise
|
||||||
|
* @param isConsonant: True if the letter can be considered as a consonant,
|
||||||
|
* false otherwise
|
||||||
|
* @param iInputs: Vector containing the various ways to input the letter.
|
||||||
|
* If not empty, the first value corresponds to the display string.
|
||||||
|
*/
|
||||||
|
void addLetter(wchar_t letter, int points, int frequency,
|
||||||
|
bool isVowel, bool isConsonant,
|
||||||
|
const vector<wstring> &iInputs);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate the dictionary. You must have called addLetter() before
|
||||||
|
* (once for each letter of the word list, and possible once for the
|
||||||
|
* joker).
|
||||||
|
* @param iWordListFile: Name (and path) of the word list file
|
||||||
|
* @param iDawgFile: Name (and path) of the generated dawg file
|
||||||
|
* @param iDicName: Internal name of the dictionary
|
||||||
|
* @return The header of the generated dawg
|
||||||
|
*/
|
||||||
|
Header generateDawg(const string &iWordListFile,
|
||||||
|
const string &iDawgFile,
|
||||||
|
const string &iDicName);
|
||||||
|
|
||||||
|
// Statistics
|
||||||
|
double getLoadTime() const { return m_loadTime; }
|
||||||
|
double getBuildTime() const { return m_buildTime; }
|
||||||
|
#ifdef CHECK_RECURSION
|
||||||
|
double getMaxRecursion() const { return m_maxRec; }
|
||||||
|
#endif
|
||||||
|
|
||||||
|
private:
|
||||||
|
DictHeaderInfo m_headerInfo;
|
||||||
|
|
||||||
|
HashMap m_hashMap;
|
||||||
|
|
||||||
|
#define MAX_STRING_LENGTH 200
|
||||||
|
|
||||||
|
/// Space for the current string
|
||||||
|
wchar_t m_stringBuf[MAX_STRING_LENGTH];
|
||||||
|
/// Point to the end of the string
|
||||||
|
wchar_t* m_endString;
|
||||||
|
/// Current position in the word list
|
||||||
|
const wchar_t *m_input;
|
||||||
|
/// Mark the end of the input
|
||||||
|
const wchar_t *m_endOfInput;
|
||||||
|
#ifdef CHECK_RECURSION
|
||||||
|
map<int, vector<DicEdge> > m_mapForDepth;
|
||||||
|
int m_currentRec;
|
||||||
|
int m_maxRec;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
double m_loadTime;
|
||||||
|
double m_buildTime;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read the word list stored in iFileName, convert it to wide chars,
|
||||||
|
* and return it. The oDicSize parameter contains the size of the
|
||||||
|
* returned array.
|
||||||
|
* In case of problem, an exception is thrown.
|
||||||
|
* @param iFileName: Name (and path) of the file containing the word list.
|
||||||
|
* @param oDicSize: Size of the returned array
|
||||||
|
* @return Word list as a wchar_t array
|
||||||
|
*/
|
||||||
|
const wchar_t * loadWordList(const string &iFileName,
|
||||||
|
unsigned int &oDicSize);
|
||||||
|
|
||||||
|
Header writeHeader(ostream &outFile) const;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Change the endianness of the pointed edges (if needed),
|
||||||
|
* and write them to the given ostream.
|
||||||
|
* @param ioEdges: array of edges
|
||||||
|
* @param num: number of edges in the array
|
||||||
|
* @param outFile: stream where to write the edges
|
||||||
|
*/
|
||||||
|
void writeNode(uint32_t *ioEdges, unsigned int num, ostream &outFile);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* MakeNode takes a prefix (as position relative to m_stringBuf) and
|
||||||
|
* returns the index of the start node of a dawg that recognizes all
|
||||||
|
* the words beginning with that prefix. String is a pointer (relative
|
||||||
|
* to m_stringBuf) indicating how much of iPrefix is matched in the
|
||||||
|
* input.
|
||||||
|
* @param iPrefix: prefix to work on
|
||||||
|
* @param outfile: stream where to write the nodes
|
||||||
|
* @param iHeader: temporary header, used only to do the conversion between
|
||||||
|
* the (wide) chars and their corresponding internal code
|
||||||
|
*/
|
||||||
|
unsigned int makeNode(const wchar_t *iPrefix, ostream &outFile,
|
||||||
|
const Header &iHeader);
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif /* DIC_COMPDIC_H_ */
|
261
dic/compdicmain.cpp
Normal file
261
dic/compdicmain.cpp
Normal file
|
@ -0,0 +1,261 @@
|
||||||
|
/*****************************************************************************
|
||||||
|
* Eliot
|
||||||
|
* Copyright (C) 1999-2007 Antoine Fraboulet & Olivier Teulière
|
||||||
|
* Authors: Antoine Fraboulet <antoine.fraboulet @@ free.fr>
|
||||||
|
* Olivier Teulière <ipkiss @@ gmail.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "config.h"
|
||||||
|
|
||||||
|
#include <fstream>
|
||||||
|
#include <sstream>
|
||||||
|
#include <iostream>
|
||||||
|
#include <vector>
|
||||||
|
#include <boost/format.hpp>
|
||||||
|
#include <boost/foreach.hpp>
|
||||||
|
#include <boost/tokenizer.hpp>
|
||||||
|
#include <getopt.h>
|
||||||
|
|
||||||
|
#if ENABLE_NLS
|
||||||
|
# include <libintl.h>
|
||||||
|
# define _(String) gettext(String)
|
||||||
|
#else
|
||||||
|
# define _(String) String
|
||||||
|
#endif
|
||||||
|
#ifdef WIN32
|
||||||
|
# include <windows.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "compdic.h"
|
||||||
|
#include "dic_exception.h"
|
||||||
|
#include "encoding.h"
|
||||||
|
#include "header.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
// Useful shortcut
|
||||||
|
#define fmt(a) boost::format(a)
|
||||||
|
|
||||||
|
|
||||||
|
void readLetters(const string &iFileName, CompDic &ioBuilder)
|
||||||
|
{
|
||||||
|
ifstream in(iFileName.c_str());
|
||||||
|
if (!in.is_open())
|
||||||
|
throw DicException((fmt(_("Could not open file '%1%'")) % iFileName).str());
|
||||||
|
|
||||||
|
// Use a more friendly type name
|
||||||
|
typedef boost::tokenizer<boost::char_separator<wchar_t>,
|
||||||
|
std::wstring::const_iterator,
|
||||||
|
std::wstring> Tokenizer;
|
||||||
|
|
||||||
|
int lineNb = 1;
|
||||||
|
string line;
|
||||||
|
while (getline(in, line))
|
||||||
|
{
|
||||||
|
// Ignore empty lines
|
||||||
|
if (line == "" || line == "\r" || line == "\n")
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// Convert the line to a wstring
|
||||||
|
const wstring &wline =
|
||||||
|
readFromUTF8(line.c_str(), line.size(), "readLetters (1)");
|
||||||
|
// Split the lines on space characters
|
||||||
|
boost::char_separator<wchar_t> sep(L" ");
|
||||||
|
Tokenizer tok(wline, sep);
|
||||||
|
Tokenizer::iterator it;
|
||||||
|
vector<wstring> tokens(tok.begin(), tok.end());
|
||||||
|
|
||||||
|
// We expect at least 5 fields on the line
|
||||||
|
if (tokens.size() < 5)
|
||||||
|
{
|
||||||
|
ostringstream ss;
|
||||||
|
ss << fmt(_("readLetters: Not enough fields "
|
||||||
|
"in %1% (line %2%)")) % iFileName % lineNb;
|
||||||
|
throw DicException(ss.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
// The first field is a single character
|
||||||
|
wstring letter = tokens[0];
|
||||||
|
if (letter.size() != 1)
|
||||||
|
{
|
||||||
|
ostringstream ss;
|
||||||
|
ss << fmt(_("readLetters: Invalid letter at line %1% "
|
||||||
|
"(only one character allowed)")) % lineNb;
|
||||||
|
throw DicException(ss.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
vector<wstring> inputs;
|
||||||
|
if (tokens.size() > 5)
|
||||||
|
{
|
||||||
|
inputs = vector<wstring>(tokens.begin() + 5, tokens.end());
|
||||||
|
}
|
||||||
|
ioBuilder.addLetter(letter[0], _wtoi(tokens[1].c_str()),
|
||||||
|
_wtoi(tokens[2].c_str()), _wtoi(tokens[3].c_str()),
|
||||||
|
_wtoi(tokens[4].c_str()), inputs);
|
||||||
|
|
||||||
|
++lineNb;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void printUsage(const string &iBinaryName)
|
||||||
|
{
|
||||||
|
cout << "Usage: " << iBinaryName << " [options]" << endl
|
||||||
|
<< _("Mandatory options:") << endl
|
||||||
|
<< _(" -d, --dicname <string> Set the dictionary name and version") << endl
|
||||||
|
<< _(" -l, --letters <string> Path to the file containing the letters (see below)") << endl
|
||||||
|
<< _(" -i, --input <string> Path to the uncompressed dictionary file (encoded in UTF-8)") << endl
|
||||||
|
<< _(" The words must be in alphabetical order, without duplicates") << endl
|
||||||
|
<< _(" -o, --output <string Path to the generated compressed dictionary file") << endl
|
||||||
|
<< _("Other options:") << endl
|
||||||
|
<< _(" -h, --help Print this help and exit") << endl
|
||||||
|
<< _("Example:") << endl
|
||||||
|
<< " " << iBinaryName << _(" -d 'ODS 5.0' -l letters.txt -i ods5.txt -o ods5.dawg") << endl
|
||||||
|
<< endl
|
||||||
|
<< _("The file containing the letters (--letters switch) must be UTF-8 encoded.") << endl
|
||||||
|
<< _("Each line corresponds to one letter, and must contain at least 5 fields separated with "
|
||||||
|
"one or more space(s).") << endl
|
||||||
|
<< _(" - 1st field: the letter itself, as stored in the input file (single character)") << endl
|
||||||
|
<< _(" - 2nd field: the points of the letter") << endl
|
||||||
|
<< _(" - 3rd field: the frequency of the letter (how many letters of this kind in the game)") << endl
|
||||||
|
<< _(" - 4th field: 1 if the letter is considered as a vowel in Scrabble game, 0 otherwise") << endl
|
||||||
|
<< _(" - 5th field: 1 if the letter is considered as a consonant in Scrabble game, 0 otherwise") << endl
|
||||||
|
<< _(" - 6th field (optional): display string for the letter (default: the letter itself)") << endl
|
||||||
|
<< _(" - other fields (optional): input strings for the letter, in addition to the display string") << endl
|
||||||
|
<< endl
|
||||||
|
<< _("Example for french:") << endl
|
||||||
|
<< "A 1 9 1 0" << endl
|
||||||
|
<< "[...]" << endl
|
||||||
|
<< "Z 10 1 0 1" << endl
|
||||||
|
<< "? 0 2 1 1" << endl
|
||||||
|
<< endl
|
||||||
|
<< _("Example for catalan:") << endl
|
||||||
|
<< "A 1 12 1 0" << endl
|
||||||
|
<< "[...]" << endl
|
||||||
|
// TRANSLATORS: the first "L.L" must be translated "L·L",
|
||||||
|
// and the last one translated "ĿL"
|
||||||
|
<< _("W 10 1 0 1 L.L L.L L-L L.L") << endl
|
||||||
|
<< "X 10 1 0 1" << endl
|
||||||
|
<< "Y 10 1 0 1 NY" << endl
|
||||||
|
<< "[...]" << endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int main(int argc, char* argv[])
|
||||||
|
{
|
||||||
|
#if HAVE_SETLOCALE
|
||||||
|
// Set locale via LC_ALL
|
||||||
|
setlocale(LC_ALL, "");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if ENABLE_NLS
|
||||||
|
// Set the message domain
|
||||||
|
#ifdef WIN32
|
||||||
|
// Get the absolute path, as returned by GetFullPathName()
|
||||||
|
char baseDir[MAX_PATH];
|
||||||
|
GetFullPathName(argv[0], MAX_PATH, baseDir, NULL);
|
||||||
|
char *pos = strrchr(baseDir, L'\\');
|
||||||
|
if (pos)
|
||||||
|
*pos = '\0';
|
||||||
|
const string localeDir = baseDir + string("\\locale");
|
||||||
|
#else
|
||||||
|
static const string localeDir = LOCALEDIR;
|
||||||
|
#endif
|
||||||
|
bindtextdomain(PACKAGE, localeDir.c_str());
|
||||||
|
textdomain(PACKAGE);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static const struct option long_options[] =
|
||||||
|
{
|
||||||
|
{"help", no_argument, NULL, 'h'},
|
||||||
|
{"dicname", required_argument, NULL, 'd'},
|
||||||
|
{"letters", required_argument, NULL, 'l'},
|
||||||
|
{"input", required_argument, NULL, 'i'},
|
||||||
|
{"output", required_argument, NULL, 'o'},
|
||||||
|
{0, 0, 0, 0}
|
||||||
|
};
|
||||||
|
static const char short_options[] = "hd:l:i:o:";
|
||||||
|
|
||||||
|
bool found_d = false;
|
||||||
|
bool found_l = false;
|
||||||
|
bool found_i = false;
|
||||||
|
bool found_o = false;
|
||||||
|
string dicName;
|
||||||
|
string inFileName;
|
||||||
|
string outFileName;
|
||||||
|
CompDic builder;
|
||||||
|
|
||||||
|
int res;
|
||||||
|
int option_index = 1;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
while ((res = getopt_long(argc, argv, short_options,
|
||||||
|
long_options, &option_index)) != -1)
|
||||||
|
{
|
||||||
|
switch (res)
|
||||||
|
{
|
||||||
|
case 'h':
|
||||||
|
printUsage(argv[0]);
|
||||||
|
exit(0);
|
||||||
|
case 'd':
|
||||||
|
found_d = true;
|
||||||
|
dicName = optarg;
|
||||||
|
break;
|
||||||
|
case 'l':
|
||||||
|
found_l = true;
|
||||||
|
readLetters(optarg, builder);
|
||||||
|
break;
|
||||||
|
case 'i':
|
||||||
|
found_i = true;
|
||||||
|
inFileName = optarg;
|
||||||
|
break;
|
||||||
|
case 'o':
|
||||||
|
found_o = true;
|
||||||
|
outFileName = optarg;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check mandatory options
|
||||||
|
if (!found_d || !found_l || !found_i || !found_o)
|
||||||
|
{
|
||||||
|
cerr << _("A mandatory option is missing") << endl;
|
||||||
|
printUsage(argv[0]);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate the dictionary
|
||||||
|
const Header &header =
|
||||||
|
builder.generateDawg(inFileName, outFileName, dicName);
|
||||||
|
|
||||||
|
// Print the header
|
||||||
|
header.print();
|
||||||
|
|
||||||
|
cout << fmt(_(" Load time: %1% s")) % builder.getLoadTime() << endl;
|
||||||
|
cout << fmt(_(" Compression time: %1% s")) % builder.getBuildTime() << endl;
|
||||||
|
#ifdef CHECK_RECURSION
|
||||||
|
cout << fmt(_(" Maximum recursion level reached: %1%")) % builder.getMaxRecursion() << endl;
|
||||||
|
#endif
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
catch (std::exception &e)
|
||||||
|
{
|
||||||
|
cerr << fmt(_("Exception caught: %1%")) % e.what() << endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue