diff --git a/dic/Makefile.am b/dic/Makefile.am index dfe0d18..bb1aa35 100644 --- a/dic/Makefile.am +++ b/dic/Makefile.am @@ -32,7 +32,8 @@ libdic_a_SOURCES = \ encoding.cpp encoding.h \ automaton.cpp automaton.h \ regexp.cpp regexp.h \ - grammar.cpp grammar.h + grammar.cpp grammar.h \ + compdic.cpp compdic.h ##################################### if BUILD_DICTOOLS @@ -42,7 +43,7 @@ bin_PROGRAMS = \ listdic \ regexp -compdic_SOURCES=compdic.cpp +compdic_SOURCES=compdicmain.cpp compdic_CPPFLAGS=$(AM_CPPFLAGS) @BOOST_CPPFLAGS@ compdic_LDADD=libdic.a @LIBINTL@ diff --git a/dic/compdic.cpp b/dic/compdic.cpp index 365340b..0f05e24 100644 --- a/dic/compdic.cpp +++ b/dic/compdic.cpp @@ -24,14 +24,10 @@ #include #include #include -#include #include #include #include -#include -#include #include -#include #include #include #include @@ -42,6 +38,10 @@ #include #include +#include "compdic.h" +#include "encoding.h" +#include "dic_exception.h" + // For htonl & Co. #ifdef WIN32 # include @@ -60,42 +60,87 @@ #else # define _(String) String #endif -#ifdef WIN32 -# include -#endif - -#include "encoding.h" -#include "header.h" -#include "dic_internals.h" -#include "dic_exception.h" - -using namespace std; // Useful shortcut #define fmt(a) boost::format(a) -//#define DEBUG_OUTPUT -#define CHECK_RECURSION - -unsigned int getFileSize(const string &iFileName) +CompDic::CompDic() + : m_currentRec(0), m_maxRec(0), m_loadTime(0), m_buildTime(0) { - struct stat stat_buf; - if (stat(iFileName.c_str(), &stat_buf) < 0) - throw DicException((fmt(_("Could not open file '%1%'")) % iFileName).str()); - return (unsigned int)stat_buf.st_size; + m_headerInfo.root = 0; + m_headerInfo.nwords = 0; + m_headerInfo.nodesused = 1; + m_headerInfo.edgesused = 1; + m_headerInfo.nodessaved = 0; + m_headerInfo.edgessaved = 0; } -const wchar_t* load_uncompressed(const string &iFileName, unsigned int &ioDicSize) + +void CompDic::addLetter(wchar_t chr, int points, int frequency, + bool isVowel, bool isConsonant, + const vector &iInputs) +{ + // We don't support non-alphabetical characters in the dictionary + // apart from the joker '?'. For more explanations on the issue, see + // on the eliot-dev mailing-list the thread with the following title: + // re: Unable to show menus in Catalan, and some weird char "problem" + // (started on 2009/12/31) + if (!iswalpha(chr) && chr != L'?') + { + ostringstream ss; + ss << fmt(_("'%1%' is not a valid letter.")) % convertToMb(chr) << endl; + ss << fmt(_("For technical reasons, Eliot currently only supports " + "alphabetical characters as internal character " + "representation, even if the tile has a display string " + "defined. Please use another character and change your " + "word list accordingly.")); + throw DicException(ss.str()); + } + + const wchar_t upChar = towupper(chr); + m_headerInfo.letters += upChar; + m_headerInfo.points.push_back(points); + m_headerInfo.frequency.push_back(frequency); + m_headerInfo.vowels.push_back(isVowel); + m_headerInfo.consonants.push_back(isConsonant); + + // Ensure the input strings are in upper case + if (!iInputs.empty()) + { + vector upperInputs = iInputs; + BOOST_FOREACH(wstring &str, upperInputs) + { + std::transform(str.begin(), str.end(), str.begin(), towupper); + } + + // If the display string is identical to the internal char and if + // there is no other input, no need to save this information, as + // it is already the default. + if (upperInputs.size() != 1 || upperInputs[0] != wstring(1, upChar)) + { + m_headerInfo.displayInputData[upChar] = upperInputs; + } + } +} + + +const wchar_t * CompDic::loadWordList(const string &iFileName, unsigned int &oDicSize) { ifstream file(iFileName.c_str(), ios::in | ios::binary); if (!file.is_open()) throw DicException((fmt(_("Could not open file '%1%'")) % iFileName).str()); + // Get the file size + struct stat stat_buf; + if (stat(iFileName.c_str(), &stat_buf) < 0) + throw DicException((fmt(_("Could not open file '%1%'")) % iFileName).str()); + oDicSize = (unsigned int)stat_buf.st_size; + // Place the buffer in a vector to avoid worrying about memory handling - vector buffer(ioDicSize); + vector buffer(oDicSize); // Load the file data, everything in one shot - file.read(&buffer.front(), ioDicSize); + file.read(&buffer.front(), oDicSize); file.close(); // If there is a BOM in the file, use an offset to start reading after it @@ -109,15 +154,15 @@ const wchar_t* load_uncompressed(const string &iFileName, unsigned int &ioDicSiz // Buffer for the wide characters (it will use at most as many characters // as the utf-8 version) - wchar_t *wideBuf = new wchar_t[ioDicSize]; + wchar_t *wideBuf = new wchar_t[oDicSize]; try { - unsigned int number = readFromUTF8(wideBuf, ioDicSize, + unsigned int number = readFromUTF8(wideBuf, oDicSize, (&buffer.front()) + bomOffset, - ioDicSize - bomOffset, - "load_uncompressed"); - ioDicSize = number; + oDicSize - bomOffset, + "loadWordList"); + oDicSize = number; return wideBuf; } catch (...) @@ -129,133 +174,17 @@ const wchar_t* load_uncompressed(const string &iFileName, unsigned int &ioDicSiz } -void readLetters(const string &iFileName, DictHeaderInfo &ioHeaderInfo) +Header CompDic::writeHeader(ostream &outFile) const { - ifstream in(iFileName.c_str()); - if (!in.is_open()) - throw DicException((fmt(_("Could not open file '%1%'")) % iFileName).str()); - - // Use a more friendly type name - typedef boost::tokenizer, - std::wstring::const_iterator, - std::wstring> Tokenizer; - - int lineNb = 1; - string line; - while (getline(in, line)) - { - // Ignore empty lines - if (line == "" || line == "\r" || line == "\n") - continue; - - // Convert the line to a wstring - const wstring &wline = readFromUTF8(line.c_str(), line.size(), "readLetters (1)"); - // Split the lines on space characters - boost::char_separator sep(L" "); - Tokenizer tok(wline, sep); - Tokenizer::iterator it; - vector tokens(tok.begin(), tok.end()); - - // We expect at least 5 fields on the line - if (tokens.size() < 5) - { - ostringstream ss; - ss << fmt(_("readLetters: Not enough fields " - "in %1% (line %2%)")) % iFileName % lineNb; - throw DicException(ss.str()); - } - - // The first field is a single character - wstring letter = tokens[0]; - if (letter.size() != 1) - { - ostringstream ss; - ss << fmt(_("readLetters: Invalid letter at line %1% " - "(only one character allowed)")) % lineNb; - throw DicException(ss.str()); - } - - // We don't support non-alphabetical characters in the dictionary - // apart from the joker '?'. For more explanations on the issue, see - // on the eliot-dev mailing-list the thread with the following title: - // re: Unable to show menus in Catalan, and some weird char "problem" - // (started on 2009/12/31) - wchar_t chr = letter[0]; - if (!iswalpha(chr) && chr != L'?') - { - ostringstream ss; - ss << fmt(_("'%1%' is not a valid letter.")) % convertToMb(letter) << endl; - ss << fmt(_("For technical reasons, Eliot currently only supports " - "alphabetical characters as internal character " - "representation, even if the tile has a display string " - "defined. Please use another character and change your " - "word list accordingly.")); - throw DicException(ss.str()); - } - wchar_t upChar = towupper(chr); - ioHeaderInfo.letters += upChar; - - ioHeaderInfo.points.push_back(_wtoi(tokens[1].c_str())); - ioHeaderInfo.frequency.push_back(_wtoi(tokens[2].c_str())); - ioHeaderInfo.vowels.push_back(_wtoi(tokens[3].c_str())); - ioHeaderInfo.consonants.push_back(_wtoi(tokens[4].c_str())); - - if (tokens.size() > 5) - { - vector inputs(tokens.begin() + 5, tokens.end()); - // Ensure the input strings are in upper case - BOOST_FOREACH(wstring &str, inputs) - { - std::transform(str.begin(), str.end(), str.begin(), towupper); - } - - // If the display string is identical to the internal char and if - // there is no other input, no need to save this information, as - // it is already the default. - if (inputs.size() != 1 || inputs[0] != wstring(1, upChar)) - { - ioHeaderInfo.displayInputData[upChar] = inputs; - } - } - - ++lineNb; - } -} - - -Header skip_init_header(ostream &outfile, DictHeaderInfo &ioHeaderInfo) -{ - ioHeaderInfo.root = 0; - ioHeaderInfo.nwords = 0; - ioHeaderInfo.nodesused = 1; - ioHeaderInfo.edgesused = 1; - ioHeaderInfo.nodessaved = 0; - ioHeaderInfo.edgessaved = 0; - - Header aHeader(ioHeaderInfo); - aHeader.write(outfile); + // Go back to the beginning of the stream before writing the header + outFile.seekp(0, ios::beg); + Header aHeader(m_headerInfo); + aHeader.write(outFile); return aHeader; } -void fix_header(ostream &outfile, DictHeaderInfo &ioHeaderInfo) -{ - ioHeaderInfo.root = ioHeaderInfo.edgesused; - // Go back to the beginning of the stream to overwrite the header - outfile.seekp(0, ios::beg); -#if defined(WORDS_BIGENDIAN) -#warning "**********************************************" -#warning "compdic does not run yet on bigendian machines" -#warning "**********************************************" -#else - Header aHeader(ioHeaderInfo); - aHeader.write(outfile); -#endif -} - - -// Change endianness of the pointed edges, and write them to the given ostream -void write_node(uint32_t *ioEdges, unsigned int num, ostream &outfile) +void CompDic::writeNode(uint32_t *ioEdges, unsigned int num, ostream &outFile) { // Handle endianness for (unsigned int i = 0; i < num; ++i) @@ -267,16 +196,13 @@ void write_node(uint32_t *ioEdges, unsigned int num, ostream &outfile) cout << fmt(_("writing %1% edges")) % num << endl; for (int i = 0; i < num; i++) { - outfile.write((char*)(ioEdges + i), sizeof(DicEdge)); + outFile.write((char*)(ioEdges + i), sizeof(DicEdge)); } #else - outfile.write((char*)ioEdges, num * sizeof(DicEdge)); + outFile.write((char*)ioEdges, num * sizeof(DicEdge)); #endif } -#define MAX_STRING_LENGTH 200 - - #define MAX_EDGES 2000 /* ods3: ?? */ /* ods4: 1746 */ @@ -295,62 +221,24 @@ size_t hash_value(const DicEdge &iEdge) class IncDec { public: - IncDec(int &ioCounter) - : m_counter(ioCounter) - { - m_counter++; - } - - ~IncDec() - { - m_counter--; - } + IncDec(int &ioCounter) : m_counter(ioCounter) { ++m_counter; } + ~IncDec() { --m_counter; } private: int &m_counter; }; - -int current_rec = 0; -int max_rec = 0; #endif -typedef boost::unordered_map, unsigned int> HashMap; -/* global variables */ -HashMap global_hashmap; - -wchar_t global_stringbuf[MAX_STRING_LENGTH]; /* Space for current string */ -wchar_t* global_endstring; /* Marks END of current string */ -const wchar_t* global_input; -const wchar_t* global_endofinput; -#ifdef CHECK_RECURSION -map > global_mapfordepth; -#endif - -/** - * Makenode takes a prefix (as position relative to stringbuf) and - * returns an index of the start node of a dawg that recognizes all - * words beginning with that prefix. String is a pointer (relative - * to stringbuf) indicating how much of iPrefix is matched in the - * input. - * @param iPrefix: prefix to work on - * @param outfile: stream where to write the nodes - * @param ioHeaderInfo: information needed to build the final header, updated - * during the processing - * @param iHeader: temporary header, used only to do the conversion between - * the (wide) chars and their corresponding internal code - */ -unsigned int makenode(const wchar_t *iPrefix, ostream &outfile, - DictHeaderInfo &ioHeaderInfo, const Header &iHeader) +unsigned int CompDic::makeNode(const wchar_t *iPrefix, ostream &outFile, + const Header &iHeader) { #ifdef CHECK_RECURSION - IncDec inc(current_rec); - if (current_rec > max_rec) - max_rec = current_rec; -#endif + IncDec inc(m_currentRec); + if (m_currentRec > m_maxRec) + m_maxRec = m_currentRec; -#ifdef CHECK_RECURSION // Instead of creating a vector, try to reuse an existing one - vector &edges = global_mapfordepth[current_rec]; + vector &edges = m_mapForDepth[m_currentRec]; edges.reserve(MAX_EDGES); edges.clear(); #else @@ -360,7 +248,7 @@ unsigned int makenode(const wchar_t *iPrefix, ostream &outfile, #endif DicEdge newEdge; - while (iPrefix == global_endstring) + while (iPrefix == m_endString) { // More edges out of node newEdge.ptr = 0; @@ -368,48 +256,47 @@ unsigned int makenode(const wchar_t *iPrefix, ostream &outfile, newEdge.last = 0; try { - newEdge.chr = iHeader.getCodeFromChar(*global_endstring++ = *global_input++); + newEdge.chr = iHeader.getCodeFromChar(*m_endString++ = *m_input++); } catch (DicException &e) { // If an invalid character is found, be specific about the problem ostringstream oss; oss << fmt(_("Error on line %1%, col %2%: %3%")) - % (1 + ioHeaderInfo.nwords) - % (global_endstring - global_stringbuf) + % (1 + m_headerInfo.nwords) + % (m_endString - m_stringBuf) % e.what() << endl; throw DicException(oss.str()); } edges.push_back(newEdge); // End of a word? - if (*global_input == L'\n' || *global_input == L'\r') + if (*m_input == L'\n' || *m_input == L'\r') { - ioHeaderInfo.nwords++; - *global_endstring = L'\0'; + m_headerInfo.nwords++; + *m_endString = L'\0'; // Mark edge as word edges.back().term = 1; // Skip \r and/or \n - while (global_input != global_endofinput && - (*global_input == L'\n' || *global_input == L'\r')) + while (m_input != m_endOfInput && + (*m_input == L'\n' || *m_input == L'\r')) { - ++global_input; + ++m_input; } // At the end of input? - if (global_input == global_endofinput) + if (m_input == m_endOfInput) break; - global_endstring = global_stringbuf; - while (*global_endstring == *global_input) + m_endString = m_stringBuf; + while (*m_endString == *m_input) { - global_endstring++; - global_input++; + m_endString++; + m_input++; } } // Make dawg pointed to by this edge - edges.back().ptr = - makenode(iPrefix + 1, outfile, ioHeaderInfo, iHeader); + edges.back().ptr = makeNode(iPrefix + 1, outFile, iHeader); } int numedges = edges.size(); @@ -422,212 +309,95 @@ unsigned int makenode(const wchar_t *iPrefix, ostream &outfile, // Mark the last edge edges.back().last = 1; - HashMap::const_iterator itMap = global_hashmap.find(edges); - if (itMap != global_hashmap.end()) + HashMap::const_iterator itMap = m_hashMap.find(edges); + if (itMap != m_hashMap.end()) { - ioHeaderInfo.edgessaved += numedges; - ioHeaderInfo.nodessaved++; + m_headerInfo.edgessaved += numedges; + m_headerInfo.nodessaved++; return itMap->second; } else { - unsigned int node_pos = ioHeaderInfo.edgesused; - global_hashmap[edges] = ioHeaderInfo.edgesused; - ioHeaderInfo.edgesused += numedges; - ioHeaderInfo.nodesused++; - write_node(reinterpret_cast(&edges.front()), - numedges, outfile); + unsigned int node_pos = m_headerInfo.edgesused; + m_hashMap[edges] = m_headerInfo.edgesused; + m_headerInfo.edgesused += numedges; + m_headerInfo.nodesused++; + writeNode(reinterpret_cast(&edges.front()), + numedges, outFile); return node_pos; } } -void printUsage(const string &iBinaryName) +Header CompDic::generateDawg(const string &iWordListFile, + const string &iDawgFile, + const string &iDicName) { - cout << "Usage: " << iBinaryName << " [options]" << endl - << _("Mandatory options:") << endl - << _(" -d, --dicname Set the dictionary name and version") << endl - << _(" -l, --letters Path to the file containing the letters (see below)") << endl - << _(" -i, --input Path to the uncompressed dictionary file (encoded in UTF-8)") << endl - << _(" The words must be in alphabetical order, without duplicates") << endl - << _(" -o, --output (tmpPtr), 1, outfile); + DicEdge *tmpPtr = &specialNode; + writeNode(reinterpret_cast(tmpPtr), 1, outFile); - /* - * Call makenode with null (relative to stringbuf) prefix; - * Initialize string to null; Put index of start node on output - */ - DicEdge rootnode = {0, 0, 0, 0}; - global_endstring = global_stringbuf; - clock_t startBuildTime = clock(); - rootnode.ptr = makenode(global_endstring, outfile, headerInfo, tempHeader); - clock_t endBuildTime = clock(); + // Call makeNode with null (relative to stringbuf) prefix; + // Initialize string to null; Put index of start node on output + DicEdge rootNode = {0, 0, 0, 0}; + m_endString = m_stringBuf; + const clock_t startBuildTime = clock(); + rootNode.ptr = makeNode(m_endString, outFile, tempHeader); // Reuse the temporary variable - tmpPtr = &rootnode; - write_node(reinterpret_cast(tmpPtr), 1, outfile); + tmpPtr = &rootNode; + writeNode(reinterpret_cast(tmpPtr), 1, outFile); + const clock_t endBuildTime = clock(); + m_buildTime = 1.0 * (endBuildTime - startBuildTime) / CLOCKS_PER_SEC; - fix_header(outfile, headerInfo); + // Write the header again, now that it is complete + m_headerInfo.root = m_headerInfo.edgesused; + const Header finalHeader = writeHeader(outFile); - Header aHeader(headerInfo); - aHeader.print(); + // Clean up + delete[] wordList; + outFile.close(); - delete[] uncompressed; - outfile.close(); - - printf(_(" Load time: %.3f s\n"), 1.0 * (endLoadTime - startLoadTime) / CLOCKS_PER_SEC); - printf(_(" Compression time: %.3f s\n"), 1.0 * (endBuildTime - startBuildTime) / CLOCKS_PER_SEC); -#ifdef CHECK_RECURSION - cout << fmt(_(" Maximum recursion level reached: %1%")) % max_rec << endl; -#endif - return 0; + return finalHeader; } catch (std::exception &e) { - cerr << fmt(_("Exception caught: %1%")) % e.what() << endl; - return 1; + // Avoid memory leaks + if (wordList != NULL) + delete[] wordList; + throw; } } diff --git a/dic/compdic.h b/dic/compdic.h new file mode 100644 index 0000000..3bf2655 --- /dev/null +++ b/dic/compdic.h @@ -0,0 +1,150 @@ +/***************************************************************************** + * Eliot + * Copyright (C) 2005-2007 Antoine Fraboulet + * Authors: Antoine Fraboulet + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + *****************************************************************************/ + +#ifndef DIC_COMPDIC_H_ +#define DIC_COMPDIC_H_ + +#include +#include +#include +#include + +#include "header.h" +#include "dic_internals.h" + +class DicEdge; +class DictHeaderInfo; +class Header; + +using namespace std; + +//#define DEBUG_OUTPUT +#define CHECK_RECURSION + + +class CompDic +{ + typedef boost::unordered_map, unsigned int> HashMap; + +public: + CompDic(); + + /** + * Define a new letter. The letter must be alphabetic (i.e. iswalpha() + * returns true for it). + * @param letter: Letter to addLetter + * @param points: Points of the letter + * @param frequency: Number of occurrences of the letter in the game + * @param isVowel: True if the letter can be considered as a vowel, + * false otherwise + * @param isConsonant: True if the letter can be considered as a consonant, + * false otherwise + * @param iInputs: Vector containing the various ways to input the letter. + * If not empty, the first value corresponds to the display string. + */ + void addLetter(wchar_t letter, int points, int frequency, + bool isVowel, bool isConsonant, + const vector &iInputs); + + /** + * Generate the dictionary. You must have called addLetter() before + * (once for each letter of the word list, and possible once for the + * joker). + * @param iWordListFile: Name (and path) of the word list file + * @param iDawgFile: Name (and path) of the generated dawg file + * @param iDicName: Internal name of the dictionary + * @return The header of the generated dawg + */ + Header generateDawg(const string &iWordListFile, + const string &iDawgFile, + const string &iDicName); + + // Statistics + double getLoadTime() const { return m_loadTime; } + double getBuildTime() const { return m_buildTime; } +#ifdef CHECK_RECURSION + double getMaxRecursion() const { return m_maxRec; } +#endif + +private: + DictHeaderInfo m_headerInfo; + + HashMap m_hashMap; + +#define MAX_STRING_LENGTH 200 + + /// Space for the current string + wchar_t m_stringBuf[MAX_STRING_LENGTH]; + /// Point to the end of the string + wchar_t* m_endString; + /// Current position in the word list + const wchar_t *m_input; + /// Mark the end of the input + const wchar_t *m_endOfInput; +#ifdef CHECK_RECURSION + map > m_mapForDepth; + int m_currentRec; + int m_maxRec; +#endif + + double m_loadTime; + double m_buildTime; + + + /** + * Read the word list stored in iFileName, convert it to wide chars, + * and return it. The oDicSize parameter contains the size of the + * returned array. + * In case of problem, an exception is thrown. + * @param iFileName: Name (and path) of the file containing the word list. + * @param oDicSize: Size of the returned array + * @return Word list as a wchar_t array + */ + const wchar_t * loadWordList(const string &iFileName, + unsigned int &oDicSize); + + Header writeHeader(ostream &outFile) const; + + /** + * Change the endianness of the pointed edges (if needed), + * and write them to the given ostream. + * @param ioEdges: array of edges + * @param num: number of edges in the array + * @param outFile: stream where to write the edges + */ + void writeNode(uint32_t *ioEdges, unsigned int num, ostream &outFile); + + /** + * MakeNode takes a prefix (as position relative to m_stringBuf) and + * returns the index of the start node of a dawg that recognizes all + * the words beginning with that prefix. String is a pointer (relative + * to m_stringBuf) indicating how much of iPrefix is matched in the + * input. + * @param iPrefix: prefix to work on + * @param outfile: stream where to write the nodes + * @param iHeader: temporary header, used only to do the conversion between + * the (wide) chars and their corresponding internal code + */ + unsigned int makeNode(const wchar_t *iPrefix, ostream &outFile, + const Header &iHeader); + +}; + +#endif /* DIC_COMPDIC_H_ */ diff --git a/dic/compdicmain.cpp b/dic/compdicmain.cpp new file mode 100644 index 0000000..1722d23 --- /dev/null +++ b/dic/compdicmain.cpp @@ -0,0 +1,261 @@ +/***************************************************************************** + * Eliot + * Copyright (C) 1999-2007 Antoine Fraboulet & Olivier Teulière + * Authors: Antoine Fraboulet + * Olivier Teulière + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + *****************************************************************************/ + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#if ENABLE_NLS +# include +# define _(String) gettext(String) +#else +# define _(String) String +#endif +#ifdef WIN32 +# include +#endif + +#include "compdic.h" +#include "dic_exception.h" +#include "encoding.h" +#include "header.h" + +using namespace std; + +// Useful shortcut +#define fmt(a) boost::format(a) + + +void readLetters(const string &iFileName, CompDic &ioBuilder) +{ + ifstream in(iFileName.c_str()); + if (!in.is_open()) + throw DicException((fmt(_("Could not open file '%1%'")) % iFileName).str()); + + // Use a more friendly type name + typedef boost::tokenizer, + std::wstring::const_iterator, + std::wstring> Tokenizer; + + int lineNb = 1; + string line; + while (getline(in, line)) + { + // Ignore empty lines + if (line == "" || line == "\r" || line == "\n") + continue; + + // Convert the line to a wstring + const wstring &wline = + readFromUTF8(line.c_str(), line.size(), "readLetters (1)"); + // Split the lines on space characters + boost::char_separator sep(L" "); + Tokenizer tok(wline, sep); + Tokenizer::iterator it; + vector tokens(tok.begin(), tok.end()); + + // We expect at least 5 fields on the line + if (tokens.size() < 5) + { + ostringstream ss; + ss << fmt(_("readLetters: Not enough fields " + "in %1% (line %2%)")) % iFileName % lineNb; + throw DicException(ss.str()); + } + + // The first field is a single character + wstring letter = tokens[0]; + if (letter.size() != 1) + { + ostringstream ss; + ss << fmt(_("readLetters: Invalid letter at line %1% " + "(only one character allowed)")) % lineNb; + throw DicException(ss.str()); + } + + vector inputs; + if (tokens.size() > 5) + { + inputs = vector(tokens.begin() + 5, tokens.end()); + } + ioBuilder.addLetter(letter[0], _wtoi(tokens[1].c_str()), + _wtoi(tokens[2].c_str()), _wtoi(tokens[3].c_str()), + _wtoi(tokens[4].c_str()), inputs); + + ++lineNb; + } +} + + +void printUsage(const string &iBinaryName) +{ + cout << "Usage: " << iBinaryName << " [options]" << endl + << _("Mandatory options:") << endl + << _(" -d, --dicname Set the dictionary name and version") << endl + << _(" -l, --letters Path to the file containing the letters (see below)") << endl + << _(" -i, --input Path to the uncompressed dictionary file (encoded in UTF-8)") << endl + << _(" The words must be in alphabetical order, without duplicates") << endl + << _(" -o, --output