/***************************************************************************** * Eliot * Copyright (C) 1999-2007 Antoine Fraboulet & Olivier Teulière * Authors: Antoine Fraboulet * Olivier Teulière * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA *****************************************************************************/ #include "config.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include // For htonl & Co. #ifdef WIN32 # include #else # if HAVE_NETINET_IN_H # include # endif # if HAVE_ARPA_INET_H # include # endif #endif #if ENABLE_NLS # include # define _(String) gettext(String) #else # define _(String) String #endif #ifdef WIN32 # include #endif #include "encoding.h" #include "header.h" #include "dic_internals.h" #include "dic_exception.h" using namespace std; //#define DEBUG_OUTPUT #define CHECK_RECURSION unsigned int getFileSize(const string &iFileName) { struct stat stat_buf; if (stat(iFileName.c_str(), &stat_buf) < 0) throw DicException(_("Cannot stat file ") + iFileName); return (unsigned int)stat_buf.st_size; } const wchar_t* load_uncompressed(const string &iFileName, unsigned int &ioDicSize) { ifstream file(iFileName.c_str(), ios::in | ios::binary); if (!file.is_open()) throw DicException("Could not open file " + iFileName); // Place the buffer in a vector to avoid worrying about memory handling vector buffer(ioDicSize); // Load the file data, everything in one shot file.read(&buffer.front(), ioDicSize); file.close(); // Buffer for the wide characters (it will use at most as many characters // as the utf-8 version) wchar_t *wideBuf = new wchar_t[ioDicSize]; try { unsigned int number = readFromUTF8(wideBuf, ioDicSize, &buffer.front(), ioDicSize, "load_uncompressed"); ioDicSize = number; return wideBuf; } catch (...) { // Avoid leaks, and propagate the exception delete[] wideBuf; throw; } } void readLetters(const string &iFileName, DictHeaderInfo &ioHeaderInfo) { ifstream in(iFileName.c_str()); if (!in.is_open()) throw DicException("Could not open file " + string(iFileName)); // Use a more friendly type name typedef boost::tokenizer, std::wstring::const_iterator, std::wstring> Tokenizer; int lineNb = 1; string line; while (getline(in, line)) { // Ignore empty lines if (line == "" || line == "\r" || line == "\n") continue; // Convert the line to a wstring const wstring &wline = readFromUTF8(line.c_str(), line.size(), "readLetters (1)"); // Split the lines on space characters boost::char_separator sep(L" "); Tokenizer tok(wline, sep); Tokenizer::iterator it; vector tokens(tok.begin(), tok.end()); // We expect at least 5 fields on the line if (tokens.size() < 5) { ostringstream ss; ss << "readLetters: Not enough fields in " << iFileName; ss << " (line " << lineNb << ")"; throw DicException(ss.str()); } // The first field is a single character wstring letter = tokens[0]; if (letter.size() != 1) { // On the first line, there could be the BOM... if (lineNb == 1 && tokens[0].size() > 3 && (uint8_t)tokens[0][0] == 0xEF && (uint8_t)tokens[0][1] == 0xBB && (uint8_t)tokens[0][2] == 0xBF) { // BOM detected, remove the first char in the wide string letter.erase(0, 1); } else { ostringstream ss; ss << "readLetters: Invalid letter at line " << lineNb; ss << " (only one character allowed)"; throw DicException(ss.str()); } } wchar_t upChar = towupper(letter[0]); ioHeaderInfo.letters += upChar; ioHeaderInfo.points.push_back(_wtoi(tokens[1].c_str())); ioHeaderInfo.frequency.push_back(_wtoi(tokens[2].c_str())); ioHeaderInfo.vowels.push_back(_wtoi(tokens[3].c_str())); ioHeaderInfo.consonants.push_back(_wtoi(tokens[4].c_str())); if (tokens.size() > 5) { vector inputs(tokens.begin() + 5, tokens.end()); // Ensure the input strings are in upper case BOOST_FOREACH(wstring &str, inputs) { std::transform(str.begin(), str.end(), str.begin(), towupper); } // If the display stirng is identical to the internal char and if // there is no other input, no need to save this information, as // it is already the default. if (inputs.size() != 1 || inputs[0] != wstring(1, upChar)) { ioHeaderInfo.displayInputData[upChar] = inputs; } } ++lineNb; } } Header skip_init_header(ostream &outfile, DictHeaderInfo &ioHeaderInfo) { ioHeaderInfo.root = 0; ioHeaderInfo.nwords = 0; ioHeaderInfo.nodesused = 1; ioHeaderInfo.edgesused = 1; ioHeaderInfo.nodessaved = 0; ioHeaderInfo.edgessaved = 0; Header aHeader(ioHeaderInfo); aHeader.write(outfile); return aHeader; } void fix_header(ostream &outfile, DictHeaderInfo &ioHeaderInfo) { ioHeaderInfo.root = ioHeaderInfo.edgesused; // Go back to the beginning of the stream to overwrite the header outfile.seekp(0, ios::beg); #if defined(WORDS_BIGENDIAN) #warning "**********************************************" #warning "compdic does not run yet on bigendian machines" #warning "**********************************************" #else Header aHeader(ioHeaderInfo); aHeader.write(outfile); #endif } // Change endianness of the pointed edges, and write them to the given ostream void write_node(uint32_t *ioEdges, unsigned int num, ostream &outfile) { // Handle endianness for (unsigned int i = 0; i < num; ++i) { ioEdges[i] = htonl(ioEdges[i]); } #ifdef DEBUG_OUTPUT printf("writing %d edges\n", num); for (int i = 0; i < num; i++) { outfile.write((char*)(ioEdges + i), sizeof(DicEdge)); } #else outfile.write((char*)ioEdges, num * sizeof(DicEdge)); #endif } #define MAX_STRING_LENGTH 200 #define MAX_EDGES 2000 /* ods3: ?? */ /* ods4: 1746 */ // Hashing function for a vector of DicEdge, based on the hashing function // of the HashTable size_t hash_value(const DicEdge &iEdge) { const uint32_t *num = reinterpret_cast(&iEdge); size_t seed = 0; boost::hash_combine(seed, *num); return seed; } #ifdef CHECK_RECURSION class IncDec { public: IncDec(int &ioCounter) : m_counter(ioCounter) { m_counter++; } ~IncDec() { m_counter--; } private: int &m_counter; }; int current_rec = 0; int max_rec = 0; #endif typedef boost::unordered_map, unsigned int> HashMap; /* global variables */ HashMap global_hashmap; wchar_t global_stringbuf[MAX_STRING_LENGTH]; /* Space for current string */ wchar_t* global_endstring; /* Marks END of current string */ const wchar_t* global_input; const wchar_t* global_endofinput; #ifdef CHECK_RECURSION map > global_mapfordepth; #endif /** * Makenode takes a prefix (as position relative to stringbuf) and * returns an index of the start node of a dawg that recognizes all * words beginning with that prefix. String is a pointer (relative * to stringbuf) indicating how much of iPrefix is matched in the * input. * @param iPrefix: prefix to work on * @param outfile: stream where to write the nodes * @param ioHeaderInfo: information needed to build the final header, updated * during the processing * @param iHeader: temporary header, used only to do the conversion between * the (wide) chars and their corresponding internal code */ unsigned int makenode(const wchar_t *iPrefix, ostream &outfile, DictHeaderInfo &ioHeaderInfo, const Header &iHeader) { #ifdef CHECK_RECURSION IncDec inc(current_rec); if (current_rec > max_rec) max_rec = current_rec; #endif #ifdef CHECK_RECURSION // Instead of creating a vector, try to reuse an existing one vector &edges = global_mapfordepth[current_rec]; edges.reserve(MAX_EDGES); edges.clear(); #else vector edges; // Optimize allocation edges.reserve(MAX_EDGES); #endif DicEdge newEdge; while (iPrefix == global_endstring) { // More edges out of node newEdge.ptr = 0; newEdge.term = 0; newEdge.last = 0; try { newEdge.chr = iHeader.getCodeFromChar(*global_endstring++ = *global_input++); } catch (DicException &e) { // If an invalid character is found, be specific about the problem ostringstream oss; oss << "Error on line " << 1 + ioHeaderInfo.nwords << ", col " << global_endstring - global_stringbuf << ": " << e.what() << endl; throw DicException(oss.str()); } edges.push_back(newEdge); // End of a word? if (*global_input == L'\n' || *global_input == L'\r') { ioHeaderInfo.nwords++; *global_endstring = L'\0'; // Mark edge as word edges.back().term = 1; // Skip \r and/or \n while (global_input != global_endofinput && (*global_input == L'\n' || *global_input == L'\r')) { ++global_input; } // At the end of input? if (global_input == global_endofinput) break; global_endstring = global_stringbuf; while (*global_endstring == *global_input) { global_endstring++; global_input++; } } // Make dawg pointed to by this edge edges.back().ptr = makenode(iPrefix + 1, outfile, ioHeaderInfo, iHeader); } int numedges = edges.size(); if (numedges == 0) { // Special node zero - no edges return 0; } // Mark the last edge edges.back().last = 1; HashMap::const_iterator itMap = global_hashmap.find(edges); if (itMap != global_hashmap.end()) { ioHeaderInfo.edgessaved += numedges; ioHeaderInfo.nodessaved++; return itMap->second; } else { unsigned int node_pos = ioHeaderInfo.edgesused; global_hashmap[edges] = ioHeaderInfo.edgesused; ioHeaderInfo.edgesused += numedges; ioHeaderInfo.nodesused++; write_node(reinterpret_cast(&edges.front()), numedges, outfile); return node_pos; } } void printUsage(const string &iBinaryName) { cout << "Usage: " << iBinaryName << " [options]" << endl << _("Mandatory options:") << endl << _(" -d, --dicname Set the dictionary name and version") << endl << _(" -l, --letters Path to the file containing the letters (see below)") << endl << _(" -i, --input Path to the uncompressed dictionary file (encoded in UTF-8)") << endl << _(" The words must be in alphabetical order, without duplicates") << endl << _(" -o, --output (tmpPtr), 1, outfile); /* * Call makenode with null (relative to stringbuf) prefix; * Initialize string to null; Put index of start node on output */ DicEdge rootnode = {0, 0, 0, 0}; global_endstring = global_stringbuf; clock_t startBuildTime = clock(); rootnode.ptr = makenode(global_endstring, outfile, headerInfo, tempHeader); clock_t endBuildTime = clock(); // Reuse the temporary variable tmpPtr = &rootnode; write_node(reinterpret_cast(tmpPtr), 1, outfile); fix_header(outfile, headerInfo); Header aHeader(headerInfo); aHeader.print(); delete[] uncompressed; outfile.close(); printf(_(" Load time: %.3f s\n"), 1.0 * (endLoadTime - startLoadTime) / CLOCKS_PER_SEC); printf(_(" Compression time: %.3f s\n"), 1.0 * (endBuildTime - startBuildTime) / CLOCKS_PER_SEC); #ifdef CHECK_RECURSION printf(_(" Maximum recursion level reached: %d\n"), max_rec); #endif return 0; } catch (std::exception &e) { cerr << e.what() << endl; return 1; } }