/***************************************************************************** * Eliot * Copyright (C) 1999-2007 Antoine Fraboulet & Olivier Teulière * Authors: Antoine Fraboulet * Olivier Teulière * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA *****************************************************************************/ #include "config.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include // For htonl & Co. #ifdef WIN32 # include #else # if HAVE_NETINET_IN_H # include # endif # if HAVE_ARPA_INET_H # include # endif #endif #if ENABLE_NLS # include # define _(String) gettext(String) #else # define _(String) String #endif #ifdef WIN32 # include #endif #include "hashtable.h" #include "encoding.h" #include "header.h" #include "dic_internals.h" #include "dic_exception.h" using namespace std; //#define DEBUG_LIST //#define DEBUG_OUTPUT //#define DEBUG_OUTPUT_L2 #define CHECK_RECURSION const wchar_t* load_uncompressed(const string &iFileName, unsigned int &ioDicSize) { ifstream file(iFileName.c_str()); if (!file.is_open()) throw DicException("Could not open file " + iFileName); // Place the buffer in a vector to avoid worrying about memory handling vector buffer(ioDicSize); // Load the file data, everything in one shot file.read(&buffer.front(), ioDicSize); file.close(); // Buffer for the wide characters (it will use at most as many characters // as the utf-8 version) wchar_t *wideBuf = new wchar_t[ioDicSize]; unsigned int number; try { number = readFromUTF8(wideBuf, ioDicSize, &buffer.front(), ioDicSize, "load_uncompressed"); ioDicSize = number; return wideBuf; } catch (...) { // Avoid leaks, and propagate the exception delete[] wideBuf; throw; } } void readLetters(const char *iFileName, DictHeaderInfo &ioHeaderInfo) { ifstream in(iFileName); if (!in.is_open()) throw DicException("Could not open file " + string(iFileName)); // Use a more friendly type name typedef boost::tokenizer > Tokenizer; int lineNb = 1; string line; while (getline(in, line)) { // Ignore empty lines if (line == "" || line == "\r" || line == "\n") continue; // Split the lines on space characters vector tokens; boost::char_separator sep(" "); Tokenizer tok(line, sep); Tokenizer::iterator it; for (it = tok.begin(); it != tok.end(); ++it) { tokens.push_back(*it); } // We expect 5 fields on the line, and the first one is a letter, so // it cannot exceed 4 bytes if (tokens.size() != 5 || tokens[0].size() > 4) { ostringstream ss; ss << "readLetters: Invalid line in " << iFileName; ss << " (line " << lineNb << ")"; throw DicException(ss.str()); } #define MAX_SIZE 4 char buff[MAX_SIZE]; strncpy(buff, tokens[0].c_str(), MAX_SIZE); wstring letter = readFromUTF8(buff, tokens[0].size(), "readLetters"); if (letter.size() != 1) { // On the first line, there could be the BOM... if (lineNb == 1 && tokens[0].size() > 3 && (uint8_t)tokens[0][0] == 0xEF && (uint8_t)tokens[0][1] == 0xBB && (uint8_t)tokens[0][2] == 0xBF) { // BOM detected, remove the first char in the wide string letter.erase(0, 1); } else { ostringstream ss; ss << "readLetters: Invalid letter at line " << lineNb; throw DicException(ss.str()); } } #undef MAX_SIZE ioHeaderInfo.letters += towupper(letter[0]); ioHeaderInfo.points.push_back(atoi(tokens[1].c_str())); ioHeaderInfo.frequency.push_back(atoi(tokens[2].c_str())); ioHeaderInfo.vowels.push_back(atoi(tokens[3].c_str())); ioHeaderInfo.consonants.push_back(atoi(tokens[4].c_str())); ++lineNb; } } Header skip_init_header(ostream &outfile, DictHeaderInfo &ioHeaderInfo) { ioHeaderInfo.root = 0; ioHeaderInfo.nwords = 0; ioHeaderInfo.nodesused = 1; ioHeaderInfo.edgesused = 1; ioHeaderInfo.nodessaved = 0; ioHeaderInfo.edgessaved = 0; Header aHeader(ioHeaderInfo); aHeader.write(outfile); return aHeader; } void fix_header(ostream &outfile, DictHeaderInfo &ioHeaderInfo) { ioHeaderInfo.root = ioHeaderInfo.edgesused; // Go back to the beginning of the stream to overwrite the header outfile.seekp(0, ios::beg); #if defined(WORDS_BIGENDIAN) #warning "**********************************************" #warning "compdic does not run yet on bigendian machines" #warning "**********************************************" #else Header aHeader(ioHeaderInfo); aHeader.write(outfile); #endif } // Change endianness of the pointes edges, and write them to the given ostream void write_node(uint32_t *ioEdges, unsigned int num, ostream &outfile) { // Handle endianness for (unsigned int i = 0; i < num; ++i) { ioEdges[i] = htonl(ioEdges[i]); } #ifdef DEBUG_OUTPUT printf("writing %d edges\n", num); for (int i = 0; i < num; i++) { #ifdef DEBUG_OUTPUT_L2 printf("ptr=%2d t=%d l=%d chr=%2d (%c)\n", ioEdges[i].ptr, ioEdges[i].term, ioEdges[i].last, ioEdges[i].chr, ioEdges[i].chr -1 +'a'); #endif outfile.write((char*)(ioEdges + i), sizeof(DicEdge)); } #else outfile.write((char*)ioEdges, num * sizeof(DicEdge)); #endif } #define MAX_STRING_LENGTH 200 #define MAX_EDGES 2000 /* ods3: ?? */ /* ods4: 1746 */ // Hashing function for a vector of DicEdge, based on the hashing function // of the HashTable struct HashVector { unsigned int operator()(const vector &iKey) const { if (iKey.empty()) return 0; return HashPtr(&iKey.front(), iKey.size() * sizeof(DicEdge)); } }; #ifdef CHECK_RECURSION class IncDec { public: IncDec(int &ioCounter) : m_counter(ioCounter) { m_counter++; } ~IncDec() { m_counter--; } private: int &m_counter; }; int current_rec = 0; int max_rec = 0; #endif /* global variables */ HashTable, unsigned int, HashVector> *global_hashtable; wchar_t global_stringbuf[MAX_STRING_LENGTH]; /* Space for current string */ wchar_t* global_endstring; /* Marks END of current string */ const wchar_t* global_input; const wchar_t* global_endofinput; #ifdef CHECK_RECURSION map > global_mapfordepth; #endif /** * Makenode takes a prefix (as position relative to stringbuf) and * returns an index of the start node of a dawg that recognizes all * words beginning with that prefix. String is a pointer (relative * to stringbuf) indicating how much of iPrefix is matched in the * input. * @param iPrefix: prefix to work on * @param outfile: stream where to write the nodes * @param ioHeaderInfo: information needed to build the final header, updated * during the processing * @param iHeader: temporary header, used only to do the conversion between * the (wide) chars and their corresponding internal code */ unsigned int makenode(const wchar_t *iPrefix, ostream &outfile, DictHeaderInfo &ioHeaderInfo, const Header &iHeader) { #ifdef CHECK_RECURSION IncDec inc(current_rec); if (current_rec > max_rec) max_rec = current_rec; #endif #ifdef CHECK_RECURSION // Instead of creating a vector, try to reuse an existing one vector &edges = global_mapfordepth[current_rec]; edges.reserve(MAX_EDGES); edges.clear(); #else vector edges; // Optimize allocation edges.reserve(MAX_EDGES); #endif DicEdge newEdge; while (iPrefix == global_endstring) { // More edges out of node newEdge.ptr = 0; newEdge.term = 0; newEdge.last = 0; newEdge.chr = iHeader.getCodeFromChar(*global_endstring++ = *global_input++); edges.push_back(newEdge); // End of a word? if (*global_input == L'\n' || *global_input == L'\r') { ioHeaderInfo.nwords++; *global_endstring = L'\0'; // Mark edge as word edges.back().term = 1; // Skip \r and/or \n while (global_input != global_endofinput && (*global_input == L'\n' || *global_input == L'\r')) { ++global_input; } // At the end of input? if (global_input == global_endofinput) break; global_endstring = global_stringbuf; while (*global_endstring == *global_input) { global_endstring++; global_input++; } } // Make dawg pointed to by this edge edges.back().ptr = makenode(iPrefix + 1, outfile, ioHeaderInfo, iHeader); } int numedges = edges.size(); if (numedges == 0) { // Special node zero - no edges return 0; } // Mark the last edge edges.back().last = 1; const unsigned int *saved_position = global_hashtable->find(edges); if (saved_position) { ioHeaderInfo.edgessaved += numedges; ioHeaderInfo.nodessaved++; return *saved_position; } else { unsigned int node_pos = ioHeaderInfo.edgesused; global_hashtable->add(edges, ioHeaderInfo.edgesused); ioHeaderInfo.edgesused += numedges; ioHeaderInfo.nodesused++; write_node(reinterpret_cast(&edges.front()), numedges, outfile); return node_pos; } } void printUsage(const string &iBinaryName) { cout << "Usage: " << iBinaryName << " [options]" << endl << _("Mandatory options:") << endl << _(" -d, --dicname Set the dictionary name and version") << endl << _(" -l, --letters Path to the file containing the letters (see below)") << endl << _(" -i, --input Path to the uncompressed dictionary file (encoded in UTF-8)") << endl << _(" The words must be in alphabetical order, without duplicates") << endl << _(" -o, --output , unsigned int, HashVector>((unsigned int)(dicsize * SCALE)); #undef SCALE headerInfo.dawg = true; Header tempHeader = skip_init_header(outfile, headerInfo); DicEdge specialnode = {0, 0, 0, 0}; specialnode.last = 1; // Temporary variable to avoid a warning when compiling with -O2 // (there is no warning with -O0... g++ bug?) DicEdge *tmpPtr = &specialnode; write_node(reinterpret_cast(tmpPtr), 1, outfile); /* * Call makenode with null (relative to stringbuf) prefix; * Initialize string to null; Put index of start node on output */ DicEdge rootnode = {0, 0, 0, 0}; global_endstring = global_stringbuf; clock_t startBuildTime = clock(); rootnode.ptr = makenode(global_endstring, outfile, headerInfo, tempHeader); clock_t endBuildTime = clock(); // Reuse the temporary variable tmpPtr = &rootnode; write_node(reinterpret_cast(tmpPtr), 1, outfile); fix_header(outfile, headerInfo); Header aHeader(headerInfo); aHeader.print(); delete global_hashtable; delete[] uncompressed; outfile.close(); printf(_(" Load time: %.3f s\n"), 1.0 * (endLoadTime - startLoadTime) / CLOCKS_PER_SEC); printf(_(" Compression time: %.3f s\n"), 1.0 * (endBuildTime - startBuildTime) / CLOCKS_PER_SEC); #ifdef CHECK_RECURSION printf(_(" Maximum recursion level reached: %d\n"), max_rec); #endif return 0; } catch (std::exception &e) { cerr << e.what() << endl; return 1; } }