2008-01-08 13:52:32 +00:00
|
|
|
/*****************************************************************************
|
|
|
|
* Eliot
|
2012-10-07 16:25:41 +02:00
|
|
|
* Copyright (C) 2007-2012 Olivier Teulière
|
2008-01-08 13:52:32 +00:00
|
|
|
* Authors: Olivier Teulière <ipkiss @@ gmail.com>
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
|
|
* (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program; if not, write to the Free Software
|
|
|
|
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
*****************************************************************************/
|
|
|
|
|
|
|
|
#include "config.h"
|
|
|
|
|
2008-07-20 12:15:51 +00:00
|
|
|
#include <cstring> // for strcpy
|
2008-01-08 13:52:32 +00:00
|
|
|
#include <string>
|
|
|
|
#include <sstream>
|
|
|
|
#include <iostream>
|
2009-06-28 10:55:24 +00:00
|
|
|
#include <boost/foreach.hpp>
|
|
|
|
#include <boost/format.hpp>
|
|
|
|
#include <boost/tokenizer.hpp>
|
2008-01-08 13:52:32 +00:00
|
|
|
|
|
|
|
// For ntohl & Co.
|
|
|
|
#ifdef WIN32
|
|
|
|
# include <winsock2.h>
|
|
|
|
#else
|
|
|
|
# if HAVE_NETINET_IN_H
|
|
|
|
# include <netinet/in.h>
|
|
|
|
# endif
|
|
|
|
# if HAVE_ARPA_INET_H
|
|
|
|
# include <arpa/inet.h>
|
|
|
|
# endif
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if ENABLE_NLS
|
|
|
|
# include <libintl.h>
|
|
|
|
# define _(String) gettext(String)
|
|
|
|
#else
|
|
|
|
# define _(String) String
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#include "header.h"
|
|
|
|
#include "encoding.h"
|
|
|
|
#include "dic_exception.h"
|
|
|
|
|
2009-06-28 10:55:24 +00:00
|
|
|
using boost::format;
|
|
|
|
using boost::wformat;
|
|
|
|
|
2008-01-08 13:52:32 +00:00
|
|
|
|
2012-02-18 22:26:52 +01:00
|
|
|
INIT_LOGGER(dic, Header);
|
|
|
|
|
|
|
|
|
2008-01-08 13:52:32 +00:00
|
|
|
#if defined(WORDS_BIGENDIAN)
|
|
|
|
// Nothing to do on big-endian machines
|
|
|
|
# define ntohll(x) (x)
|
|
|
|
# define htonll(x) (x)
|
|
|
|
#else
|
|
|
|
static inline uint64_t htonll(uint64_t host64)
|
|
|
|
{
|
|
|
|
return (((uint64_t)htonl((host64 << 32) >> 32)) << 32) | htonl(host64 >> 32);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline uint64_t ntohll(uint64_t net64)
|
|
|
|
{
|
|
|
|
return htonll(net64);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Keyword included in dictionary headers
|
|
|
|
* Implies little endian storage on words
|
|
|
|
*/
|
|
|
|
#define _COMPIL_KEYWORD_ "_COMPILED_DICTIONARY_"
|
|
|
|
|
|
|
|
/** Old format of the header (i.e. version 0) */
|
|
|
|
struct Dict_header_old
|
|
|
|
{
|
|
|
|
/// Identification string
|
|
|
|
char ident[sizeof(_COMPIL_KEYWORD_)];
|
|
|
|
/// Version of the serialization format
|
|
|
|
uint8_t version;
|
2009-06-28 10:55:24 +00:00
|
|
|
/// Unused at the moment, reserved for future use
|
2008-01-08 13:52:32 +00:00
|
|
|
char unused;
|
|
|
|
uint32_t root;
|
|
|
|
uint32_t nwords;
|
|
|
|
/// Information about the compression
|
|
|
|
//@{
|
|
|
|
uint32_t edgesused;
|
|
|
|
uint32_t nodesused;
|
|
|
|
uint32_t nodessaved;
|
|
|
|
uint32_t edgessaved;
|
|
|
|
//@}
|
|
|
|
};
|
|
|
|
|
|
|
|
// Do not change these values, as they impact the size of the structure!
|
|
|
|
// Note: they are chosen carefully to avoid alignment issues
|
|
|
|
#define _MAX_USER_HOST_ 32
|
|
|
|
#define _MAX_DIC_NAME_SIZE_ 30
|
|
|
|
#define _MAX_LETTERS_NB_ 63
|
|
|
|
#define _MAX_LETTERS_SIZE_ 80
|
2009-06-28 10:55:24 +00:00
|
|
|
#define _MAX_DISPLAY_INPUT_SIZE_ 112
|
2008-01-08 13:52:32 +00:00
|
|
|
|
2009-06-28 10:55:24 +00:00
|
|
|
/** Extension of the old format (used in version 1) */
|
2008-01-08 13:52:32 +00:00
|
|
|
struct Dict_header_ext
|
|
|
|
{
|
2012-10-14 14:31:44 +02:00
|
|
|
Dict_header_ext()
|
|
|
|
{
|
|
|
|
// Make sure we won't write uninitialized bytes
|
|
|
|
memset(this, 0, sizeof(*this));
|
|
|
|
}
|
|
|
|
|
2008-01-08 13:52:32 +00:00
|
|
|
// Time when the dictionary was compressed
|
|
|
|
// (number of seconds since the Epoch)
|
|
|
|
uint64_t compressDate;
|
|
|
|
// Build information
|
|
|
|
char userHost[_MAX_USER_HOST_];
|
|
|
|
// Size taken by the build information
|
|
|
|
uint32_t userHostSize;
|
|
|
|
|
|
|
|
// --- we have a multiple of 64 bytes here
|
|
|
|
|
|
|
|
// Compression algorithm (1 = DAWG, 2 = GADDAG)
|
|
|
|
uint8_t algorithm;
|
|
|
|
// Variant used in the rules (XXX: currently unused)
|
|
|
|
uint8_t variant;
|
|
|
|
|
|
|
|
// --- we have a multiple of 64 bytes here
|
|
|
|
|
|
|
|
// Dictionary official name and version (e.g.: ODS 5.0)
|
|
|
|
char dicName[_MAX_DIC_NAME_SIZE_];
|
|
|
|
// Size taken by the dictionary name
|
|
|
|
uint32_t dicNameSize;
|
|
|
|
|
|
|
|
// --- we have a multiple of 64 bytes here
|
|
|
|
|
|
|
|
// Letters used in the dictionary
|
|
|
|
// We should have: nbLetters <= lettersSize <= _MAX_LETTERS_SIZE_
|
|
|
|
// and: nbLetters <= _MAX_LETTERS_NB_
|
|
|
|
// The letters themselves, in UTF-8
|
|
|
|
char letters[_MAX_LETTERS_SIZE_];
|
|
|
|
// Size taken by the letters
|
|
|
|
uint32_t lettersSize;
|
|
|
|
// Number of letters (XXX: in theory useless, but allows a sanity check)
|
|
|
|
uint32_t nbLetters;
|
|
|
|
|
|
|
|
// --- we have a multiple of 64 bytes here
|
|
|
|
|
|
|
|
// Points of the letters (indexed by their code)
|
|
|
|
// The "+ 1" is there for struct alignment
|
|
|
|
uint8_t points[_MAX_LETTERS_NB_ + 1];
|
|
|
|
// Frequency of the letters (indexed by their code)
|
|
|
|
// The "+ 1" is there for struct alignment
|
|
|
|
uint8_t frequency[_MAX_LETTERS_NB_ + 1];
|
|
|
|
// Bitfield indicating whether letters are vowels
|
|
|
|
uint64_t vowels;
|
|
|
|
// Bitfield indicating whether letters are consonants
|
|
|
|
uint64_t consonants;
|
|
|
|
|
|
|
|
// --- we have a multiple of 64 bytes here
|
|
|
|
};
|
|
|
|
|
2009-06-28 10:55:24 +00:00
|
|
|
/** Extension of the extension :-) (used in version 2) */
|
|
|
|
struct Dict_header_ext_2
|
|
|
|
{
|
2012-10-14 14:31:44 +02:00
|
|
|
Dict_header_ext_2()
|
|
|
|
{
|
|
|
|
// Make sure we won't write uninitialized bytes
|
|
|
|
memset(this, 0, sizeof(*this));
|
|
|
|
}
|
|
|
|
|
2009-06-28 10:55:24 +00:00
|
|
|
// Additional information concerning the display strings and the
|
|
|
|
// alternative input strings of the letters
|
|
|
|
char displayAndInput[_MAX_DISPLAY_INPUT_SIZE_];
|
|
|
|
// Size taken by the display/input data
|
|
|
|
uint16_t displayAndInputSize;
|
|
|
|
|
|
|
|
// --- we have a multiple of 64 bytes here
|
|
|
|
};
|
|
|
|
|
2008-01-08 13:52:32 +00:00
|
|
|
|
|
|
|
Header::Header(istream &iStream)
|
|
|
|
: m_root(0), m_nbWords(0), m_nodesUsed(0), m_edgesUsed(0),
|
|
|
|
m_nodesSaved(0), m_edgesSaved(0), m_type(kDAWG)
|
|
|
|
{
|
|
|
|
// Simply delegate to the read() method
|
|
|
|
// The code is not moved here because I find it more natural to have a
|
|
|
|
// read() method symmetrical to the write() one
|
|
|
|
read(iStream);
|
2009-06-28 10:55:24 +00:00
|
|
|
buildCaches();
|
2008-01-08 13:52:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
Header::Header(const DictHeaderInfo &iInfo)
|
|
|
|
{
|
|
|
|
// Use the latest serialization format
|
2009-06-28 10:55:24 +00:00
|
|
|
m_version = 2;
|
2008-01-08 13:52:32 +00:00
|
|
|
|
|
|
|
// Sanity checks
|
|
|
|
if (iInfo.letters.size() > _MAX_LETTERS_NB_)
|
|
|
|
{
|
2010-01-02 12:44:56 +00:00
|
|
|
format fmt(_("Header::Header: Too many different letters for "
|
|
|
|
"the current format; only %1% are supported"));
|
|
|
|
fmt % _MAX_LETTERS_NB_;
|
|
|
|
throw DicException(fmt.str());
|
2008-01-08 13:52:32 +00:00
|
|
|
}
|
|
|
|
if (iInfo.points.size() != iInfo.letters.size())
|
|
|
|
{
|
|
|
|
throw DicException("Header::Header: Different sizes for "
|
|
|
|
"iInfo.points and iInfo.letters");
|
|
|
|
}
|
|
|
|
if (iInfo.frequency.size() != iInfo.letters.size())
|
|
|
|
{
|
|
|
|
throw DicException("Header::Header: Different sizes for "
|
|
|
|
"iInfo.frequency and iInfo.letters");
|
|
|
|
}
|
|
|
|
if (iInfo.vowels.size() != iInfo.letters.size())
|
|
|
|
{
|
|
|
|
throw DicException("Header::Header: Different sizes for "
|
|
|
|
"iInfo.vowels and iInfo.letters");
|
|
|
|
}
|
|
|
|
if (iInfo.consonants.size() != iInfo.letters.size())
|
|
|
|
{
|
|
|
|
throw DicException("Header::Header: Different sizes for "
|
|
|
|
"iInfo.consonants and iInfo.letters");
|
|
|
|
}
|
|
|
|
|
|
|
|
m_compressDate = time(NULL);
|
2011-07-30 19:48:05 +00:00
|
|
|
m_userHost = wfl(ELIOT_COMPILE_BY + string("@") + ELIOT_COMPILE_HOST);
|
2008-01-08 13:52:32 +00:00
|
|
|
m_root = iInfo.root;
|
|
|
|
m_nbWords = iInfo.nwords;
|
|
|
|
m_nodesUsed = iInfo.nodesused;
|
|
|
|
m_edgesUsed = iInfo.edgesused;
|
|
|
|
m_nodesSaved = iInfo.nodessaved;
|
|
|
|
m_edgesSaved = iInfo.edgessaved;
|
|
|
|
m_type = iInfo.dawg ? kDAWG : kGADDAG;
|
|
|
|
m_dicName = iInfo.dicName;
|
|
|
|
m_letters = iInfo.letters;
|
|
|
|
m_points = iInfo.points;
|
|
|
|
m_frequency = iInfo.frequency;
|
|
|
|
m_vowels = iInfo.vowels;
|
|
|
|
m_consonants = iInfo.consonants;
|
2009-06-28 10:55:24 +00:00
|
|
|
m_displayAndInputData = iInfo.displayInputData;
|
2008-01-08 13:52:32 +00:00
|
|
|
|
2009-06-28 10:55:24 +00:00
|
|
|
buildCaches();
|
2008-01-08 13:52:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2009-06-28 10:55:24 +00:00
|
|
|
void Header::buildCaches()
|
2008-01-08 13:52:32 +00:00
|
|
|
{
|
2009-06-28 10:55:24 +00:00
|
|
|
// Build the char --> code mapping
|
2008-01-08 13:52:32 +00:00
|
|
|
for (unsigned int i = 0; i < m_letters.size(); ++i)
|
|
|
|
{
|
|
|
|
// We don't differentiate uppercase and lowercase letters
|
|
|
|
m_mapCodeFromChar[towlower(m_letters[i])] = i + 1;
|
|
|
|
m_mapCodeFromChar[towupper(m_letters[i])] = i + 1;
|
|
|
|
}
|
2009-06-28 10:55:24 +00:00
|
|
|
|
|
|
|
// Build the display strings cache
|
|
|
|
m_displayCache.assign(m_letters.size() + 1, L"");
|
|
|
|
for (unsigned int i = 0; i < m_letters.size(); ++i)
|
|
|
|
{
|
|
|
|
map<wchar_t, vector<wstring> >::const_iterator it =
|
|
|
|
m_displayAndInputData.find(m_letters[i]);
|
|
|
|
if (it == m_displayAndInputData.end())
|
|
|
|
m_displayCache[i + 1].append(1, m_letters[i]);
|
|
|
|
else
|
|
|
|
m_displayCache[i + 1] = it->second[0];
|
|
|
|
}
|
|
|
|
|
2012-05-15 22:01:14 +02:00
|
|
|
// Create a string with all the characters possibly used,
|
|
|
|
// and another one with the characters used in multi-char input strings
|
2009-06-28 10:55:24 +00:00
|
|
|
m_inputChars.reserve(m_letters.size());
|
|
|
|
BOOST_FOREACH(wchar_t wch, m_letters)
|
|
|
|
{
|
|
|
|
map<wchar_t, vector<wstring> >::const_iterator it =
|
|
|
|
m_displayAndInputData.find(wch);
|
|
|
|
if (it == m_displayAndInputData.end())
|
|
|
|
m_inputChars.append(1, wch);
|
|
|
|
else
|
|
|
|
{
|
|
|
|
BOOST_FOREACH(const wstring &str, it->second)
|
|
|
|
{
|
|
|
|
BOOST_FOREACH(wchar_t chr, str)
|
|
|
|
{
|
2012-05-15 22:01:14 +02:00
|
|
|
wchar_t upChr = towupper(chr);
|
|
|
|
if (m_inputChars.find(upChr) == string::npos)
|
|
|
|
m_inputChars.append(1, upChr);
|
|
|
|
if (str.size() > 1 &&
|
|
|
|
m_multiCharInputChars.find(upChr) == wstring::npos)
|
|
|
|
{
|
|
|
|
m_multiCharInputChars.append(1, upChr);
|
|
|
|
}
|
2009-06-28 10:55:24 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2008-01-08 13:52:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2012-05-15 22:01:14 +02:00
|
|
|
bool Header::isMultiCharPart(wchar_t iChar) const
|
|
|
|
{
|
|
|
|
return m_multiCharInputChars.find(towupper(iChar)) != wstring::npos;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-01-08 13:52:32 +00:00
|
|
|
wchar_t Header::getCharFromCode(unsigned int iCode) const
|
|
|
|
{
|
|
|
|
// Safety check
|
|
|
|
if (iCode == 0 || iCode > m_letters.size())
|
|
|
|
{
|
2010-01-02 12:44:56 +00:00
|
|
|
format fmt(_("Header::getCharFromCode: No letter for code '%1%'"));
|
|
|
|
fmt % iCode;
|
|
|
|
throw DicException(fmt.str());
|
2008-01-08 13:52:32 +00:00
|
|
|
}
|
|
|
|
return m_letters[iCode - 1];
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
unsigned int Header::getCodeFromChar(wchar_t iChar) const
|
|
|
|
{
|
|
|
|
map<wchar_t, unsigned int>::const_iterator pair =
|
|
|
|
m_mapCodeFromChar.find(iChar);
|
|
|
|
if (pair == m_mapCodeFromChar.end())
|
|
|
|
{
|
2010-01-02 12:44:56 +00:00
|
|
|
format fmt(_("Header::getCodeFromChar: No code for letter '%1%' (val=%2%)"));
|
2012-11-29 19:47:19 +01:00
|
|
|
fmt % lfw(iChar) % (unsigned) iChar;
|
2010-01-02 12:44:56 +00:00
|
|
|
throw DicException(fmt.str());
|
2008-01-08 13:52:32 +00:00
|
|
|
}
|
|
|
|
return pair->second;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2009-06-28 10:55:24 +00:00
|
|
|
const wdstring & Header::getDisplayStr(unsigned int iCode) const
|
2009-06-23 12:41:53 +00:00
|
|
|
{
|
|
|
|
// Safety check
|
2009-06-28 11:47:50 +00:00
|
|
|
if (iCode == 0 || iCode > m_letters.size())
|
2009-06-23 12:41:53 +00:00
|
|
|
{
|
|
|
|
ostringstream oss;
|
|
|
|
oss << iCode;
|
|
|
|
throw DicException("Header::getDisplayStr: No code for letter '" + oss.str());
|
|
|
|
}
|
2009-06-28 10:55:24 +00:00
|
|
|
return m_displayCache[iCode];
|
2009-06-23 12:41:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2009-06-28 11:47:50 +00:00
|
|
|
vector<wistring> Header::getInputStr(unsigned int iCode) const
|
|
|
|
{
|
|
|
|
// Safety check
|
|
|
|
if (iCode == 0 || iCode > m_letters.size())
|
|
|
|
{
|
|
|
|
ostringstream oss;
|
|
|
|
oss << iCode;
|
|
|
|
throw DicException("Header::getInputStr: No code for letter '" + oss.str());
|
|
|
|
}
|
|
|
|
map<wchar_t, vector<wstring> >::const_iterator it =
|
|
|
|
m_displayAndInputData.find(m_letters[iCode - 1]);
|
|
|
|
if (it == m_displayAndInputData.end())
|
|
|
|
{
|
|
|
|
vector<wistring> vect;
|
|
|
|
vect.push_back(m_displayCache[iCode]);
|
|
|
|
return vect;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
return it->second;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-01-08 13:52:32 +00:00
|
|
|
void Header::read(istream &iStream)
|
|
|
|
{
|
|
|
|
Dict_header_old aHeader;
|
|
|
|
iStream.read((char*)&aHeader, sizeof(Dict_header_old));
|
|
|
|
if (iStream.gcount() != sizeof(Dict_header_old))
|
|
|
|
throw DicException("Header::read: expected to read more bytes");
|
|
|
|
|
|
|
|
// Check the identification string
|
|
|
|
if (string(aHeader.ident) != _COMPIL_KEYWORD_)
|
|
|
|
throw DicException("Header::read: incorrect header keyword; is it a dictionary file?");
|
|
|
|
|
|
|
|
m_version = aHeader.version;
|
|
|
|
|
2008-11-22 13:11:48 +00:00
|
|
|
// Version 0 corresponds to the dictionary format in the first Eliot
|
|
|
|
// versions, supported until Eliot 1.8 (excluded).
|
|
|
|
// The new version (version 1) was introduced in Eliot 1.6.
|
2008-01-08 13:52:32 +00:00
|
|
|
if (m_version == 0)
|
|
|
|
{
|
2008-11-22 13:11:48 +00:00
|
|
|
throw DicException(_("Too old dictionary format. This format is not "
|
|
|
|
"supported anymore since Eliot 1.8. You can "
|
|
|
|
"create dictionaries in the new format with the "
|
|
|
|
"'compdic' tool provided with Eliot (since "
|
|
|
|
"version 1.6)."));
|
2008-01-08 13:52:32 +00:00
|
|
|
}
|
2008-11-22 13:11:48 +00:00
|
|
|
|
|
|
|
// Handle endianness
|
|
|
|
m_root = ntohl(aHeader.root);
|
|
|
|
m_nbWords = ntohl(aHeader.nwords);
|
|
|
|
m_nodesUsed = ntohl(aHeader.nodesused);
|
|
|
|
m_edgesUsed = ntohl(aHeader.edgesused);
|
|
|
|
m_nodesSaved = ntohl(aHeader.nodessaved);
|
|
|
|
m_edgesSaved = ntohl(aHeader.edgessaved);
|
|
|
|
|
|
|
|
// After reading the old header, we now read the extension
|
|
|
|
Dict_header_ext aHeaderExt;
|
|
|
|
iStream.read((char*)&aHeaderExt, sizeof(Dict_header_ext));
|
|
|
|
if (iStream.gcount() != sizeof(Dict_header_ext))
|
|
|
|
throw DicException("Header::read: expected to read more bytes");
|
|
|
|
|
|
|
|
// Handle endianness in the extension
|
|
|
|
aHeaderExt.compressDate = ntohll(aHeaderExt.compressDate);
|
|
|
|
aHeaderExt.userHostSize = ntohl(aHeaderExt.userHostSize);
|
|
|
|
aHeaderExt.dicNameSize = ntohl(aHeaderExt.dicNameSize);
|
|
|
|
aHeaderExt.lettersSize = ntohl(aHeaderExt.lettersSize);
|
|
|
|
aHeaderExt.nbLetters = ntohl(aHeaderExt.nbLetters);
|
|
|
|
aHeaderExt.vowels = ntohll(aHeaderExt.vowels);
|
|
|
|
aHeaderExt.consonants = ntohll(aHeaderExt.consonants);
|
|
|
|
|
|
|
|
m_compressDate = aHeaderExt.compressDate;
|
|
|
|
|
|
|
|
if (aHeaderExt.algorithm == kDAWG)
|
|
|
|
m_type = kDAWG;
|
|
|
|
else if (aHeaderExt.algorithm == kGADDAG)
|
|
|
|
m_type = kGADDAG;
|
2008-01-08 13:52:32 +00:00
|
|
|
else
|
2008-11-22 13:11:48 +00:00
|
|
|
throw DicException("Header::read: unrecognized algorithm type");
|
|
|
|
|
2011-07-30 19:46:47 +00:00
|
|
|
m_userHost = readFromUTF8(string(aHeaderExt.userHost,
|
|
|
|
aHeaderExt.userHostSize),
|
2008-11-22 13:11:48 +00:00
|
|
|
"user and host information");
|
|
|
|
|
|
|
|
// Convert the dictionary letters from UTF-8 to wchar_t*
|
2011-07-30 19:46:47 +00:00
|
|
|
m_dicName = readFromUTF8(string(aHeaderExt.dicName,
|
|
|
|
aHeaderExt.dicNameSize),
|
2008-11-22 13:11:48 +00:00
|
|
|
"dictionary name");
|
|
|
|
|
|
|
|
// Convert the dictionary letters from UTF-8 to wchar_t*
|
2011-07-30 19:46:47 +00:00
|
|
|
m_letters = readFromUTF8(string(aHeaderExt.letters,
|
|
|
|
aHeaderExt.lettersSize),
|
2008-11-22 13:11:48 +00:00
|
|
|
"dictionary letters");
|
|
|
|
// Safety check: correct number of letters?
|
|
|
|
if (m_letters.size() != aHeaderExt.nbLetters)
|
2008-01-08 13:52:32 +00:00
|
|
|
{
|
2008-11-22 13:11:48 +00:00
|
|
|
throw DicException("Header::read: inconsistent header");
|
2008-01-08 13:52:32 +00:00
|
|
|
}
|
|
|
|
|
2008-11-22 13:11:48 +00:00
|
|
|
// Letters points and frequency
|
|
|
|
for (unsigned int i = 0; i < m_letters.size(); ++i)
|
2008-01-08 13:52:32 +00:00
|
|
|
{
|
2008-11-22 13:11:48 +00:00
|
|
|
m_points.push_back(aHeaderExt.points[i]);
|
|
|
|
m_frequency.push_back(aHeaderExt.frequency[i]);
|
2008-01-08 13:52:32 +00:00
|
|
|
}
|
|
|
|
|
2008-11-22 13:11:48 +00:00
|
|
|
// Vowels and consonants
|
|
|
|
for (unsigned int i = 0; i < m_letters.size(); ++i)
|
|
|
|
{
|
|
|
|
m_vowels.push_back(aHeaderExt.vowels & (1 << i));
|
|
|
|
m_consonants.push_back(aHeaderExt.consonants & (1 << i));
|
2008-01-08 13:52:32 +00:00
|
|
|
}
|
2009-06-28 10:55:24 +00:00
|
|
|
|
|
|
|
// Read the additional display/input data
|
|
|
|
if (m_version >= 2)
|
|
|
|
{
|
|
|
|
// Read the extension of the extension...
|
|
|
|
Dict_header_ext_2 aHeaderExt2;
|
|
|
|
iStream.read((char*)&aHeaderExt2, sizeof(Dict_header_ext_2));
|
|
|
|
if (iStream.gcount() != sizeof(Dict_header_ext_2))
|
|
|
|
throw DicException("Header::read: expected to read more bytes (ext2)");
|
|
|
|
|
|
|
|
// Handle endianness
|
|
|
|
aHeaderExt2.displayAndInputSize = ntohs(aHeaderExt2.displayAndInputSize);
|
|
|
|
|
|
|
|
// Convert the dictionary letters from UTF-8 to wchar_t*
|
2011-07-30 19:46:47 +00:00
|
|
|
wstring serialized = readFromUTF8(string(aHeaderExt2.displayAndInput,
|
|
|
|
aHeaderExt2.displayAndInputSize),
|
2009-06-28 10:55:24 +00:00
|
|
|
"display and input data");
|
|
|
|
// Parse this string and structure the data
|
|
|
|
readDisplayAndInput(serialized);
|
|
|
|
}
|
2008-01-08 13:52:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void Header::write(ostream &oStream) const
|
|
|
|
{
|
|
|
|
Dict_header_old aHeader;
|
|
|
|
strcpy(aHeader.ident, _COMPIL_KEYWORD_);
|
|
|
|
aHeader.version = m_version;
|
|
|
|
aHeader.unused = 0;
|
|
|
|
aHeader.root = htonl(m_root);
|
|
|
|
aHeader.nwords = htonl(m_nbWords);
|
|
|
|
aHeader.nodesused = htonl(m_nodesUsed);
|
|
|
|
aHeader.edgesused = htonl(m_edgesUsed);
|
|
|
|
aHeader.nodessaved = htonl(m_nodesSaved);
|
|
|
|
aHeader.edgessaved = htonl(m_edgesSaved);
|
|
|
|
|
|
|
|
oStream.write((char*)&aHeader, sizeof(Dict_header_old));
|
|
|
|
if (!oStream.good())
|
|
|
|
throw DicException("Header::write: error when writing to file");
|
|
|
|
|
2008-11-22 13:11:48 +00:00
|
|
|
Dict_header_ext aHeaderExt;
|
|
|
|
aHeaderExt.compressDate = m_compressDate;
|
|
|
|
aHeaderExt.userHostSize =
|
|
|
|
writeInUTF8(m_userHost, aHeaderExt.userHost,
|
|
|
|
_MAX_USER_HOST_, "user and host information");
|
|
|
|
aHeaderExt.algorithm = m_type;
|
|
|
|
|
|
|
|
// Convert the dictionary name to UTF-8
|
|
|
|
aHeaderExt.dicNameSize =
|
|
|
|
writeInUTF8(m_dicName, aHeaderExt.dicName,
|
|
|
|
_MAX_DIC_NAME_SIZE_, "dictionary name");
|
|
|
|
|
|
|
|
// Convert the dictionary letters to UTF-8
|
|
|
|
aHeaderExt.lettersSize =
|
|
|
|
writeInUTF8(m_letters, aHeaderExt.letters,
|
|
|
|
_MAX_LETTERS_SIZE_, "dictionary letters");
|
|
|
|
aHeaderExt.nbLetters = (uint32_t)m_letters.size();
|
|
|
|
|
|
|
|
// Letters points and frequency
|
|
|
|
for (unsigned int i = 0; i < m_letters.size(); ++i)
|
2008-01-08 13:52:32 +00:00
|
|
|
{
|
2008-11-22 13:11:48 +00:00
|
|
|
aHeaderExt.points[i] = m_points[i];
|
|
|
|
aHeaderExt.frequency[i] = m_frequency[i];
|
2008-01-08 13:52:32 +00:00
|
|
|
}
|
2008-11-22 13:11:48 +00:00
|
|
|
|
|
|
|
// Vowels and consonants
|
|
|
|
aHeaderExt.vowels = 0;
|
|
|
|
aHeaderExt.consonants = 0;
|
|
|
|
for (unsigned int i = 0; i < m_letters.size(); ++i)
|
|
|
|
{
|
|
|
|
if (m_vowels[i])
|
|
|
|
aHeaderExt.vowels |= 1 << i;
|
|
|
|
if (m_consonants[i])
|
|
|
|
aHeaderExt.consonants |= 1 << i;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Handle endianness in the extension
|
|
|
|
aHeaderExt.userHostSize = htonl(aHeaderExt.userHostSize);
|
|
|
|
aHeaderExt.compressDate = htonll(aHeaderExt.compressDate);
|
|
|
|
aHeaderExt.dicNameSize = htonl(aHeaderExt.dicNameSize);
|
|
|
|
aHeaderExt.lettersSize = htonl(aHeaderExt.lettersSize);
|
|
|
|
aHeaderExt.nbLetters = htonl(aHeaderExt.nbLetters);
|
|
|
|
aHeaderExt.vowels = htonll(aHeaderExt.vowels);
|
|
|
|
aHeaderExt.consonants = htonll(aHeaderExt.consonants);
|
|
|
|
|
|
|
|
// Write the extension
|
|
|
|
oStream.write((char*)&aHeaderExt, sizeof(Dict_header_ext));
|
|
|
|
if (!oStream.good())
|
|
|
|
throw DicException("Header::write: error when writing to file");
|
2009-06-28 10:55:24 +00:00
|
|
|
|
|
|
|
// Write the second extension
|
|
|
|
Dict_header_ext_2 aHeaderExt2;
|
|
|
|
const wstring &serialized = writeDisplayAndInput();
|
|
|
|
|
|
|
|
// Convert the serialized data to UTF-8
|
|
|
|
aHeaderExt2.displayAndInputSize =
|
|
|
|
writeInUTF8(serialized, aHeaderExt2.displayAndInput,
|
|
|
|
_MAX_DISPLAY_INPUT_SIZE_, "display and input data");
|
|
|
|
|
|
|
|
// Handle endianness
|
|
|
|
aHeaderExt2.displayAndInputSize = htons(aHeaderExt2.displayAndInputSize);
|
|
|
|
|
|
|
|
// Write the extension
|
|
|
|
oStream.write((char*)&aHeaderExt2, sizeof(Dict_header_ext_2));
|
|
|
|
if (!oStream.good())
|
|
|
|
throw DicException("Header::write: error when writing to file (ext2)");
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void Header::readDisplayAndInput(const wstring &serialized)
|
|
|
|
{
|
|
|
|
// The format is the following:
|
|
|
|
// "X|DISPX|INP1|INP2|INP3 Y|DISPY Z|DISPZ|INP4|INP5"
|
|
|
|
// where X, Y and Z are internal chars (i.e. chars of m_letters),
|
|
|
|
// DISPX, DISPY and DISPZ are the corresponding display strings,
|
|
|
|
// and the INP* are input strings (in addition to the display string,
|
|
|
|
// which is always considered as an input string)
|
|
|
|
|
|
|
|
// Use a more friendly type name
|
|
|
|
typedef boost::tokenizer<boost::char_separator<wchar_t>,
|
|
|
|
std::wstring::const_iterator,
|
|
|
|
std::wstring> Tokenizer;
|
|
|
|
|
|
|
|
// Split the string on double spaces
|
|
|
|
static const boost::char_separator<wchar_t> sep1(L" ");
|
|
|
|
static const boost::char_separator<wchar_t> sep2(L"|");
|
|
|
|
Tokenizer tok(serialized, sep1);
|
|
|
|
Tokenizer::iterator it;
|
|
|
|
for (it = tok.begin(); it != tok.end(); ++it)
|
|
|
|
{
|
|
|
|
// Split the token on single space
|
|
|
|
Tokenizer tok2(*it, sep2);
|
|
|
|
vector<wstring> pieces(tok2.begin(), tok2.end());
|
|
|
|
// Some sanity checks...
|
|
|
|
if (pieces.size() < 2)
|
|
|
|
throw DicException("Header::readDisplayAndInput: no display "
|
|
|
|
"string. Corrupted dictionary?");
|
|
|
|
// The first piece must be a single char, present in m_letters
|
|
|
|
if (pieces[0].size() != 1 ||
|
|
|
|
m_letters.find(pieces[0][0]) == wstring::npos)
|
|
|
|
{
|
|
|
|
throw DicException("Header::readDisplayAndInput: invalid internal"
|
|
|
|
" letter. Corrupted dictionary?");
|
|
|
|
}
|
|
|
|
wchar_t chr = pieces[0][0];
|
|
|
|
if (m_displayAndInputData.find(chr) != m_displayAndInputData.end())
|
|
|
|
{
|
|
|
|
throw DicException("Header::readDisplayAndInput: found 2 display"
|
|
|
|
" data sections for the same letter. Corrupted"
|
|
|
|
" dictionary?");
|
|
|
|
}
|
|
|
|
// OK, save the data
|
|
|
|
pieces.erase(pieces.begin());
|
|
|
|
m_displayAndInputData[chr] = pieces;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
wstring Header::writeDisplayAndInput() const
|
|
|
|
{
|
|
|
|
wstring serialized;
|
|
|
|
bool first = true;
|
|
|
|
map<wchar_t, vector<wstring> >::const_iterator it;
|
|
|
|
for (it = m_displayAndInputData.begin();
|
|
|
|
it != m_displayAndInputData.end(); ++it)
|
|
|
|
{
|
|
|
|
if (first)
|
|
|
|
first = false;
|
|
|
|
else
|
|
|
|
serialized += L" ";
|
|
|
|
serialized.append(1, it->first);
|
|
|
|
BOOST_FOREACH(const wstring &str, it->second)
|
|
|
|
{
|
|
|
|
// Make sure the string is uppercase
|
2012-12-23 18:15:55 +01:00
|
|
|
serialized += L"|" + toUpper(str);
|
2009-06-28 10:55:24 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return serialized;
|
2008-01-08 13:52:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2012-11-29 19:43:08 +01:00
|
|
|
void Header::print(ostream &out) const
|
2008-01-08 13:52:32 +00:00
|
|
|
{
|
2010-01-02 12:44:56 +00:00
|
|
|
#define fmt(x) boost::format(x)
|
2012-11-29 19:43:08 +01:00
|
|
|
out << fmt(_("Dictionary name: %1%")) % lfw(m_dicName) << endl;
|
2009-06-28 10:55:24 +00:00
|
|
|
char buf[150];
|
2008-11-22 13:11:48 +00:00
|
|
|
strftime(buf, sizeof(buf), "%c", gmtime(&m_compressDate));
|
2012-11-29 19:43:08 +01:00
|
|
|
out << fmt(_("Compressed on: %1%")) % buf << endl;
|
|
|
|
out << fmt(_("Compressed using a binary compiled by: %1%")) % lfw(m_userHost) << endl;
|
|
|
|
out << fmt(_("Dictionary type: %1%")) % (m_type == kDAWG ? "DAWG" : "GADDAG") << endl;
|
|
|
|
out << fmt(_("Letters: %1%")) % lfw(m_letters) << endl;
|
|
|
|
out << fmt(_("Number of letters: %1%")) % m_letters.size() << endl;
|
|
|
|
out << fmt(_("Number of words: %1%")) % m_nbWords << endl;
|
2009-06-28 10:55:24 +00:00
|
|
|
long unsigned int size = sizeof(Dict_header_old) +
|
|
|
|
sizeof(Dict_header_ext) + sizeof(Dict_header_ext_2);
|
2012-11-29 19:43:08 +01:00
|
|
|
out << fmt(_("Header size: %1% bytes")) % size << endl;
|
|
|
|
out << fmt(_("Root: %1% (edge)")) % m_root << endl;
|
|
|
|
out << fmt(_("Nodes: %1% used + %2% saved")) % m_nodesUsed % m_nodesSaved << endl;
|
|
|
|
out << fmt(_("Edges: %1% used + %2% saved")) % m_edgesUsed % m_edgesSaved << endl;
|
2009-06-28 10:55:24 +00:00
|
|
|
#undef fmt
|
2012-11-29 19:43:08 +01:00
|
|
|
out << "====================================================================" << endl;
|
|
|
|
out << format("%1% | %2% | %3% | %4% | %5% | %6% | %7%")
|
2009-06-28 12:11:10 +00:00
|
|
|
% _("Letter") % _("Points") % _("Frequency") % _("Vowel")
|
|
|
|
% _("Consonant") % _("Display") % _("Alt. input") << endl;
|
2012-11-29 19:43:08 +01:00
|
|
|
out << "-------+--------+-----------+-------+-----------+-------+------" << endl;
|
2009-06-28 10:55:24 +00:00
|
|
|
#define sz(x) strlen(_(x))
|
2008-01-08 13:52:32 +00:00
|
|
|
for (unsigned int i = 0; i < m_letters.size(); ++i)
|
|
|
|
{
|
2009-06-28 10:55:24 +00:00
|
|
|
format fmter("%1% | %2% | %3% | %4% | %5% | %6% | %7%");
|
2009-06-28 12:11:10 +00:00
|
|
|
fmter % centerAndConvert(wstring(1, m_letters[i]), sz("Letter"));
|
|
|
|
fmter % centerAndConvert(str(wformat(L"%1%") % m_points[i]), sz("Points"));
|
|
|
|
fmter % centerAndConvert(str(wformat(L"%1%") % m_frequency[i]), sz("Frequency"));
|
|
|
|
fmter % centerAndConvert(str(wformat(L"%1%") % m_vowels[i]), sz("Vowel"));
|
|
|
|
fmter % centerAndConvert(str(wformat(L"%1%") % m_consonants[i]), sz("Consonant"));
|
2009-06-28 10:55:24 +00:00
|
|
|
map<wchar_t, vector<wstring> >::const_iterator it =
|
|
|
|
m_displayAndInputData.find(m_letters[i]);
|
|
|
|
if (it != m_displayAndInputData.end())
|
|
|
|
{
|
|
|
|
const vector<wstring> &inputs = it->second;
|
2009-06-28 12:11:10 +00:00
|
|
|
fmter % centerAndConvert(str(wformat(L"%1%") % inputs[0]), sz("Display"));
|
2009-06-28 10:55:24 +00:00
|
|
|
bool first = true;
|
|
|
|
string s;
|
|
|
|
for (uint8_t j = 1; j < inputs.size(); ++j)
|
|
|
|
{
|
|
|
|
if (first)
|
|
|
|
first = false;
|
|
|
|
else
|
|
|
|
s += " ";
|
2011-07-30 19:48:05 +00:00
|
|
|
s += lfw(inputs[j]);
|
2009-06-28 10:55:24 +00:00
|
|
|
}
|
|
|
|
fmter % s;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2009-06-28 12:11:10 +00:00
|
|
|
fmter % string(sz("Display"), ' ') % string(sz("Alt. input"), ' ');
|
2009-06-28 10:55:24 +00:00
|
|
|
}
|
2012-11-29 19:43:08 +01:00
|
|
|
out << fmter.str() << endl;
|
2008-01-08 13:52:32 +00:00
|
|
|
}
|
2009-06-28 10:55:24 +00:00
|
|
|
#undef sz
|
2012-11-29 19:43:08 +01:00
|
|
|
out << "====================================================================" << endl;
|
2008-01-08 13:52:32 +00:00
|
|
|
}
|
|
|
|
|