eliot/dic/encoding.cpp
2010-05-19 19:28:26 +00:00

484 lines
14 KiB
C++

/*****************************************************************************
* Eliot
* Copyright (C) 2005-2007 Olivier Teulière
* Authors: Olivier Teulière <ipkiss @@ gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*****************************************************************************/
#include "config.h"
#include <iostream>
#include <sstream>
#include <cstdlib>
#include <cstdarg>
#include <cstring>
#include <cwchar>
#include <cwctype>
#include <cerrno>
#include <iconv.h>
#ifdef WIN32
#include <windows.h>
#endif
#include "encoding.h"
#include "dic_exception.h"
using namespace std;
#ifdef WIN32
// Utility function to get the last system error as a string
static string GetWin32Error()
{
char *lpMsgBuf;
DWORD dw = GetLastError();
FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER |
FORMAT_MESSAGE_FROM_SYSTEM |
FORMAT_MESSAGE_IGNORE_INSERTS,
NULL, dw,
MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
(LPTSTR) &lpMsgBuf,
0, NULL);
string msg = lpMsgBuf;
LocalFree(lpMsgBuf);
return msg;
}
#endif
#if !HAVE_WCWIDTH
// wcwidth replacement (for win32 in particular)
// Inspired from the gnulib package, without some of the refinements
static inline int wcwidth(wchar_t c)
{
// Assume all the printable characters have width 1
return c == 0 ? 0 : (iswprint(c) ? 1 : -1);
}
#endif
int _wtoi(const wchar_t *iWStr)
{
return wcstol(iWStr, NULL, 10);
}
int _swprintf(wchar_t *wcs, size_t maxlen, const wchar_t *format, ...)
{
int res;
va_list argp;
va_start(argp, format);
#ifdef WIN32
// Mingw32 does not take the maxlen argument
(void)maxlen;
res = vswprintf(wcs, format, argp);
#else
res = vswprintf(wcs, maxlen, format, argp);
#endif
va_end(argp);
return res;
}
wchar_t *_wcstok(wchar_t *wcs, const wchar_t *delim, wchar_t **ptr)
{
#ifdef WIN32
// Mingw32 does not take the third argument
(void)ptr;
return wcstok(wcs, delim);
#else
return wcstok(wcs, delim, ptr);
#endif
}
#define _MAX_SIZE_FOR_STACK_ 30
wstring convertToWc(const string& iStr)
{
#ifdef WIN32
if (iStr.empty())
return L"";
const unsigned int bufSize = iStr.size();
// Temporary buffer for output
// We will have at most as many characters as in the UTF-8 string
wchar_t *wideBuf = new wchar_t[bufSize];
int number = MultiByteToWideChar(CP_OEMCP, MB_ERR_INVALID_CHARS,
iStr.c_str(), bufSize, wideBuf, bufSize);
wstring res(wideBuf, number);
delete[] wideBuf;
if (number == 0)
{
// Retrieve the system error message for the last-error code
throw DicException("convertToWc: MultiByteToWideChar failed:" +
GetWin32Error());
}
return res;
#else
// Get the needed length (we _can't_ use string::size())
size_t len = mbstowcs(NULL, iStr.c_str(), 0);
if (len == (size_t)-1)
return L"";
// Change the allocation method depending on the length of the string
if (len < _MAX_SIZE_FOR_STACK_)
{
// Without multi-thread, we can use static storage
static wchar_t tmp[_MAX_SIZE_FOR_STACK_];
len = mbstowcs(tmp, iStr.c_str(), len + 1);
return tmp;
}
else
{
wchar_t *tmp = new wchar_t[len + 1];
len = mbstowcs(tmp, iStr.c_str(), len + 1);
wstring res = tmp;
delete[] tmp;
return res;
}
#endif
}
string convertToMb(const wstring& iWStr)
{
#ifdef WIN32
const unsigned int size = iWStr.size() * 4;
if (size == 0)
return "";
char buf[size];
int res = WideCharToMultiByte(CP_OEMCP, 0, iWStr.c_str(), iWStr.size(),
buf, size, NULL, NULL);
if (res == 0)
{
// Retrieve the system error message for the last-error code
throw DicException("convertToMb: WideCharToMultiByte failed: " +
GetWin32Error());
}
return string(buf, res);
#else
// Get the needed length (we _can't_ use wstring::size())
size_t len = wcstombs(NULL, iWStr.c_str(), 0);
if (len == (size_t)-1)
return "";
// Change the allocation method depending on the length of the string
if (len < _MAX_SIZE_FOR_STACK_)
{
// Without multi-thread, we can use static storage
static char tmp[_MAX_SIZE_FOR_STACK_];
len = wcstombs(tmp, iWStr.c_str(), len + 1);
return tmp;
}
else
{
char *tmp = new char[len + 1];
len = wcstombs(tmp, iWStr.c_str(), len + 1);
string res = tmp;
delete[] tmp;
return res;
}
#endif
}
#undef _MAX_SIZE_FOR_STACK_
string convertToMb(wchar_t iWChar)
{
return convertToMb(wstring(1, iWChar));
}
string truncString(const string &iStr, unsigned int iMaxWidth)
{
// Heuristic: the width of a character cannot exceed the number of
// bytes used to represent it (even in UTF-8)
if (iStr.size() <= iMaxWidth)
return iStr;
return truncAndConvert(convertToWc(iStr), iMaxWidth);
}
string truncAndConvert(const wstring &iWstr, unsigned int iMaxWidth)
{
unsigned int width = 0;
unsigned int pos;
for (pos = 0; pos < iWstr.size(); ++pos)
{
int n = wcwidth(iWstr[pos]);
if (n == -1)
{
ostringstream ss;
// XXX: Should we throw an exception instead? Just ignore the problem?
#if 0
ss << "truncAndConvert: non printable character: " << iWstr[pos];
cerr << ss.str() << endl;;
//throw DicException(ss.str());
#endif
return convertToMb(iWstr);
}
if (width + n > iMaxWidth)
break;
width += n;
}
return convertToMb(iWstr.substr(0, pos));
}
string truncOrPad(const string &iStr, unsigned int iWidth, char iChar)
{
wstring wstr = convertToWc(iStr);
unsigned int width = 0;
unsigned int pos;
for (pos = 0; pos < wstr.size(); ++pos)
{
int n = wcwidth(wstr[pos]);
if (n == -1)
{
ostringstream ss;
// XXX: Should we throw an exception instead? Just ignore the problem?
#if 0
ss << "truncAndConvert: non printable character: " << wstr[pos];
cerr << ss.str() << endl;;
//throw DicException(ss.str());
#endif
return convertToMb(wstr);
}
if (width + n > iWidth)
break;
width += n;
}
if (iWidth > width)
return convertToMb(wstr.substr(0, pos)) + string(iWidth - width, iChar);
else
return convertToMb(wstr.substr(0, pos));
}
string padAndConvert(const wstring &iWstr, unsigned int iLength,
bool iLeftPad, char c)
{
int width = 0;
for (unsigned int i = 0; i < iWstr.size(); ++i)
{
int n = wcwidth(iWstr[i]);
if (n == -1)
{
ostringstream ss;
// XXX: Should we throw an exception instead? Just ignore the problem?
#if 0
ss << "padAndConvert: non printable character: " << iWstr[i];
cerr << ss.str() << endl;;
//throw DicException(ss.str());
#endif
return convertToMb(iWstr);
}
width += n;
}
if ((unsigned int)width >= iLength)
return convertToMb(iWstr);
else
{
// Padding is needed
string s(iLength - width, c);
if (iLeftPad)
return s + convertToMb(iWstr);
else
return convertToMb(iWstr) + s;
}
}
string centerAndConvert(const wstring &iWstr, unsigned int iLength, char c)
{
int width = 0;
for (unsigned int i = 0; i < iWstr.size(); ++i)
{
int n = wcwidth(iWstr[i]);
if (n == -1)
{
ostringstream ss;
// XXX: Should we throw an exception instead? Just ignore the problem?
#if 0
ss << "padAndConvert: non printable character: " << iWstr[i];
cerr << ss.str() << endl;;
//throw DicException(ss.str());
#endif
return convertToMb(iWstr);
}
width += n;
}
if ((unsigned int)width >= iLength)
return convertToMb(iWstr);
else
{
// Padding is needed
string s((iLength - width) / 2, c);
string res = s + convertToMb(iWstr) + s;
// If the string cannot be centered perfectly, pad again
// (on the left if iLength is even, on the right otherwise:
// this tends to align numbers of 1 or 2 digits in a nice way)
// Note: if needed, we could add the iLeftPad argument
if ((iLength - width) % 2)
{
if (iLength % 2)
res.append(1, c);
else
res.insert(res.begin(), c);
}
return res;
}
}
unsigned int readFromUTF8(wchar_t *oString, unsigned int iWideSize,
const char *iBuffer, unsigned int iBufSize,
const string &iContext)
{
#ifdef WIN32
if (iBufSize == 0 || iBuffer == NULL || *iBuffer == '\0')
{
return 0;
}
int res = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, iBuffer,
iBufSize, oString, iWideSize);
if (res == 0)
{
// Retrieve the system error message for the last-error code
throw DicException("readFromUTF8: MultiByteToWideChar failed (" +
iContext + "): " + GetWin32Error());
}
return res;
#else
iconv_t handle = iconv_open("WCHAR_T", "UTF-8");
if (handle == (iconv_t)(-1))
throw DicException("readFromUTF8: iconv_open failed");
size_t inChars = iBufSize;
size_t outChars = iWideSize * sizeof(wchar_t);
// Use the ICONV_CONST trick because the declaration of iconv()
// differs depending on the implementations...
ICONV_CONST char *in = const_cast<ICONV_CONST char*>(iBuffer);
char *out = (char*)oString;
size_t res = iconv(handle, &in, &inChars, &out, &outChars);
iconv_close(handle);
// Problem during encoding conversion?
if (res == (size_t)(-1))
{
throw DicException("readFromUTF8: iconv failed (" +
iContext + "): " + string(strerror(errno)));
}
return iWideSize - outChars / sizeof(wchar_t);
#endif
}
wstring readFromUTF8(const char *iBuffer, unsigned int iBufSize,
const string &iContext)
{
// Temporary buffer for output
// We will have at most as many characters as in the UTF-8 string
wchar_t *wideBuf = new wchar_t[iBufSize];
unsigned int number;
try
{
number = readFromUTF8(wideBuf, iBufSize, iBuffer, iBufSize, iContext);
}
catch (...)
{
// Make sure not to leak
delete[] wideBuf;
throw;
}
// Copy the string
wstring res(wideBuf, number);
delete[] wideBuf;
return res;
}
unsigned int writeInUTF8(const wstring &iWString, char *oBuffer,
unsigned int iBufSize, const string &iContext)
{
#ifdef WIN32
if (iWString.empty())
{
*oBuffer = '\0';
return 0;
}
int res = WideCharToMultiByte(CP_UTF8, 0, iWString.c_str(), iWString.size(),
oBuffer, iBufSize, NULL, NULL);
if (res == 0)
{
// Retrieve the system error message for the last-error code
throw DicException("writeInUTF8: WideCharToMultiByte failed (" +
iContext + "): " + GetWin32Error());
}
return res;
#else
iconv_t handle = iconv_open("UTF-8", "WCHAR_T");
if (handle == (iconv_t)(-1))
throw DicException("writeInUTF8: iconv_open failed");
size_t length = iWString.size();
size_t inChars = sizeof(wchar_t) * length;
size_t outChars = iBufSize;
// Use the ICONV_CONST trick because the declaration of iconv()
// differs depending on the implementations...
// FIXME: bonus ugliness for doing 2 casts at once, and accessing string
// internals...
ICONV_CONST char *in = (ICONV_CONST char*)(&iWString[0]);
char *out = oBuffer;
size_t res = iconv(handle, &in, &inChars, &out, &outChars);
iconv_close(handle);
// Problem during encoding conversion?
if (res == (size_t)(-1))
{
throw DicException("writeInUTF8: iconv failed (" +
iContext + "): " + string(strerror(errno)));
}
// Return the number of written bytes
return iBufSize - outChars;
#endif
}
string writeInUTF8(const wstring &iWString, const string &iContext)
{
// Temporary buffer for output
// Each character will take at most 4 bytes in the UTF-8 string
unsigned int bufSize = iWString.size() * 4;
char *buf = new char[bufSize];
unsigned int number;
try
{
number = writeInUTF8(iWString, buf, bufSize, iContext);
}
catch (...)
{
// Make sure not to leak
delete[] buf;
throw;
}
// Copy the string
string res(buf, number);
delete[] buf;
return res;
}