/***************************************************************************** * Eliot * Copyright (C) 2005-2007 Olivier Teulière * Authors: Olivier Teulière * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA *****************************************************************************/ #include "config.h" #include #include #include #include #include #include #include #include #include #ifdef WIN32 #include #endif #include "encoding.h" #include "dic_exception.h" using namespace std; #ifdef WIN32 // Utility function to get the last system error as a string static string GetWin32Error() { char *lpMsgBuf; DWORD dw = GetLastError(); FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, dw, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPTSTR) &lpMsgBuf, 0, NULL); string msg = lpMsgBuf; LocalFree(lpMsgBuf); return msg; } #endif #if !HAVE_WCWIDTH // wcwidth replacement (for win32 in particular) // Inspired from the gnulib package, without some of the refinements static inline int wcwidth(wchar_t c) { // Assume all the printable characters have width 1 return c == 0 ? 0 : (iswprint(c) ? 1 : -1); } #endif int _wtoi(const wchar_t *iWStr) { return wcstol(iWStr, NULL, 10); } int _swprintf(wchar_t *wcs, size_t maxlen, const wchar_t *format, ...) { int res; va_list argp; va_start(argp, format); #ifdef WIN32 // Mingw32 does not take the maxlen argument (void)maxlen; res = vswprintf(wcs, format, argp); #else res = vswprintf(wcs, maxlen, format, argp); #endif va_end(argp); return res; } wchar_t *_wcstok(wchar_t *wcs, const wchar_t *delim, wchar_t **ptr) { #ifdef WIN32 // Mingw32 does not take the third argument (void)ptr; return wcstok(wcs, delim); #else return wcstok(wcs, delim, ptr); #endif } #define _MAX_SIZE_FOR_STACK_ 30 wstring convertToWc(const string& iStr) { #ifdef WIN32 if (iStr.empty()) return L""; const unsigned int bufSize = iStr.size(); // Temporary buffer for output // We will have at most as many characters as in the UTF-8 string wchar_t *wideBuf = new wchar_t[bufSize]; int number = MultiByteToWideChar(CP_OEMCP, MB_ERR_INVALID_CHARS, iStr.c_str(), bufSize, wideBuf, bufSize); wstring res(wideBuf, number); delete[] wideBuf; if (number == 0) { // Retrieve the system error message for the last-error code throw DicException("convertToWc: MultiByteToWideChar failed:" + GetWin32Error()); } return res; #else // Get the needed length (we _can't_ use string::size()) size_t len = mbstowcs(NULL, iStr.c_str(), 0); if (len == (size_t)-1) return L""; // Change the allocation method depending on the length of the string if (len < _MAX_SIZE_FOR_STACK_) { // Without multi-thread, we can use static storage static wchar_t tmp[_MAX_SIZE_FOR_STACK_]; len = mbstowcs(tmp, iStr.c_str(), len + 1); return tmp; } else { wchar_t *tmp = new wchar_t[len + 1]; len = mbstowcs(tmp, iStr.c_str(), len + 1); wstring res = tmp; delete[] tmp; return res; } #endif } string convertToMb(const wstring& iWStr) { #ifdef WIN32 const unsigned int size = iWStr.size() * 4; if (size == 0) return ""; char buf[size]; int res = WideCharToMultiByte(CP_OEMCP, 0, iWStr.c_str(), iWStr.size(), buf, size, NULL, NULL); if (res == 0) { // Retrieve the system error message for the last-error code throw DicException("convertToMb: WideCharToMultiByte failed: " + GetWin32Error()); } return string(buf, res); #else // Get the needed length (we _can't_ use wstring::size()) size_t len = wcstombs(NULL, iWStr.c_str(), 0); if (len == (size_t)-1) return ""; // Change the allocation method depending on the length of the string if (len < _MAX_SIZE_FOR_STACK_) { // Without multi-thread, we can use static storage static char tmp[_MAX_SIZE_FOR_STACK_]; len = wcstombs(tmp, iWStr.c_str(), len + 1); return tmp; } else { char *tmp = new char[len + 1]; len = wcstombs(tmp, iWStr.c_str(), len + 1); string res = tmp; delete[] tmp; return res; } #endif } #undef _MAX_SIZE_FOR_STACK_ string convertToMb(wchar_t iWChar) { return convertToMb(wstring(1, iWChar)); } string truncString(const string &iStr, unsigned int iMaxWidth) { // Heuristic: the width of a character cannot exceed the number of // bytes used to represent it (even in UTF-8) if (iStr.size() <= iMaxWidth) return iStr; return truncAndConvert(convertToWc(iStr), iMaxWidth); } string truncAndConvert(const wstring &iWstr, unsigned int iMaxWidth) { unsigned int width = 0; unsigned int pos; for (pos = 0; pos < iWstr.size(); ++pos) { int n = wcwidth(iWstr[pos]); if (n == -1) { ostringstream ss; // XXX: Should we throw an exception instead? Just ignore the problem? #if 0 ss << "truncAndConvert: non printable character: " << iWstr[pos]; cerr << ss.str() << endl;; //throw DicException(ss.str()); #endif return convertToMb(iWstr); } if (width + n > iMaxWidth) break; width += n; } return convertToMb(iWstr.substr(0, pos)); } string truncOrPad(const string &iStr, unsigned int iWidth, char iChar) { wstring wstr = convertToWc(iStr); unsigned int width = 0; unsigned int pos; for (pos = 0; pos < wstr.size(); ++pos) { int n = wcwidth(wstr[pos]); if (n == -1) { ostringstream ss; // XXX: Should we throw an exception instead? Just ignore the problem? #if 0 ss << "truncAndConvert: non printable character: " << wstr[pos]; cerr << ss.str() << endl;; //throw DicException(ss.str()); #endif return convertToMb(wstr); } if (width + n > iWidth) break; width += n; } if (iWidth > width) return convertToMb(wstr.substr(0, pos)) + string(iWidth - width, iChar); else return convertToMb(wstr.substr(0, pos)); } string padAndConvert(const wstring &iWstr, unsigned int iLength, bool iLeftPad, char c) { int width = 0; for (unsigned int i = 0; i < iWstr.size(); ++i) { int n = wcwidth(iWstr[i]); if (n == -1) { ostringstream ss; // XXX: Should we throw an exception instead? Just ignore the problem? #if 0 ss << "padAndConvert: non printable character: " << iWstr[i]; cerr << ss.str() << endl;; //throw DicException(ss.str()); #endif return convertToMb(iWstr); } width += n; } if ((unsigned int)width >= iLength) return convertToMb(iWstr); else { // Padding is needed string s(iLength - width, c); if (iLeftPad) return s + convertToMb(iWstr); else return convertToMb(iWstr) + s; } } string centerAndConvert(const wstring &iWstr, unsigned int iLength, char c) { int width = 0; for (unsigned int i = 0; i < iWstr.size(); ++i) { int n = wcwidth(iWstr[i]); if (n == -1) { ostringstream ss; // XXX: Should we throw an exception instead? Just ignore the problem? #if 0 ss << "padAndConvert: non printable character: " << iWstr[i]; cerr << ss.str() << endl;; //throw DicException(ss.str()); #endif return convertToMb(iWstr); } width += n; } if ((unsigned int)width >= iLength) return convertToMb(iWstr); else { // Padding is needed string s((iLength - width) / 2, c); string res = s + convertToMb(iWstr) + s; // If the string cannot be centered perfectly, pad again // (on the left if iLength is even, on the right otherwise: // this tends to align numbers of 1 or 2 digits in a nice way) // Note: if needed, we could add the iLeftPad argument if ((iLength - width) % 2) { if (iLength % 2) res.append(1, c); else res.insert(res.begin(), c); } return res; } } unsigned int readFromUTF8(wchar_t *oString, unsigned int iWideSize, const char *iBuffer, unsigned int iBufSize, const string &iContext) { #ifdef WIN32 if (iBuffer == NULL || *iBuffer == '\0') { return 0; } int res = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, iBuffer, iBufSize, oString, iWideSize); if (res == 0) { // Retrieve the system error message for the last-error code throw DicException("readFromUTF8: MultiByteToWideChar failed (" + iContext + "): " + GetWin32Error()); } return res; #else iconv_t handle = iconv_open("WCHAR_T", "UTF-8"); if (handle == (iconv_t)(-1)) throw DicException("readFromUTF8: iconv_open failed"); size_t inChars = iBufSize; size_t outChars = iWideSize * sizeof(wchar_t); // Use the ICONV_CONST trick because the declaration of iconv() // differs depending on the implementations... ICONV_CONST char *in = const_cast(iBuffer); char *out = (char*)oString; size_t res = iconv(handle, &in, &inChars, &out, &outChars); iconv_close(handle); // Problem during encoding conversion? if (res == (size_t)(-1)) { throw DicException("readFromUTF8: iconv failed (" + iContext + "): " + string(strerror(errno))); } return iWideSize - outChars / sizeof(wchar_t); #endif } wstring readFromUTF8(const char *iBuffer, unsigned int iBufSize, const string &iContext) { // Temporary buffer for output // We will have at most as many characters as in the UTF-8 string wchar_t *wideBuf = new wchar_t[iBufSize]; unsigned int number; try { number = readFromUTF8(wideBuf, iBufSize, iBuffer, iBufSize, iContext); } catch (...) { // Make sure not to leak delete[] wideBuf; throw; } // Copy the string wstring res(wideBuf, number); delete[] wideBuf; return res; } unsigned int writeInUTF8(const wstring &iWString, char *oBuffer, unsigned int iBufSize, const string &iContext) { #ifdef WIN32 if (iWString.empty()) { *oBuffer = '\0'; return 0; } int res = WideCharToMultiByte(CP_UTF8, 0, iWString.c_str(), iWString.size(), oBuffer, iBufSize, NULL, NULL); if (res == 0) { // Retrieve the system error message for the last-error code throw DicException("writeInUTF8: WideCharToMultiByte failed (" + iContext + "): " + GetWin32Error()); } return res; #else iconv_t handle = iconv_open("UTF-8", "WCHAR_T"); if (handle == (iconv_t)(-1)) throw DicException("writeInUTF8: iconv_open failed"); size_t length = iWString.size(); size_t inChars = sizeof(wchar_t) * length; size_t outChars = iBufSize; // Use the ICONV_CONST trick because the declaration of iconv() // differs depending on the implementations... // FIXME: bonus ugliness for doing 2 casts at once, and accessing string // internals... ICONV_CONST char *in = (ICONV_CONST char*)(&iWString[0]); char *out = oBuffer; size_t res = iconv(handle, &in, &inChars, &out, &outChars); iconv_close(handle); // Problem during encoding conversion? if (res == (size_t)(-1)) { throw DicException("writeInUTF8: iconv failed (" + iContext + "): " + string(strerror(errno))); } // Return the number of written bytes return iBufSize - outChars; #endif } string writeInUTF8(const wstring &iWString, const string &iContext) { // Temporary buffer for output // Each character will take at most 4 bytes in the UTF-8 string unsigned int bufSize = iWString.size() * 4; char *buf = new char[bufSize]; unsigned int number; try { number = writeInUTF8(iWString, buf, bufSize, iContext); } catch (...) { // Make sure not to leak delete[] buf; throw; } // Copy the string string res(buf, number); delete[] buf; return res; }