2003-08-29 22:23:42 +02:00
|
|
|
//---------------------------------------------------------------------------
|
|
|
|
// $Id$
|
|
|
|
//---------------------------------------------------------------------------
|
2003-09-02 13:23:52 +02:00
|
|
|
#include "ucs2_utf8.h"
|
2003-08-29 22:23:42 +02:00
|
|
|
//---------------------------------------------------------------------------
|
|
|
|
// Some of this code is derived from work done by Ken Thompson,
|
|
|
|
// provided to the X/Open Group.
|
|
|
|
//
|
|
|
|
// I got my information about UTF-8 from RFC 2044.
|
|
|
|
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
struct Tab
|
|
|
|
{
|
2003-09-18 10:19:01 +02:00
|
|
|
unsigned char char_mask;
|
|
|
|
unsigned char char_value;
|
2003-08-29 22:23:42 +02:00
|
|
|
int shift;
|
2003-09-18 10:19:01 +02:00
|
|
|
unsigned long wide_mask;
|
2003-08-29 22:23:42 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
static const Tab tab[] =
|
|
|
|
{
|
|
|
|
{ char(0x80), char(0x00), 0*6, 0x7F, }, // 1 byte sequence
|
|
|
|
{ char(0xE0), char(0xC0), 1*6, 0x7FF, }, // 2 byte sequence
|
|
|
|
{ char(0xF0), char(0xE0), 2*6, 0xFFFF, }, // 3 byte sequence
|
|
|
|
{ 0, 0, 0, 0, } // end of table
|
|
|
|
};
|
|
|
|
} // namespace
|
|
|
|
|
2003-09-11 15:01:00 +02:00
|
|
|
std::codecvt_base::result Arabica::Internal::ucs2_2_utf8(
|
2003-08-29 22:23:42 +02:00
|
|
|
const wchar_t* from, const wchar_t* from_end, const wchar_t*& from_next,
|
|
|
|
char* to, char* to_limit, char*& to_next)
|
|
|
|
{
|
|
|
|
from_next = from;
|
|
|
|
to_next = to;
|
|
|
|
|
|
|
|
while(from_next < from_end)
|
|
|
|
{
|
2003-09-18 10:19:01 +02:00
|
|
|
unsigned long fn = static_cast<unsigned long >(*from_next);
|
|
|
|
|
2003-08-29 22:23:42 +02:00
|
|
|
for(const Tab *t = tab; t->char_mask; t++)
|
|
|
|
{
|
2003-09-18 10:19:01 +02:00
|
|
|
if(fn > t->wide_mask )
|
2003-08-29 22:23:42 +02:00
|
|
|
continue;
|
|
|
|
|
|
|
|
// is there enough room in outbuffer?
|
|
|
|
if(to_next + (t - tab) + 1 >= to_limit)
|
|
|
|
return std::codecvt_base::partial;
|
|
|
|
|
|
|
|
int c = t->shift;
|
2003-09-18 10:19:01 +02:00
|
|
|
*to_next++ = static_cast<char>(t->char_value | (fn >> c));
|
2003-08-29 22:23:42 +02:00
|
|
|
while(c > 0)
|
|
|
|
{
|
|
|
|
c -= 6;
|
2003-09-18 10:19:01 +02:00
|
|
|
*to_next++ = static_cast<char>(0x80 | ((fn >> c) & 0x3F));
|
2003-08-29 22:23:42 +02:00
|
|
|
} // while(c > 0)
|
|
|
|
break;
|
|
|
|
} // for(Tab *t = tab; t->char_mask; t++)
|
|
|
|
++from_next;
|
|
|
|
} // while(from_next < from_end)
|
|
|
|
|
|
|
|
return std::codecvt_base::ok;
|
2003-09-18 10:19:01 +02:00
|
|
|
} // ucs2_2_utf8
|
2003-08-29 22:23:42 +02:00
|
|
|
|
2003-09-11 15:01:00 +02:00
|
|
|
std::codecvt_base::result Arabica::Internal::utf8_2_ucs2(
|
2003-08-29 22:23:42 +02:00
|
|
|
const char* from, const char* from_end, const char*& from_next,
|
|
|
|
wchar_t* to, wchar_t* to_limit, wchar_t*& to_next)
|
|
|
|
{
|
|
|
|
from_next = from;
|
|
|
|
to_next = to;
|
|
|
|
|
|
|
|
while((from_next < from_end) && (to_next < to_limit))
|
|
|
|
{
|
2003-09-18 10:19:01 +02:00
|
|
|
unsigned char start = static_cast<unsigned char>(*from_next);
|
|
|
|
|
|
|
|
const Tab *t = tab;
|
|
|
|
for(; t->char_mask; ++t)
|
2003-08-29 22:23:42 +02:00
|
|
|
{
|
|
|
|
if((start & t->char_mask) == t->char_value)
|
|
|
|
break;
|
2003-09-18 10:19:01 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if((from_next + (t - tab)) >= from_end)
|
|
|
|
break;
|
|
|
|
|
|
|
|
unsigned long wide_mask = t->wide_mask;
|
|
|
|
|
|
|
|
*to_next = start;
|
|
|
|
for(; t != tab; --t)
|
|
|
|
{
|
2003-08-29 22:23:42 +02:00
|
|
|
from_next++;
|
|
|
|
*to_next = (*to_next << 6) | ((*from_next ^ 0x80) & 0xff);
|
2003-09-18 10:19:01 +02:00
|
|
|
}
|
|
|
|
*to_next &= wide_mask;
|
2003-08-29 22:23:42 +02:00
|
|
|
|
|
|
|
++from_next;
|
|
|
|
++to_next;
|
|
|
|
} // while
|
|
|
|
|
|
|
|
return (from_next == from_end) ? std::codecvt_base::ok : std::codecvt_base::partial;
|
2003-09-18 10:19:01 +02:00
|
|
|
} // utf8_2_ucs2
|
2003-08-29 22:23:42 +02:00
|
|
|
// end of file
|