mirror of
https://github.com/jezhiggins/arabica
synced 2025-01-15 15:40:56 +01:00
*** empty log message ***
This commit is contained in:
parent
e694657457
commit
b2e29e67ac
17 changed files with 277 additions and 424 deletions
20
Arabica.sln
20
Arabica.sln
|
@ -1,17 +1,19 @@
|
||||||
Microsoft Visual Studio Solution File, Format Version 7.00
|
Microsoft Visual Studio Solution File, Format Version 7.00
|
||||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "DOM", "DOM\DOM.vcproj", "{AFD0FD18-3D55-4CEC-A242-EA290EBBF171}"
|
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "DOM", "DOM\DOM.vcproj", "{AFD0FD18-3D55-4CEC-A242-EA290EBBF171}"
|
||||||
EndProject
|
EndProject
|
||||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SAX2DOM_test", "EXAMPLES\SAX2DOM\SAX2DOM_test.vcproj", "{E5157BA4-96A1-4D7F-B895-8C9A32F26BB5}"
|
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "example_SAX2DOMTests", "EXAMPLES\SAX2DOM\SAX2DOM_test.vcproj", "{E5157BA4-96A1-4D7F-B895-8C9A32F26BB5}"
|
||||||
EndProject
|
EndProject
|
||||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SAXlib", "SAX\SAX.vcproj", "{884490E3-E4B3-43BE-A88B-7FA9EA4E16AB}"
|
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ArabicaLib", "SAX\SAX.vcproj", "{884490E3-E4B3-43BE-A88B-7FA9EA4E16AB}"
|
||||||
EndProject
|
EndProject
|
||||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SimpleHandler", "examples\SAX\SimpleHandler.vcproj", "{16475ED0-2906-429B-9E73-F2BF2929F6E9}"
|
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "example_SAXSimpleHandler", "examples\SAX\SimpleHandler.vcproj", "{16475ED0-2906-429B-9E73-F2BF2929F6E9}"
|
||||||
EndProject
|
EndProject
|
||||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Writer", "EXAMPLES\SAX\Writer.vcproj", "{3C6CBC24-07D4-4DE8-A1DF-592C3BC77C56}"
|
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "example_SAXWriter", "EXAMPLES\SAX\Writer.vcproj", "{3C6CBC24-07D4-4DE8-A1DF-592C3BC77C56}"
|
||||||
EndProject
|
EndProject
|
||||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "pyx", "examples\SAX\pyx.vcproj", "{AE33D6D1-0F57-4E97-90AE-696854C5AE71}"
|
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "example_SAXPyx", "examples\SAX\pyx.vcproj", "{AE33D6D1-0F57-4E97-90AE-696854C5AE71}"
|
||||||
EndProject
|
EndProject
|
||||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "DOMWriter", "examples\SAX2DOM\DOMWriter.vcproj", "{C1CF7801-1681-4F15-8D71-BBC814805AF2}"
|
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "example_DOMWriter", "examples\SAX2DOM\DOMWriter.vcproj", "{C1CF7801-1681-4F15-8D71-BBC814805AF2}"
|
||||||
|
EndProject
|
||||||
|
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "example_UtilsTranscode", "examples\Utils\transcode.vcproj", "{436B423B-BF20-4B2E-A187-604AF391FBE2}"
|
||||||
EndProject
|
EndProject
|
||||||
Global
|
Global
|
||||||
GlobalSection(SolutionConfiguration) = preSolution
|
GlobalSection(SolutionConfiguration) = preSolution
|
||||||
|
@ -23,6 +25,8 @@ Global
|
||||||
{16475ED0-2906-429B-9E73-F2BF2929F6E9}.0 = {884490E3-E4B3-43BE-A88B-7FA9EA4E16AB}
|
{16475ED0-2906-429B-9E73-F2BF2929F6E9}.0 = {884490E3-E4B3-43BE-A88B-7FA9EA4E16AB}
|
||||||
{3C6CBC24-07D4-4DE8-A1DF-592C3BC77C56}.0 = {884490E3-E4B3-43BE-A88B-7FA9EA4E16AB}
|
{3C6CBC24-07D4-4DE8-A1DF-592C3BC77C56}.0 = {884490E3-E4B3-43BE-A88B-7FA9EA4E16AB}
|
||||||
{AE33D6D1-0F57-4E97-90AE-696854C5AE71}.0 = {884490E3-E4B3-43BE-A88B-7FA9EA4E16AB}
|
{AE33D6D1-0F57-4E97-90AE-696854C5AE71}.0 = {884490E3-E4B3-43BE-A88B-7FA9EA4E16AB}
|
||||||
|
{C1CF7801-1681-4F15-8D71-BBC814805AF2}.0 = {884490E3-E4B3-43BE-A88B-7FA9EA4E16AB}
|
||||||
|
{436B423B-BF20-4B2E-A187-604AF391FBE2}.0 = {884490E3-E4B3-43BE-A88B-7FA9EA4E16AB}
|
||||||
EndGlobalSection
|
EndGlobalSection
|
||||||
GlobalSection(ProjectConfiguration) = postSolution
|
GlobalSection(ProjectConfiguration) = postSolution
|
||||||
{AFD0FD18-3D55-4CEC-A242-EA290EBBF171}.Debug.ActiveCfg = Debug|Win32
|
{AFD0FD18-3D55-4CEC-A242-EA290EBBF171}.Debug.ActiveCfg = Debug|Win32
|
||||||
|
@ -53,6 +57,10 @@ Global
|
||||||
{C1CF7801-1681-4F15-8D71-BBC814805AF2}.Debug.Build.0 = Debug|Win32
|
{C1CF7801-1681-4F15-8D71-BBC814805AF2}.Debug.Build.0 = Debug|Win32
|
||||||
{C1CF7801-1681-4F15-8D71-BBC814805AF2}.Release.ActiveCfg = Release|Win32
|
{C1CF7801-1681-4F15-8D71-BBC814805AF2}.Release.ActiveCfg = Release|Win32
|
||||||
{C1CF7801-1681-4F15-8D71-BBC814805AF2}.Release.Build.0 = Release|Win32
|
{C1CF7801-1681-4F15-8D71-BBC814805AF2}.Release.Build.0 = Release|Win32
|
||||||
|
{436B423B-BF20-4B2E-A187-604AF391FBE2}.Debug.ActiveCfg = Debug|Win32
|
||||||
|
{436B423B-BF20-4B2E-A187-604AF391FBE2}.Debug.Build.0 = Debug|Win32
|
||||||
|
{436B423B-BF20-4B2E-A187-604AF391FBE2}.Release.ActiveCfg = Release|Win32
|
||||||
|
{436B423B-BF20-4B2E-A187-604AF391FBE2}.Release.Build.0 = Release|Win32
|
||||||
EndGlobalSection
|
EndGlobalSection
|
||||||
GlobalSection(ExtensibilityGlobals) = postSolution
|
GlobalSection(ExtensibilityGlobals) = postSolution
|
||||||
EndGlobalSection
|
EndGlobalSection
|
||||||
|
|
|
@ -330,10 +330,10 @@
|
||||||
RelativePath="..\XML\XMLCharacterClasses.h">
|
RelativePath="..\XML\XMLCharacterClasses.h">
|
||||||
</File>
|
</File>
|
||||||
<File
|
<File
|
||||||
RelativePath="..\Utils\base64_codecvt.cpp">
|
RelativePath="..\Utils\base64codecvt.cpp">
|
||||||
</File>
|
</File>
|
||||||
<File
|
<File
|
||||||
RelativePath="..\Utils\base64_codecvt.h">
|
RelativePath="..\Utils\base64codecvt.h">
|
||||||
</File>
|
</File>
|
||||||
<File
|
<File
|
||||||
RelativePath="..\Utils\convert_adaptor.h">
|
RelativePath="..\Utils\convert_adaptor.h">
|
||||||
|
@ -342,26 +342,54 @@
|
||||||
RelativePath="..\Utils\convertstream.h">
|
RelativePath="..\Utils\convertstream.h">
|
||||||
</File>
|
</File>
|
||||||
<File
|
<File
|
||||||
RelativePath="..\Utils\iso8859_1utf8_codecvt.cpp">
|
RelativePath="..\Utils\iso88591utf8codecvt.cpp">
|
||||||
</File>
|
</File>
|
||||||
<File
|
<File
|
||||||
RelativePath="..\Utils\iso8859_1utf8_codecvt.h">
|
RelativePath="..\Utils\iso88591utf8codecvt.h">
|
||||||
</File>
|
</File>
|
||||||
<File
|
<File
|
||||||
RelativePath="..\Utils\rot13_codecvt.cpp">
|
RelativePath="..\Utils\rot13codecvt.cpp">
|
||||||
</File>
|
</File>
|
||||||
<File
|
<File
|
||||||
RelativePath="..\Utils\rot13_codecvt.h">
|
RelativePath="..\Utils\rot13codecvt.h">
|
||||||
</File>
|
</File>
|
||||||
<File
|
<File
|
||||||
RelativePath="..\Utils\socket_stream.h">
|
RelativePath="..\Utils\socket_stream.h">
|
||||||
</File>
|
</File>
|
||||||
<File
|
<File
|
||||||
RelativePath="..\Utils\utf16utf8_codecvt.cpp">
|
RelativePath="..\Utils\utf16utf8codecvt.cpp">
|
||||||
</File>
|
</File>
|
||||||
<File
|
<File
|
||||||
RelativePath="..\Utils\utf16utf8_codecvt.h">
|
RelativePath="..\Utils\utf16utf8codecvt.h">
|
||||||
</File>
|
</File>
|
||||||
|
<File
|
||||||
|
RelativePath="..\Utils\utf8iso88591codecvt.cpp">
|
||||||
|
</File>
|
||||||
|
<File
|
||||||
|
RelativePath="..\Utils\utf8iso88591codecvt.h">
|
||||||
|
</File>
|
||||||
|
<File
|
||||||
|
RelativePath="..\Utils\utf8utf16codecvt.cpp">
|
||||||
|
</File>
|
||||||
|
<File
|
||||||
|
RelativePath="..\Utils\utf8utf16codecvt.h">
|
||||||
|
</File>
|
||||||
|
<Filter
|
||||||
|
Name="impl"
|
||||||
|
Filter="">
|
||||||
|
<File
|
||||||
|
RelativePath="..\Utils\impl\iso88591_utf8.cpp">
|
||||||
|
</File>
|
||||||
|
<File
|
||||||
|
RelativePath="..\Utils\impl\iso88591_utf8.h">
|
||||||
|
</File>
|
||||||
|
<File
|
||||||
|
RelativePath="..\Utils\impl\utf16_utf8.cpp">
|
||||||
|
</File>
|
||||||
|
<File
|
||||||
|
RelativePath="..\Utils\impl\utf16_utf8.h">
|
||||||
|
</File>
|
||||||
|
</Filter>
|
||||||
</Filter>
|
</Filter>
|
||||||
<File
|
<File
|
||||||
RelativePath=".\ParserConfig.S">
|
RelativePath=".\ParserConfig.S">
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <Utils/convertstream.h>
|
#include <Utils/convertstream.h>
|
||||||
#include <Utils/utf16utf8_codecvt.h>
|
#include <Utils/utf8utf16codecvt.h>
|
||||||
|
|
||||||
namespace SAX
|
namespace SAX
|
||||||
{
|
{
|
||||||
|
@ -74,9 +74,9 @@ public:
|
||||||
|
|
||||||
default_string_adaptor() :
|
default_string_adaptor() :
|
||||||
#if !(defined _MSC_VER) || !(_MSC_VER < 1300)
|
#if !(defined _MSC_VER) || !(_MSC_VER < 1300)
|
||||||
loc_(std::locale(), new utf16utf8_codecvt()),
|
loc_(std::locale(), new utf8utf16codecvt()),
|
||||||
#else
|
#else
|
||||||
loc_(std::_Addfac(std::locale(), new utf16utf8_codecvt)),
|
loc_(std::_Addfac(std::locale(), new utf8utf16codecvt)),
|
||||||
#endif
|
#endif
|
||||||
n_(),
|
n_(),
|
||||||
w_()
|
w_()
|
||||||
|
@ -138,9 +138,9 @@ public:
|
||||||
|
|
||||||
default_string_adaptor() :
|
default_string_adaptor() :
|
||||||
#if !(defined _MSC_VER) || !(_MSC_VER < 1300)
|
#if !(defined _MSC_VER) || !(_MSC_VER < 1300)
|
||||||
loc_(std::locale(), new utf16utf8_codecvt()),
|
loc_(std::locale(), new utf8utf16codecvt()),
|
||||||
#else
|
#else
|
||||||
loc_(std::_Addfac(std::locale(), new utf16utf8_codecvt)),
|
loc_(std::_Addfac(std::locale(), new utf8utf16codecvt)),
|
||||||
#endif
|
#endif
|
||||||
n_(),
|
n_(),
|
||||||
w_()
|
w_()
|
||||||
|
|
|
@ -48,7 +48,7 @@ int iso88591utf8codecvt::do_length(const std::mbstate_t&,
|
||||||
while((from_next < end) && (count < max))
|
while((from_next < end) && (count < max))
|
||||||
{
|
{
|
||||||
unsigned char fn = static_cast<unsigned char>(*from_next);
|
unsigned char fn = static_cast<unsigned char>(*from_next);
|
||||||
if(fn && 0x80)
|
if(fn & 0x80)
|
||||||
++count;
|
++count;
|
||||||
++count;
|
++count;
|
||||||
++from_next;
|
++from_next;
|
||||||
|
|
|
@ -1,171 +0,0 @@
|
||||||
//---------------------------------------------------------------------------
|
|
||||||
// $Id$
|
|
||||||
//---------------------------------------------------------------------------
|
|
||||||
#include "iso8859_1utf8_codecvt.h"
|
|
||||||
//---------------------------------------------------------------------------
|
|
||||||
// This facet converts from ISO8859:1 (Latin 1) chars to UTF-8 encoded chars.
|
|
||||||
//
|
|
||||||
// Some of this code is derived from work done by Ken Thompson,
|
|
||||||
// provided to the X/Open Group.
|
|
||||||
|
|
||||||
struct Tab
|
|
||||||
{
|
|
||||||
char char_mask;
|
|
||||||
char char_value;
|
|
||||||
int shift;
|
|
||||||
wchar_t wide_mask;
|
|
||||||
};
|
|
||||||
|
|
||||||
static const Tab tab[] =
|
|
||||||
{
|
|
||||||
{ char(0x80), char(0x00), 0*6, 0x7F, }, // 1 byte sequence
|
|
||||||
{ char(0xE0), char(0xC0), 1*6, 0x7FF, }, // 2 byte sequence
|
|
||||||
{ char(0xF0), char(0xE0), 2*6, 0xFFFF, }, // 3 byte sequence
|
|
||||||
{ 0, 0, 0, 0, } // end of table
|
|
||||||
};
|
|
||||||
|
|
||||||
iso8859_1utf8_codecvt::~iso8859_1utf8_codecvt()
|
|
||||||
{
|
|
||||||
} // ~iso8859_1utf8_codecvt
|
|
||||||
|
|
||||||
std::codecvt_base::result iso8859_1utf8_codecvt::do_in(std::mbstate_t& /* state */,
|
|
||||||
const char* from,
|
|
||||||
const char* from_end,
|
|
||||||
const char*& from_next,
|
|
||||||
char* to,
|
|
||||||
char* to_limit,
|
|
||||||
char*& to_next) const
|
|
||||||
{
|
|
||||||
from_next = from;
|
|
||||||
to_next = to;
|
|
||||||
|
|
||||||
while(from_next < from_end)
|
|
||||||
{
|
|
||||||
for(const Tab *t = tab; t->char_mask; t++)
|
|
||||||
{
|
|
||||||
unsigned char fn = static_cast<unsigned char>(*from_next);
|
|
||||||
if(fn > t->wide_mask )
|
|
||||||
continue;
|
|
||||||
|
|
||||||
// is there enough room in outbuffer?
|
|
||||||
if(to_next + (t - tab) + 1 >= to_limit)
|
|
||||||
return std::codecvt_base::partial;
|
|
||||||
|
|
||||||
int c = t->shift;
|
|
||||||
*to_next++ = static_cast<char>(t->char_value | (fn >> c));
|
|
||||||
while(c > 0)
|
|
||||||
{
|
|
||||||
c -= 6;
|
|
||||||
*to_next++ = static_cast<char>(0x80 | ((fn >> c) & 0x3F));
|
|
||||||
} // while(c > 0)
|
|
||||||
break;
|
|
||||||
} // for(Tab *t = tab; t->char_mask; t++)
|
|
||||||
++from_next;
|
|
||||||
} // while(from_next < from_end)
|
|
||||||
|
|
||||||
return std::codecvt_base::ok;
|
|
||||||
} // do_out
|
|
||||||
|
|
||||||
std::codecvt_base::result iso8859_1utf8_codecvt::do_out(std::mbstate_t& /* state */,
|
|
||||||
const char* from,
|
|
||||||
const char* from_end,
|
|
||||||
const char*& from_next,
|
|
||||||
char* to,
|
|
||||||
char* to_limit,
|
|
||||||
char*& to_next) const
|
|
||||||
{
|
|
||||||
from_next = from;
|
|
||||||
to_next = to;
|
|
||||||
|
|
||||||
while((from_next < from_end) && (to_next < to_limit))
|
|
||||||
{
|
|
||||||
char start = *from_next;
|
|
||||||
wchar_t next = static_cast<unsigned char>(*from_next);
|
|
||||||
for(const Tab *t = tab; t->char_mask; t++)
|
|
||||||
{
|
|
||||||
if((start & t->char_mask) == t->char_value)
|
|
||||||
{
|
|
||||||
next &= t->wide_mask;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
from_next++;
|
|
||||||
next = (next << 6) | ((*from_next ^ 0x80) & 0xff);
|
|
||||||
} // for(Tab *t = tab; t->char_mask; t++)
|
|
||||||
|
|
||||||
if(next <= 0xFF)
|
|
||||||
*to_next = static_cast<char>(next);
|
|
||||||
else
|
|
||||||
*to_next = '?'; // error state!
|
|
||||||
|
|
||||||
++from_next;
|
|
||||||
++to_next;
|
|
||||||
} // while
|
|
||||||
|
|
||||||
return (from_next == from_end) ? std::codecvt_base::ok : std::codecvt_base::partial;
|
|
||||||
} // do_in
|
|
||||||
|
|
||||||
std::codecvt_base::result iso8859_1utf8_codecvt::do_unshift(std::mbstate_t& /* state */,
|
|
||||||
char* to,
|
|
||||||
char* /* to_limit */,
|
|
||||||
char*& to_next) const
|
|
||||||
{
|
|
||||||
to_next = to;
|
|
||||||
return noconv;
|
|
||||||
} // do_unshift
|
|
||||||
|
|
||||||
int iso8859_1utf8_codecvt::do_encoding() const throw()
|
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
} // do_encoding
|
|
||||||
|
|
||||||
bool iso8859_1utf8_codecvt::do_always_noconv() const throw()
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
} // do_always_noconv
|
|
||||||
|
|
||||||
int iso8859_1utf8_codecvt::do_length(const std::mbstate_t&,
|
|
||||||
const char* from,
|
|
||||||
const char* end,
|
|
||||||
size_t max) const
|
|
||||||
{
|
|
||||||
size_t count(0);
|
|
||||||
const char* from_next = from;
|
|
||||||
|
|
||||||
while((from_next < end) && (count < max))
|
|
||||||
{
|
|
||||||
if(!(*from_next & 0x80))
|
|
||||||
{
|
|
||||||
++count;
|
|
||||||
++from_next;
|
|
||||||
}
|
|
||||||
else if((*from_next&0xc0) == 0xc0)
|
|
||||||
{
|
|
||||||
if(from_next+2 < end)
|
|
||||||
{
|
|
||||||
++count;
|
|
||||||
from_next += 2;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
else if((*from_next&0xe0) == 0xe0)
|
|
||||||
{
|
|
||||||
if(from_next+3 < end)
|
|
||||||
{
|
|
||||||
++count;
|
|
||||||
from_next += 3;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} // while
|
|
||||||
|
|
||||||
return (from_next-from);
|
|
||||||
} // do_length
|
|
||||||
|
|
||||||
int iso8859_1utf8_codecvt::do_max_length() const throw()
|
|
||||||
{
|
|
||||||
return 2;
|
|
||||||
} // do_max_length
|
|
||||||
|
|
||||||
// end of file
|
|
|
@ -1,49 +0,0 @@
|
||||||
#ifndef ISO8859_1utf8_codecvtH
|
|
||||||
#define ISO8859_1utf8_codecvtH
|
|
||||||
#include <locale>
|
|
||||||
|
|
||||||
#if(_MSC_VER < 1300)
|
|
||||||
namespace std {
|
|
||||||
typedef ::mbstate_t mbstate_t;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
class iso8859_1utf8_codecvt : public std::codecvt<char, char, std::mbstate_t>
|
|
||||||
{
|
|
||||||
protected:
|
|
||||||
virtual ~iso8859_1utf8_codecvt();
|
|
||||||
|
|
||||||
virtual result do_out(std::mbstate_t&,
|
|
||||||
const char* from,
|
|
||||||
const char* from_end,
|
|
||||||
const char*& from_next,
|
|
||||||
char* to,
|
|
||||||
char* to_limit,
|
|
||||||
char*& to_next) const;
|
|
||||||
|
|
||||||
virtual result do_in(std::mbstate_t&,
|
|
||||||
const char* from,
|
|
||||||
const char* from_end,
|
|
||||||
const char*& from_next,
|
|
||||||
char* to,
|
|
||||||
char* to_limit,
|
|
||||||
char*& to_next) const;
|
|
||||||
|
|
||||||
virtual result do_unshift(std::mbstate_t&,
|
|
||||||
char*,
|
|
||||||
char*,
|
|
||||||
char*&) const;
|
|
||||||
|
|
||||||
virtual int do_encoding() const throw();
|
|
||||||
|
|
||||||
virtual bool do_always_noconv() const throw();
|
|
||||||
|
|
||||||
virtual int do_length(const std::mbstate_t&,
|
|
||||||
const char* from,
|
|
||||||
const char* end,
|
|
||||||
size_t max) const;
|
|
||||||
|
|
||||||
virtual int do_max_length() const throw();
|
|
||||||
}; // class iso8859_1utf8_codecvt
|
|
||||||
|
|
||||||
#endif
|
|
|
@ -1,166 +0,0 @@
|
||||||
//---------------------------------------------------------------------------
|
|
||||||
// $Id$
|
|
||||||
//---------------------------------------------------------------------------
|
|
||||||
#include "utf16utf8_codecvt.h"
|
|
||||||
//---------------------------------------------------------------------------
|
|
||||||
// This facet converts from wide chars to char using the
|
|
||||||
// FSS-UTF (UCS2) encoding.
|
|
||||||
//
|
|
||||||
// Some of this code is derived from work done by Ken Thompson,
|
|
||||||
// provided to the X/Open Group.
|
|
||||||
|
|
||||||
struct Tab
|
|
||||||
{
|
|
||||||
char char_mask;
|
|
||||||
char char_value;
|
|
||||||
int shift;
|
|
||||||
wchar_t wide_mask;
|
|
||||||
};
|
|
||||||
|
|
||||||
static const Tab tab[] =
|
|
||||||
{
|
|
||||||
{ char(0x80), char(0x00), 0*6, 0x7F, }, // 1 byte sequence
|
|
||||||
{ char(0xE0), char(0xC0), 1*6, 0x7FF, }, // 2 byte sequence
|
|
||||||
{ char(0xF0), char(0xE0), 2*6, 0xFFFF, }, // 3 byte sequence
|
|
||||||
{ 0, 0, 0, 0, } // end of table
|
|
||||||
};
|
|
||||||
|
|
||||||
utf16utf8_codecvt::~utf16utf8_codecvt()
|
|
||||||
{
|
|
||||||
} // ~utf16utf8_codecvt
|
|
||||||
|
|
||||||
std::codecvt_base::result utf16utf8_codecvt::do_out(std::mbstate_t& /* state */,
|
|
||||||
const wchar_t* from,
|
|
||||||
const wchar_t* from_end,
|
|
||||||
const wchar_t*& from_next,
|
|
||||||
char* to,
|
|
||||||
char* to_limit,
|
|
||||||
char*& to_next) const
|
|
||||||
{
|
|
||||||
from_next = from;
|
|
||||||
to_next = to;
|
|
||||||
|
|
||||||
while(from_next < from_end)
|
|
||||||
{
|
|
||||||
for(const Tab *t = tab; t->char_mask; t++)
|
|
||||||
{
|
|
||||||
if(*from_next > t->wide_mask )
|
|
||||||
continue;
|
|
||||||
|
|
||||||
// is there enough room in outbuffer?
|
|
||||||
if(to_next + (t - tab) + 1 >= to_limit)
|
|
||||||
return std::codecvt_base::partial;
|
|
||||||
|
|
||||||
int c = t->shift;
|
|
||||||
*to_next++ = static_cast<char>(t->char_value | (*from_next >> c));
|
|
||||||
while(c > 0)
|
|
||||||
{
|
|
||||||
c -= 6;
|
|
||||||
*to_next++ = static_cast<char>(0x80 | ((*from_next >> c) & 0x3F));
|
|
||||||
} // while(c > 0)
|
|
||||||
break;
|
|
||||||
} // for(Tab *t = tab; t->char_mask; t++)
|
|
||||||
++from_next;
|
|
||||||
} // while(from_next < from_end)
|
|
||||||
|
|
||||||
return std::codecvt_base::ok;
|
|
||||||
} // do_out
|
|
||||||
|
|
||||||
std::codecvt_base::result utf16utf8_codecvt::do_in(std::mbstate_t& /* state */,
|
|
||||||
const char* from,
|
|
||||||
const char* from_end,
|
|
||||||
const char*& from_next,
|
|
||||||
wchar_t* to,
|
|
||||||
wchar_t* to_limit,
|
|
||||||
wchar_t*& to_next) const
|
|
||||||
{
|
|
||||||
from_next = from;
|
|
||||||
to_next = to;
|
|
||||||
|
|
||||||
while((from_next < from_end) && (to_next < to_limit))
|
|
||||||
{
|
|
||||||
char start = *from_next;
|
|
||||||
*to_next = static_cast<unsigned char>(*from_next);
|
|
||||||
for(const Tab *t = tab; t->char_mask; t++)
|
|
||||||
{
|
|
||||||
if((start & t->char_mask) == t->char_value)
|
|
||||||
{
|
|
||||||
*to_next &= t->wide_mask;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
from_next++;
|
|
||||||
*to_next = (*to_next << 6) | ((*from_next ^ 0x80) & 0xff);
|
|
||||||
} // for(Tab *t = tab; t->char_mask; t++)
|
|
||||||
|
|
||||||
++from_next;
|
|
||||||
++to_next;
|
|
||||||
} // while
|
|
||||||
|
|
||||||
return (from_next == from_end) ? std::codecvt_base::ok : std::codecvt_base::partial;
|
|
||||||
} // do_in
|
|
||||||
|
|
||||||
std::codecvt_base::result utf16utf8_codecvt::do_unshift(std::mbstate_t& /* state */,
|
|
||||||
char* to,
|
|
||||||
char* /* to_limit */,
|
|
||||||
char*& to_next) const
|
|
||||||
{
|
|
||||||
to_next = to;
|
|
||||||
return noconv;
|
|
||||||
} // do_unshift
|
|
||||||
|
|
||||||
int utf16utf8_codecvt::do_encoding() const throw()
|
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
} // do_encoding
|
|
||||||
|
|
||||||
bool utf16utf8_codecvt::do_always_noconv() const throw()
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
} // do_always_noconv
|
|
||||||
|
|
||||||
int utf16utf8_codecvt::do_length(const std::mbstate_t&,
|
|
||||||
const char* from,
|
|
||||||
const char* end,
|
|
||||||
size_t max) const
|
|
||||||
{
|
|
||||||
size_t count(0);
|
|
||||||
const char* from_next = from;
|
|
||||||
|
|
||||||
while((from_next < end) && (count < max))
|
|
||||||
{
|
|
||||||
if(!(*from_next & 0x80))
|
|
||||||
{
|
|
||||||
++count;
|
|
||||||
++from_next;
|
|
||||||
}
|
|
||||||
else if((*from_next&0xc0) == 0xc0)
|
|
||||||
{
|
|
||||||
if(from_next+2 < end)
|
|
||||||
{
|
|
||||||
++count;
|
|
||||||
from_next += 2;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
else if((*from_next&0xe0) == 0xe0)
|
|
||||||
{
|
|
||||||
if(from_next+3 < end)
|
|
||||||
{
|
|
||||||
++count;
|
|
||||||
from_next += 3;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} // while
|
|
||||||
|
|
||||||
return (from_next-from);
|
|
||||||
} // do_length
|
|
||||||
|
|
||||||
int utf16utf8_codecvt::do_max_length() const throw()
|
|
||||||
{
|
|
||||||
return 3;
|
|
||||||
} // do_max_length
|
|
||||||
|
|
||||||
// end of file
|
|
63
Utils/utf16utf8codecvt.cpp
Normal file
63
Utils/utf16utf8codecvt.cpp
Normal file
|
@ -0,0 +1,63 @@
|
||||||
|
//---------------------------------------------------------------------------
|
||||||
|
// $Id$
|
||||||
|
//---------------------------------------------------------------------------
|
||||||
|
#include "utf16utf8codecvt.h"
|
||||||
|
#include "impl/utf16_utf8.h"
|
||||||
|
//---------------------------------------------------------------------------
|
||||||
|
// This facet converts from wide chars to char using the
|
||||||
|
// FSS-UTF (UCS2) encoding.
|
||||||
|
//
|
||||||
|
|
||||||
|
std::codecvt_base::result utf16utf8codecvt::do_out(std::mbstate_t& /* state */,
|
||||||
|
const char* from,
|
||||||
|
const char* from_end,
|
||||||
|
const char*& from_next,
|
||||||
|
wchar_t* to,
|
||||||
|
wchar_t* to_limit,
|
||||||
|
wchar_t*& to_next) const
|
||||||
|
{
|
||||||
|
return ArabicaInternal::utf8_2_utf16(from, from_end, from_next, to, to_limit, to_next);
|
||||||
|
} // do_out
|
||||||
|
|
||||||
|
std::codecvt_base::result utf16utf8codecvt::do_in(std::mbstate_t& /* state */,
|
||||||
|
const wchar_t* from,
|
||||||
|
const wchar_t* from_end,
|
||||||
|
const wchar_t*& from_next,
|
||||||
|
char* to,
|
||||||
|
char* to_limit,
|
||||||
|
char*& to_next) const
|
||||||
|
{
|
||||||
|
return ArabicaInternal::utf16_2_utf8(from, from_end, from_next, to, to_limit, to_next);
|
||||||
|
} // do_in
|
||||||
|
|
||||||
|
std::codecvt_base::result utf16utf8codecvt::do_unshift(std::mbstate_t& /* state */,
|
||||||
|
wchar_t* to,
|
||||||
|
wchar_t* /* to_limit */,
|
||||||
|
wchar_t*& to_next) const
|
||||||
|
{
|
||||||
|
to_next = to;
|
||||||
|
return noconv;
|
||||||
|
} // do_unshift
|
||||||
|
|
||||||
|
int utf16utf8codecvt::do_length(const std::mbstate_t&,
|
||||||
|
const wchar_t* from,
|
||||||
|
const wchar_t* end,
|
||||||
|
size_t max) const
|
||||||
|
{
|
||||||
|
size_t count(0);
|
||||||
|
const wchar_t* from_next = from;
|
||||||
|
|
||||||
|
while((from_next < end) && (count < max))
|
||||||
|
{
|
||||||
|
if(*from_next > 0x7FF)
|
||||||
|
++count;
|
||||||
|
if(*from_next > 0x7F)
|
||||||
|
++count;
|
||||||
|
++count;
|
||||||
|
++from_next;
|
||||||
|
} // while
|
||||||
|
|
||||||
|
return (from_next-from);
|
||||||
|
} // do_length
|
||||||
|
|
||||||
|
// end of file
|
61
Utils/utf16utf8codecvt.h
Normal file
61
Utils/utf16utf8codecvt.h
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
#ifndef ARABICA_UTF16UTF8_CODECVT_H
|
||||||
|
#define ARABICA_UTF16UTF8_CODECVT_H
|
||||||
|
//---------------------------------------------------------------------------
|
||||||
|
// class utf16utf8codecvt
|
||||||
|
// This facet converts from Unicode (UCS-2) wchar_ts to
|
||||||
|
// char using the UTF-8 encoding.
|
||||||
|
//
|
||||||
|
// For the full guff on codecvts see section 22.2.1.5 of
|
||||||
|
// The C++ Standard (ISO/IEC 14882 to be pedantic).
|
||||||
|
//
|
||||||
|
// I got my information about UTF-8 from RFC 2044.
|
||||||
|
//
|
||||||
|
// $Id$
|
||||||
|
//---------------------------------------------------------------------------
|
||||||
|
#include <locale>
|
||||||
|
|
||||||
|
#if(_MSC_VER < 1300)
|
||||||
|
namespace std {
|
||||||
|
typedef ::mbstate_t mbstate_t;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
class utf16utf8codecvt : public std::codecvt<char, wchar_t, std::mbstate_t>
|
||||||
|
{
|
||||||
|
protected:
|
||||||
|
virtual ~utf16utf8codecvt() { }
|
||||||
|
|
||||||
|
virtual result do_out(std::mbstate_t&,
|
||||||
|
const char* from,
|
||||||
|
const char* from_end,
|
||||||
|
const char*& from_next,
|
||||||
|
wchar_t* to,
|
||||||
|
wchar_t* to_limit,
|
||||||
|
wchar_t*& to_next) const;
|
||||||
|
|
||||||
|
virtual result do_in(std::mbstate_t&,
|
||||||
|
const wchar_t* from,
|
||||||
|
const wchar_t* from_end,
|
||||||
|
const wchar_t*& from_next,
|
||||||
|
char* to,
|
||||||
|
char* to_limit,
|
||||||
|
char*& to_next) const;
|
||||||
|
|
||||||
|
virtual result do_unshift(std::mbstate_t&,
|
||||||
|
wchar_t*,
|
||||||
|
wchar_t*,
|
||||||
|
wchar_t*&) const;
|
||||||
|
|
||||||
|
virtual int do_encoding() const throw() { return 0; }
|
||||||
|
|
||||||
|
virtual bool do_always_noconv() const throw() { return false; }
|
||||||
|
|
||||||
|
virtual int do_length(const std::mbstate_t&,
|
||||||
|
const wchar_t* from,
|
||||||
|
const wchar_t* end,
|
||||||
|
size_t max) const;
|
||||||
|
|
||||||
|
virtual int do_max_length() const throw() { return 1; }
|
||||||
|
}; // class utf16utf8codecvt
|
||||||
|
|
||||||
|
#endif
|
|
@ -43,7 +43,7 @@ protected:
|
||||||
const char* end,
|
const char* end,
|
||||||
size_t max) const;
|
size_t max) const;
|
||||||
|
|
||||||
virtual int do_max_length() const throw() { return 2; }
|
virtual int do_max_length() const throw() { return 3; }
|
||||||
}; // class utf8iso88591codecvt
|
}; // class utf8iso88591codecvt
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
79
Utils/utf8utf16codecvt.cpp
Normal file
79
Utils/utf8utf16codecvt.cpp
Normal file
|
@ -0,0 +1,79 @@
|
||||||
|
//---------------------------------------------------------------------------
|
||||||
|
// $Id$
|
||||||
|
//---------------------------------------------------------------------------
|
||||||
|
#include "utf8utf16codecvt.h"
|
||||||
|
#include "impl/utf16_utf8.h"
|
||||||
|
//---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
std::codecvt_base::result utf8utf16codecvt::do_out(std::mbstate_t& /* state */,
|
||||||
|
const wchar_t* from,
|
||||||
|
const wchar_t* from_end,
|
||||||
|
const wchar_t*& from_next,
|
||||||
|
char* to,
|
||||||
|
char* to_limit,
|
||||||
|
char*& to_next) const
|
||||||
|
{
|
||||||
|
return ArabicaInternal::utf16_2_utf8(from, from_end, from_next, to, to_limit, to_next);
|
||||||
|
} // do_out
|
||||||
|
|
||||||
|
std::codecvt_base::result utf8utf16codecvt::do_in(std::mbstate_t& /* state */,
|
||||||
|
const char* from,
|
||||||
|
const char* from_end,
|
||||||
|
const char*& from_next,
|
||||||
|
wchar_t* to,
|
||||||
|
wchar_t* to_limit,
|
||||||
|
wchar_t*& to_next) const
|
||||||
|
{
|
||||||
|
return ArabicaInternal::utf8_2_utf16(from, from_end, from_next, to, to_limit, to_next);
|
||||||
|
} // do_in
|
||||||
|
|
||||||
|
std::codecvt_base::result utf8utf16codecvt::do_unshift(std::mbstate_t& /* state */,
|
||||||
|
char* to,
|
||||||
|
char* /* to_limit */,
|
||||||
|
char*& to_next) const
|
||||||
|
{
|
||||||
|
to_next = to;
|
||||||
|
return noconv;
|
||||||
|
} // do_unshift
|
||||||
|
|
||||||
|
int utf8utf16codecvt::do_length(const std::mbstate_t&,
|
||||||
|
const char* from,
|
||||||
|
const char* end,
|
||||||
|
size_t max) const
|
||||||
|
{
|
||||||
|
size_t count(0);
|
||||||
|
const char* from_next = from;
|
||||||
|
|
||||||
|
while((from_next < end) && (count < max))
|
||||||
|
{
|
||||||
|
if(!(*from_next & 0x80))
|
||||||
|
{
|
||||||
|
++count;
|
||||||
|
++from_next;
|
||||||
|
}
|
||||||
|
else if((*from_next&0xc0) == 0xc0)
|
||||||
|
{
|
||||||
|
if(from_next+2 < end)
|
||||||
|
{
|
||||||
|
++count;
|
||||||
|
from_next += 2;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else if((*from_next&0xe0) == 0xe0)
|
||||||
|
{
|
||||||
|
if(from_next+3 < end)
|
||||||
|
{
|
||||||
|
++count;
|
||||||
|
from_next += 3;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} // while
|
||||||
|
|
||||||
|
return (from_next-from);
|
||||||
|
} // do_length
|
||||||
|
|
||||||
|
// end of file
|
|
@ -1,7 +1,7 @@
|
||||||
#ifndef utf16utf8_codecvtH
|
#ifndef ARABICA_UTF8UTF16_CODECVT_H
|
||||||
#define utf16utf8_codecvtH
|
#define ARABICA_UTF8UTF16_CODECVT_H
|
||||||
//---------------------------------------------------------------------------
|
//---------------------------------------------------------------------------
|
||||||
// class utf16tf8_codecvt
|
// class utf8utf16codecvt
|
||||||
// This facet converts from Unicode (UCS-2) wchar_ts to
|
// This facet converts from Unicode (UCS-2) wchar_ts to
|
||||||
// char using the UTF-8 encoding.
|
// char using the UTF-8 encoding.
|
||||||
//
|
//
|
||||||
|
@ -14,16 +14,16 @@
|
||||||
//---------------------------------------------------------------------------
|
//---------------------------------------------------------------------------
|
||||||
#include <locale>
|
#include <locale>
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
#if(_MSC_VER < 1300)
|
||||||
namespace std {
|
namespace std {
|
||||||
typedef ::mbstate_t mbstate_t;
|
typedef ::mbstate_t mbstate_t;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
class utf16utf8_codecvt : public std::codecvt<wchar_t, char, std::mbstate_t>
|
class utf8utf16codecvt : public std::codecvt<wchar_t, char, std::mbstate_t>
|
||||||
{
|
{
|
||||||
protected:
|
protected:
|
||||||
virtual ~utf16utf8_codecvt();
|
virtual ~utf8utf16codecvt() { }
|
||||||
|
|
||||||
virtual result do_out(std::mbstate_t&,
|
virtual result do_out(std::mbstate_t&,
|
||||||
const wchar_t* from,
|
const wchar_t* from,
|
||||||
|
@ -46,16 +46,16 @@ protected:
|
||||||
char*,
|
char*,
|
||||||
char*&) const;
|
char*&) const;
|
||||||
|
|
||||||
virtual int do_encoding() const throw();
|
virtual int do_encoding() const throw() { return 0; }
|
||||||
|
|
||||||
virtual bool do_always_noconv() const throw();
|
virtual bool do_always_noconv() const throw() { return false; }
|
||||||
|
|
||||||
virtual int do_length(const std::mbstate_t&,
|
virtual int do_length(const std::mbstate_t&,
|
||||||
const char* from,
|
const char* from,
|
||||||
const char* end,
|
const char* end,
|
||||||
size_t max) const;
|
size_t max) const;
|
||||||
|
|
||||||
virtual int do_max_length() const throw();
|
virtual int do_max_length() const throw() { return 3; }
|
||||||
}; // class utf16utf8_codecvt
|
}; // class utf8utf16codecvt
|
||||||
|
|
||||||
#endif
|
#endif
|
|
@ -2,7 +2,7 @@
|
||||||
<VisualStudioProject
|
<VisualStudioProject
|
||||||
ProjectType="Visual C++"
|
ProjectType="Visual C++"
|
||||||
Version="7.00"
|
Version="7.00"
|
||||||
Name="SimpleHandler"
|
Name="example_SAXSimpleHandler"
|
||||||
SccProjectName=""
|
SccProjectName=""
|
||||||
SccLocalPath="">
|
SccLocalPath="">
|
||||||
<Platforms>
|
<Platforms>
|
||||||
|
@ -71,7 +71,7 @@
|
||||||
</Configuration>
|
</Configuration>
|
||||||
<Configuration
|
<Configuration
|
||||||
Name="Release|Win32"
|
Name="Release|Win32"
|
||||||
OutputDirectory=".\..\bin"
|
OutputDirectory=".\..\..\bin"
|
||||||
IntermediateDirectory=".\Release"
|
IntermediateDirectory=".\Release"
|
||||||
ConfigurationType="1"
|
ConfigurationType="1"
|
||||||
UseOfMFC="0"
|
UseOfMFC="0"
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
<VisualStudioProject
|
<VisualStudioProject
|
||||||
ProjectType="Visual C++"
|
ProjectType="Visual C++"
|
||||||
Version="7.00"
|
Version="7.00"
|
||||||
Name="Writer"
|
Name="example_SAXWriter"
|
||||||
SccProjectName=""
|
SccProjectName=""
|
||||||
SccLocalPath="">
|
SccLocalPath="">
|
||||||
<Platforms>
|
<Platforms>
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
<VisualStudioProject
|
<VisualStudioProject
|
||||||
ProjectType="Visual C++"
|
ProjectType="Visual C++"
|
||||||
Version="7.00"
|
Version="7.00"
|
||||||
Name="pyx"
|
Name="example_SAXPyx"
|
||||||
SccProjectName=""
|
SccProjectName=""
|
||||||
SccLocalPath="">
|
SccLocalPath="">
|
||||||
<Platforms>
|
<Platforms>
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
<VisualStudioProject
|
<VisualStudioProject
|
||||||
ProjectType="Visual C++"
|
ProjectType="Visual C++"
|
||||||
Version="7.00"
|
Version="7.00"
|
||||||
Name="DOMWriter"
|
Name="example_DOMWriter"
|
||||||
ProjectGUID="{C1CF7801-1681-4F15-8D71-BBC814805AF2}"
|
ProjectGUID="{C1CF7801-1681-4F15-8D71-BBC814805AF2}"
|
||||||
Keyword="Win32Proj">
|
Keyword="Win32Proj">
|
||||||
<Platforms>
|
<Platforms>
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
<VisualStudioProject
|
<VisualStudioProject
|
||||||
ProjectType="Visual C++"
|
ProjectType="Visual C++"
|
||||||
Version="7.00"
|
Version="7.00"
|
||||||
Name="SAX2DOM_test"
|
Name="example_SAX2DOMTests"
|
||||||
SccProjectName=""
|
SccProjectName=""
|
||||||
SccLocalPath="">
|
SccLocalPath="">
|
||||||
<Platforms>
|
<Platforms>
|
||||||
|
@ -70,7 +70,7 @@
|
||||||
</Configuration>
|
</Configuration>
|
||||||
<Configuration
|
<Configuration
|
||||||
Name="Release|Win32"
|
Name="Release|Win32"
|
||||||
OutputDirectory=".\Release"
|
OutputDirectory=".\..\..\bin"
|
||||||
IntermediateDirectory=".\Release"
|
IntermediateDirectory=".\Release"
|
||||||
ConfigurationType="1"
|
ConfigurationType="1"
|
||||||
UseOfMFC="0"
|
UseOfMFC="0"
|
||||||
|
|
Loading…
Reference in a new issue