mirror of
https://git.code.sf.net/p/newrpl/sources
synced 2024-11-16 19:51:25 +01:00
457 lines
12 KiB
C
457 lines
12 KiB
C
#include "utf8lib.h"
|
|
|
|
// UNICODE DATA FOR NFC NORMALIZATION
|
|
|
|
extern const int used_CCData;
|
|
extern const unsigned int const packed_CCData[];
|
|
|
|
extern const int used_CCBytes;
|
|
extern const unsigned char const packed_CCBytes[];
|
|
|
|
extern const int used_singletonRanges;
|
|
extern const unsigned int const packed_singletonRanges[];
|
|
|
|
extern const int used_singletonData;
|
|
extern const unsigned int const packed_singletonData[];
|
|
|
|
extern const int used_doubleRanges;
|
|
extern const unsigned int const packed_doubleRanges[];
|
|
|
|
extern const int used_doubleData;
|
|
extern const unsigned int const packed_doubleData[];
|
|
|
|
extern const int used_combiners;
|
|
extern const unsigned int const packed_combiners[];
|
|
|
|
extern const int used_starteroff;
|
|
extern const unsigned int const packed_starters[];
|
|
|
|
extern const int used_starterdata;
|
|
extern const unsigned int const packed_starterData[];
|
|
|
|
unsigned int unicodeBuffer[MAX_UNICODE_CHARACTER_LEN];
|
|
|
|
// DECODE A UTF8 CODE POINT AND RETURN ITS VALUE
|
|
int utf82Char(char *ptr, int len)
|
|
{
|
|
if(*ptr & 0x80) {
|
|
if((*ptr & 0xe0) == 0xc0) {
|
|
if(len < 2)
|
|
return -1;
|
|
if((ptr[1] & 0xc0) != 0x80)
|
|
return -1;
|
|
return ((((unsigned int)ptr[0]) & 0x1f) << 6) | (((unsigned int)
|
|
ptr[1]) & 0x3f);
|
|
}
|
|
if((*ptr & 0xf0) == 0xe0) {
|
|
if(len < 3)
|
|
return -1;
|
|
if((ptr[1] & 0xc0) != 0x80)
|
|
return -1;
|
|
if((ptr[2] & 0xc0) != 0x80)
|
|
return -1;
|
|
|
|
return ((((unsigned int)ptr[0]) & 0xf) << 12) | ((((unsigned int)
|
|
ptr[1]) & 0x3f) << 6) | (((unsigned int)ptr[2]) &
|
|
0x3f);
|
|
}
|
|
if((*ptr & 0xf8) == 0xf0) {
|
|
if(len < 4)
|
|
return -1;
|
|
if((ptr[1] & 0xc0) != 0x80)
|
|
return -1;
|
|
if((ptr[2] & 0xc0) != 0x80)
|
|
return -1;
|
|
if((ptr[3] & 0xc0) != 0x80)
|
|
return -1;
|
|
return ((((unsigned int)ptr[0]) & 0x7) << 18) | ((((unsigned int)
|
|
ptr[1]) & 0x3f) << 12) | ((((unsigned int)ptr[2]) &
|
|
0x3f) << 6) | (((unsigned int)ptr[3]) & 0x3f);
|
|
}
|
|
// THIS IS AN INVALID SEQUENCE
|
|
return -1;
|
|
}
|
|
|
|
return (int)*ptr;
|
|
}
|
|
|
|
// SKIP A CODE POINT
|
|
char *utf8Skip(char *ptr, int len)
|
|
{
|
|
if(len < 1)
|
|
return ptr;
|
|
if(*ptr & 0x80) {
|
|
++ptr;
|
|
--len;
|
|
while(((*ptr & 0xc0) == 0x80) && len) {
|
|
++ptr;
|
|
--len;
|
|
}
|
|
return ptr;
|
|
}
|
|
return ++ptr;
|
|
}
|
|
|
|
// ENCODE A CHARACTER AND RETURN A NULL TERMINATED STRING,
|
|
// OR A NON-TERMINATED 4-BYTE STRING
|
|
unsigned int Char2utf8(unsigned int codepoint)
|
|
{
|
|
if(codepoint <= 0x7f)
|
|
return codepoint;
|
|
if(codepoint <= 0x7ff)
|
|
return (((codepoint & 0x3f) | 0x80) << 8) | ((codepoint >> 6) & 0x1f) |
|
|
0xc0;
|
|
if(codepoint <= 0xffff)
|
|
return (((codepoint & 0x3f) | 0x80) << 16) | ((((codepoint >> 6) & 0x3f)
|
|
| 0x80) << 8) | ((codepoint >> 12) & 0xf) | 0xe0;
|
|
if(codepoint <= 0x10ffff)
|
|
return (((codepoint & 0x3f) | 0x80) << 24) | ((((codepoint >> 6) & 0x3f)
|
|
| 0x80) << 16) | ((((codepoint >> 12) & 0x3f) | 0x80) << 8)
|
|
| ((codepoint >> 18) & 0x7) | 0xf0;
|
|
// INVALID CHARACTER
|
|
return -1;
|
|
}
|
|
|
|
// OPTIMIZED VERSION RETURNS NFC QUICK CHECK, CC AND COMPOSITION EXCLUSION PROPERTIES
|
|
// GIVEN A SINGLE CODE POINT
|
|
unsigned char getCPInfo(unsigned int cp)
|
|
{
|
|
int k;
|
|
unsigned int codept = 0;
|
|
int nchars;
|
|
for(k = 0; k < used_CCData; ++k) {
|
|
if(CCBYTE(packed_CCData[k]) != 0xff) {
|
|
nchars = LONG_NCHARS(packed_CCData[k]);
|
|
if(cp < codept + nchars)
|
|
return CCBYTE(packed_CCData[k]);
|
|
}
|
|
else {
|
|
nchars = NCHARS(packed_CCData[k]);
|
|
if(cp < codept + nchars)
|
|
return packed_CCBytes[TOFFSET(packed_CCData[k]) + cp - codept];
|
|
}
|
|
codept += nchars;
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
// *********************************************************************************
|
|
// HANGUL DECOMPOSITION - PORTED FROM UNICODE STANDARD EXAMPLE CHAPTER 3.12
|
|
// *********************************************************************************
|
|
|
|
#define SBase 0xAC00
|
|
#define LBase 0x1100
|
|
#define VBase 0x1161
|
|
#define TBase 0x11A7
|
|
#define LCount 19
|
|
#define VCount 21
|
|
#define TCount 28
|
|
#define NCount (VCount * TCount)
|
|
#define SCount (LCount * NCount)
|
|
|
|
void quickDecomp(unsigned int cp, unsigned int *dec1, unsigned int *dec2,
|
|
unsigned int *dec3)
|
|
{
|
|
// SLOW VERSION - REPLACE WITH OPTIMIZED VERSION LATER
|
|
|
|
// ALGORITHMIC HANGUL DECOMPOSITIONS
|
|
int SIndex = cp - SBase;
|
|
if(SIndex >= 0 && SIndex < SCount) {
|
|
*dec1 = LBase + SIndex / NCount;
|
|
*dec2 = VBase + (SIndex % NCount) / TCount;
|
|
int T = TBase + SIndex % TCount;
|
|
if(T != TBase)
|
|
*dec3 = T;
|
|
else
|
|
*dec3 = -1;
|
|
return;
|
|
}
|
|
|
|
*dec3 = -1;
|
|
|
|
// TRY SINGLETONS FIRST
|
|
int k;
|
|
unsigned int codept = 0;
|
|
int nchars, off;
|
|
for(k = 0; k < used_singletonRanges; ++k) {
|
|
nchars = SING_LEN(packed_singletonRanges[k]);
|
|
off = SING_OFFSET(packed_singletonRanges[k]);
|
|
if(cp < codept + nchars) {
|
|
if(off == 0xfff)
|
|
break; // NOT A SINGLETON
|
|
if(packed_singletonData[off + cp - codept] != -1) {
|
|
*dec1 = packed_singletonData[off + cp - codept];
|
|
*dec2 = -1;
|
|
return;
|
|
}
|
|
else
|
|
break;
|
|
}
|
|
codept += nchars;
|
|
}
|
|
|
|
// TRY DOUBLES
|
|
codept = 0;
|
|
for(k = 0; k < used_doubleRanges; ++k) {
|
|
nchars = SING_LEN(packed_doubleRanges[k]);
|
|
off = SING_OFFSET(packed_doubleRanges[k]);
|
|
if(cp < codept + nchars) {
|
|
if(off == 0xfff)
|
|
break; // NOT A DOUBLE
|
|
*dec1 = packed_doubleData[(off + cp - codept) << 1];
|
|
*dec2 = packed_doubleData[((off + cp - codept) << 1) + 1];
|
|
return;
|
|
}
|
|
codept += nchars;
|
|
}
|
|
|
|
*dec1 = -1;
|
|
*dec2 = -1;
|
|
|
|
}
|
|
|
|
// READ ONE CHARACTER OF A UTF8 STRING
|
|
|
|
unsigned int unicodeBuffer[MAX_UNICODE_CHARACTER_LEN];
|
|
|
|
int appendDecomp(unsigned int cp, int lastchar)
|
|
{
|
|
// DECOMPOSE IN THE BUFFER
|
|
unsigned int dec1, dec2, dec3;
|
|
//dec1=dec2=-1;
|
|
quickDecomp(cp, &dec1, &dec2, &dec3);
|
|
|
|
if(dec1 != -1) {
|
|
lastchar = appendDecomp(dec1, lastchar);
|
|
if(dec2 != -1) {
|
|
lastchar = appendDecomp(dec2, lastchar);
|
|
if(dec3 != -1) {
|
|
lastchar = appendDecomp(dec3, lastchar);
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
unicodeBuffer[lastchar++] = cp;
|
|
bubbleSort(lastchar - 1);
|
|
}
|
|
|
|
return lastchar;
|
|
}
|
|
|
|
void bubbleSort(int lastch)
|
|
{
|
|
int cc = CCLASS(getCPInfo(unicodeBuffer[lastch]));
|
|
int cc2;
|
|
if(!cc)
|
|
return;
|
|
while(lastch > 0) {
|
|
--lastch;
|
|
cc2 = CCLASS(getCPInfo(unicodeBuffer[lastch]));
|
|
if(cc2 > cc) {
|
|
// SWAP POSITIONS
|
|
unsigned int tmp = unicodeBuffer[lastch];
|
|
unicodeBuffer[lastch] = unicodeBuffer[lastch + 1];
|
|
unicodeBuffer[lastch + 1] = tmp;
|
|
}
|
|
else
|
|
return;
|
|
}
|
|
}
|
|
|
|
int getComposition(unsigned int char1, unsigned int char2)
|
|
{
|
|
// TRY DOUBLES
|
|
unsigned int codept = 0;
|
|
int k, nchars, off, j;
|
|
for(k = 0; k < used_combiners; ++k) {
|
|
nchars = SING_LEN(packed_combiners[k]);
|
|
off = SING_OFFSET(packed_combiners[k]);
|
|
if(char2 < codept + nchars) {
|
|
if(off == 0xfff)
|
|
return -1; // NOT A COMBINER
|
|
int tableoff = packed_starters[(off + char2 - codept)];
|
|
if(tableoff < 0)
|
|
return -1; // NOT A COMBINER
|
|
for(j = 0; j < packed_starterData[tableoff]; ++j) {
|
|
if(char1 == packed_starterData[tableoff + 1 + (j << 1)])
|
|
return packed_starterData[tableoff + 2 + (j << 1)];
|
|
if(char1 < packed_starterData[tableoff + 1 + (j << 1)])
|
|
return -1;
|
|
}
|
|
return -1;
|
|
}
|
|
codept += nchars;
|
|
}
|
|
return -1;
|
|
|
|
}
|
|
|
|
int quickCompose(int lastch)
|
|
{
|
|
// SLOW VERSION - REPLACE WITH OPTIMIZED VERSION LATER
|
|
int hanguldone = 0;
|
|
// DO HANGUL COMPOSITION FIRST
|
|
int LIndex = unicodeBuffer[0] - LBase;
|
|
if(0 <= LIndex && LIndex < LCount) {
|
|
int VIndex = unicodeBuffer[1] - VBase;
|
|
if(0 <= VIndex && VIndex < VCount) {
|
|
// make syllable of form LV
|
|
unicodeBuffer[0] = (SBase + (LIndex * VCount + VIndex) * TCount);
|
|
int idx = 1;
|
|
--lastch;
|
|
while(idx <= lastch) {
|
|
unicodeBuffer[idx] = unicodeBuffer[idx + 1];
|
|
++idx;
|
|
}
|
|
hanguldone = 1;
|
|
}
|
|
}
|
|
|
|
// HANGUL CASE 2
|
|
int SIndex = unicodeBuffer[0] - SBase;
|
|
if(0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0) {
|
|
int TIndex = unicodeBuffer[1] - TBase;
|
|
if(0 < TIndex && TIndex < TCount) {
|
|
// make syllable of form LVT
|
|
unicodeBuffer[0] += TIndex;
|
|
int idx = 1;
|
|
--lastch;
|
|
while(idx <= lastch) {
|
|
unicodeBuffer[idx] = unicodeBuffer[idx + 1];
|
|
++idx;
|
|
}
|
|
hanguldone = 1;
|
|
}
|
|
}
|
|
|
|
if(hanguldone)
|
|
return lastch;
|
|
|
|
// NORMAL CHARACTER COMPOSITION
|
|
|
|
unsigned int starter = unicodeBuffer[0];
|
|
int cc = CCLASS(getCPInfo(starter));
|
|
|
|
int idx = 1;
|
|
while(idx < lastch) {
|
|
if(idx > 1) {
|
|
// CHECK IF THE CHARACTER IS BLOCKED
|
|
int ccidx = CCLASS(getCPInfo(unicodeBuffer[idx]));
|
|
if(ccidx == 0) {
|
|
// FOUND NEXT STARTER, CANNOT COMBINE
|
|
return lastch;
|
|
|
|
}
|
|
if(ccidx == CCLASS(getCPInfo(unicodeBuffer[idx - 1]))) {
|
|
// BLOCKED CHARACTER CANNOT COMBINE, SKIP
|
|
++idx;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// DO THE ACTUAL COMPOSITION
|
|
|
|
int combined = getComposition(starter, unicodeBuffer[idx]);
|
|
|
|
if(combined != -1) {
|
|
// COMPOSE THE CHARACTER
|
|
starter = unicodeBuffer[0] = combined;
|
|
--lastch;
|
|
while(idx <= lastch) {
|
|
unicodeBuffer[idx] = unicodeBuffer[idx + 1];
|
|
++idx;
|
|
}
|
|
|
|
idx = 0;
|
|
}
|
|
|
|
// NO MATCH FOUND FOR COMPOSITION, CONTINUE TO NEXT CHARACTER
|
|
++idx;
|
|
}
|
|
return lastch;
|
|
|
|
}
|
|
|
|
enum
|
|
{
|
|
NEED_DECOMP = 1,
|
|
NEED_COMP = 2
|
|
};
|
|
// READ A UTF8 CHARACTER, CONVERT TO NFC AND LEAVE AT THE BUFFER.
|
|
// RETURNS THE NUMBER OF BYTES USED FROM STRING
|
|
|
|
// NFC NORMALIZATION IS:
|
|
// NFC_QC=0 -> CC=0 --> NO ACTION NEEDED
|
|
// NFC_QC=0 -> CC!=0 --> STORE BUT MIGHT NEED BUBBLE SORT
|
|
// NFC_QC!=0 -> CC=0 --> FULL DECOMPOSITION/COMPOSITION, NO BUBBLE SORT
|
|
// NFC_QC!=0 -> CC!=0 --> FULL DEC+BUBBLE SORT+COMPOSITION
|
|
|
|
int utf82NFC(char *string, int len)
|
|
{
|
|
char *end = string + len;
|
|
unsigned int cp, cc, qc, cpinfo;
|
|
int lastchar = 0, flags;
|
|
|
|
flags = 0;
|
|
while(string < end) {
|
|
cp = utf82Char(string, end - string);
|
|
if(cp == -1) {
|
|
unicodeBuffer[0] = 0;
|
|
return len;
|
|
}
|
|
cpinfo = getCPInfo(cp);
|
|
qc = NFC_QC(cpinfo);
|
|
cc = CCLASS(cpinfo);
|
|
|
|
if((cc == 0) && (qc == 0)) {
|
|
if(lastchar != 0) {
|
|
|
|
if(flags & NEED_DECOMP) {
|
|
lastchar =
|
|
appendDecomp(unicodeBuffer[lastchar - 1],
|
|
lastchar - 1);
|
|
}
|
|
|
|
lastchar = quickCompose(lastchar);
|
|
|
|
unicodeBuffer[lastchar] = 0;
|
|
return len - (end - string);
|
|
}
|
|
}
|
|
else {
|
|
if(lastchar == 1) {
|
|
// THE FIRST CHARACTER NEEDS TO BE DECOMPOSED
|
|
lastchar =
|
|
appendDecomp(unicodeBuffer[lastchar - 1], lastchar - 1);
|
|
flags |= NEED_COMP;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
string = utf8Skip(string, end - string);
|
|
|
|
if(qc) {
|
|
// FAILED QUICK CHECK TEST
|
|
// DECOMPOSE IN THE BUFFER
|
|
lastchar = appendDecomp(cp, lastchar);
|
|
flags |= NEED_COMP;
|
|
|
|
}
|
|
else {
|
|
// QUICK CHECK PASSED
|
|
unicodeBuffer[lastchar++] = cp;
|
|
if(lastchar > 1)
|
|
bubbleSort(lastchar - 1);
|
|
}
|
|
}
|
|
|
|
if(flags & NEED_COMP)
|
|
lastchar = quickCompose(lastchar);
|
|
|
|
unicodeBuffer[lastchar] = 0;
|
|
return len - (end - string);
|
|
|
|
}
|