Allow language Makefile to specify encoding. Pass to perl and c++

dict builders, using it to open files and to determine whether to do
multi-to-wide conversion.
This commit is contained in:
ehouse 2009-01-13 13:32:07 +00:00
parent 2691b00dc9
commit b45fc82771
5 changed files with 121 additions and 42 deletions

View file

@ -18,6 +18,7 @@
XWLANG=DISCbeta XWLANG=DISCbeta
LANGCODE=ca_ES LANGCODE=ca_ES
TARGET_TYPE ?= PALM TARGET_TYPE ?= PALM
ENC = UTF-8
ifeq ($(TARGET_TYPE),PALM) ifeq ($(TARGET_TYPE),PALM)
PBITMS = ./bmps/palm PBITMS = ./bmps/palm

View file

@ -1,6 +1,6 @@
# -*-mode: Makefile -*- # -*-mode: Makefile -*-
# Copyright 2000-2002 by Eric House (xwords@eehouse.org) # Copyright 2000-2009 by Eric House (xwords@eehouse.org)
# #
# This program is free software; you can redistribute it and/or # This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License # modify it under the terms of the GNU General Public License
@ -32,10 +32,16 @@ PAR = ../par.pl
LANGUAGE = $(shell basename $$(pwd)) LANGUAGE = $(shell basename $$(pwd))
ifdef ENC
ENCP = -enc $(ENC)
endif
# prefer the compiled version if available. But don't compile it # prefer the compiled version if available. But don't compile it
# automatically until it's a bit better tested. # automatically until it's a bit better tested.
# DICT2DAWG = $(if $(shell test -x ../dict2dawg && echo foo),\ # DICT2DAWG = $(if $(shell test -x ../dict2dawg && echo foo),\
# ../dict2dawg,../dict2dawg.pl) # ../dict2dawg,../dict2dawg.pl)
#
# No. The perl version no longer works. Don't use without fixing.
DICT2DAWG = ../dict2dawg DICT2DAWG = ../dict2dawg
@ -245,7 +251,7 @@ dawg$(XWLANG)%.stamp: $(XWLANG)Main.dict.gz $(DICT2DAWG) table.bin ../Makefile.l
end=$$(echo $@ | sed -e 's/dawg$(XWLANG)[0-9]*to\([0-9]*\).stamp/\1/'); \ end=$$(echo $@ | sed -e 's/dawg$(XWLANG)[0-9]*to\([0-9]*\).stamp/\1/'); \
echo $${start} and $${end}; \ echo $${start} and $${end}; \
zcat $< | $(DICT2DAWG) $(DICT2DAWGARGS) $(TABLE_ARG) table.bin -b 28000 \ zcat $< | $(DICT2DAWG) $(DICT2DAWGARGS) $(TABLE_ARG) table.bin -b 28000 \
-ob dawg$(XWLANG)$* \ -ob dawg$(XWLANG)$* $(ENCP) \
-sn $(XWLANG)StartLoc.bin -min $${start} -max $${end} \ -sn $(XWLANG)StartLoc.bin -min $${start} -max $${end} \
-wc $(XWLANG)$*_wordcount.bin $(FORCE_4) -ns $(XWLANG)$*_nodesize.bin -wc $(XWLANG)$*_wordcount.bin $(FORCE_4) -ns $(XWLANG)$*_nodesize.bin
touch $@ touch $@
@ -261,20 +267,20 @@ allbins:
table.bin: ../xloc.pl table.bin: ../xloc.pl
ifdef NEWDAWG ifdef NEWDAWG
perl -I../ ../xloc.pl -tn $@ perl -I../ ../xloc.pl $(ENCP) -tn -out $@
else else
perl -I../ ../xloc.pl -t $@ perl -I../ ../xloc.pl -t -out $@
endif endif
values.bin: ../xloc.pl values.bin: ../xloc.pl
perl -I../ ../xloc.pl -v $@ perl -I../ ../xloc.pl -v -out $@ $(ENCP)
%.dict: %.dict.gz %.dict: %.dict.gz
zcat $< > $@ zcat $< > $@
# clean this up.... # clean this up....
../dict2dawg: ../dict2dawg.cpp ../dict2dawg: ../dict2dawg.cpp
cd ../ && g++ -DDEBUG -O -o dict2dawg dict2dawg.cpp cd ../ && g++ -DDEBUG -O -Wall -o dict2dawg dict2dawg.cpp
clean_common: clean_common:
rm -f $(XWLANG)Main.dict *.bin *.pdb *.seb dawg*.stamp *.$(FRANK_EXT) \ rm -f $(XWLANG)Main.dict *.bin *.pdb *.seb dawg*.stamp *.$(FRANK_EXT) \

View file

@ -1,9 +1,10 @@
/* -*- compile-command: "g++ -DDEBUG -O -Wall -o dict2dawg dict2dawg.cpp"; -*- */ /* -*- compile-command: "g++ -DDEBUG -O0 -Wall -g -o dict2dawg dict2dawg.cpp"; -*- */
/************************************************************************* /*************************************************************************
* adapted from perl code that was itself adapted from C++ code * adapted from perl code that was itself adapted from C++ code
* Copyright (C) 2000 Falk Hueffner * Copyright (C) 2000 Falk Hueffner
* This version Copyright (C) 2002,2006-2007 Eric House (xwords@eehouse.org) * This version Copyright (C) 2002,2006-2009 Eric House
* (xwords@eehouse.org)
* *
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
@ -77,6 +78,7 @@ static void (*gReadWordProc)(void) = NULL;
NodeList gNodes; // final array of nodes NodeList gNodes; // final array of nodes
unsigned int gNBytesPerOutfile = 0xFFFFFFFF; unsigned int gNBytesPerOutfile = 0xFFFFFFFF;
char* gTableFile = NULL; char* gTableFile = NULL;
static bool gIsMultibyte = false;
char* gOutFileBase = NULL; char* gOutFileBase = NULL;
char* gStartNodeOut = NULL; char* gStartNodeOut = NULL;
static FILE* gInFile = NULL; static FILE* gInFile = NULL;
@ -87,7 +89,7 @@ char* gCountFile = NULL;
char* gBytesPerNodeFile = NULL; // where to write whether node char* gBytesPerNodeFile = NULL; // where to write whether node
// size 3 or 4 // size 3 or 4
int gWordCount = 0; int gWordCount = 0;
std::map<char,int> gTableHash; std::map<Letter,wchar_t> gTableHash;
int gBlankIndex; int gBlankIndex;
std::vector<char> gRevMap; std::vector<char> gRevMap;
#ifdef DEBUG #ifdef DEBUG
@ -121,14 +123,14 @@ static void TrieNodeSetIsTerminal( Node* nodeR, bool isTerminal );
static bool TrieNodeGetIsTerminal( Node node ); static bool TrieNodeGetIsTerminal( Node node );
static void TrieNodeSetIsLastSibling( Node* nodeR, bool isLastSibling ); static void TrieNodeSetIsLastSibling( Node* nodeR, bool isLastSibling );
static bool TrieNodeGetIsLastSibling( Node node ); static bool TrieNodeGetIsLastSibling( Node node );
static void TrieNodeSetLetter( Node* nodeR, int letter ); static void TrieNodeSetLetter( Node* nodeR, Letter letter );
static unsigned int TrieNodeGetLetter( Node node ); static Letter TrieNodeGetLetter( Node node );
static void TrieNodeSetFirstChildOffset( Node* nodeR, int fco ); static void TrieNodeSetFirstChildOffset( Node* nodeR, int fco );
static int TrieNodeGetFirstChildOffset( Node node ); static int TrieNodeGetFirstChildOffset( Node node );
static int findSubArray( NodeList& newedgesR ); static int findSubArray( NodeList& newedgesR );
static void registerSubArray( NodeList& edgesR, int nodeLoc ); static void registerSubArray( NodeList& edgesR, int nodeLoc );
static Node MakeTrieNode( int letter, bool isTerminal, int firstChildOffset, static Node MakeTrieNode( Letter letter, bool isTerminal,
bool isLastSibling ); int firstChildOffset, bool isLastSibling );
static void printNodes( NodeList& nodesR ); static void printNodes( NodeList& nodesR );
static void printNode( int index, Node node ); static void printNode( int index, Node node );
static void moveTopToFront( int* firstRef ); static void moveTopToFront( int* firstRef );
@ -142,6 +144,8 @@ static void readFromSortedArray( void );
int int
main( int argc, char** argv ) main( int argc, char** argv )
{ {
setlocale(LC_CTYPE, "");
gReadWordProc = readFromSortedArray; gReadWordProc = readFromSortedArray;
const char* inFileName; const char* inFileName;
@ -287,7 +291,7 @@ buildNode( int depth )
bool wordEnd; bool wordEnd;
do { do {
char letter = gCurrentWord[depth]; Letter letter = gCurrentWord[depth];
bool isTerminal = (gCurrentWordLen - 1) == depth; bool isTerminal = (gCurrentWordLen - 1) == depth;
int nodeOffset = buildNode( depth + 1 ); int nodeOffset = buildNode( depth + 1 );
@ -336,7 +340,7 @@ addNodes( NodeList& newedgesR )
static void static void
printNode( int index, Node node ) printNode( int index, Node node )
{ {
unsigned int letter = TrieNodeGetLetter(node); Letter letter = TrieNodeGetLetter(node);
assert( letter < gRevMap.size() ); assert( letter < gRevMap.size() );
fprintf( stderr, fprintf( stderr,
"[%d] letter=%d(%c); isTerminal=%s; isLastSib=%s; fco=%d;\n", "[%d] letter=%d(%c); isTerminal=%s; isLastSib=%s; fco=%d;\n",
@ -472,6 +476,38 @@ readFromSortedArray( void )
#endif #endif
} // readFromSortedArray } // readFromSortedArray
static wchar_t
getWideChar( FILE* file )
{
wchar_t dest;
char src[4] = { '\0' };
const char* srcp = src;
int ii;
mbstate_t ps = {0};
for ( ii = 0; ; ++ii ) {
int byt = getc( file );
size_t siz;
if ( byt == EOF || byt == gTermChar ) {
dest = byt;
break;
}
assert( ii < 4 );
src[ii] = byt;
siz = mbsrtowcs( &dest, &srcp, 1, &ps );
if ( siz == (size_t)-1 ) {
continue;
} else if ( siz == 1 ) {
break;
}
}
// fprintf( stderr, "%s=>%lc\n", __func__, dest );
return dest;
} // getWideChar
static Letter* static Letter*
readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF ) readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF )
{ {
@ -485,7 +521,7 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF )
// return it. If no, start over ONLY IF the terminator was not // return it. If no, start over ONLY IF the terminator was not
// EOF. // EOF.
for ( ; ; ) { for ( ; ; ) {
int byt = getc( gInFile ); wchar_t byt = gIsMultibyte? getWideChar( gInFile ) : getc( gInFile );
// EOF is special: we don't try for another word even if // EOF is special: we don't try for another word even if
// dropWord is true; we must leave now. // dropWord is true; we must leave now.
@ -523,7 +559,7 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF )
// Don't call into the hashtable twice here!! // Don't call into the hashtable twice here!!
} else if ( gTableHash.find(byt) != gTableHash.end() ) { } else if ( gTableHash.find(byt) != gTableHash.end() ) {
assert( count < bufLen ); assert( count < bufLen );
wordBuf[count++] = (char)gTableHash[byt]; wordBuf[count++] = gTableHash[byt];
if ( count >= bufLen ) { if ( count >= bufLen ) {
dropWord = true; dropWord = true;
} }
@ -534,9 +570,9 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF )
tileToAscii( buf, sizeof(buf), wordBuf ); tileToAscii( buf, sizeof(buf), wordBuf );
if ( gKillIfMissing ) { if ( gKillIfMissing ) {
ERROR_EXIT( "chr %c (%d) not in map file %s\n" ERROR_EXIT( "chr %lc (%d/0x%x) not in map file %s\n"
"last word was %s\n", "last word was %s\n",
(char)byt, (int)byt, gTableFile, buf ); byt, (int)byt, (int)byt, gTableFile, buf );
} else if ( !dropWord ) { } else if ( !dropWord ) {
#ifdef DEBUG #ifdef DEBUG
if ( gDebug ) { if ( gDebug ) {
@ -551,7 +587,7 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF )
} }
// if ( NULL != result ) { // if ( NULL != result ) {
// char buf[MAX_WORD_LEN+1]; // char buf[T2ABUFLEN(MAX_WORD_LEN)];
// fprintf( stderr, "%s returning %s\n", __func__, // fprintf( stderr, "%s returning %s\n", __func__,
// tileToAscii( buf, sizeof(buf), result ) ); // tileToAscii( buf, sizeof(buf), result ) );
// } // }
@ -638,16 +674,17 @@ tileToAscii( char* out, int outSize, const Letter* in )
char* orig = out; char* orig = out;
for ( ; ; ) { for ( ; ; ) {
char ch = *in++; Letter ch = *in++;
if ( '\0' == ch ) { if ( '\0' == ch ) {
break; break;
} }
assert( (unsigned int)ch < gRevMap.size() ); assert( ch < gRevMap.size() );
*out++ = gRevMap[ch]; *out++ = gRevMap[ch];
tilesLen += sprintf( &tiles[tilesLen], "%d,", ch ); tilesLen += sprintf( &tiles[tilesLen], "%d,", ch );
assert( (out - orig) < outSize ); assert( (out - orig) < outSize );
} }
assert( tilesLen+1 < outSize );
tiles[tilesLen] = ']'; tiles[tilesLen] = ']';
tiles[tilesLen+1] = '\0'; tiles[tilesLen+1] = '\0';
strcpy( out, tiles ); strcpy( out, tiles );
@ -765,7 +802,7 @@ TrieNodeGetIsLastSibling( Node node )
} }
static void static void
TrieNodeSetLetter( Node* nodeR, int letter ) TrieNodeSetLetter( Node* nodeR, Letter letter )
{ {
if ( letter >= 64 ) { if ( letter >= 64 ) {
ERROR_EXIT( "letter %d too big", letter ); ERROR_EXIT( "letter %d too big", letter );
@ -776,7 +813,7 @@ TrieNodeSetLetter( Node* nodeR, int letter )
*nodeR |= (letter << 24); // set new ones *nodeR |= (letter << 24); // set new ones
} }
static unsigned int static Letter
TrieNodeGetLetter( Node node ) TrieNodeGetLetter( Node node )
{ {
node >>= 24; node >>= 24;
@ -804,7 +841,7 @@ TrieNodeGetFirstChildOffset( Node node )
} }
static Node static Node
MakeTrieNode( int letter, bool isTerminal, int firstChildOffset, MakeTrieNode( Letter letter, bool isTerminal, int firstChildOffset,
bool isLastSibling ) bool isLastSibling )
{ {
Node result = 0; Node result = 0;
@ -1001,7 +1038,7 @@ static void
outputNode( Node node, int nBytes, FILE* outfile ) outputNode( Node node, int nBytes, FILE* outfile )
{ {
unsigned int fco = TrieNodeGetFirstChildOffset(node); unsigned int fco = TrieNodeGetFirstChildOffset(node);
unsigned int fourthByte; unsigned int fourthByte = 0;
if ( nBytes == 4 ) { if ( nBytes == 4 ) {
fourthByte = fco >> 16; fourthByte = fco >> 16;
@ -1115,6 +1152,7 @@ parseARGV( int argc, char** argv, const char** inFileName )
{ {
*inFileName = NULL; *inFileName = NULL;
int index = 1; int index = 1;
const char* enc = NULL;
while ( index < argc ) { while ( index < argc ) {
char* arg = argv[index++]; char* arg = argv[index++];
@ -1139,6 +1177,8 @@ parseARGV( int argc, char** argv, const char** inFileName )
gTableFile = argv[index++]; gTableFile = argv[index++];
} else if ( 0 == strcmp( arg, "-ob" ) ) { } else if ( 0 == strcmp( arg, "-ob" ) ) {
gOutFileBase = argv[index++]; gOutFileBase = argv[index++];
} else if ( 0 == strcmp( arg, "-enc" ) ) {
enc = argv[index++];
} else if ( 0 == strcmp( arg, "-sn" ) ) { } else if ( 0 == strcmp( arg, "-sn" ) ) {
gStartNodeOut = argv[index++]; gStartNodeOut = argv[index++];
} else if ( 0 == strcmp( arg, "-if" ) ) { } else if ( 0 == strcmp( arg, "-if" ) ) {
@ -1175,6 +1215,14 @@ parseARGV( int argc, char** argv, const char** inFileName )
exit(1); exit(1);
} }
if ( !!enc ) {
if ( !strcasecmp( enc, "UTF-8" ) ) {
gIsMultibyte = true;
} else {
ERROR_EXIT( "%s: unknown encoding %s", __func__, enc );
}
}
#ifdef DEBUG #ifdef DEBUG
if ( gDebug ) { if ( gDebug ) {
fprintf( stderr, "gNBytesPerOutfile=%d\n", gNBytesPerOutfile ); fprintf( stderr, "gNBytesPerOutfile=%d\n", gNBytesPerOutfile );

View file

@ -21,26 +21,44 @@
use strict; use strict;
use xloc; use xloc;
my $arg = shift(@ARGV); my $unicode = -1;
my $outfile = shift(@ARGV); my $doval = 0;
my $lang = shift(@ARGV); my $enc;
my $path = "./$lang"; my $outfile;
my $infoFile = "$path/info.txt";
my $arg;
while ( $arg = $ARGV[0] ) {
if ( $arg eq '-enc' ) {
$enc = $ARGV[1];
shift @ARGV;
} elsif ( $arg eq "-tn" ) {
$unicode = 1;
} elsif ( $arg eq "-t" ) {
$unicode = 0;
} elsif ( $arg eq "-v" ) {
$doval = 1;
} elsif ( $arg eq '-out' ) {
$outfile = $ARGV[1];
shift @ARGV;
} else {
die "unknown arg $arg\n";
}
shift @ARGV;
}
my $infoFile = "info.txt";
die "info file $infoFile not found\n" if ! -s $infoFile; die "info file $infoFile not found\n" if ! -s $infoFile;
my $xlocToken = xloc::ParseTileInfo($infoFile, $enc);
my $xlocToken = xloc::ParseTileInfo($infoFile);
open OUTFILE, "> $outfile"; open OUTFILE, "> $outfile";
# For f*cking windoze linefeeds # For f*cking windoze linefeeds
binmode( OUTFILE ); binmode( OUTFILE );
if ( $arg eq "-t" ) { if ( $unicode ne -1 ) {
xloc::WriteMapFile( $xlocToken, 0, \*OUTFILE ); xloc::WriteMapFile( $xlocToken, $unicode, \*OUTFILE );
} elsif ( $arg eq "-tn" ) { } elsif ( $doval ) {
xloc::WriteMapFile( $xlocToken, 1, \*OUTFILE );
} elsif ( $arg eq "-v" ) {
xloc::WriteValuesFile( $xlocToken, \*OUTFILE ); xloc::WriteValuesFile( $xlocToken, \*OUTFILE );
} }

View file

@ -43,11 +43,17 @@ BEGIN {
# for queries. It's a hash with name-value pairs and an _INFO entry # for queries. It's a hash with name-value pairs and an _INFO entry
# containing a list of tile info lists. # containing a list of tile info lists.
sub ParseTileInfo($) { sub ParseTileInfo($$) {
my ( $filePath ) = @_; my ( $filePath, $enc ) = @_;
my %result; my %result;
open INPUT, "<$filePath" or die "couldn't open $filePath"; if ( $enc ) {
open( INPUT, "<:encoding($enc)", "$filePath" )
or die "couldn't open $filePath";
} else {
open( INPUT, "<$filePath" )
or die "couldn't open $filePath";
}
my $inTiles = 0; my $inTiles = 0;
my @tiles; my @tiles;