From b45fc827712ae899636996f356d58b7eab423d80 Mon Sep 17 00:00:00 2001 From: ehouse Date: Tue, 13 Jan 2009 13:32:07 +0000 Subject: [PATCH] Allow language Makefile to specify encoding. Pass to perl and c++ dict builders, using it to open files and to determine whether to do multi-to-wide conversion. --- dawg/Catalan/Makefile | 1 + dawg/Makefile.langcommon | 18 +++++--- dawg/dict2dawg.cpp | 90 ++++++++++++++++++++++++++++++---------- dawg/xloc.pl | 42 +++++++++++++------ dawg/xloc.pm | 12 ++++-- 5 files changed, 121 insertions(+), 42 deletions(-) diff --git a/dawg/Catalan/Makefile b/dawg/Catalan/Makefile index 9e43dbe2c..3fd0fe1f8 100644 --- a/dawg/Catalan/Makefile +++ b/dawg/Catalan/Makefile @@ -18,6 +18,7 @@ XWLANG=DISCbeta LANGCODE=ca_ES TARGET_TYPE ?= PALM +ENC = UTF-8 ifeq ($(TARGET_TYPE),PALM) PBITMS = ./bmps/palm diff --git a/dawg/Makefile.langcommon b/dawg/Makefile.langcommon index 0553d144e..05107f177 100644 --- a/dawg/Makefile.langcommon +++ b/dawg/Makefile.langcommon @@ -1,6 +1,6 @@ # -*-mode: Makefile -*- -# Copyright 2000-2002 by Eric House (xwords@eehouse.org) +# Copyright 2000-2009 by Eric House (xwords@eehouse.org) # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -32,10 +32,16 @@ PAR = ../par.pl LANGUAGE = $(shell basename $$(pwd)) +ifdef ENC + ENCP = -enc $(ENC) +endif + # prefer the compiled version if available. But don't compile it # automatically until it's a bit better tested. # DICT2DAWG = $(if $(shell test -x ../dict2dawg && echo foo),\ # ../dict2dawg,../dict2dawg.pl) +# +# No. The perl version no longer works. Don't use without fixing. DICT2DAWG = ../dict2dawg @@ -245,7 +251,7 @@ dawg$(XWLANG)%.stamp: $(XWLANG)Main.dict.gz $(DICT2DAWG) table.bin ../Makefile.l end=$$(echo $@ | sed -e 's/dawg$(XWLANG)[0-9]*to\([0-9]*\).stamp/\1/'); \ echo $${start} and $${end}; \ zcat $< | $(DICT2DAWG) $(DICT2DAWGARGS) $(TABLE_ARG) table.bin -b 28000 \ - -ob dawg$(XWLANG)$* \ + -ob dawg$(XWLANG)$* $(ENCP) \ -sn $(XWLANG)StartLoc.bin -min $${start} -max $${end} \ -wc $(XWLANG)$*_wordcount.bin $(FORCE_4) -ns $(XWLANG)$*_nodesize.bin touch $@ @@ -261,20 +267,20 @@ allbins: table.bin: ../xloc.pl ifdef NEWDAWG - perl -I../ ../xloc.pl -tn $@ + perl -I../ ../xloc.pl $(ENCP) -tn -out $@ else - perl -I../ ../xloc.pl -t $@ + perl -I../ ../xloc.pl -t -out $@ endif values.bin: ../xloc.pl - perl -I../ ../xloc.pl -v $@ + perl -I../ ../xloc.pl -v -out $@ $(ENCP) %.dict: %.dict.gz zcat $< > $@ # clean this up.... ../dict2dawg: ../dict2dawg.cpp - cd ../ && g++ -DDEBUG -O -o dict2dawg dict2dawg.cpp + cd ../ && g++ -DDEBUG -O -Wall -o dict2dawg dict2dawg.cpp clean_common: rm -f $(XWLANG)Main.dict *.bin *.pdb *.seb dawg*.stamp *.$(FRANK_EXT) \ diff --git a/dawg/dict2dawg.cpp b/dawg/dict2dawg.cpp index 6b74f8e6d..dbbc927a6 100644 --- a/dawg/dict2dawg.cpp +++ b/dawg/dict2dawg.cpp @@ -1,9 +1,10 @@ -/* -*- compile-command: "g++ -DDEBUG -O -Wall -o dict2dawg dict2dawg.cpp"; -*- */ +/* -*- compile-command: "g++ -DDEBUG -O0 -Wall -g -o dict2dawg dict2dawg.cpp"; -*- */ /************************************************************************* * adapted from perl code that was itself adapted from C++ code * Copyright (C) 2000 Falk Hueffner - * This version Copyright (C) 2002,2006-2007 Eric House (xwords@eehouse.org) + * This version Copyright (C) 2002,2006-2009 Eric House + * (xwords@eehouse.org) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -77,6 +78,7 @@ static void (*gReadWordProc)(void) = NULL; NodeList gNodes; // final array of nodes unsigned int gNBytesPerOutfile = 0xFFFFFFFF; char* gTableFile = NULL; +static bool gIsMultibyte = false; char* gOutFileBase = NULL; char* gStartNodeOut = NULL; static FILE* gInFile = NULL; @@ -87,7 +89,7 @@ char* gCountFile = NULL; char* gBytesPerNodeFile = NULL; // where to write whether node // size 3 or 4 int gWordCount = 0; -std::map gTableHash; +std::map gTableHash; int gBlankIndex; std::vector gRevMap; #ifdef DEBUG @@ -121,14 +123,14 @@ static void TrieNodeSetIsTerminal( Node* nodeR, bool isTerminal ); static bool TrieNodeGetIsTerminal( Node node ); static void TrieNodeSetIsLastSibling( Node* nodeR, bool isLastSibling ); static bool TrieNodeGetIsLastSibling( Node node ); -static void TrieNodeSetLetter( Node* nodeR, int letter ); -static unsigned int TrieNodeGetLetter( Node node ); +static void TrieNodeSetLetter( Node* nodeR, Letter letter ); +static Letter TrieNodeGetLetter( Node node ); static void TrieNodeSetFirstChildOffset( Node* nodeR, int fco ); static int TrieNodeGetFirstChildOffset( Node node ); static int findSubArray( NodeList& newedgesR ); static void registerSubArray( NodeList& edgesR, int nodeLoc ); -static Node MakeTrieNode( int letter, bool isTerminal, int firstChildOffset, - bool isLastSibling ); +static Node MakeTrieNode( Letter letter, bool isTerminal, + int firstChildOffset, bool isLastSibling ); static void printNodes( NodeList& nodesR ); static void printNode( int index, Node node ); static void moveTopToFront( int* firstRef ); @@ -142,6 +144,8 @@ static void readFromSortedArray( void ); int main( int argc, char** argv ) { + setlocale(LC_CTYPE, ""); + gReadWordProc = readFromSortedArray; const char* inFileName; @@ -287,7 +291,7 @@ buildNode( int depth ) bool wordEnd; do { - char letter = gCurrentWord[depth]; + Letter letter = gCurrentWord[depth]; bool isTerminal = (gCurrentWordLen - 1) == depth; int nodeOffset = buildNode( depth + 1 ); @@ -336,7 +340,7 @@ addNodes( NodeList& newedgesR ) static void printNode( int index, Node node ) { - unsigned int letter = TrieNodeGetLetter(node); + Letter letter = TrieNodeGetLetter(node); assert( letter < gRevMap.size() ); fprintf( stderr, "[%d] letter=%d(%c); isTerminal=%s; isLastSib=%s; fco=%d;\n", @@ -472,6 +476,38 @@ readFromSortedArray( void ) #endif } // readFromSortedArray +static wchar_t +getWideChar( FILE* file ) +{ + wchar_t dest; + char src[4] = { '\0' }; + const char* srcp = src; + int ii; + mbstate_t ps = {0}; + + for ( ii = 0; ; ++ii ) { + int byt = getc( file ); + size_t siz; + + if ( byt == EOF || byt == gTermChar ) { + dest = byt; + break; + } + + assert( ii < 4 ); + src[ii] = byt; + siz = mbsrtowcs( &dest, &srcp, 1, &ps ); + + if ( siz == (size_t)-1 ) { + continue; + } else if ( siz == 1 ) { + break; + } + } +// fprintf( stderr, "%s=>%lc\n", __func__, dest ); + return dest; +} // getWideChar + static Letter* readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF ) { @@ -485,7 +521,7 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF ) // return it. If no, start over ONLY IF the terminator was not // EOF. for ( ; ; ) { - int byt = getc( gInFile ); + wchar_t byt = gIsMultibyte? getWideChar( gInFile ) : getc( gInFile ); // EOF is special: we don't try for another word even if // dropWord is true; we must leave now. @@ -523,7 +559,7 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF ) // Don't call into the hashtable twice here!! } else if ( gTableHash.find(byt) != gTableHash.end() ) { assert( count < bufLen ); - wordBuf[count++] = (char)gTableHash[byt]; + wordBuf[count++] = gTableHash[byt]; if ( count >= bufLen ) { dropWord = true; } @@ -534,9 +570,9 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF ) tileToAscii( buf, sizeof(buf), wordBuf ); if ( gKillIfMissing ) { - ERROR_EXIT( "chr %c (%d) not in map file %s\n" + ERROR_EXIT( "chr %lc (%d/0x%x) not in map file %s\n" "last word was %s\n", - (char)byt, (int)byt, gTableFile, buf ); + byt, (int)byt, (int)byt, gTableFile, buf ); } else if ( !dropWord ) { #ifdef DEBUG if ( gDebug ) { @@ -551,7 +587,7 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF ) } // if ( NULL != result ) { -// char buf[MAX_WORD_LEN+1]; +// char buf[T2ABUFLEN(MAX_WORD_LEN)]; // fprintf( stderr, "%s returning %s\n", __func__, // tileToAscii( buf, sizeof(buf), result ) ); // } @@ -638,16 +674,17 @@ tileToAscii( char* out, int outSize, const Letter* in ) char* orig = out; for ( ; ; ) { - char ch = *in++; + Letter ch = *in++; if ( '\0' == ch ) { break; } - assert( (unsigned int)ch < gRevMap.size() ); + assert( ch < gRevMap.size() ); *out++ = gRevMap[ch]; tilesLen += sprintf( &tiles[tilesLen], "%d,", ch ); assert( (out - orig) < outSize ); } + assert( tilesLen+1 < outSize ); tiles[tilesLen] = ']'; tiles[tilesLen+1] = '\0'; strcpy( out, tiles ); @@ -765,9 +802,9 @@ TrieNodeGetIsLastSibling( Node node ) } static void -TrieNodeSetLetter( Node* nodeR, int letter ) +TrieNodeSetLetter( Node* nodeR, Letter letter ) { - if( letter >= 64 ) { + if ( letter >= 64 ) { ERROR_EXIT( "letter %d too big", letter ); } @@ -776,7 +813,7 @@ TrieNodeSetLetter( Node* nodeR, int letter ) *nodeR |= (letter << 24); // set new ones } -static unsigned int +static Letter TrieNodeGetLetter( Node node ) { node >>= 24; @@ -804,7 +841,7 @@ TrieNodeGetFirstChildOffset( Node node ) } static Node -MakeTrieNode( int letter, bool isTerminal, int firstChildOffset, +MakeTrieNode( Letter letter, bool isTerminal, int firstChildOffset, bool isLastSibling ) { Node result = 0; @@ -1001,7 +1038,7 @@ static void outputNode( Node node, int nBytes, FILE* outfile ) { unsigned int fco = TrieNodeGetFirstChildOffset(node); - unsigned int fourthByte; + unsigned int fourthByte = 0; if ( nBytes == 4 ) { fourthByte = fco >> 16; @@ -1115,6 +1152,7 @@ parseARGV( int argc, char** argv, const char** inFileName ) { *inFileName = NULL; int index = 1; + const char* enc = NULL; while ( index < argc ) { char* arg = argv[index++]; @@ -1139,6 +1177,8 @@ parseARGV( int argc, char** argv, const char** inFileName ) gTableFile = argv[index++]; } else if ( 0 == strcmp( arg, "-ob" ) ) { gOutFileBase = argv[index++]; + } else if ( 0 == strcmp( arg, "-enc" ) ) { + enc = argv[index++]; } else if ( 0 == strcmp( arg, "-sn" ) ) { gStartNodeOut = argv[index++]; } else if ( 0 == strcmp( arg, "-if" ) ) { @@ -1175,6 +1215,14 @@ parseARGV( int argc, char** argv, const char** inFileName ) exit(1); } + if ( !!enc ) { + if ( !strcasecmp( enc, "UTF-8" ) ) { + gIsMultibyte = true; + } else { + ERROR_EXIT( "%s: unknown encoding %s", __func__, enc ); + } + } + #ifdef DEBUG if ( gDebug ) { fprintf( stderr, "gNBytesPerOutfile=%d\n", gNBytesPerOutfile ); diff --git a/dawg/xloc.pl b/dawg/xloc.pl index cb21db81b..20b72fcc9 100755 --- a/dawg/xloc.pl +++ b/dawg/xloc.pl @@ -21,26 +21,44 @@ use strict; use xloc; -my $arg = shift(@ARGV); -my $outfile = shift(@ARGV); -my $lang = shift(@ARGV); -my $path = "./$lang"; -my $infoFile = "$path/info.txt"; +my $unicode = -1; +my $doval = 0; +my $enc; +my $outfile; + +my $arg; +while ( $arg = $ARGV[0] ) { + if ( $arg eq '-enc' ) { + $enc = $ARGV[1]; + shift @ARGV; + } elsif ( $arg eq "-tn" ) { + $unicode = 1; + } elsif ( $arg eq "-t" ) { + $unicode = 0; + } elsif ( $arg eq "-v" ) { + $doval = 1; + } elsif ( $arg eq '-out' ) { + $outfile = $ARGV[1]; + shift @ARGV; + } else { + die "unknown arg $arg\n"; + } + shift @ARGV; +} + +my $infoFile = "info.txt"; die "info file $infoFile not found\n" if ! -s $infoFile; - -my $xlocToken = xloc::ParseTileInfo($infoFile); +my $xlocToken = xloc::ParseTileInfo($infoFile, $enc); open OUTFILE, "> $outfile"; # For f*cking windoze linefeeds binmode( OUTFILE ); -if ( $arg eq "-t" ) { - xloc::WriteMapFile( $xlocToken, 0, \*OUTFILE ); -} elsif ( $arg eq "-tn" ) { - xloc::WriteMapFile( $xlocToken, 1, \*OUTFILE ); -} elsif ( $arg eq "-v" ) { +if ( $unicode ne -1 ) { + xloc::WriteMapFile( $xlocToken, $unicode, \*OUTFILE ); +} elsif ( $doval ) { xloc::WriteValuesFile( $xlocToken, \*OUTFILE ); } diff --git a/dawg/xloc.pm b/dawg/xloc.pm index 4aefe7440..741968e76 100644 --- a/dawg/xloc.pm +++ b/dawg/xloc.pm @@ -43,11 +43,17 @@ BEGIN { # for queries. It's a hash with name-value pairs and an _INFO entry # containing a list of tile info lists. -sub ParseTileInfo($) { - my ( $filePath ) = @_; +sub ParseTileInfo($$) { + my ( $filePath, $enc ) = @_; my %result; - open INPUT, "<$filePath" or die "couldn't open $filePath"; + if ( $enc ) { + open( INPUT, "<:encoding($enc)", "$filePath" ) + or die "couldn't open $filePath"; + } else { + open( INPUT, "<$filePath" ) + or die "couldn't open $filePath"; + } my $inTiles = 0; my @tiles;