Allow language Makefile to specify encoding. Pass to perl and c++

dict builders, using it to open files and to determine whether to do multi-to-wide conversion.
2025-02-05 20:45:49 +01:00 · 2009-01-13 13:32:07 +00:00 · 2009-01-13 13:32:07 +00:00 · b45fc82771
commit b45fc82771
parent 2691b00dc9
5 changed files with 121 additions and 42 deletions
--- a/dawg/Catalan/Makefile
+++ b/dawg/Catalan/Makefile
@ -18,6 +18,7 @@
 XWLANG=DISCbeta
 LANGCODE=ca_ES
 TARGET_TYPE ?= PALM
+ENC = UTF-8

 ifeq ($(TARGET_TYPE),PALM)
 PBITMS = ./bmps/palm
--- a/dawg/Makefile.langcommon
+++ b/dawg/Makefile.langcommon
@ -1,6 +1,6 @@
 # -*-mode: Makefile -*-

-# Copyright 2000-2002 by Eric House (xwords@eehouse.org)
+# Copyright 2000-2009 by Eric House (xwords@eehouse.org)
 #
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
@ -32,10 +32,16 @@ PAR = ../par.pl

 LANGUAGE = $(shell basename $$(pwd))

+ifdef ENC
+	ENCP = -enc $(ENC)
+endif
+
 # prefer the compiled version if available.  But don't compile it
 # automatically until it's a bit better tested.
 # DICT2DAWG = $(if $(shell test -x ../dict2dawg && echo foo),\
 # 		../dict2dawg,../dict2dawg.pl)
+#
+# No.  The perl version no longer works.  Don't use without fixing.

 DICT2DAWG = ../dict2dawg

@ -245,7 +251,7 @@ dawg$(XWLANG)%.stamp: $(XWLANG)Main.dict.gz $(DICT2DAWG) table.bin ../Makefile.l
 	end=$$(echo $@ | sed -e 's/dawg$(XWLANG)[0-9]*to\([0-9]*\).stamp/\1/'); \
 	echo $${start} and $${end}; \
 	zcat $< | $(DICT2DAWG) $(DICT2DAWGARGS) $(TABLE_ARG) table.bin -b 28000 \
-		-ob dawg$(XWLANG)$* \
+		-ob dawg$(XWLANG)$* $(ENCP) \
 		-sn $(XWLANG)StartLoc.bin -min $${start} -max $${end} \
 		-wc $(XWLANG)$*_wordcount.bin $(FORCE_4) -ns $(XWLANG)$*_nodesize.bin
 	touch $@
@ -261,20 +267,20 @@ allbins:

 table.bin:  ../xloc.pl 
 ifdef NEWDAWG
-	perl -I../ ../xloc.pl -tn $@
+	perl -I../ ../xloc.pl $(ENCP) -tn -out $@
 else
-	perl -I../ ../xloc.pl -t $@
+	perl -I../ ../xloc.pl -t -out $@
 endif

 values.bin:  ../xloc.pl 
-	perl -I../ ../xloc.pl -v $@
+	perl -I../ ../xloc.pl -v -out $@ $(ENCP)

 %.dict: %.dict.gz
 	zcat $< > $@

 # clean this up....
 ../dict2dawg: ../dict2dawg.cpp
-	cd ../ && g++ -DDEBUG -O -o dict2dawg dict2dawg.cpp
+	cd ../ && g++ -DDEBUG -O -Wall -o dict2dawg dict2dawg.cpp

 clean_common:
 	rm -f $(XWLANG)Main.dict *.bin *.pdb *.seb dawg*.stamp *.$(FRANK_EXT) \
--- a/dawg/dict2dawg.cpp
+++ b/dawg/dict2dawg.cpp
@ -1,9 +1,10 @@
-/* -*- compile-command: "g++ -DDEBUG -O -Wall -o dict2dawg dict2dawg.cpp"; -*- */
+/* -*- compile-command: "g++ -DDEBUG -O0 -Wall -g -o dict2dawg dict2dawg.cpp"; -*- */
 /*************************************************************************
 * adapted from perl code that was itself adapted from C++ code
 * Copyright (C) 2000 Falk Hueffner

- * This version Copyright (C) 2002,2006-2007 Eric House (xwords@eehouse.org)
+ * This version Copyright (C) 2002,2006-2009 Eric House
+ * (xwords@eehouse.org)
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@ -77,6 +78,7 @@ static void (*gReadWordProc)(void) = NULL;
 NodeList gNodes;       // final array of nodes
 unsigned int gNBytesPerOutfile = 0xFFFFFFFF;
 char* gTableFile = NULL;
+static bool gIsMultibyte = false;
 char* gOutFileBase = NULL;
 char* gStartNodeOut = NULL;
 static FILE* gInFile = NULL;
@ -87,7 +89,7 @@ char* gCountFile = NULL;
 char* gBytesPerNodeFile = NULL;        // where to write whether node
                                       // size 3 or 4
 int gWordCount = 0;
-std::map<char,int> gTableHash;
+std::map<Letter,wchar_t> gTableHash;
 int gBlankIndex;
 std::vector<char> gRevMap;
 #ifdef DEBUG
@ -121,14 +123,14 @@ static void TrieNodeSetIsTerminal( Node* nodeR, bool isTerminal );
 static bool TrieNodeGetIsTerminal( Node node );
 static void TrieNodeSetIsLastSibling( Node* nodeR, bool isLastSibling );
 static bool TrieNodeGetIsLastSibling( Node node );
-static void TrieNodeSetLetter( Node* nodeR, int letter );
-static unsigned int TrieNodeGetLetter( Node node );
+static void TrieNodeSetLetter( Node* nodeR, Letter letter );
+static Letter TrieNodeGetLetter( Node node );
 static void TrieNodeSetFirstChildOffset( Node* nodeR, int fco );
 static int TrieNodeGetFirstChildOffset( Node node );
 static int findSubArray( NodeList& newedgesR );
 static void registerSubArray( NodeList& edgesR, int nodeLoc );
-static Node MakeTrieNode( int letter, bool isTerminal, int firstChildOffset, 
-                          bool isLastSibling );
+static Node MakeTrieNode( Letter letter, bool isTerminal,
+                          int firstChildOffset, bool isLastSibling );
 static void printNodes( NodeList& nodesR );
 static void printNode( int index, Node node );
 static void moveTopToFront( int* firstRef );
@ -142,6 +144,8 @@ static void readFromSortedArray( void );
 int 
 main( int argc, char** argv ) 
 { 
+    setlocale(LC_CTYPE, "");
+    
    gReadWordProc = readFromSortedArray;

    const char* inFileName;
@ -287,7 +291,7 @@ buildNode( int depth )

    bool wordEnd;
    do {
-        char letter = gCurrentWord[depth];
+        Letter letter = gCurrentWord[depth];
        bool isTerminal = (gCurrentWordLen - 1) == depth;

        int nodeOffset = buildNode( depth + 1 );
@ -336,7 +340,7 @@ addNodes( NodeList& newedgesR )
 static void
 printNode( int index, Node node )
 {
-    unsigned int letter = TrieNodeGetLetter(node);
+    Letter letter = TrieNodeGetLetter(node);
    assert( letter < gRevMap.size() );
    fprintf( stderr,
             "[%d] letter=%d(%c); isTerminal=%s; isLastSib=%s; fco=%d;\n", 
@ -472,6 +476,38 @@ readFromSortedArray( void )
 #endif
 } // readFromSortedArray

+static wchar_t
+getWideChar( FILE* file )
+{
+    wchar_t dest;
+    char src[4] = { '\0' };
+    const char* srcp = src;
+    int ii;
+    mbstate_t ps = {0};
+
+    for ( ii = 0; ; ++ii ) {
+        int byt = getc( file );
+        size_t siz;
+
+        if ( byt == EOF || byt == gTermChar ) {
+            dest = byt;
+            break;
+        }
+
+        assert( ii < 4 );
+        src[ii] = byt;
+        siz = mbsrtowcs( &dest, &srcp, 1, &ps );
+
+        if ( siz == (size_t)-1 ) {
+            continue;
+        } else if ( siz == 1 ) {
+            break;
+        }
+    }
+//     fprintf( stderr, "%s=>%lc\n", __func__, dest );
+    return dest;
+} // getWideChar
+
 static Letter*
 readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF )
 {
@ -485,7 +521,7 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF )
    // return it.  If no, start over ONLY IF the terminator was not
    // EOF.
    for ( ; ; ) {
-        int byt = getc( gInFile );
+        wchar_t byt = gIsMultibyte? getWideChar( gInFile ) : getc( gInFile );

        // EOF is special: we don't try for another word even if
        // dropWord is true; we must leave now.
@ -523,7 +559,7 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF )
            // Don't call into the hashtable twice here!!
        } else if ( gTableHash.find(byt) != gTableHash.end() ) {
            assert( count < bufLen );
-            wordBuf[count++] = (char)gTableHash[byt];
+            wordBuf[count++] = gTableHash[byt];
            if ( count >= bufLen ) {
                dropWord = true;
            }
@ -534,9 +570,9 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF )
            tileToAscii( buf, sizeof(buf), wordBuf );

            if ( gKillIfMissing ) {
-                ERROR_EXIT( "chr %c (%d) not in map file %s\n"
+                ERROR_EXIT( "chr %lc (%d/0x%x) not in map file %s\n"
                            "last word was %s\n",
-                            (char)byt, (int)byt, gTableFile, buf );
+                            byt, (int)byt, (int)byt, gTableFile, buf );
            } else if ( !dropWord ) {
 #ifdef DEBUG
                if ( gDebug ) {
@ -551,7 +587,7 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF )
    }

 //     if ( NULL != result ) {
-//         char buf[MAX_WORD_LEN+1];
+//         char buf[T2ABUFLEN(MAX_WORD_LEN)];
 //         fprintf( stderr, "%s returning %s\n", __func__,
 //                  tileToAscii( buf, sizeof(buf), result ) );
 //     }
@ -638,16 +674,17 @@ tileToAscii( char* out, int outSize, const Letter* in )

    char* orig = out;
    for ( ; ; ) {
-        char ch = *in++;
+        Letter ch = *in++;
        if ( '\0' == ch ) {
            break;
        }
-        assert( (unsigned int)ch < gRevMap.size() );
+        assert( ch < gRevMap.size() );
        *out++ = gRevMap[ch];
        tilesLen += sprintf( &tiles[tilesLen], "%d,", ch );
        assert( (out - orig) < outSize );
    }

+    assert( tilesLen+1 < outSize );
    tiles[tilesLen] = ']';
    tiles[tilesLen+1] = '\0';
    strcpy( out, tiles );
@ -765,9 +802,9 @@ TrieNodeGetIsLastSibling( Node node )
 }

 static void
-TrieNodeSetLetter( Node* nodeR, int letter )
+TrieNodeSetLetter( Node* nodeR, Letter letter )
 {
-    if( letter >= 64 ) {
+    if ( letter >= 64 ) {
        ERROR_EXIT( "letter %d too big", letter );
    }

@ -776,7 +813,7 @@ TrieNodeSetLetter( Node* nodeR, int letter )
    *nodeR |= (letter << 24);          // set new ones
 }

-static unsigned int
+static Letter
 TrieNodeGetLetter( Node node )
 {
    node >>= 24;
@ -804,7 +841,7 @@ TrieNodeGetFirstChildOffset( Node node )
 }

 static Node
-MakeTrieNode( int letter, bool isTerminal, int firstChildOffset, 
+MakeTrieNode( Letter letter, bool isTerminal, int firstChildOffset, 
              bool isLastSibling )
 {
    Node result = 0;
@ -1001,7 +1038,7 @@ static void
 outputNode( Node node, int nBytes, FILE* outfile )
 {
    unsigned int fco = TrieNodeGetFirstChildOffset(node);
-    unsigned int fourthByte;
+    unsigned int fourthByte = 0;

    if ( nBytes == 4 ) {
        fourthByte = fco >> 16;
@ -1115,6 +1152,7 @@ parseARGV( int argc, char** argv, const char** inFileName )
 {
    *inFileName = NULL;
    int index = 1;
+    const char* enc = NULL;
    while ( index < argc ) {

        char* arg = argv[index++];
@ -1139,6 +1177,8 @@ parseARGV( int argc, char** argv, const char** inFileName )
            gTableFile = argv[index++];
        } else if ( 0 == strcmp( arg, "-ob" ) ) {
            gOutFileBase = argv[index++];
+        } else if ( 0 == strcmp( arg, "-enc" ) ) {
+            enc = argv[index++];
        } else if ( 0 == strcmp( arg, "-sn" ) ) {
            gStartNodeOut = argv[index++];
        } else if ( 0 == strcmp( arg, "-if" ) ) {
@ -1175,6 +1215,14 @@ parseARGV( int argc, char** argv, const char** inFileName )
        exit(1);
    }

+    if ( !!enc ) {
+        if ( !strcasecmp( enc, "UTF-8" ) ) {
+            gIsMultibyte = true;
+        } else {
+            ERROR_EXIT( "%s: unknown encoding %s", __func__, enc );
+        }
+    }
+
 #ifdef DEBUG
    if ( gDebug ) {
        fprintf( stderr, "gNBytesPerOutfile=%d\n", gNBytesPerOutfile );
--- a/dawg/xloc.pl
+++ b/dawg/xloc.pl
@ -21,26 +21,44 @@
 use strict;
 use xloc;

-my $arg = shift(@ARGV);
-my $outfile = shift(@ARGV);
-my $lang = shift(@ARGV);
-my $path = "./$lang";
-my $infoFile = "$path/info.txt";
+my $unicode = -1;
+my $doval = 0;
+my $enc;
+my $outfile;
+
+my $arg;
+while ( $arg = $ARGV[0] ) {
+    if ( $arg eq '-enc' ) {
+        $enc = $ARGV[1];
+        shift @ARGV;
+    } elsif ( $arg eq "-tn" ) {
+        $unicode = 1;
+    } elsif ( $arg eq "-t" ) {
+        $unicode = 0;
+    } elsif ( $arg eq "-v" ) {
+        $doval = 1;
+    } elsif ( $arg eq '-out' ) {
+        $outfile = $ARGV[1];
+        shift @ARGV;
+    } else {
+        die "unknown arg $arg\n";
+    }
+    shift @ARGV;
+}
+
+my $infoFile = "info.txt";

 die "info file $infoFile not found\n" if ! -s $infoFile;

-
-my $xlocToken = xloc::ParseTileInfo($infoFile);
+my $xlocToken = xloc::ParseTileInfo($infoFile, $enc);

 open OUTFILE, "> $outfile";
 # For f*cking windoze linefeeds
 binmode( OUTFILE );

-if ( $arg eq "-t" ) {
-    xloc::WriteMapFile( $xlocToken, 0, \*OUTFILE );
-} elsif ( $arg eq "-tn" ) {
-    xloc::WriteMapFile( $xlocToken, 1, \*OUTFILE );
-} elsif ( $arg eq "-v" ) {
+if ( $unicode ne -1 ) {
+    xloc::WriteMapFile( $xlocToken, $unicode, \*OUTFILE );
+} elsif ( $doval ) {
    xloc::WriteValuesFile( $xlocToken, \*OUTFILE );
 }

--- a/dawg/xloc.pm
+++ b/dawg/xloc.pm
@ -43,11 +43,17 @@ BEGIN {
 # for queries.  It's a hash with name-value pairs and an _INFO entry
 # containing a list of tile info lists.

-sub ParseTileInfo($) {
-    my ( $filePath ) = @_;
+sub ParseTileInfo($$) {
+    my ( $filePath, $enc ) = @_;
    my %result;

-    open INPUT, "<$filePath" or die "couldn't open $filePath";
+    if ( $enc ) {
+        open( INPUT, "<:encoding($enc)", "$filePath" ) 
+            or die "couldn't open $filePath";
+    } else {
+        open( INPUT, "<$filePath" ) 
+            or die "couldn't open $filePath";
+    }

    my $inTiles = 0;
    my @tiles;