From b45fc827712ae899636996f356d58b7eab423d80 Mon Sep 17 00:00:00 2001
From: ehouse <ehouse@0782aaa5-4710-0410-8820-a96bf9123855>
Date: Tue, 13 Jan 2009 13:32:07 +0000
Subject: [PATCH] Allow language Makefile to specify encoding.  Pass to perl
 and c++ dict builders, using it to open files and to determine whether to do
 multi-to-wide conversion.

---
 dawg/Catalan/Makefile    |  1 +
 dawg/Makefile.langcommon | 18 +++++---
 dawg/dict2dawg.cpp       | 90 ++++++++++++++++++++++++++++++----------
 dawg/xloc.pl             | 42 +++++++++++++------
 dawg/xloc.pm             | 12 ++++--
 5 files changed, 121 insertions(+), 42 deletions(-)

diff --git a/dawg/Catalan/Makefile b/dawg/Catalan/Makefile
index 9e43dbe2c..3fd0fe1f8 100644
--- a/dawg/Catalan/Makefile
+++ b/dawg/Catalan/Makefile
@@ -18,6 +18,7 @@
 XWLANG=DISCbeta
 LANGCODE=ca_ES
 TARGET_TYPE ?= PALM
+ENC = UTF-8
 
 ifeq ($(TARGET_TYPE),PALM)
 PBITMS = ./bmps/palm
diff --git a/dawg/Makefile.langcommon b/dawg/Makefile.langcommon
index 0553d144e..05107f177 100644
--- a/dawg/Makefile.langcommon
+++ b/dawg/Makefile.langcommon
@@ -1,6 +1,6 @@
 # -*-mode: Makefile -*-
 
-# Copyright 2000-2002 by Eric House (xwords@eehouse.org)
+# Copyright 2000-2009 by Eric House (xwords@eehouse.org)
 #
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
@@ -32,10 +32,16 @@ PAR = ../par.pl
 
 LANGUAGE = $(shell basename $$(pwd))
 
+ifdef ENC
+	ENCP = -enc $(ENC)
+endif
+
 # prefer the compiled version if available.  But don't compile it
 # automatically until it's a bit better tested.
 # DICT2DAWG = $(if $(shell test -x ../dict2dawg && echo foo),\
 # 		../dict2dawg,../dict2dawg.pl)
+#
+# No.  The perl version no longer works.  Don't use without fixing.
 
 DICT2DAWG = ../dict2dawg
 
@@ -245,7 +251,7 @@ dawg$(XWLANG)%.stamp: $(XWLANG)Main.dict.gz $(DICT2DAWG) table.bin ../Makefile.l
 	end=$$(echo $@ | sed -e 's/dawg$(XWLANG)[0-9]*to\([0-9]*\).stamp/\1/'); \
 	echo $${start} and $${end}; \
 	zcat $< | $(DICT2DAWG) $(DICT2DAWGARGS) $(TABLE_ARG) table.bin -b 28000 \
-		-ob dawg$(XWLANG)$* \
+		-ob dawg$(XWLANG)$* $(ENCP) \
 		-sn $(XWLANG)StartLoc.bin -min $${start} -max $${end} \
 		-wc $(XWLANG)$*_wordcount.bin $(FORCE_4) -ns $(XWLANG)$*_nodesize.bin
 	touch $@
@@ -261,20 +267,20 @@ allbins:
 
 table.bin:  ../xloc.pl 
 ifdef NEWDAWG
-	perl -I../ ../xloc.pl -tn $@
+	perl -I../ ../xloc.pl $(ENCP) -tn -out $@
 else
-	perl -I../ ../xloc.pl -t $@
+	perl -I../ ../xloc.pl -t -out $@
 endif
 
 values.bin:  ../xloc.pl 
-	perl -I../ ../xloc.pl -v $@
+	perl -I../ ../xloc.pl -v -out $@ $(ENCP)
 
 %.dict: %.dict.gz
 	zcat $< > $@
 
 # clean this up....
 ../dict2dawg: ../dict2dawg.cpp
-	cd ../ && g++ -DDEBUG -O -o dict2dawg dict2dawg.cpp
+	cd ../ && g++ -DDEBUG -O -Wall -o dict2dawg dict2dawg.cpp
 
 clean_common:
 	rm -f $(XWLANG)Main.dict *.bin *.pdb *.seb dawg*.stamp *.$(FRANK_EXT) \
diff --git a/dawg/dict2dawg.cpp b/dawg/dict2dawg.cpp
index 6b74f8e6d..dbbc927a6 100644
--- a/dawg/dict2dawg.cpp
+++ b/dawg/dict2dawg.cpp
@@ -1,9 +1,10 @@
-/* -*- compile-command: "g++ -DDEBUG -O -Wall -o dict2dawg dict2dawg.cpp"; -*- */
+/* -*- compile-command: "g++ -DDEBUG -O0 -Wall -g -o dict2dawg dict2dawg.cpp"; -*- */
 /*************************************************************************
  * adapted from perl code that was itself adapted from C++ code
  * Copyright (C) 2000 Falk Hueffner
 
- * This version Copyright (C) 2002,2006-2007 Eric House (xwords@eehouse.org)
+ * This version Copyright (C) 2002,2006-2009 Eric House
+ * (xwords@eehouse.org)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -77,6 +78,7 @@ static void (*gReadWordProc)(void) = NULL;
 NodeList gNodes;       // final array of nodes
 unsigned int gNBytesPerOutfile = 0xFFFFFFFF;
 char* gTableFile = NULL;
+static bool gIsMultibyte = false;
 char* gOutFileBase = NULL;
 char* gStartNodeOut = NULL;
 static FILE* gInFile = NULL;
@@ -87,7 +89,7 @@ char* gCountFile = NULL;
 char* gBytesPerNodeFile = NULL;        // where to write whether node
                                        // size 3 or 4
 int gWordCount = 0;
-std::map<char,int> gTableHash;
+std::map<Letter,wchar_t> gTableHash;
 int gBlankIndex;
 std::vector<char> gRevMap;
 #ifdef DEBUG
@@ -121,14 +123,14 @@ static void TrieNodeSetIsTerminal( Node* nodeR, bool isTerminal );
 static bool TrieNodeGetIsTerminal( Node node );
 static void TrieNodeSetIsLastSibling( Node* nodeR, bool isLastSibling );
 static bool TrieNodeGetIsLastSibling( Node node );
-static void TrieNodeSetLetter( Node* nodeR, int letter );
-static unsigned int TrieNodeGetLetter( Node node );
+static void TrieNodeSetLetter( Node* nodeR, Letter letter );
+static Letter TrieNodeGetLetter( Node node );
 static void TrieNodeSetFirstChildOffset( Node* nodeR, int fco );
 static int TrieNodeGetFirstChildOffset( Node node );
 static int findSubArray( NodeList& newedgesR );
 static void registerSubArray( NodeList& edgesR, int nodeLoc );
-static Node MakeTrieNode( int letter, bool isTerminal, int firstChildOffset, 
-                          bool isLastSibling );
+static Node MakeTrieNode( Letter letter, bool isTerminal,
+                          int firstChildOffset, bool isLastSibling );
 static void printNodes( NodeList& nodesR );
 static void printNode( int index, Node node );
 static void moveTopToFront( int* firstRef );
@@ -142,6 +144,8 @@ static void readFromSortedArray( void );
 int 
 main( int argc, char** argv ) 
 { 
+    setlocale(LC_CTYPE, "");
+    
     gReadWordProc = readFromSortedArray;
 
     const char* inFileName;
@@ -287,7 +291,7 @@ buildNode( int depth )
 
     bool wordEnd;
     do {
-        char letter = gCurrentWord[depth];
+        Letter letter = gCurrentWord[depth];
         bool isTerminal = (gCurrentWordLen - 1) == depth;
 
         int nodeOffset = buildNode( depth + 1 );
@@ -336,7 +340,7 @@ addNodes( NodeList& newedgesR )
 static void
 printNode( int index, Node node )
 {
-    unsigned int letter = TrieNodeGetLetter(node);
+    Letter letter = TrieNodeGetLetter(node);
     assert( letter < gRevMap.size() );
     fprintf( stderr,
              "[%d] letter=%d(%c); isTerminal=%s; isLastSib=%s; fco=%d;\n", 
@@ -472,6 +476,38 @@ readFromSortedArray( void )
 #endif
 } // readFromSortedArray
 
+static wchar_t
+getWideChar( FILE* file )
+{
+    wchar_t dest;
+    char src[4] = { '\0' };
+    const char* srcp = src;
+    int ii;
+    mbstate_t ps = {0};
+
+    for ( ii = 0; ; ++ii ) {
+        int byt = getc( file );
+        size_t siz;
+
+        if ( byt == EOF || byt == gTermChar ) {
+            dest = byt;
+            break;
+        }
+
+        assert( ii < 4 );
+        src[ii] = byt;
+        siz = mbsrtowcs( &dest, &srcp, 1, &ps );
+
+        if ( siz == (size_t)-1 ) {
+            continue;
+        } else if ( siz == 1 ) {
+            break;
+        }
+    }
+//     fprintf( stderr, "%s=>%lc\n", __func__, dest );
+    return dest;
+} // getWideChar
+
 static Letter*
 readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF )
 {
@@ -485,7 +521,7 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF )
     // return it.  If no, start over ONLY IF the terminator was not
     // EOF.
     for ( ; ; ) {
-        int byt = getc( gInFile );
+        wchar_t byt = gIsMultibyte? getWideChar( gInFile ) : getc( gInFile );
 
         // EOF is special: we don't try for another word even if
         // dropWord is true; we must leave now.
@@ -523,7 +559,7 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF )
             // Don't call into the hashtable twice here!!
         } else if ( gTableHash.find(byt) != gTableHash.end() ) {
             assert( count < bufLen );
-            wordBuf[count++] = (char)gTableHash[byt];
+            wordBuf[count++] = gTableHash[byt];
             if ( count >= bufLen ) {
                 dropWord = true;
             }
@@ -534,9 +570,9 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF )
             tileToAscii( buf, sizeof(buf), wordBuf );
 
             if ( gKillIfMissing ) {
-                ERROR_EXIT( "chr %c (%d) not in map file %s\n"
+                ERROR_EXIT( "chr %lc (%d/0x%x) not in map file %s\n"
                             "last word was %s\n",
-                            (char)byt, (int)byt, gTableFile, buf );
+                            byt, (int)byt, (int)byt, gTableFile, buf );
             } else if ( !dropWord ) {
 #ifdef DEBUG
                 if ( gDebug ) {
@@ -551,7 +587,7 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF )
     }
 
 //     if ( NULL != result ) {
-//         char buf[MAX_WORD_LEN+1];
+//         char buf[T2ABUFLEN(MAX_WORD_LEN)];
 //         fprintf( stderr, "%s returning %s\n", __func__,
 //                  tileToAscii( buf, sizeof(buf), result ) );
 //     }
@@ -638,16 +674,17 @@ tileToAscii( char* out, int outSize, const Letter* in )
 
     char* orig = out;
     for ( ; ; ) {
-        char ch = *in++;
+        Letter ch = *in++;
         if ( '\0' == ch ) {
             break;
         }
-        assert( (unsigned int)ch < gRevMap.size() );
+        assert( ch < gRevMap.size() );
         *out++ = gRevMap[ch];
         tilesLen += sprintf( &tiles[tilesLen], "%d,", ch );
         assert( (out - orig) < outSize );
     }
 
+    assert( tilesLen+1 < outSize );
     tiles[tilesLen] = ']';
     tiles[tilesLen+1] = '\0';
     strcpy( out, tiles );
@@ -765,9 +802,9 @@ TrieNodeGetIsLastSibling( Node node )
 }
 
 static void
-TrieNodeSetLetter( Node* nodeR, int letter )
+TrieNodeSetLetter( Node* nodeR, Letter letter )
 {
-    if( letter >= 64 ) {
+    if ( letter >= 64 ) {
         ERROR_EXIT( "letter %d too big", letter );
     }
 
@@ -776,7 +813,7 @@ TrieNodeSetLetter( Node* nodeR, int letter )
     *nodeR |= (letter << 24);          // set new ones
 }
 
-static unsigned int
+static Letter
 TrieNodeGetLetter( Node node )
 {
     node >>= 24;
@@ -804,7 +841,7 @@ TrieNodeGetFirstChildOffset( Node node )
 }
 
 static Node
-MakeTrieNode( int letter, bool isTerminal, int firstChildOffset, 
+MakeTrieNode( Letter letter, bool isTerminal, int firstChildOffset, 
               bool isLastSibling )
 {
     Node result = 0;
@@ -1001,7 +1038,7 @@ static void
 outputNode( Node node, int nBytes, FILE* outfile )
 {
     unsigned int fco = TrieNodeGetFirstChildOffset(node);
-    unsigned int fourthByte;
+    unsigned int fourthByte = 0;
 
     if ( nBytes == 4 ) {
         fourthByte = fco >> 16;
@@ -1115,6 +1152,7 @@ parseARGV( int argc, char** argv, const char** inFileName )
 {
     *inFileName = NULL;
     int index = 1;
+    const char* enc = NULL;
     while ( index < argc ) {
 
         char* arg = argv[index++];
@@ -1139,6 +1177,8 @@ parseARGV( int argc, char** argv, const char** inFileName )
             gTableFile = argv[index++];
         } else if ( 0 == strcmp( arg, "-ob" ) ) {
             gOutFileBase = argv[index++];
+        } else if ( 0 == strcmp( arg, "-enc" ) ) {
+            enc = argv[index++];
         } else if ( 0 == strcmp( arg, "-sn" ) ) {
             gStartNodeOut = argv[index++];
         } else if ( 0 == strcmp( arg, "-if" ) ) {
@@ -1175,6 +1215,14 @@ parseARGV( int argc, char** argv, const char** inFileName )
         exit(1);
     }
 
+    if ( !!enc ) {
+        if ( !strcasecmp( enc, "UTF-8" ) ) {
+            gIsMultibyte = true;
+        } else {
+            ERROR_EXIT( "%s: unknown encoding %s", __func__, enc );
+        }
+    }
+
 #ifdef DEBUG
     if ( gDebug ) {
         fprintf( stderr, "gNBytesPerOutfile=%d\n", gNBytesPerOutfile );
diff --git a/dawg/xloc.pl b/dawg/xloc.pl
index cb21db81b..20b72fcc9 100755
--- a/dawg/xloc.pl
+++ b/dawg/xloc.pl
@@ -21,26 +21,44 @@
 use strict;
 use xloc;
 
-my $arg = shift(@ARGV);
-my $outfile = shift(@ARGV);
-my $lang = shift(@ARGV);
-my $path = "./$lang";
-my $infoFile = "$path/info.txt";
+my $unicode = -1;
+my $doval = 0;
+my $enc;
+my $outfile;
+
+my $arg;
+while ( $arg = $ARGV[0] ) {
+    if ( $arg eq '-enc' ) {
+        $enc = $ARGV[1];
+        shift @ARGV;
+    } elsif ( $arg eq "-tn" ) {
+        $unicode = 1;
+    } elsif ( $arg eq "-t" ) {
+        $unicode = 0;
+    } elsif ( $arg eq "-v" ) {
+        $doval = 1;
+    } elsif ( $arg eq '-out' ) {
+        $outfile = $ARGV[1];
+        shift @ARGV;
+    } else {
+        die "unknown arg $arg\n";
+    }
+    shift @ARGV;
+}
+
+my $infoFile = "info.txt";
 
 die "info file $infoFile not found\n" if ! -s $infoFile;
 
-
-my $xlocToken = xloc::ParseTileInfo($infoFile);
+my $xlocToken = xloc::ParseTileInfo($infoFile, $enc);
 
 open OUTFILE, "> $outfile";
 # For f*cking windoze linefeeds
 binmode( OUTFILE );
 
-if ( $arg eq "-t" ) {
-    xloc::WriteMapFile( $xlocToken, 0, \*OUTFILE );
-} elsif ( $arg eq "-tn" ) {
-    xloc::WriteMapFile( $xlocToken, 1, \*OUTFILE );
-} elsif ( $arg eq "-v" ) {
+if ( $unicode ne -1 ) {
+    xloc::WriteMapFile( $xlocToken, $unicode, \*OUTFILE );
+} elsif ( $doval ) {
     xloc::WriteValuesFile( $xlocToken, \*OUTFILE );
 }
 
diff --git a/dawg/xloc.pm b/dawg/xloc.pm
index 4aefe7440..741968e76 100644
--- a/dawg/xloc.pm
+++ b/dawg/xloc.pm
@@ -43,11 +43,17 @@ BEGIN {
 # for queries.  It's a hash with name-value pairs and an _INFO entry
 # containing a list of tile info lists.
 
-sub ParseTileInfo($) {
-    my ( $filePath ) = @_;
+sub ParseTileInfo($$) {
+    my ( $filePath, $enc ) = @_;
     my %result;
 
-    open INPUT, "<$filePath" or die "couldn't open $filePath";
+    if ( $enc ) {
+        open( INPUT, "<:encoding($enc)", "$filePath" ) 
+            or die "couldn't open $filePath";
+    } else {
+        open( INPUT, "<$filePath" ) 
+            or die "couldn't open $filePath";
+    }
 
     my $inTiles = 0;
     my @tiles;