mirror of
git://xwords.git.sourceforge.net/gitroot/xwords/xwords
synced 2025-02-05 20:45:49 +01:00
Allow language Makefile to specify encoding. Pass to perl and c++
dict builders, using it to open files and to determine whether to do multi-to-wide conversion.
This commit is contained in:
parent
2691b00dc9
commit
b45fc82771
5 changed files with 121 additions and 42 deletions
|
@ -18,6 +18,7 @@
|
|||
XWLANG=DISCbeta
|
||||
LANGCODE=ca_ES
|
||||
TARGET_TYPE ?= PALM
|
||||
ENC = UTF-8
|
||||
|
||||
ifeq ($(TARGET_TYPE),PALM)
|
||||
PBITMS = ./bmps/palm
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# -*-mode: Makefile -*-
|
||||
|
||||
# Copyright 2000-2002 by Eric House (xwords@eehouse.org)
|
||||
# Copyright 2000-2009 by Eric House (xwords@eehouse.org)
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License
|
||||
|
@ -32,10 +32,16 @@ PAR = ../par.pl
|
|||
|
||||
LANGUAGE = $(shell basename $$(pwd))
|
||||
|
||||
ifdef ENC
|
||||
ENCP = -enc $(ENC)
|
||||
endif
|
||||
|
||||
# prefer the compiled version if available. But don't compile it
|
||||
# automatically until it's a bit better tested.
|
||||
# DICT2DAWG = $(if $(shell test -x ../dict2dawg && echo foo),\
|
||||
# ../dict2dawg,../dict2dawg.pl)
|
||||
#
|
||||
# No. The perl version no longer works. Don't use without fixing.
|
||||
|
||||
DICT2DAWG = ../dict2dawg
|
||||
|
||||
|
@ -245,7 +251,7 @@ dawg$(XWLANG)%.stamp: $(XWLANG)Main.dict.gz $(DICT2DAWG) table.bin ../Makefile.l
|
|||
end=$$(echo $@ | sed -e 's/dawg$(XWLANG)[0-9]*to\([0-9]*\).stamp/\1/'); \
|
||||
echo $${start} and $${end}; \
|
||||
zcat $< | $(DICT2DAWG) $(DICT2DAWGARGS) $(TABLE_ARG) table.bin -b 28000 \
|
||||
-ob dawg$(XWLANG)$* \
|
||||
-ob dawg$(XWLANG)$* $(ENCP) \
|
||||
-sn $(XWLANG)StartLoc.bin -min $${start} -max $${end} \
|
||||
-wc $(XWLANG)$*_wordcount.bin $(FORCE_4) -ns $(XWLANG)$*_nodesize.bin
|
||||
touch $@
|
||||
|
@ -261,20 +267,20 @@ allbins:
|
|||
|
||||
table.bin: ../xloc.pl
|
||||
ifdef NEWDAWG
|
||||
perl -I../ ../xloc.pl -tn $@
|
||||
perl -I../ ../xloc.pl $(ENCP) -tn -out $@
|
||||
else
|
||||
perl -I../ ../xloc.pl -t $@
|
||||
perl -I../ ../xloc.pl -t -out $@
|
||||
endif
|
||||
|
||||
values.bin: ../xloc.pl
|
||||
perl -I../ ../xloc.pl -v $@
|
||||
perl -I../ ../xloc.pl -v -out $@ $(ENCP)
|
||||
|
||||
%.dict: %.dict.gz
|
||||
zcat $< > $@
|
||||
|
||||
# clean this up....
|
||||
../dict2dawg: ../dict2dawg.cpp
|
||||
cd ../ && g++ -DDEBUG -O -o dict2dawg dict2dawg.cpp
|
||||
cd ../ && g++ -DDEBUG -O -Wall -o dict2dawg dict2dawg.cpp
|
||||
|
||||
clean_common:
|
||||
rm -f $(XWLANG)Main.dict *.bin *.pdb *.seb dawg*.stamp *.$(FRANK_EXT) \
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
/* -*- compile-command: "g++ -DDEBUG -O -Wall -o dict2dawg dict2dawg.cpp"; -*- */
|
||||
/* -*- compile-command: "g++ -DDEBUG -O0 -Wall -g -o dict2dawg dict2dawg.cpp"; -*- */
|
||||
/*************************************************************************
|
||||
* adapted from perl code that was itself adapted from C++ code
|
||||
* Copyright (C) 2000 Falk Hueffner
|
||||
|
||||
* This version Copyright (C) 2002,2006-2007 Eric House (xwords@eehouse.org)
|
||||
* This version Copyright (C) 2002,2006-2009 Eric House
|
||||
* (xwords@eehouse.org)
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
|
@ -77,6 +78,7 @@ static void (*gReadWordProc)(void) = NULL;
|
|||
NodeList gNodes; // final array of nodes
|
||||
unsigned int gNBytesPerOutfile = 0xFFFFFFFF;
|
||||
char* gTableFile = NULL;
|
||||
static bool gIsMultibyte = false;
|
||||
char* gOutFileBase = NULL;
|
||||
char* gStartNodeOut = NULL;
|
||||
static FILE* gInFile = NULL;
|
||||
|
@ -87,7 +89,7 @@ char* gCountFile = NULL;
|
|||
char* gBytesPerNodeFile = NULL; // where to write whether node
|
||||
// size 3 or 4
|
||||
int gWordCount = 0;
|
||||
std::map<char,int> gTableHash;
|
||||
std::map<Letter,wchar_t> gTableHash;
|
||||
int gBlankIndex;
|
||||
std::vector<char> gRevMap;
|
||||
#ifdef DEBUG
|
||||
|
@ -121,14 +123,14 @@ static void TrieNodeSetIsTerminal( Node* nodeR, bool isTerminal );
|
|||
static bool TrieNodeGetIsTerminal( Node node );
|
||||
static void TrieNodeSetIsLastSibling( Node* nodeR, bool isLastSibling );
|
||||
static bool TrieNodeGetIsLastSibling( Node node );
|
||||
static void TrieNodeSetLetter( Node* nodeR, int letter );
|
||||
static unsigned int TrieNodeGetLetter( Node node );
|
||||
static void TrieNodeSetLetter( Node* nodeR, Letter letter );
|
||||
static Letter TrieNodeGetLetter( Node node );
|
||||
static void TrieNodeSetFirstChildOffset( Node* nodeR, int fco );
|
||||
static int TrieNodeGetFirstChildOffset( Node node );
|
||||
static int findSubArray( NodeList& newedgesR );
|
||||
static void registerSubArray( NodeList& edgesR, int nodeLoc );
|
||||
static Node MakeTrieNode( int letter, bool isTerminal, int firstChildOffset,
|
||||
bool isLastSibling );
|
||||
static Node MakeTrieNode( Letter letter, bool isTerminal,
|
||||
int firstChildOffset, bool isLastSibling );
|
||||
static void printNodes( NodeList& nodesR );
|
||||
static void printNode( int index, Node node );
|
||||
static void moveTopToFront( int* firstRef );
|
||||
|
@ -142,6 +144,8 @@ static void readFromSortedArray( void );
|
|||
int
|
||||
main( int argc, char** argv )
|
||||
{
|
||||
setlocale(LC_CTYPE, "");
|
||||
|
||||
gReadWordProc = readFromSortedArray;
|
||||
|
||||
const char* inFileName;
|
||||
|
@ -287,7 +291,7 @@ buildNode( int depth )
|
|||
|
||||
bool wordEnd;
|
||||
do {
|
||||
char letter = gCurrentWord[depth];
|
||||
Letter letter = gCurrentWord[depth];
|
||||
bool isTerminal = (gCurrentWordLen - 1) == depth;
|
||||
|
||||
int nodeOffset = buildNode( depth + 1 );
|
||||
|
@ -336,7 +340,7 @@ addNodes( NodeList& newedgesR )
|
|||
static void
|
||||
printNode( int index, Node node )
|
||||
{
|
||||
unsigned int letter = TrieNodeGetLetter(node);
|
||||
Letter letter = TrieNodeGetLetter(node);
|
||||
assert( letter < gRevMap.size() );
|
||||
fprintf( stderr,
|
||||
"[%d] letter=%d(%c); isTerminal=%s; isLastSib=%s; fco=%d;\n",
|
||||
|
@ -472,6 +476,38 @@ readFromSortedArray( void )
|
|||
#endif
|
||||
} // readFromSortedArray
|
||||
|
||||
static wchar_t
|
||||
getWideChar( FILE* file )
|
||||
{
|
||||
wchar_t dest;
|
||||
char src[4] = { '\0' };
|
||||
const char* srcp = src;
|
||||
int ii;
|
||||
mbstate_t ps = {0};
|
||||
|
||||
for ( ii = 0; ; ++ii ) {
|
||||
int byt = getc( file );
|
||||
size_t siz;
|
||||
|
||||
if ( byt == EOF || byt == gTermChar ) {
|
||||
dest = byt;
|
||||
break;
|
||||
}
|
||||
|
||||
assert( ii < 4 );
|
||||
src[ii] = byt;
|
||||
siz = mbsrtowcs( &dest, &srcp, 1, &ps );
|
||||
|
||||
if ( siz == (size_t)-1 ) {
|
||||
continue;
|
||||
} else if ( siz == 1 ) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// fprintf( stderr, "%s=>%lc\n", __func__, dest );
|
||||
return dest;
|
||||
} // getWideChar
|
||||
|
||||
static Letter*
|
||||
readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF )
|
||||
{
|
||||
|
@ -485,7 +521,7 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF )
|
|||
// return it. If no, start over ONLY IF the terminator was not
|
||||
// EOF.
|
||||
for ( ; ; ) {
|
||||
int byt = getc( gInFile );
|
||||
wchar_t byt = gIsMultibyte? getWideChar( gInFile ) : getc( gInFile );
|
||||
|
||||
// EOF is special: we don't try for another word even if
|
||||
// dropWord is true; we must leave now.
|
||||
|
@ -523,7 +559,7 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF )
|
|||
// Don't call into the hashtable twice here!!
|
||||
} else if ( gTableHash.find(byt) != gTableHash.end() ) {
|
||||
assert( count < bufLen );
|
||||
wordBuf[count++] = (char)gTableHash[byt];
|
||||
wordBuf[count++] = gTableHash[byt];
|
||||
if ( count >= bufLen ) {
|
||||
dropWord = true;
|
||||
}
|
||||
|
@ -534,9 +570,9 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF )
|
|||
tileToAscii( buf, sizeof(buf), wordBuf );
|
||||
|
||||
if ( gKillIfMissing ) {
|
||||
ERROR_EXIT( "chr %c (%d) not in map file %s\n"
|
||||
ERROR_EXIT( "chr %lc (%d/0x%x) not in map file %s\n"
|
||||
"last word was %s\n",
|
||||
(char)byt, (int)byt, gTableFile, buf );
|
||||
byt, (int)byt, (int)byt, gTableFile, buf );
|
||||
} else if ( !dropWord ) {
|
||||
#ifdef DEBUG
|
||||
if ( gDebug ) {
|
||||
|
@ -551,7 +587,7 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF )
|
|||
}
|
||||
|
||||
// if ( NULL != result ) {
|
||||
// char buf[MAX_WORD_LEN+1];
|
||||
// char buf[T2ABUFLEN(MAX_WORD_LEN)];
|
||||
// fprintf( stderr, "%s returning %s\n", __func__,
|
||||
// tileToAscii( buf, sizeof(buf), result ) );
|
||||
// }
|
||||
|
@ -638,16 +674,17 @@ tileToAscii( char* out, int outSize, const Letter* in )
|
|||
|
||||
char* orig = out;
|
||||
for ( ; ; ) {
|
||||
char ch = *in++;
|
||||
Letter ch = *in++;
|
||||
if ( '\0' == ch ) {
|
||||
break;
|
||||
}
|
||||
assert( (unsigned int)ch < gRevMap.size() );
|
||||
assert( ch < gRevMap.size() );
|
||||
*out++ = gRevMap[ch];
|
||||
tilesLen += sprintf( &tiles[tilesLen], "%d,", ch );
|
||||
assert( (out - orig) < outSize );
|
||||
}
|
||||
|
||||
assert( tilesLen+1 < outSize );
|
||||
tiles[tilesLen] = ']';
|
||||
tiles[tilesLen+1] = '\0';
|
||||
strcpy( out, tiles );
|
||||
|
@ -765,9 +802,9 @@ TrieNodeGetIsLastSibling( Node node )
|
|||
}
|
||||
|
||||
static void
|
||||
TrieNodeSetLetter( Node* nodeR, int letter )
|
||||
TrieNodeSetLetter( Node* nodeR, Letter letter )
|
||||
{
|
||||
if( letter >= 64 ) {
|
||||
if ( letter >= 64 ) {
|
||||
ERROR_EXIT( "letter %d too big", letter );
|
||||
}
|
||||
|
||||
|
@ -776,7 +813,7 @@ TrieNodeSetLetter( Node* nodeR, int letter )
|
|||
*nodeR |= (letter << 24); // set new ones
|
||||
}
|
||||
|
||||
static unsigned int
|
||||
static Letter
|
||||
TrieNodeGetLetter( Node node )
|
||||
{
|
||||
node >>= 24;
|
||||
|
@ -804,7 +841,7 @@ TrieNodeGetFirstChildOffset( Node node )
|
|||
}
|
||||
|
||||
static Node
|
||||
MakeTrieNode( int letter, bool isTerminal, int firstChildOffset,
|
||||
MakeTrieNode( Letter letter, bool isTerminal, int firstChildOffset,
|
||||
bool isLastSibling )
|
||||
{
|
||||
Node result = 0;
|
||||
|
@ -1001,7 +1038,7 @@ static void
|
|||
outputNode( Node node, int nBytes, FILE* outfile )
|
||||
{
|
||||
unsigned int fco = TrieNodeGetFirstChildOffset(node);
|
||||
unsigned int fourthByte;
|
||||
unsigned int fourthByte = 0;
|
||||
|
||||
if ( nBytes == 4 ) {
|
||||
fourthByte = fco >> 16;
|
||||
|
@ -1115,6 +1152,7 @@ parseARGV( int argc, char** argv, const char** inFileName )
|
|||
{
|
||||
*inFileName = NULL;
|
||||
int index = 1;
|
||||
const char* enc = NULL;
|
||||
while ( index < argc ) {
|
||||
|
||||
char* arg = argv[index++];
|
||||
|
@ -1139,6 +1177,8 @@ parseARGV( int argc, char** argv, const char** inFileName )
|
|||
gTableFile = argv[index++];
|
||||
} else if ( 0 == strcmp( arg, "-ob" ) ) {
|
||||
gOutFileBase = argv[index++];
|
||||
} else if ( 0 == strcmp( arg, "-enc" ) ) {
|
||||
enc = argv[index++];
|
||||
} else if ( 0 == strcmp( arg, "-sn" ) ) {
|
||||
gStartNodeOut = argv[index++];
|
||||
} else if ( 0 == strcmp( arg, "-if" ) ) {
|
||||
|
@ -1175,6 +1215,14 @@ parseARGV( int argc, char** argv, const char** inFileName )
|
|||
exit(1);
|
||||
}
|
||||
|
||||
if ( !!enc ) {
|
||||
if ( !strcasecmp( enc, "UTF-8" ) ) {
|
||||
gIsMultibyte = true;
|
||||
} else {
|
||||
ERROR_EXIT( "%s: unknown encoding %s", __func__, enc );
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
if ( gDebug ) {
|
||||
fprintf( stderr, "gNBytesPerOutfile=%d\n", gNBytesPerOutfile );
|
||||
|
|
42
dawg/xloc.pl
42
dawg/xloc.pl
|
@ -21,26 +21,44 @@
|
|||
use strict;
|
||||
use xloc;
|
||||
|
||||
my $arg = shift(@ARGV);
|
||||
my $outfile = shift(@ARGV);
|
||||
my $lang = shift(@ARGV);
|
||||
my $path = "./$lang";
|
||||
my $infoFile = "$path/info.txt";
|
||||
my $unicode = -1;
|
||||
my $doval = 0;
|
||||
my $enc;
|
||||
my $outfile;
|
||||
|
||||
my $arg;
|
||||
while ( $arg = $ARGV[0] ) {
|
||||
if ( $arg eq '-enc' ) {
|
||||
$enc = $ARGV[1];
|
||||
shift @ARGV;
|
||||
} elsif ( $arg eq "-tn" ) {
|
||||
$unicode = 1;
|
||||
} elsif ( $arg eq "-t" ) {
|
||||
$unicode = 0;
|
||||
} elsif ( $arg eq "-v" ) {
|
||||
$doval = 1;
|
||||
} elsif ( $arg eq '-out' ) {
|
||||
$outfile = $ARGV[1];
|
||||
shift @ARGV;
|
||||
} else {
|
||||
die "unknown arg $arg\n";
|
||||
}
|
||||
shift @ARGV;
|
||||
}
|
||||
|
||||
my $infoFile = "info.txt";
|
||||
|
||||
die "info file $infoFile not found\n" if ! -s $infoFile;
|
||||
|
||||
|
||||
my $xlocToken = xloc::ParseTileInfo($infoFile);
|
||||
my $xlocToken = xloc::ParseTileInfo($infoFile, $enc);
|
||||
|
||||
open OUTFILE, "> $outfile";
|
||||
# For f*cking windoze linefeeds
|
||||
binmode( OUTFILE );
|
||||
|
||||
if ( $arg eq "-t" ) {
|
||||
xloc::WriteMapFile( $xlocToken, 0, \*OUTFILE );
|
||||
} elsif ( $arg eq "-tn" ) {
|
||||
xloc::WriteMapFile( $xlocToken, 1, \*OUTFILE );
|
||||
} elsif ( $arg eq "-v" ) {
|
||||
if ( $unicode ne -1 ) {
|
||||
xloc::WriteMapFile( $xlocToken, $unicode, \*OUTFILE );
|
||||
} elsif ( $doval ) {
|
||||
xloc::WriteValuesFile( $xlocToken, \*OUTFILE );
|
||||
}
|
||||
|
||||
|
|
12
dawg/xloc.pm
12
dawg/xloc.pm
|
@ -43,11 +43,17 @@ BEGIN {
|
|||
# for queries. It's a hash with name-value pairs and an _INFO entry
|
||||
# containing a list of tile info lists.
|
||||
|
||||
sub ParseTileInfo($) {
|
||||
my ( $filePath ) = @_;
|
||||
sub ParseTileInfo($$) {
|
||||
my ( $filePath, $enc ) = @_;
|
||||
my %result;
|
||||
|
||||
open INPUT, "<$filePath" or die "couldn't open $filePath";
|
||||
if ( $enc ) {
|
||||
open( INPUT, "<:encoding($enc)", "$filePath" )
|
||||
or die "couldn't open $filePath";
|
||||
} else {
|
||||
open( INPUT, "<$filePath" )
|
||||
or die "couldn't open $filePath";
|
||||
}
|
||||
|
||||
my $inTiles = 0;
|
||||
my @tiles;
|
||||
|
|
Loading…
Add table
Reference in a new issue