diff --git a/xwords4/dawg/Makefile b/xwords4/dawg/Makefile index 8e0623956..736a65152 100644 --- a/xwords4/dawg/Makefile +++ b/xwords4/dawg/Makefile @@ -23,7 +23,7 @@ byodfiles.tgz: byodfiles.tar byodfiles.tar: dict2dawg rm -f $@ langinfo - tar cvf $@ ./dict2dawg ./dict2dawg.cpp ./par.pl ./xloc.pl ./xloc.pm + tar cvf $@ ./dict2dawg ./dict2dawg.cpp ./par.pl ./xloc.py for dir in $$(ls .); do \ if [ $$dir = "Hëx" ]; then \ :; \ diff --git a/xwords4/dawg/Makefile.langcommon b/xwords4/dawg/Makefile.langcommon index 04471c625..5ae03dceb 100644 --- a/xwords4/dawg/Makefile.langcommon +++ b/xwords4/dawg/Makefile.langcommon @@ -269,24 +269,24 @@ allbins: $(MAKE) TARGET_TYPE=FRANK byodbins rm palmspecials.bin -table.bin: ../xloc.pl +table.bin: ../xloc.py ifdef NEWDAWG - perl -I../ ../xloc.pl $(ENCP) -tn -out $@ + ../xloc.py $(ENCP) -tn -out $@ else - perl -I../ ../xloc.pl -t -out $@ + error endif -values.bin: ../xloc.pl - perl -I../ ../xloc.pl -v -out $@ +values.bin: ../xloc.py + ../xloc.py -v -out $@ # a binary file, two bytes, one giving the size of tiles data and the # other the number of tiles in the dict. Tiles data is utf-8 and so # number is not derivable from size. -$(XWLANG)_charcount.bin: table.bin ../xloc.pl +$(XWLANG)_charcount.bin: table.bin ../xloc.py SIZ=$$(ls -l $< | awk '{print $$5}'); \ perl -e "print pack(\"c\",$$SIZ)" > $@ TMP=/tmp/tmp$$$$; \ - perl -I../ ../xloc.pl -s -out $$TMP; \ + ../xloc.py -s -out $$TMP; \ cat $$TMP >> $@; \ rm -f $$TMP diff --git a/xwords4/dawg/dictstats.pl b/xwords4/dawg/dictstats.pl deleted file mode 100755 index 819b82e35..000000000 --- a/xwords4/dawg/dictstats.pl +++ /dev/null @@ -1,76 +0,0 @@ -#!/usr/bin/perl - -# print stats about in input stream that's assumed to be a dictionary. -# Counts and percentages of each letter, as well as total numbers of -# words. This is not part of the dictionary build process. I use it -# for creating info.txt files for new languages and debugging the -# creation of dictionaries from new wordlists. -# -# Something like this might form the basis for choosing counts and -# values for tiles without using the conventions established by -# Scrabble players. This isn't enough, though: the frequency of -# letter tuples and triples -- how often letters appear together -- is -# a better indicator than just letter count. - -use strict; - -my @wordSizeCounts; -my %letterCounts; -my $wordCount; -my $letterCount; -my $enc = "utf8"; # this could be a cmdline arg.... - -if ( $enc ) { - binmode( STDOUT, ":encoding($enc)" ) ; - binmode( STDIN, ":encoding($enc)" ) ; -} - -while (<>) { - - chomp; - - ++$wordSizeCounts[length]; - ++$wordCount; - - foreach my $letter (split( / */ ) ) { - my $ii = ord($letter); - # special-case the bogus chars we add for "specials" - die "$0: this is a letter?: $ii" if $ii <= 32 && $ii >= 4 && $ii != 0; - ++$letterCounts{$letter}; - ++$letterCount; - } -} - -print "Number of words: $wordCount\n"; -print "Number of letters: $letterCount\n\n"; - - -print "**** word sizes ****\n"; -print "SIZE COUNT PERCENT\n"; -my $pctTotal = 0.0; -my $wordTotal = 0; -for ( my $i = 1 ; $i <= 99; ++$i ) { - my $count = $wordSizeCounts[$i]; - $wordTotal += $count; - if ( $count > 0 ) { - my $pct = (100.00 * $count)/$wordCount; - $pctTotal += $pct; - printf "%2d %6d %.2f\n", $i, $count, $pct; - } -} -printf "-------------------------------\n"; -printf " %6d %.2f\n", $wordTotal, $pctTotal; - - -print "\n\n**** Letter counts ****\n"; -print " ASCII ORD HEX PCT (of $letterCount)\n"; -my $lineNo = 1; -foreach my $key (sort keys %letterCounts) { - my $count = $letterCounts{$key}; - my $pct = (100.00 * $count) / $letterCount; - printf( "%2d: %3s %3d %x %5.2f (%d)\n", - $lineNo, $key, ord($key), ord($key), $pct, $count ); - ++$lineNo; -} - -print "\n"; diff --git a/xwords4/dawg/dictstats.py b/xwords4/dawg/dictstats.py new file mode 100755 index 000000000..28bfb631f --- /dev/null +++ b/xwords4/dawg/dictstats.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 + +import sys + +""" +print stats about in input stream that's assumed to be a dictionary. +Counts and percentages of each letter, as well as total numbers of +words. This is not part of the dictionary build process. I use it +for creating info.txt files for new languages and debugging the +creation of dictionaries from new wordlists. + +Something like this might form the basis for choosing counts and +values for tiles without using the conventions established by +Scrabble players. This isn't enough, though: the frequency of +letter tuples and triples -- how often letters appear together -- is +a better indicator than just letter count. +""" + + + +def main(): + wordSizeCounts = {} + letterCounts = {} + wordCount = 0 + letterCount = 0 + enc = 'utf8' # this could be a cmdline arg.... + + for line in sys.stdin.readlines(): + line = line.strip() + + length = len(line) + if not length in wordSizeCounts: wordSizeCounts[length] = 0 + wordSizeCounts[length] += 1 + wordCount += 1 + + for letter in line: + ii = ord(letter) + # perl did this: die "$0: this is a letter?: $ii" if $ii <= 32 && $ii >= 4 && $ii != 0; + assert ii > 32 or ii < 4 or ii == 0, 'letter {} out of range'.format(ii) + if not letter in letterCounts: letterCounts[letter] = 0 + letterCounts[letter] += 1 + letterCount += 1 + + print( 'Number of words: {}'.format(wordCount)) + print( 'Number of letters: {}'.format(letterCount)) + print('') + + print( '**** word sizes ****' ) + print( 'SIZE COUNT PERCENT' ) + pctTotal = 0.0 + wordTotal = 0 + for ii in sorted(wordSizeCounts): + count = wordSizeCounts[ii] + wordTotal += count + pct = (100.00 * count)/wordCount + pctTotal += pct + print( '{:2d} {:6d} {:02.2f}'.format(ii, count, pct)) + + print( '-------------------------------' ) + print(' {:6d} {:.2f}'.format( wordTotal, pctTotal)) + print('') + + lineNo = 1 + pctTotal = 0.0 + print( '**** Letter counts ****' ) + print( ' ASCII ORD HEX PCT (of {})'.format(letterCount)) + for letter in sorted(letterCounts): + count = letterCounts[letter] + pct = (100.00 * count) / letterCount + pctTotal += pct + print( '{:2d}: {: >6s} {:2d} {:x} {:5.2f} ({:d})' \ + .format(lineNo, letter, ord(letter), ord(letter), pct, count ) ) + lineNo += 1 + + print('percent total {:.2f}'.format( pctTotal)) + print('') + +############################################################################## +if __name__ == '__main__': + main() diff --git a/xwords4/dawg/frank_mkspecials.pl b/xwords4/dawg/frank_mkspecials.pl deleted file mode 100755 index 5c0ed4465..000000000 --- a/xwords4/dawg/frank_mkspecials.pl +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/perl - -# Copyright 2001 by Eric House (xwords@eehouse.org) -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -# Given arguments consisting of triples, first a string and then pbitm -# files representing bitmaps. For each triple, print out the string and -# then the converted bitmaps. - -use strict; - -while ( @ARGV ) { - my $str = shift(); - my $largebmp = shift(); - my $smallbmp = shift(); - - doOne( $str, $largebmp, $smallbmp ); -} - -sub doOne { - my ( $str, $largebmp, $smallbmp ) = @_; - - print pack( "C", length($str) ); - print $str; - - print STDERR "looking at $largebmp", "\n"; - - die "file $largebmp does not exist\n" unless -e $largebmp; - print `cat $largebmp | ../pbitm2bin.pl`; - die "file $smallbmp does not exist\n" unless -e $smallbmp; - print `cat $smallbmp | ../pbitm2bin.pl`; -} - - diff --git a/xwords4/dawg/xloc.pl b/xwords4/dawg/xloc.pl deleted file mode 100755 index 23ef0ca43..000000000 --- a/xwords4/dawg/xloc.pl +++ /dev/null @@ -1,76 +0,0 @@ -#!/usr/bin/perl - -# Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved. -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -# test and wrapper file for xloc.pm - -use strict; -use xloc; - -my $unicode = -1; -my $doval = 0; -my $dosize = 0; -my $enc; -my $outfile; - -my $arg; -while ( $arg = $ARGV[0] ) { - if ( $arg eq '-enc' ) { - $enc = $ARGV[1]; - shift @ARGV; - } elsif ( $arg eq "-tn" ) { - $unicode = 1; - } elsif ( $arg eq "-t" ) { - $unicode = 0; - } elsif ( $arg eq "-v" ) { - $doval = 1; - } elsif ( $arg eq "-s" ) { - $dosize = 1; - } elsif ( $arg eq '-out' ) { - $outfile = $ARGV[1]; - shift @ARGV; - } else { - die "unknown arg $arg\n"; - } - shift @ARGV; -} - -my $infoFile = "info.txt"; - -die "info file $infoFile not found\n" if ! -s $infoFile; - -my $xlocToken = xloc::ParseTileInfo($infoFile, $enc); - -if ( $enc ) { - open OUTFILE, ">:encoding($enc)", "$outfile" - or die "couldn't open $outfile"; -} else { - open OUTFILE, ">$outfile" or die "couldn't open $outfile"; -} -# For f*cking windoze linefeeds -# binmode( OUTFILE ); - -if ( $unicode ne -1 ) { - xloc::WriteMapFile( $xlocToken, $unicode, \*OUTFILE ); -} elsif ( $dosize ) { - my $count = xloc::GetNTiles( $xlocToken ); - print OUTFILE pack("c", $count ); -} elsif ( $doval ) { - xloc::WriteValuesFile( $xlocToken, \*OUTFILE ); -} - -close OUTFILE; diff --git a/xwords4/dawg/xloc.pm b/xwords4/dawg/xloc.pm deleted file mode 100644 index 2bcb624b5..000000000 --- a/xwords4/dawg/xloc.pm +++ /dev/null @@ -1,194 +0,0 @@ -#!/usr/bin/perl - -# Copyright 2002-2014 by Eric House (xwords@eehouse.org). All rights -# reserved. -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -# The idea here is that all that matters about a language is stored in -# one file (possibly excepting rules for prepping a dictionary). -# There's a list of tile faces, counts and values, and also some -# name-value pairs as needed. The pairs come first, and then a list -# of tiles. - -package xloc; - -use strict; -use warnings; -# force output in utf8 -use open qw/:std :utf8/; - -BEGIN { - use Exporter (); - our ($VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS); - - $VERSION = 1.00; - - @ISA = qw(Exporter); - @EXPORT = qw(&ParseTileInfo &GetNTiles &TileFace &TileValue - &TileCount &GetValue &WriteMapFile &WriteValuesFile); - %EXPORT_TAGS = ( ); -} - -# Returns what's meant to be an opaque object that can be passed back -# for queries. It's a hash with name-value pairs and an _INFO entry -# containing a list of tile info lists. - -sub ParseTileInfo($$) { - my ( $filePath, $enc ) = @_; - my %result; - - if ( $enc ) { - open( INPUT, "<:encoding($enc)", "$filePath" ) - or die "couldn't open $filePath"; - } else { - open( INPUT, "<$filePath" ) - or die "couldn't open $filePath"; - } - - my $inTiles = 0; - my @tiles; - while ( ) { - - chomp; - s/\#.*$//; - s/^\s*$//; # nuke all-white-space lines - next if !length; - - if ( $inTiles ) { - if ( // ) { - last; - } else { - my ( $count, $val, $face ) = m/^\s*(\w+)\s+(\w+)\s+(.*)\s*$/; - push @tiles, [ $count, $val, $face ]; - } - } elsif ( /\w:/ ) { - my ( $nam, $val ) = split ':', $_, 2; - $result{$nam} .= $val; - } elsif ( // ) { - $inTiles = 1; - } - - } - - close INPUT; - - $result{"_TILES"} = [ @tiles ]; - - return \%result; -} - -sub GetNTiles($) { - my ( $hashR ) = @_; - - my $listR = ${$hashR}{"_TILES"}; - - return 0 + @{$listR}; -} - -sub GetValue($$) { - my ( $hashR, $name ) = @_; - return ${$hashR}{$name}; -} - -sub printLetters($$) { - my ( $str, $fhr ) = @_; - my @letters = split( /\|/, $str ); - $str = join( " ", @letters ); - for ( my $key = 0; $key < length($str); ++$key ) { - my $chr = substr( $str, $key, 1 ); - print $fhr pack( "U", ord($chr) ); - } -} - -sub WriteMapFile($$$) { - my ( $hashR, $unicode, $fhr ) = @_; - - my $count = GetNTiles($hashR); - my $specialCount = 0; - for ( my $i = 0; $i < $count; ++$i ) { - my $tileR = GetNthTile( $hashR, $i ); - my $str = ${$tileR}[2]; - - if ( $str =~ /\'(.(\|.)*)\'/ ) { - printLetters( $1, $fhr ); - } elsif ( $str =~ /\"(.+)\"/ ) { - print $fhr pack( "c", $specialCount++ ); - } elsif ( $str =~ /(\d+)/ ) { - print $fhr pack( "n", $1 ); - } else { - die "WriteMapFile: unrecognized face format $str, elem $i"; - } - } -} # WriteMapFile - -sub WriteValuesFile($$) { - my ( $hashR, $fhr ) = @_; - - my $header = GetValue( $hashR,"XLOC_HEADER" ); - die "no XLOC_HEADER found" if ! $header; - - print STDERR "header is $header\n"; - - print $fhr pack( "n", hex($header) ); - - my $count = GetNTiles($hashR); - for ( my $i = 0; $i < $count; ++$i ) { - my $tileR = GetNthTile( $hashR, $i ); - - print $fhr pack( "c", TileValue($tileR) ); - print $fhr pack( "c", TileCount($tileR) ); - } - -} # WriteValuesFile - -sub GetNthTile($$) { - my ( $hashR, $n ) = @_; - my $listR = ${$hashR}{"_TILES"}; - - return ${$listR}[$n]; -} - -sub TileFace($) { - my ( $tileR ) = @_; - my $result; - - my $str = ${$tileR}[2]; - - if ( $str =~ /\'(.(\|.)*)\'/ ) { - $result = $1; - } elsif ( $str =~ /\"(.+)\"/ ) { - $result = $1; - } elsif ( $str =~ /(\d+)/ ) { - $result = chr($1); - } else { - die "TileFace: unrecognized face format: $str"; - } - return $result; -} - -sub TileValue($) { - my ( $tileR ) = @_; - - return ${$tileR}[0]; -} - -sub TileCount($) { - my ( $tileR ) = @_; - - return ${$tileR}[1]; -} - -1; diff --git a/xwords4/dawg/xloc.py b/xwords4/dawg/xloc.py new file mode 100755 index 000000000..1aef1ff0c --- /dev/null +++ b/xwords4/dawg/xloc.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 +import argparse, os, re, struct, sys + +def errorOut(msg): + print('ERROR: {}'.format(msg)) + sys.exit(1) + +def mkParser(): + parser = argparse.ArgumentParser() + parser.add_argument('-enc', dest = 'ENCODING', type = str, help = 'use this encoding' ) + parser.add_argument('-tn', dest = 'DO_TABLE', action = 'store_true', help = 'output table file' ) + # parser.add_argument('-tn', dest = 'UNICODE', default = False, + # action = 'store_true', help = 'assume unicode') + # parser.add_argument('-t', dest = 'UNICODE', type = str, default = True, + # action = 'store_false', help = 'DO NOT assume unicode') + parser.add_argument('-v', dest = 'DO_VALS', action = 'store_true', help = 'output values file' ) + parser.add_argument('-s', dest = 'DO_SIZE', action = 'store_true', help = 'output size file') + parser.add_argument('-out', dest = 'OUTFILE', type = str, help = 'outfile path') + return parser + +sPreComment = re.compile('^(.*)#.*$') +sVarAssign = re.compile('^(\w+):(.*)$') +sBeginTiles = re.compile('^$') +sEndTiles = re.compile('^$') +sSingleCharMatch = re.compile('\'(.(\|.)+)\'') +sSpecialsMatch = re.compile('{"(.+)"}') + +def parseTileInfo(infoFile, encoding): + result = {'_TILES' : []} + with open(infoFile, 'rt') as file: + data = file.read() + # if encoding: + # data = data.decode(encoding) + data = data.split('\n') + + inTiles = False + tiles = [] + for line in data: + # print('line at start: {}'.format(line)) + match = sPreComment.match(line) + if match: + line = match.group(1) + # print('line sans comment: {}'.format(line)) + if 0 == len(line):continue + + if inTiles: + if sEndTiles.match(line): + break + else: + (count, val, face) = line.split(None, 3) + result['_TILES'].append((count, val, face)) + elif sBeginTiles.match(line): + inTiles = True + else: + match = sVarAssign.match(line) + if match: + var = match.group(1) + if not var in result: result[var] = '' + result[var] += match.group(2) + + return result + +class XLOC(): + None + +def readXLOC(): + return XLOC() + +# sub WriteMapFile($$$) { +# my ( $hashR, $unicode, $fhr ) = @_; + +# my $count = GetNTiles($hashR); +# my $specialCount = 0; +# for ( my $i = 0; $i < $count; ++$i ) { +# my $tileR = GetNthTile( $hashR, $i ); +# my $str = ${$tileR}[2]; + +# if ( $str =~ /\'(.(\|.)*)\'/ ) { +# printLetters( $1, $fhr ); +# } elsif ( $str =~ /\"(.+)\"/ ) { +# print $fhr pack( "c", $specialCount++ ); +# } elsif ( $str =~ /(\d+)/ ) { +# print $fhr pack( "n", $1 ); +# } else { +# die "WriteMapFile: unrecognized face format $str, elem $i"; +# } +# } +# } # WriteMapFile + +def printLetters( letters, outfile ): + letters = letters.split('|') + letters = ' '.join(letters) + outfile.write(letters.encode('utf8')) + +def writeMapFile(xlocToken, outfile): + print('writeMapFile()') + tiles = xlocToken['_TILES'] + specialCount = 0 + for tile in tiles: + face = tile[2] + match = sSingleCharMatch.match(face) + if match: + print('single char: {}'.format(match.group(1))) + printLetters( match.group(1), outfile ) + continue + match = sSpecialsMatch.match(face) + if match: + print('specials char: {}'.format(match.group(1))) + outfile.write(struct.pack('B', specialCount )) + specialCount += 1 + continue + + print('bad/unmatched face: {}'.format(face)) + assert False + +def writeValuesFile(xlocToken, outfile): + header = xlocToken.get('XLOC_HEADER') or errorOut('no XLOC_HEADER found') + + print('writing header: {}'.format(header)) + outfile.write(struct.pack('!H', int(header, 16))) + + for tile in xlocToken['_TILES']: + val = int(tile[0]) + count = int(tile[1]) + outfile.write(struct.pack('BB', val, count)) + +def main(): + print('{}.main {} called'.format(sys.argv[0], sys.argv[1:])) + args = mkParser().parse_args() + assert args.OUTFILE + + infoFile = 'info.txt' + if not os.path.exists(infoFile): + errorOut('{} not found'.format(infoFile)) + xlocToken = parseTileInfo(infoFile, args.ENCODING) + + xloc = readXLOC() + + with open(args.OUTFILE, 'wb') as outfile: + if args.DO_TABLE: + writeMapFile(xlocToken, outfile); + elif args.DO_SIZE: + assert not args.DO_VALS + count = len(xlocToken['_TILES']) + outfile.write(struct.pack('!B', count)) + elif args.DO_VALS: + assert not args.DO_SIZE + writeValuesFile( xlocToken, outfile ) + + +############################################################################## +if __name__ == '__main__': + main()