rewrite some dawg perl scripts in python

This commit is contained in:
Eric House 2020-06-30 09:48:08 -07:00
parent 6f049c0b27
commit 98d134b491
8 changed files with 241 additions and 401 deletions

View file

@ -23,7 +23,7 @@ byodfiles.tgz: byodfiles.tar
byodfiles.tar: dict2dawg byodfiles.tar: dict2dawg
rm -f $@ langinfo rm -f $@ langinfo
tar cvf $@ ./dict2dawg ./dict2dawg.cpp ./par.pl ./xloc.pl ./xloc.pm tar cvf $@ ./dict2dawg ./dict2dawg.cpp ./par.pl ./xloc.py
for dir in $$(ls .); do \ for dir in $$(ls .); do \
if [ $$dir = "Hëx" ]; then \ if [ $$dir = "Hëx" ]; then \
:; \ :; \

View file

@ -269,24 +269,24 @@ allbins:
$(MAKE) TARGET_TYPE=FRANK byodbins $(MAKE) TARGET_TYPE=FRANK byodbins
rm palmspecials.bin rm palmspecials.bin
table.bin: ../xloc.pl table.bin: ../xloc.py
ifdef NEWDAWG ifdef NEWDAWG
perl -I../ ../xloc.pl $(ENCP) -tn -out $@ ../xloc.py $(ENCP) -tn -out $@
else else
perl -I../ ../xloc.pl -t -out $@ error
endif endif
values.bin: ../xloc.pl values.bin: ../xloc.py
perl -I../ ../xloc.pl -v -out $@ ../xloc.py -v -out $@
# a binary file, two bytes, one giving the size of tiles data and the # a binary file, two bytes, one giving the size of tiles data and the
# other the number of tiles in the dict. Tiles data is utf-8 and so # other the number of tiles in the dict. Tiles data is utf-8 and so
# number is not derivable from size. # number is not derivable from size.
$(XWLANG)_charcount.bin: table.bin ../xloc.pl $(XWLANG)_charcount.bin: table.bin ../xloc.py
SIZ=$$(ls -l $< | awk '{print $$5}'); \ SIZ=$$(ls -l $< | awk '{print $$5}'); \
perl -e "print pack(\"c\",$$SIZ)" > $@ perl -e "print pack(\"c\",$$SIZ)" > $@
TMP=/tmp/tmp$$$$; \ TMP=/tmp/tmp$$$$; \
perl -I../ ../xloc.pl -s -out $$TMP; \ ../xloc.py -s -out $$TMP; \
cat $$TMP >> $@; \ cat $$TMP >> $@; \
rm -f $$TMP rm -f $$TMP

View file

@ -1,76 +0,0 @@
#!/usr/bin/perl
# print stats about in input stream that's assumed to be a dictionary.
# Counts and percentages of each letter, as well as total numbers of
# words. This is not part of the dictionary build process. I use it
# for creating info.txt files for new languages and debugging the
# creation of dictionaries from new wordlists.
#
# Something like this might form the basis for choosing counts and
# values for tiles without using the conventions established by
# Scrabble players. This isn't enough, though: the frequency of
# letter tuples and triples -- how often letters appear together -- is
# a better indicator than just letter count.
use strict;
my @wordSizeCounts;
my %letterCounts;
my $wordCount;
my $letterCount;
my $enc = "utf8"; # this could be a cmdline arg....
if ( $enc ) {
binmode( STDOUT, ":encoding($enc)" ) ;
binmode( STDIN, ":encoding($enc)" ) ;
}
while (<>) {
chomp;
++$wordSizeCounts[length];
++$wordCount;
foreach my $letter (split( / */ ) ) {
my $ii = ord($letter);
# special-case the bogus chars we add for "specials"
die "$0: this is a letter?: $ii" if $ii <= 32 && $ii >= 4 && $ii != 0;
++$letterCounts{$letter};
++$letterCount;
}
}
print "Number of words: $wordCount\n";
print "Number of letters: $letterCount\n\n";
print "**** word sizes ****\n";
print "SIZE COUNT PERCENT\n";
my $pctTotal = 0.0;
my $wordTotal = 0;
for ( my $i = 1 ; $i <= 99; ++$i ) {
my $count = $wordSizeCounts[$i];
$wordTotal += $count;
if ( $count > 0 ) {
my $pct = (100.00 * $count)/$wordCount;
$pctTotal += $pct;
printf "%2d %6d %.2f\n", $i, $count, $pct;
}
}
printf "-------------------------------\n";
printf " %6d %.2f\n", $wordTotal, $pctTotal;
print "\n\n**** Letter counts ****\n";
print " ASCII ORD HEX PCT (of $letterCount)\n";
my $lineNo = 1;
foreach my $key (sort keys %letterCounts) {
my $count = $letterCounts{$key};
my $pct = (100.00 * $count) / $letterCount;
printf( "%2d: %3s %3d %x %5.2f (%d)\n",
$lineNo, $key, ord($key), ord($key), $pct, $count );
++$lineNo;
}
print "\n";

80
xwords4/dawg/dictstats.py Executable file
View file

@ -0,0 +1,80 @@
#!/usr/bin/env python3
import sys
"""
print stats about in input stream that's assumed to be a dictionary.
Counts and percentages of each letter, as well as total numbers of
words. This is not part of the dictionary build process. I use it
for creating info.txt files for new languages and debugging the
creation of dictionaries from new wordlists.
Something like this might form the basis for choosing counts and
values for tiles without using the conventions established by
Scrabble players. This isn't enough, though: the frequency of
letter tuples and triples -- how often letters appear together -- is
a better indicator than just letter count.
"""
def main():
wordSizeCounts = {}
letterCounts = {}
wordCount = 0
letterCount = 0
enc = 'utf8' # this could be a cmdline arg....
for line in sys.stdin.readlines():
line = line.strip()
length = len(line)
if not length in wordSizeCounts: wordSizeCounts[length] = 0
wordSizeCounts[length] += 1
wordCount += 1
for letter in line:
ii = ord(letter)
# perl did this: die "$0: this is a letter?: $ii" if $ii <= 32 && $ii >= 4 && $ii != 0;
assert ii > 32 or ii < 4 or ii == 0, 'letter {} out of range'.format(ii)
if not letter in letterCounts: letterCounts[letter] = 0
letterCounts[letter] += 1
letterCount += 1
print( 'Number of words: {}'.format(wordCount))
print( 'Number of letters: {}'.format(letterCount))
print('')
print( '**** word sizes ****' )
print( 'SIZE COUNT PERCENT' )
pctTotal = 0.0
wordTotal = 0
for ii in sorted(wordSizeCounts):
count = wordSizeCounts[ii]
wordTotal += count
pct = (100.00 * count)/wordCount
pctTotal += pct
print( '{:2d} {:6d} {:02.2f}'.format(ii, count, pct))
print( '-------------------------------' )
print(' {:6d} {:.2f}'.format( wordTotal, pctTotal))
print('')
lineNo = 1
pctTotal = 0.0
print( '**** Letter counts ****' )
print( ' ASCII ORD HEX PCT (of {})'.format(letterCount))
for letter in sorted(letterCounts):
count = letterCounts[letter]
pct = (100.00 * count) / letterCount
pctTotal += pct
print( '{:2d}: {: >6s} {:2d} {:x} {:5.2f} ({:d})' \
.format(lineNo, letter, ord(letter), ord(letter), pct, count ) )
lineNo += 1
print('percent total {:.2f}'.format( pctTotal))
print('')
##############################################################################
if __name__ == '__main__':
main()

View file

@ -1,47 +0,0 @@
#!/usr/bin/perl
# Copyright 2001 by Eric House (xwords@eehouse.org)
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# Given arguments consisting of triples, first a string and then pbitm
# files representing bitmaps. For each triple, print out the string and
# then the converted bitmaps.
use strict;
while ( @ARGV ) {
my $str = shift();
my $largebmp = shift();
my $smallbmp = shift();
doOne( $str, $largebmp, $smallbmp );
}
sub doOne {
my ( $str, $largebmp, $smallbmp ) = @_;
print pack( "C", length($str) );
print $str;
print STDERR "looking at $largebmp", "\n";
die "file $largebmp does not exist\n" unless -e $largebmp;
print `cat $largebmp | ../pbitm2bin.pl`;
die "file $smallbmp does not exist\n" unless -e $smallbmp;
print `cat $smallbmp | ../pbitm2bin.pl`;
}

View file

@ -1,76 +0,0 @@
#!/usr/bin/perl
# Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# test and wrapper file for xloc.pm
use strict;
use xloc;
my $unicode = -1;
my $doval = 0;
my $dosize = 0;
my $enc;
my $outfile;
my $arg;
while ( $arg = $ARGV[0] ) {
if ( $arg eq '-enc' ) {
$enc = $ARGV[1];
shift @ARGV;
} elsif ( $arg eq "-tn" ) {
$unicode = 1;
} elsif ( $arg eq "-t" ) {
$unicode = 0;
} elsif ( $arg eq "-v" ) {
$doval = 1;
} elsif ( $arg eq "-s" ) {
$dosize = 1;
} elsif ( $arg eq '-out' ) {
$outfile = $ARGV[1];
shift @ARGV;
} else {
die "unknown arg $arg\n";
}
shift @ARGV;
}
my $infoFile = "info.txt";
die "info file $infoFile not found\n" if ! -s $infoFile;
my $xlocToken = xloc::ParseTileInfo($infoFile, $enc);
if ( $enc ) {
open OUTFILE, ">:encoding($enc)", "$outfile"
or die "couldn't open $outfile";
} else {
open OUTFILE, ">$outfile" or die "couldn't open $outfile";
}
# For f*cking windoze linefeeds
# binmode( OUTFILE );
if ( $unicode ne -1 ) {
xloc::WriteMapFile( $xlocToken, $unicode, \*OUTFILE );
} elsif ( $dosize ) {
my $count = xloc::GetNTiles( $xlocToken );
print OUTFILE pack("c", $count );
} elsif ( $doval ) {
xloc::WriteValuesFile( $xlocToken, \*OUTFILE );
}
close OUTFILE;

View file

@ -1,194 +0,0 @@
#!/usr/bin/perl
# Copyright 2002-2014 by Eric House (xwords@eehouse.org). All rights
# reserved.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# The idea here is that all that matters about a language is stored in
# one file (possibly excepting rules for prepping a dictionary).
# There's a list of tile faces, counts and values, and also some
# name-value pairs as needed. The pairs come first, and then a list
# of tiles.
package xloc;
use strict;
use warnings;
# force output in utf8
use open qw/:std :utf8/;
BEGIN {
use Exporter ();
our ($VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS);
$VERSION = 1.00;
@ISA = qw(Exporter);
@EXPORT = qw(&ParseTileInfo &GetNTiles &TileFace &TileValue
&TileCount &GetValue &WriteMapFile &WriteValuesFile);
%EXPORT_TAGS = ( );
}
# Returns what's meant to be an opaque object that can be passed back
# for queries. It's a hash with name-value pairs and an _INFO entry
# containing a list of tile info lists.
sub ParseTileInfo($$) {
my ( $filePath, $enc ) = @_;
my %result;
if ( $enc ) {
open( INPUT, "<:encoding($enc)", "$filePath" )
or die "couldn't open $filePath";
} else {
open( INPUT, "<$filePath" )
or die "couldn't open $filePath";
}
my $inTiles = 0;
my @tiles;
while ( <INPUT> ) {
chomp;
s/\#.*$//;
s/^\s*$//; # nuke all-white-space lines
next if !length;
if ( $inTiles ) {
if ( /<END_TILES>/ ) {
last;
} else {
my ( $count, $val, $face ) = m/^\s*(\w+)\s+(\w+)\s+(.*)\s*$/;
push @tiles, [ $count, $val, $face ];
}
} elsif ( /\w:/ ) {
my ( $nam, $val ) = split ':', $_, 2;
$result{$nam} .= $val;
} elsif ( /<BEGIN_TILES>/ ) {
$inTiles = 1;
}
}
close INPUT;
$result{"_TILES"} = [ @tiles ];
return \%result;
}
sub GetNTiles($) {
my ( $hashR ) = @_;
my $listR = ${$hashR}{"_TILES"};
return 0 + @{$listR};
}
sub GetValue($$) {
my ( $hashR, $name ) = @_;
return ${$hashR}{$name};
}
sub printLetters($$) {
my ( $str, $fhr ) = @_;
my @letters = split( /\|/, $str );
$str = join( " ", @letters );
for ( my $key = 0; $key < length($str); ++$key ) {
my $chr = substr( $str, $key, 1 );
print $fhr pack( "U", ord($chr) );
}
}
sub WriteMapFile($$$) {
my ( $hashR, $unicode, $fhr ) = @_;
my $count = GetNTiles($hashR);
my $specialCount = 0;
for ( my $i = 0; $i < $count; ++$i ) {
my $tileR = GetNthTile( $hashR, $i );
my $str = ${$tileR}[2];
if ( $str =~ /\'(.(\|.)*)\'/ ) {
printLetters( $1, $fhr );
} elsif ( $str =~ /\"(.+)\"/ ) {
print $fhr pack( "c", $specialCount++ );
} elsif ( $str =~ /(\d+)/ ) {
print $fhr pack( "n", $1 );
} else {
die "WriteMapFile: unrecognized face format $str, elem $i";
}
}
} # WriteMapFile
sub WriteValuesFile($$) {
my ( $hashR, $fhr ) = @_;
my $header = GetValue( $hashR,"XLOC_HEADER" );
die "no XLOC_HEADER found" if ! $header;
print STDERR "header is $header\n";
print $fhr pack( "n", hex($header) );
my $count = GetNTiles($hashR);
for ( my $i = 0; $i < $count; ++$i ) {
my $tileR = GetNthTile( $hashR, $i );
print $fhr pack( "c", TileValue($tileR) );
print $fhr pack( "c", TileCount($tileR) );
}
} # WriteValuesFile
sub GetNthTile($$) {
my ( $hashR, $n ) = @_;
my $listR = ${$hashR}{"_TILES"};
return ${$listR}[$n];
}
sub TileFace($) {
my ( $tileR ) = @_;
my $result;
my $str = ${$tileR}[2];
if ( $str =~ /\'(.(\|.)*)\'/ ) {
$result = $1;
} elsif ( $str =~ /\"(.+)\"/ ) {
$result = $1;
} elsif ( $str =~ /(\d+)/ ) {
$result = chr($1);
} else {
die "TileFace: unrecognized face format: $str";
}
return $result;
}
sub TileValue($) {
my ( $tileR ) = @_;
return ${$tileR}[0];
}
sub TileCount($) {
my ( $tileR ) = @_;
return ${$tileR}[1];
}
1;

153
xwords4/dawg/xloc.py Executable file
View file

@ -0,0 +1,153 @@
#!/usr/bin/env python3
import argparse, os, re, struct, sys
def errorOut(msg):
print('ERROR: {}'.format(msg))
sys.exit(1)
def mkParser():
parser = argparse.ArgumentParser()
parser.add_argument('-enc', dest = 'ENCODING', type = str, help = 'use this encoding' )
parser.add_argument('-tn', dest = 'DO_TABLE', action = 'store_true', help = 'output table file' )
# parser.add_argument('-tn', dest = 'UNICODE', default = False,
# action = 'store_true', help = 'assume unicode')
# parser.add_argument('-t', dest = 'UNICODE', type = str, default = True,
# action = 'store_false', help = 'DO NOT assume unicode')
parser.add_argument('-v', dest = 'DO_VALS', action = 'store_true', help = 'output values file' )
parser.add_argument('-s', dest = 'DO_SIZE', action = 'store_true', help = 'output size file')
parser.add_argument('-out', dest = 'OUTFILE', type = str, help = 'outfile path')
return parser
sPreComment = re.compile('^(.*)#.*$')
sVarAssign = re.compile('^(\w+):(.*)$')
sBeginTiles = re.compile('^<BEGIN_TILES>$')
sEndTiles = re.compile('^<END_TILES>$')
sSingleCharMatch = re.compile('\'(.(\|.)+)\'')
sSpecialsMatch = re.compile('{"(.+)"}')
def parseTileInfo(infoFile, encoding):
result = {'_TILES' : []}
with open(infoFile, 'rt') as file:
data = file.read()
# if encoding:
# data = data.decode(encoding)
data = data.split('\n')
inTiles = False
tiles = []
for line in data:
# print('line at start: {}'.format(line))
match = sPreComment.match(line)
if match:
line = match.group(1)
# print('line sans comment: {}'.format(line))
if 0 == len(line):continue
if inTiles:
if sEndTiles.match(line):
break
else:
(count, val, face) = line.split(None, 3)
result['_TILES'].append((count, val, face))
elif sBeginTiles.match(line):
inTiles = True
else:
match = sVarAssign.match(line)
if match:
var = match.group(1)
if not var in result: result[var] = ''
result[var] += match.group(2)
return result
class XLOC():
None
def readXLOC():
return XLOC()
# sub WriteMapFile($$$) {
# my ( $hashR, $unicode, $fhr ) = @_;
# my $count = GetNTiles($hashR);
# my $specialCount = 0;
# for ( my $i = 0; $i < $count; ++$i ) {
# my $tileR = GetNthTile( $hashR, $i );
# my $str = ${$tileR}[2];
# if ( $str =~ /\'(.(\|.)*)\'/ ) {
# printLetters( $1, $fhr );
# } elsif ( $str =~ /\"(.+)\"/ ) {
# print $fhr pack( "c", $specialCount++ );
# } elsif ( $str =~ /(\d+)/ ) {
# print $fhr pack( "n", $1 );
# } else {
# die "WriteMapFile: unrecognized face format $str, elem $i";
# }
# }
# } # WriteMapFile
def printLetters( letters, outfile ):
letters = letters.split('|')
letters = ' '.join(letters)
outfile.write(letters.encode('utf8'))
def writeMapFile(xlocToken, outfile):
print('writeMapFile()')
tiles = xlocToken['_TILES']
specialCount = 0
for tile in tiles:
face = tile[2]
match = sSingleCharMatch.match(face)
if match:
print('single char: {}'.format(match.group(1)))
printLetters( match.group(1), outfile )
continue
match = sSpecialsMatch.match(face)
if match:
print('specials char: {}'.format(match.group(1)))
outfile.write(struct.pack('B', specialCount ))
specialCount += 1
continue
print('bad/unmatched face: {}'.format(face))
assert False
def writeValuesFile(xlocToken, outfile):
header = xlocToken.get('XLOC_HEADER') or errorOut('no XLOC_HEADER found')
print('writing header: {}'.format(header))
outfile.write(struct.pack('!H', int(header, 16)))
for tile in xlocToken['_TILES']:
val = int(tile[0])
count = int(tile[1])
outfile.write(struct.pack('BB', val, count))
def main():
print('{}.main {} called'.format(sys.argv[0], sys.argv[1:]))
args = mkParser().parse_args()
assert args.OUTFILE
infoFile = 'info.txt'
if not os.path.exists(infoFile):
errorOut('{} not found'.format(infoFile))
xlocToken = parseTileInfo(infoFile, args.ENCODING)
xloc = readXLOC()
with open(args.OUTFILE, 'wb') as outfile:
if args.DO_TABLE:
writeMapFile(xlocToken, outfile);
elif args.DO_SIZE:
assert not args.DO_VALS
count = len(xlocToken['_TILES'])
outfile.write(struct.pack('!B', count))
elif args.DO_VALS:
assert not args.DO_SIZE
writeValuesFile( xlocToken, outfile )
##############################################################################
if __name__ == '__main__':
main()