rewrite some dawg perl scripts in python

2025-01-23 07:27:22 +01:00 · 2020-06-30 09:48:08 -07:00 · 2020-06-30 09:48:08 -07:00 · 98d134b491
commit 98d134b491
parent 6f049c0b27
8 changed files with 241 additions and 401 deletions
--- a/xwords4/dawg/Makefile
+++ b/xwords4/dawg/Makefile
@ -23,7 +23,7 @@ byodfiles.tgz: byodfiles.tar
 byodfiles.tar: dict2dawg
 	rm -f $@ langinfo
-	tar cvf $@ ./dict2dawg ./dict2dawg.cpp ./par.pl ./xloc.pl ./xloc.pm
+	tar cvf $@ ./dict2dawg ./dict2dawg.cpp ./par.pl ./xloc.py
 	for dir in $$(ls .); do \
 		if [ $$dir = "Hëx" ]; then \
 			:; \
--- a/xwords4/dawg/Makefile.langcommon
+++ b/xwords4/dawg/Makefile.langcommon
@ -269,24 +269,24 @@ allbins:
 	$(MAKE) TARGET_TYPE=FRANK byodbins
 	rm palmspecials.bin
-table.bin:  ../xloc.pl 
+table.bin:  ../xloc.py
 ifdef NEWDAWG
-	perl -I../ ../xloc.pl $(ENCP) -tn -out $@
+	../xloc.py $(ENCP) -tn -out $@
 else
-	perl -I../ ../xloc.pl -t -out $@
+	error
 endif
-values.bin: ../xloc.pl
+values.bin: ../xloc.py
-	perl -I../ ../xloc.pl -v -out $@
+	../xloc.py -v -out $@
 # a binary file, two bytes, one giving the size of tiles data and the
 #  other the number of tiles in the dict.  Tiles data is utf-8 and so
 #  number is not derivable from size.
-$(XWLANG)_charcount.bin: table.bin ../xloc.pl
+$(XWLANG)_charcount.bin: table.bin ../xloc.py
 	SIZ=$$(ls -l $< | awk '{print $$5}'); \
 	perl -e "print pack(\"c\",$$SIZ)" > $@
 	TMP=/tmp/tmp$$$$; \
-	perl -I../ ../xloc.pl -s -out $$TMP; \
+	../xloc.py -s -out $$TMP; \
 	cat $$TMP >> $@; \
 	rm -f $$TMP
--- a/xwords4/dawg/dictstats.pl
+++ b/xwords4/dawg/dictstats.pl
@ -1,76 +0,0 @@
 #!/usr/bin/perl
 # print stats about in input stream that's assumed to be a dictionary.
 # Counts and percentages of each letter, as well as total numbers of
 # words.  This is not part of the dictionary build process.  I use it
 # for creating info.txt files for new languages and debugging the
 # creation of dictionaries from new wordlists.
 #
 # Something like this might form the basis for choosing counts and
 # values for tiles without using the conventions established by
 # Scrabble players.  This isn't enough, though: the frequency of
 # letter tuples and triples -- how often letters appear together -- is
 # a better indicator than just letter count.
 use strict;
 my @wordSizeCounts;
 my %letterCounts;
 my $wordCount;
 my $letterCount;
 my $enc = "utf8";               # this could be a cmdline arg....
 if ( $enc ) {
    binmode( STDOUT, ":encoding($enc)" ) ;
    binmode( STDIN, ":encoding($enc)" ) ;
 }
 while (<>) {
    chomp;
    ++$wordSizeCounts[length];
    ++$wordCount;
    foreach my $letter (split( / */ ) ) {
        my $ii = ord($letter);
        # special-case the bogus chars we add for "specials"
        die "$0: this is a letter?: $ii" if $ii <= 32 && $ii >= 4 && $ii != 0; 
        ++$letterCounts{$letter};
        ++$letterCount;
    }
 }
 print "Number of words: $wordCount\n";
 print "Number of letters: $letterCount\n\n";
 print "**** word sizes ****\n";
 print "SIZE  COUNT   PERCENT\n";
 my $pctTotal = 0.0;
 my $wordTotal = 0;
 for ( my $i = 1 ; $i <= 99; ++$i ) {
    my $count = $wordSizeCounts[$i];
    $wordTotal += $count;
    if ( $count > 0 ) {
        my $pct = (100.00 * $count)/$wordCount;
        $pctTotal += $pct;
        printf "%2d   %6d    %.2f\n", $i, $count, $pct;
    }
 } 
 printf "-------------------------------\n";
 printf "     %6d  %.2f\n", $wordTotal, $pctTotal;
 print "\n\n**** Letter counts ****\n";
 print "     ASCII ORD  HEX     PCT (of $letterCount)\n";
 my $lineNo = 1;
 foreach my $key (sort keys %letterCounts) {
    my $count = $letterCounts{$key};
    my $pct = (100.00 * $count) / $letterCount;
    printf( "%2d: %3s   %3d  %x    %5.2f (%d)\n",
            $lineNo, $key, ord($key), ord($key), $pct, $count );
    ++$lineNo;
 }
 print "\n";
--- a/xwords4/dawg/dictstats.py
+++ b/xwords4/dawg/dictstats.py
@ -0,0 +1,80 @@
 #!/usr/bin/env python3
 import sys
 """
 print stats about in input stream that's assumed to be a dictionary.
 Counts and percentages of each letter, as well as total numbers of
 words.  This is not part of the dictionary build process.  I use it
 for creating info.txt files for new languages and debugging the
 creation of dictionaries from new wordlists.
 Something like this might form the basis for choosing counts and
 values for tiles without using the conventions established by
 Scrabble players.  This isn't enough, though: the frequency of
 letter tuples and triples -- how often letters appear together -- is
 a better indicator than just letter count.
 """
 def main():
    wordSizeCounts = {}
    letterCounts = {}
    wordCount = 0
    letterCount = 0
    enc = 'utf8'               # this could be a cmdline arg....
    for line in sys.stdin.readlines():
        line = line.strip()
        length = len(line)
        if not length in wordSizeCounts: wordSizeCounts[length] = 0
        wordSizeCounts[length] += 1
        wordCount += 1
        for letter in line:
            ii = ord(letter)
            # perl did this:  die "$0: this is a letter?: $ii" if $ii <= 32 && $ii >= 4 && $ii != 0; 
            assert ii > 32 or ii < 4 or ii == 0, 'letter {} out of range'.format(ii)
            if not letter in letterCounts: letterCounts[letter] = 0
            letterCounts[letter] += 1
            letterCount += 1
    print( 'Number of words: {}'.format(wordCount))
    print( 'Number of letters: {}'.format(letterCount))
    print('')
    print( '**** word sizes ****' )
    print( 'SIZE  COUNT   PERCENT' )
    pctTotal = 0.0
    wordTotal = 0
    for ii in sorted(wordSizeCounts):
        count = wordSizeCounts[ii]
        wordTotal += count
        pct = (100.00 * count)/wordCount
        pctTotal += pct
        print( '{:2d}   {:6d}    {:02.2f}'.format(ii, count, pct))
    print( '-------------------------------' )
    print('     {:6d}  {:.2f}'.format( wordTotal, pctTotal))
    print('')
    lineNo = 1
    pctTotal = 0.0
    print( '**** Letter counts ****' )
    print( '     ASCII  ORD  HEX     PCT (of {})'.format(letterCount))
    for letter in sorted(letterCounts):
        count = letterCounts[letter]
        pct = (100.00 * count) / letterCount
        pctTotal += pct
        print( '{:2d}: {: >6s}   {:2d}   {:x}    {:5.2f} ({:d})' \
               .format(lineNo, letter, ord(letter), ord(letter), pct, count ) )
        lineNo += 1
    print('percent total {:.2f}'.format( pctTotal))
    print('')
 ##############################################################################
 if __name__ == '__main__':
    main()
--- a/xwords4/dawg/frank_mkspecials.pl
+++ b/xwords4/dawg/frank_mkspecials.pl
@ -1,47 +0,0 @@
 #!/usr/bin/perl
 # Copyright 2001 by Eric House (xwords@eehouse.org)
 #
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
 # as published by the Free Software Foundation; either version 2
 # of the License, or (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 # Given arguments consisting of triples, first a string and then pbitm
 # files representing bitmaps.  For each triple, print out the string and
 # then the converted bitmaps.
 use strict;
 while ( @ARGV ) {
  my $str = shift();
  my $largebmp = shift();
  my $smallbmp = shift();
  doOne( $str, $largebmp, $smallbmp );
 }
 sub doOne {
  my ( $str, $largebmp, $smallbmp ) = @_;
  print pack( "C", length($str) );
  print $str;
  print STDERR "looking at $largebmp", "\n";
  die "file $largebmp does not exist\n" unless -e $largebmp;
  print `cat $largebmp | ../pbitm2bin.pl`;
  die "file $smallbmp does not exist\n" unless -e $smallbmp;
  print `cat $smallbmp | ../pbitm2bin.pl`;
 }
--- a/xwords4/dawg/xloc.pl
+++ b/xwords4/dawg/xloc.pl
@ -1,76 +0,0 @@
 #!/usr/bin/perl
 # Copyright 2002 by Eric House (xwords@eehouse.org).  All rights reserved.
 #
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
 # as published by the Free Software Foundation; either version 2
 # of the License, or (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 # test and wrapper file for xloc.pm
 use strict;
 use xloc;
 my $unicode = -1;
 my $doval = 0;
 my $dosize = 0;
 my $enc;
 my $outfile;
 my $arg;
 while ( $arg = $ARGV[0] ) {
    if ( $arg eq '-enc' ) {
        $enc = $ARGV[1];
        shift @ARGV;
    } elsif ( $arg eq "-tn" ) {
        $unicode = 1;
    } elsif ( $arg eq "-t" ) {
        $unicode = 0;
    } elsif ( $arg eq "-v" ) {
        $doval = 1;
    } elsif ( $arg eq "-s" ) {
        $dosize = 1;
    } elsif ( $arg eq '-out' ) {
        $outfile = $ARGV[1];
        shift @ARGV;
    } else {
        die "unknown arg $arg\n";
    }
    shift @ARGV;
 }
 my $infoFile = "info.txt";
 die "info file $infoFile not found\n" if ! -s $infoFile;
 my $xlocToken = xloc::ParseTileInfo($infoFile, $enc);
 if ( $enc ) {
    open OUTFILE, ">:encoding($enc)", "$outfile" 
        or die "couldn't open $outfile";
 } else {
    open OUTFILE, ">$outfile" or die "couldn't open $outfile";
 }
 # For f*cking windoze linefeeds
 # binmode( OUTFILE );
 if ( $unicode ne -1 ) {
    xloc::WriteMapFile( $xlocToken, $unicode, \*OUTFILE );
 } elsif ( $dosize ) {
    my $count = xloc::GetNTiles( $xlocToken );
    print OUTFILE pack("c", $count );
 } elsif ( $doval ) {
    xloc::WriteValuesFile( $xlocToken, \*OUTFILE );
 }
 close OUTFILE;
--- a/xwords4/dawg/xloc.pm
+++ b/xwords4/dawg/xloc.pm
@ -1,194 +0,0 @@
 #!/usr/bin/perl
 # Copyright 2002-2014 by Eric House (xwords@eehouse.org).  All rights
 # reserved.
 #
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
 # as published by the Free Software Foundation; either version 2
 # of the License, or (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 # The idea here is that all that matters about a language is stored in
 # one file (possibly excepting rules for prepping a dictionary).
 # There's a list of tile faces, counts and values, and also some
 # name-value pairs as needed.  The pairs come first, and then a list
 # of tiles.
 package xloc;
 use strict;
 use warnings;
 # force output in utf8
 use open qw/:std :utf8/;
 BEGIN {
    use Exporter   ();
    our ($VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS);
    $VERSION     = 1.00;
    @ISA         = qw(Exporter);
    @EXPORT      = qw(&ParseTileInfo &GetNTiles &TileFace &TileValue
 		      &TileCount &GetValue &WriteMapFile &WriteValuesFile);
    %EXPORT_TAGS = ( );
 }
 # Returns what's meant to be an opaque object that can be passed back
 # for queries.  It's a hash with name-value pairs and an _INFO entry
 # containing a list of tile info lists.
 sub ParseTileInfo($$) {
    my ( $filePath, $enc ) = @_;
    my %result;
    if ( $enc ) {
        open( INPUT, "<:encoding($enc)", "$filePath" ) 
            or die "couldn't open $filePath";
    } else {
        open( INPUT, "<$filePath" ) 
            or die "couldn't open $filePath";
    }
    my $inTiles = 0;
    my @tiles;
    while ( <INPUT> ) {
        chomp;
        s/\#.*$//;
        s/^\s*$//;                  # nuke all-white-space lines
        next if !length;
        if ( $inTiles ) {
            if ( /<END_TILES>/ ) {
                last;
            } else {
                my ( $count, $val, $face ) = m/^\s*(\w+)\s+(\w+)\s+(.*)\s*$/;
                push @tiles, [ $count, $val, $face ];
            }
        } elsif ( /\w:/ ) {
            my ( $nam, $val ) = split ':', $_, 2;
            $result{$nam} .= $val;
        } elsif ( /<BEGIN_TILES>/ ) {
            $inTiles = 1;
        }
    }
    close INPUT;
    $result{"_TILES"} = [ @tiles ];
    return \%result;
 }
 sub GetNTiles($) {
    my ( $hashR ) = @_;
    my $listR = ${$hashR}{"_TILES"};
    return 0 + @{$listR};
 }
 sub GetValue($$) {
    my ( $hashR, $name ) = @_;
    return ${$hashR}{$name};
 }
 sub printLetters($$) {
    my ( $str, $fhr ) = @_;
    my @letters = split( /\|/, $str );
    $str = join( " ", @letters );
    for ( my $key = 0; $key < length($str); ++$key ) {
        my $chr = substr( $str, $key, 1 );
        print $fhr pack( "U", ord($chr) );
    }
 }
 sub WriteMapFile($$$) {
    my ( $hashR, $unicode, $fhr ) = @_;
    my $count = GetNTiles($hashR);
    my $specialCount = 0;
    for ( my $i = 0; $i < $count; ++$i ) {
        my $tileR = GetNthTile( $hashR, $i );
        my $str = ${$tileR}[2];
        if ( $str =~ /\'(.(\|.)*)\'/ ) {
            printLetters( $1, $fhr );
        } elsif ( $str =~ /\"(.+)\"/ ) {
            print $fhr pack( "c", $specialCount++ );
        } elsif ( $str =~ /(\d+)/ ) {
            print $fhr pack( "n", $1 );
        } else {
            die "WriteMapFile: unrecognized face format $str, elem $i";
        }
    }
 } # WriteMapFile
 sub WriteValuesFile($$) {
    my ( $hashR, $fhr ) = @_;
    my $header = GetValue( $hashR,"XLOC_HEADER" );
    die "no XLOC_HEADER found" if ! $header;
    print STDERR "header is $header\n";
    print $fhr pack( "n", hex($header) );
    my $count = GetNTiles($hashR);
    for ( my $i = 0; $i < $count; ++$i ) {
        my $tileR = GetNthTile( $hashR, $i );
        print $fhr pack( "c", TileValue($tileR) );
        print $fhr pack( "c", TileCount($tileR) );
    }
 } # WriteValuesFile
 sub GetNthTile($$) {
    my ( $hashR, $n ) = @_;
    my $listR = ${$hashR}{"_TILES"};
    return ${$listR}[$n];
 }
 sub TileFace($) {
    my ( $tileR ) = @_;
    my $result;
    my $str = ${$tileR}[2];
    if ( $str =~ /\'(.(\|.)*)\'/ ) {
        $result = $1;
    } elsif ( $str =~ /\"(.+)\"/ ) {
        $result = $1;
    } elsif ( $str =~ /(\d+)/ ) {
        $result = chr($1);
    } else {
        die "TileFace: unrecognized face format: $str";
    }
    return $result;
 }
 sub TileValue($) {
    my ( $tileR ) = @_;
    return ${$tileR}[0];
 }
 sub TileCount($) {
    my ( $tileR ) = @_;
    return ${$tileR}[1];
 }
 1;
--- a/xwords4/dawg/xloc.py
+++ b/xwords4/dawg/xloc.py
@ -0,0 +1,153 @@
 #!/usr/bin/env python3
 import argparse, os, re, struct, sys
 def errorOut(msg):
    print('ERROR: {}'.format(msg))
    sys.exit(1)
 def mkParser():
    parser = argparse.ArgumentParser()
    parser.add_argument('-enc', dest = 'ENCODING', type = str, help = 'use this encoding' )
    parser.add_argument('-tn', dest = 'DO_TABLE', action = 'store_true', help = 'output table file' )
    # parser.add_argument('-tn', dest = 'UNICODE', default = False,
    #                     action = 'store_true', help = 'assume unicode')
    # parser.add_argument('-t', dest = 'UNICODE', type = str, default = True,
    #                     action = 'store_false', help = 'DO NOT assume unicode')
    parser.add_argument('-v', dest = 'DO_VALS', action = 'store_true', help = 'output values file' )
    parser.add_argument('-s', dest = 'DO_SIZE', action = 'store_true', help = 'output size file')
    parser.add_argument('-out', dest = 'OUTFILE', type = str, help = 'outfile path')
    return parser
 sPreComment = re.compile('^(.*)#.*$')
 sVarAssign = re.compile('^(\w+):(.*)$')
 sBeginTiles = re.compile('^<BEGIN_TILES>$')
 sEndTiles = re.compile('^<END_TILES>$')
 sSingleCharMatch = re.compile('\'(.(\|.)+)\'')
 sSpecialsMatch = re.compile('{"(.+)"}')
 def parseTileInfo(infoFile, encoding):
    result = {'_TILES' : []}
    with open(infoFile, 'rt') as file:
        data = file.read()
        # if encoding:
        #     data = data.decode(encoding)
        data = data.split('\n')
        inTiles = False
        tiles = []
        for line in data:
            # print('line at start: {}'.format(line))
            match = sPreComment.match(line)
            if match:
                line = match.group(1)
                # print('line sans comment: {}'.format(line))
            if 0 == len(line):continue
            if inTiles:
                if sEndTiles.match(line):
                    break
                else:
                    (count, val, face) = line.split(None, 3)
                    result['_TILES'].append((count, val, face))
            elif sBeginTiles.match(line):
                inTiles = True
            else:
                match = sVarAssign.match(line)
                if match:
                    var = match.group(1)
                    if not var in result: result[var] = ''
                    result[var] += match.group(2)
    return result
 class XLOC():
    None
 def readXLOC():
    return XLOC()
 # sub WriteMapFile($$$) {
 #     my ( $hashR, $unicode, $fhr ) = @_;
 #     my $count = GetNTiles($hashR);
 #     my $specialCount = 0;
 #     for ( my $i = 0; $i < $count; ++$i ) {
 #         my $tileR = GetNthTile( $hashR, $i );
 #         my $str = ${$tileR}[2];
 #         if ( $str =~ /\'(.(\|.)*)\'/ ) {
 #             printLetters( $1, $fhr );
 #         } elsif ( $str =~ /\"(.+)\"/ ) {
 #             print $fhr pack( "c", $specialCount++ );
 #         } elsif ( $str =~ /(\d+)/ ) {
 #             print $fhr pack( "n", $1 );
 #         } else {
 #             die "WriteMapFile: unrecognized face format $str, elem $i";
 #         }
 #     }
 # } # WriteMapFile
 def printLetters( letters, outfile ):
    letters = letters.split('|')
    letters = ' '.join(letters)
    outfile.write(letters.encode('utf8'))
 def writeMapFile(xlocToken, outfile):
    print('writeMapFile()')
    tiles = xlocToken['_TILES']
    specialCount = 0
    for tile in tiles:
        face = tile[2]
        match = sSingleCharMatch.match(face)
        if match:
            print('single char: {}'.format(match.group(1)))
            printLetters( match.group(1), outfile )
            continue
        match = sSpecialsMatch.match(face)
        if match:
            print('specials char: {}'.format(match.group(1)))
            outfile.write(struct.pack('B', specialCount ))
            specialCount += 1
            continue
        print('bad/unmatched face: {}'.format(face))
        assert False
 def writeValuesFile(xlocToken, outfile):
    header = xlocToken.get('XLOC_HEADER') or errorOut('no XLOC_HEADER found')
    print('writing header: {}'.format(header))
    outfile.write(struct.pack('!H', int(header, 16)))
    for tile in xlocToken['_TILES']:
        val = int(tile[0])
        count = int(tile[1])
        outfile.write(struct.pack('BB', val, count))
 def main():
    print('{}.main {} called'.format(sys.argv[0], sys.argv[1:]))
    args = mkParser().parse_args()
    assert args.OUTFILE
    infoFile = 'info.txt'
    if not os.path.exists(infoFile):
        errorOut('{} not found'.format(infoFile))
    xlocToken = parseTileInfo(infoFile, args.ENCODING)
    xloc = readXLOC()
    with open(args.OUTFILE, 'wb') as outfile:
        if args.DO_TABLE:
            writeMapFile(xlocToken, outfile);
        elif args.DO_SIZE:
            assert not args.DO_VALS
            count = len(xlocToken['_TILES'])
            outfile.write(struct.pack('!B', count))
        elif args.DO_VALS:
            assert not args.DO_SIZE
            writeValuesFile( xlocToken, outfile )
 ##############################################################################
 if __name__ == '__main__':
    main()