diff --git a/xwords4/dawg/Slovak/Makefile b/xwords4/dawg/Slovak/Makefile index f0092dc59..1c267415d 100644 --- a/xwords4/dawg/Slovak/Makefile +++ b/xwords4/dawg/Slovak/Makefile @@ -28,7 +28,7 @@ SOURCEDICT ?= $(XWDICTPATH)/Slovak/Scrabble_SK.dict $(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile cat $< | tr -d '\r' | \ - tr [aáäbcčdďeéfghiíjklĺľmnňoôóprŕsštťuúvxyýzž] [AÁÄBCČDĎEÉFGHIÍJKLĹĽMNŇOÔÓPRŔSŠTŤUÚVXYÝZŽ] | \ + tr '[:lower:]' '[:upper:]' | \ sed 's,.,\U\0,g' | \ grep '^[AÁÄBCČDĎEÉFGHIÍJKLĹĽMNŇOÔÓPRŔSŠTŤUÚVXYÝZŽ]*$$' | \ gzip -c > $@ diff --git a/xwords4/dawg/dawg2dict.pl b/xwords4/dawg/dawg2dict.pl deleted file mode 100755 index 977228fd7..000000000 --- a/xwords4/dawg/dawg2dict.pl +++ /dev/null @@ -1,551 +0,0 @@ -#!/usr/bin/perl -# -# Copyright 2004 - 2012 by Eric House (xwords@eehouse.org) -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - - -# Given a .pdb or .xwd file, print all the words in the DAWG to -# stdout. - -use strict; -use Fcntl; -use Encode 'from_to'; -use Encode; -use Digest::MD5; - -my $gInFile; -my $gSumOnly = 0; -my $gSum = ''; -my $gDescOnly = 0; -my $gDesc; -my $gDoRaw = 0; -my $gDoJSON = 0; -my $gFileType; -my $gNodeSize; -my $gIsUTF; - -use Fcntl 'SEEK_CUR'; - -use constant { - DICT_HEADER_MASK => 0x0008, - DICT_SYNONYMS_MASK => 0x0010 -}; - -sub systell { sysseek($_[0], 0, SEEK_CUR) } - -sub usage() { - print STDERR "USAGE: $0" - . " [-raw | -json] " - . " [-get-sum]" - . " [-get-desc]" - . " -dict " - . "\n" - . "\t(Takes a .pdb or .xwd and prints its words to stdout)\n"; - exit 1; -} - -sub parseARGV() { - - while ( my $parm = shift(@ARGV) ) { - if ( $parm eq "-raw" ) { - $gDoRaw = 1; - } elsif ( $parm eq "-json" ) { - $gDoJSON = 1; - } elsif ( $parm eq "-dict" ) { - $gInFile = shift(@ARGV); - } elsif ( $parm eq "-get-sum" ) { - $gSumOnly = 1; - } elsif ( $parm eq "-get-desc" ) { - $gDescOnly = 1; - } else { - usage(); - } - } - - if ( $gInFile =~ m|.xwd$| ) { - $gFileType = "xwd"; - } elsif ( $gInFile =~ m|.pdb$| ) { - $gFileType = "pdb"; - } else { - usage(); - } -} # parseARGV - -sub countSpecials($) { - my ( $facesRef ) = @_; - my $count = 0; - - map { ++$count if ( ord($_) < 32 ); } @$facesRef; - return $count; -} # countSpecials - -sub readXWDFaces($$$) { - my ( $fh, $facRef, $nSpecials ) = @_; - - my ( $buf, $nRead, $nChars, $nBytes ); - $nRead = sysread( $fh, $buf, 1 ); - $nBytes = unpack( 'c', $buf ); - printf STDERR "nBytes of faces: %d\n", $nBytes; - - if ( $gIsUTF ) { - $nRead = sysread( $fh, $buf, 1 ); - $nChars = unpack( 'c', $buf ); - printf STDERR "nChars of faces: %d\n", $nChars; - } - - # At this point $fh is pointing at the start of data - if ( $gSumOnly ) { - my $sum = sumRestOfFile( $fh ); - if ( $sum eq $gSum ) { - print STDERR "got: $sum, $gSum twice!!!\n"; - } elsif ( !$gSum ) { - $gSum = $sum; - } else { - print STDERR "disagreement: $sum vs $gSum\n"; - exit( -1 ); - } - } else { - binmode( $fh, ":encoding(utf8)" ) or die "binmode(:utf-8) failed\n"; - sysread( $fh, $buf, $nBytes ); - length($buf) == $nBytes or die "didn't read expected number of bytes\n"; - binmode( $fh ) or die "binmode failed\n"; - - print STDERR "string now: $buf\n"; - my @faces; - my $index = 0; - for ( my $nFound = 0; $nFound < $nChars; ) { - my $chr = substr( $buf, $index++, 1 ); - if ( $chr eq ' ' ) { - # For testing, this next line uses last rather than - # first alternate - # $faces[$#faces] = substr( $buf, $index, 1 ); - ++$index; - next; - } - print STDERR "pushing $chr \n"; - push( @faces, $chr ); - ++$nFound; - } - - printf STDERR "at 0x%x after reading faces\n", systell($fh); - - ${$nSpecials} = countSpecials( \@faces ); - @{$facRef} = @faces; - printf STDERR "readXWDFaces=>%d\n", $nChars; - } - return $nChars; -} # readXWDFaces - -sub skipBitmap($) { - my ( $fh ) = @_; - my $buf; - sysread( $fh, $buf, 1 ); - my $nCols = unpack( 'C', $buf ); - if ( $nCols > 0 ) { - sysread( $fh, $buf, 1 ); - my $nRows = unpack( 'C', $buf ); - my $nBytes = (($nRows * $nCols) + 7) / 8; - - sysread( $fh, $buf, $nBytes ); - } - printf STDERR "skipBitmap\n"; -} # skipBitmap - -sub getSpecials($$$) { - my ( $fh, $nSpecials, $specRef ) = @_; - - my @specials; - for ( my $i = 0; $i < $nSpecials; ++$i ) { - my $buf; - sysread( $fh, $buf, 1 ); - my $len = unpack( 'C', $buf ); - sysread( $fh, $buf, $len ); - push( @specials, $buf ); - skipBitmap( $fh ); - skipBitmap( $fh ); - } - - @{$specRef} = @specials; -} # getSpecials - -sub readNodesToEnd($) { - my ( $fh ) = @_; - my @nodes; - my $count = 0; - my $offset = 4 - $gNodeSize; - my ( $buf, $nRead ); - - do { - $nRead = sysread( $fh, $buf, $gNodeSize, $offset ); - $count += $nRead; - my $node = unpack( 'N', $buf ); - push( @nodes, $node ); - } while ( $nRead == $gNodeSize ); - die "out of sync? nRead=$nRead, count=$count" if $nRead != 0; - - return @nodes; -} # readNodesToEnd - -sub printHeader($$) { - my ( $buf, $len ) = @_; - printf STDERR "skipped %d bytes of header:\n", $len + 2; - my $count; - if ( $len == 4 ) { - ($count) = unpack( 'N', $buf ); - } else { - print STDERR 'unpacking...\n'; - ($count, $gDesc, $gSum) = unpack( 'N Z* Z*', $buf ); - } - printf STDERR "has %d words\n", $count; -} - -sub nodeSizeFromFlags($$) { - my ( $fh, $flags ) = @_; - - my $bitSet = $flags & DICT_HEADER_MASK; - if ( 0 != $bitSet ){ - $flags = $flags & ~DICT_HEADER_MASK; - # need to skip header - my $buf; - 2 == sysread( $fh, $buf, 2 ) || die "couldn't read length of header"; - my $len = unpack( "n", $buf ); - $len == sysread( $fh, $buf, $len ) || die "couldn't read header bytes"; - printHeader( $buf, $len ); - } - - $flags = $flags & ~DICT_SYNONYMS_MASK; - - my $nodeSize; - if ( $flags == 2 || $flags == 4 ) { - $nodeSize = 3; - } elsif ( $flags == 3 || $flags == 5 ) { - $nodeSize = 4; - } else { - die "invalid dict flags $flags"; - } - my $isUTF = $flags == 4 || $flags == 5; - - return ( $nodeSize, $isUTF ); -} # nodeSizeFromFlags - -sub mergeSpecials($$) { - my ( $facesRef, $specialsRef ) = @_; - for ( my $i = 0; $i < @$facesRef; ++$i ) { - my $ref = ord($$facesRef[$i]); - if ( $ref < 32 ) { - $$facesRef[$i] = $$specialsRef[$ref]; - #print STDERR "set $ref to $$specialsRef[$ref]\n"; - } - } -} - -sub sumRestOfFile($) { - my ( $fh ) = @_; - my $buf; - my $count = 0; - my $md5 = Digest::MD5->new; - for ( ; ; ) { - my $nRead = sysread( $fh, $buf, 128 ); - 0 == $nRead && last; - $count += $nRead; - $md5->add( $buf ); - } - # print STDERR "read $count bytes\n"; - return $md5->hexdigest(); -} - -sub prepXWD($$$$) { - my ( $fh, $facRef, $nodesRef, $startRef ) = @_; - my $done = 1; - - printf STDERR "at 0x%x at start\n", systell($fh); - my $buf; - my $nRead = sysread( $fh, $buf, 2 ); - my $flags = unpack( "n", $buf ); - - ($gNodeSize, $gIsUTF) = nodeSizeFromFlags( $fh, $flags ); - - my $nSpecials; - my $faceCount = readXWDFaces( $fh, $facRef, \$nSpecials ); # does checksum - - if ( $gSumOnly ) { - print STDOUT $gSum, "\n"; - } elsif( $gDescOnly ) { - print STDOUT $gDesc, "\n"; - } else { - $done = 0; - printf STDERR "at 0x%x before header read\n", systell($fh); - # skip xloc header - $nRead = sysread( $fh, $buf, 2 ); - - # skip values info. - printf STDERR "at 0x%x before reading %d values\n", systell($fh), $faceCount; - sysread( $fh, $buf, $faceCount * 2 ); - printf STDERR "at 0x%x after values read\n", systell($fh); - - printf STDERR "at 0x%x before specials read\n", systell($fh); - my @specials; - getSpecials( $fh, $nSpecials, \@specials ); - mergeSpecials( $facRef, \@specials ); - printf STDERR "at 0x%x after specials read\n", systell($fh); - - printf STDERR "at 0x%x before offset read\n", systell($fh); - sysread( $fh, $buf, 4 ); - $$startRef = unpack( 'N', $buf ); - print STDERR "startRef=$$startRef\n"; - - my @nodes = readNodesToEnd( $fh ); - - @$nodesRef = @nodes; - } - print STDERR "prepXWD done\n"; - return $done; -} # prepXWD - -sub readPDBSpecials($$$$$) { - my ( $fh, $nChars, $nToRead, $nSpecials, $specRef ) = @_; - - my ( $nRead, $buf ); - - # first skip counts and values, and xloc header - $nRead += sysread( $fh, $buf, ($nChars * 2) + 2 ); - - while ( $nSpecials-- ) { - $nRead += sysread( $fh, $buf, 8 ); # sizeof(Xloc_specialEntry) - my @chars = unpack( 'C8', $buf ); - my $str; - foreach my $char (@chars) { - if ( $char == 0 ) { # null-terminated on palm - last; - } - $str .= chr($char); - } - push( @$specRef, $str ); - } - - $nRead += sysread( $fh, $buf, $nToRead - $nRead ); # skip bitmaps - - return $nRead; -} # readPDBSpecials - -sub prepPDB($$$$) { - my ( $fh, $facRef, $nodesRef, $startRef ) = @_; - - $$startRef = 0; # always for palm? - - my $buf; - # skip header info - my $nRead = sysread( $fh, $buf, 76 ); - $nRead += sysread( $fh, $buf, 2 ); - my $nRecs = unpack( 'n', $buf ); - - my @offsets; - for ( my $i = 0; $i < $nRecs; ++$i ) { - $nRead += sysread( $fh, $buf, 4 ); - push( @offsets, unpack( 'N', $buf ) ); - $nRead += sysread( $fh, $buf, 4 ); # skip - } - - die "too far" if $nRead > $offsets[0]; - while ( $nRead < $offsets[0] ) { - $nRead += sysread( $fh, $buf, 1 ); - } - - my $facesOffset = $offsets[1]; - my $nChars = ($offsets[2] - $facesOffset) / 2; - $nRead += sysread( $fh, $buf, $facesOffset - $nRead ); - my @tmp = unpack( 'Nc6n', $buf ); - $gNodeSize = nodeSizeFromFlags( 0, $tmp[7] ); - - my @faces; - for ( my $i = 0; $i < $nChars; ++$i ) { - $nRead += sysread( $fh, $buf, 2 ); - push( @faces, chr(unpack( "n", $buf ) ) ); - } - @{$facRef} = @faces; - - die "out of sync: $nRead != $offsets[2]" if $nRead != $offsets[2]; - - my @specials; - $nRead += readPDBSpecials( $fh, $nChars, $offsets[3] - $nRead, - countSpecials($facRef), \@specials ); - mergeSpecials( $facRef, \@specials ); - - die "out of sync" if $nRead != $offsets[3]; - my @nodes = readNodesToEnd( $fh ); - - @$nodesRef = @nodes; -} # prepPDB - -sub parseNode($$$$$) { - my ( $node, $chrIndex, $nextEdge, $accepting, $last ) = @_; - - if ( $gNodeSize == 4 ) { - $$accepting = ($node & 0x00008000) != 0; - $$last = ($node & 0x00004000) != 0; - $$chrIndex = ($node & 0x00003f00) >> 8; - $$nextEdge = ($node >> 16) + (($node & 0x000000FF) << 16); - } elsif( $gNodeSize == 3 ) { - $$accepting = ($node & 0x00000080) != 0; - $$last = ($node & 0x00000040) != 0; - $$chrIndex = $node & 0x0000001f; - $$nextEdge = ($node >> 8) + (($node & 0x00000020) << 11); - } - - # printf "%x: acpt=$$accepting; last=$$last; " - # . "next=$$nextEdge; ci=$$chrIndex\n", $node; -} # parseNode - -sub printStr($$) { - my ( $strRef, $facesRef ) = @_; - - print join( "", map {$$facesRef[$_]} @$strRef), "\n"; -} # printStr - -# Given an array of 4-byte nodes, a start index. and another array of -# two-byte faces, print out all of the words in the nodes array. -sub printDAWG($$$$) { - my ( $strRef, $arrRef, $start, $facesRef ) = @_; - - die "infinite recursion???" if @$strRef > 15; - - for ( ; ; ) { - my $node = $$arrRef[$start++]; - my $nextEdge; - my $chrIndex; - my $accepting; - my $lastEdge; - - parseNode( $node, \$chrIndex, \$nextEdge, \$accepting, \$lastEdge ); - - push( @$strRef, $chrIndex ); - if ( $accepting ) { - printStr( $strRef, $facesRef ); - } - - if ( $nextEdge != 0 ) { - printDAWG( $strRef, $arrRef, $nextEdge, $facesRef ); - } - - pop( @$strRef ); - - if ( $lastEdge ) { - last; - } - } -} # printDAWG - -sub printNodes($$) { - my ( $nr, $fr ) = @_; - - my $len = @$nr; - for ( my $i = 0; $i < $len; ++$i ) { - my $node = $$nr[$i]; - - my ( $chrIndex, $nextEdge, $accepting, $lastEdge ); - parseNode( $node, \$chrIndex, \$nextEdge, \$accepting, \$lastEdge ); - - printf "%.8x: (%.8x) %2d(%s) %.8x ", $i, $node, $chrIndex, - $$fr[$chrIndex], $nextEdge; - print ($accepting? "A":"a"); - print " "; - print ($lastEdge? "L":"l"); - print "\n"; - } -} - -sub printStartJson($) { - my ( $startIndex ) = @_; - printf( " start: 0x%.8x,\n", $startIndex ); -} - -sub printCharsJson($) { - my ( $fr ) = @_; - print " chars: [ "; - foreach my $char (@$fr) { - print "\"$char\", " - } - print "],\n" -} - -sub printNodesJson($) { - my ( $nr ) = @_; - print " dawg: [\n"; - - my $len = @$nr; - my $newLine = 1; - for ( my $ii = 0; $ii < $len; ++$ii ) { - my $node = $$nr[$ii]; - - if ( $newLine ) { - printf( " /*%.6x*/ ", $ii ); - $newLine = 0; - } - - printf "0x%.8x, ", $node; - - my ( $chrIndex, $nextEdge, $accepting, $lastEdge ); - parseNode( $node, \$chrIndex, \$nextEdge, \$accepting, \$lastEdge ); - if ( $lastEdge ) { - print "\n"; - $newLine = 1; - } - } - - print "\n ],\n" -} - -################################################################# -# main -################################################################# - -binmode( STDERR, ":encoding(utf8)" ) or die "binmode(:utf-8) failed\n"; - -parseARGV(); - -sysopen(INFILE, $gInFile, O_RDONLY) or die "couldn't open $gInFile: $!\n";; -binmode INFILE; - -my @faces; -my @nodes; -my $startIndex; -my $done; - -if ( $gFileType eq "xwd" ){ - $done = prepXWD( *INFILE, \@faces, \@nodes, \$startIndex ); -} elsif ( $gFileType eq "pdb" ) { - $done = prepPDB( *INFILE, \@faces, \@nodes, \$startIndex ); -} -close INFILE; - -die "no nodes!!!" if 0 == @nodes; - -if ( $done ) { - # we're done... -} elsif ( $gDoRaw ) { - printNodes( \@nodes, \@faces ); -} elsif ( $gDoJSON ) { - print "dict = {\n"; - printStartJson( $startIndex ); - printCharsJson( \@faces ); - printNodesJson( \@nodes ); - print "}\n"; -} else { - binmode( STDOUT, ":encoding(utf8)" ) or die "binmode(:utf-8) failed\n"; - printDAWG( [], \@nodes, $startIndex, \@faces ); -} - -exit 0;