From cec6a805e3e557b4fe490ad89502247243e284bb Mon Sep 17 00:00:00 2001 From: ehouse Date: Sat, 1 Nov 2003 06:19:20 +0000 Subject: [PATCH] first checkin from personal project --- dawg/English/Makefile | 40 ++ dawg/English/Makefile.BasEnglish | 35 ++ dawg/English/info.txt | 71 +++ dawg/Makefile.2to8 | 7 + dawg/Makefile.langcommon | 257 ++++++++++ dawg/dict2dawg.pl | 798 +++++++++++++++++++++++++++++++ dawg/frank_mkspecials.pl | 45 ++ dawg/palm_mkspecials.pl | 111 +++++ dawg/par.pl | 234 +++++++++ dawg/xloc.pl | 42 ++ dawg/xloc.pm | 180 +++++++ 11 files changed, 1820 insertions(+) create mode 100644 dawg/English/Makefile create mode 100644 dawg/English/Makefile.BasEnglish create mode 100644 dawg/English/info.txt create mode 100644 dawg/Makefile.2to8 create mode 100644 dawg/Makefile.langcommon create mode 100755 dawg/dict2dawg.pl create mode 100755 dawg/frank_mkspecials.pl create mode 100755 dawg/palm_mkspecials.pl create mode 100755 dawg/par.pl create mode 100755 dawg/xloc.pl create mode 100644 dawg/xloc.pm diff --git a/dawg/English/Makefile b/dawg/English/Makefile new file mode 100644 index 000000000..aa7edea4e --- /dev/null +++ b/dawg/English/Makefile @@ -0,0 +1,40 @@ +# -*-mode: Makefile -*- +# Copyright 2002 by Eric House (fixin@peak.org). All rights reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +TARGET_TYPE ?= FRANK + +# This represents the default -- for now +COMMAND = -f Makefile.BasEnglish TARGET_TYPE=FRANK + +alleng: + for mfile in Makefile.BasEnglish Makefile.OSW Makefile.TWL98 Makefile.CollegeEng; do \ + $(MAKE) -f $$mfile TARGET_TYPE=$(TARGET_TYPE); \ + done + +%: + $(MAKE) $(COMMAND) $@ + +all: + $(MAKE) $(COMMAND) + +clean: + $(MAKE) $(COMMAND) clean + +help: + @echo "try make -f Makefile.[BasEnglish|CollegeEng] \\" + @echo " TARGET_TYPE=[PALM|FRANK]" + diff --git a/dawg/English/Makefile.BasEnglish b/dawg/English/Makefile.BasEnglish new file mode 100644 index 000000000..d8151e1c0 --- /dev/null +++ b/dawg/English/Makefile.BasEnglish @@ -0,0 +1,35 @@ +# -*-mode: Makefile -*- +# Copyright 2002 by Eric House (fixin@peak.org). All rights reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +LANG=BasEnglish +LANGCODE=en_US +#NEWDAWG=1 + +TARGET_TYPE ?= FRANK + +include ../Makefile.2to8 + +include ../Makefile.langcommon + +$(LANG)Main.dict.gz: BasEnglish.dict.gz + ln -s $< $@ + +# Everything but creating of the Main.dict file is inherited from the +# "parent" Makefile.langcommon in the parent directory. + +clean: clean_common + rm -f $(LANG)Main.dict.gz *.bin $(LANG)*.pdb $(LANG)*.seb diff --git a/dawg/English/info.txt b/dawg/English/info.txt new file mode 100644 index 000000000..40cf89fb3 --- /dev/null +++ b/dawg/English/info.txt @@ -0,0 +1,71 @@ +# Copyright 2002 by Eric House (fixin@peak.org). All rights reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +LANGCODE:en_US + +# deal with DOS files +LANGFILTER_PRECLIP: tr -d '\r' | + +LANGFILTER_POSTCLIP: | tr [a-z] [A-Z] +LANGFILTER_POSTCLIP: | grep '^[A-Z]*$' +LANGFILTER_POSTCLIP: | tr -s '\n\r' '\000\000' +LANGFILTER_POSTCLIP: | sort -z + +# We can trust sort (above) to do the right thing since there's no +# high ascii. dict2dawg.pl is much faster if I can trust that its +# input is in sorted order. +NEEDSSORT:false + +LANGINFO:

English dictionaries can contain words with any of the 26 +LANGINFO: letters you think of as making up the alphabet: A-Z. At +LANGINFO: this point any word in your list containing anything else +LANGINFO: will simply be excluded from the dictionary.

+ +# High bit means "official". Next 7 bits are an enum where +# English==1. Low byte is padding +XLOC_HEADER:0x8100 + + +2 0 {"_"} +9 1 'A' +2 3 'B' +2 3 'C' +4 2 'D' +12 1 'E' +2 4 'F' +3 2 'G' +2 4 'H' +9 1 'I' +1 8 'J' +1 5 'K' +4 1 'L' +2 3 'M' +6 1 'N' +8 1 'O' +2 3 'P' +1 10 'Q' +6 1 'R' +4 1 'S' +6 1 'T' +4 1 'U' +2 4 'V' +2 4 'W' +1 8 'X' +2 4 'Y' +1 10 'Z' + + +# should ignore all after the above diff --git a/dawg/Makefile.2to8 b/dawg/Makefile.2to8 new file mode 100644 index 000000000..bb1777d27 --- /dev/null +++ b/dawg/Makefile.2to8 @@ -0,0 +1,7 @@ +# -*-mode: Makefile -*- + +# These are the targets that almost all language makefiles will want. + +SHORT_WORD = 2 +LONG_WORD = 8 + diff --git a/dawg/Makefile.langcommon b/dawg/Makefile.langcommon new file mode 100644 index 000000000..52d33a44a --- /dev/null +++ b/dawg/Makefile.langcommon @@ -0,0 +1,257 @@ +# -*-mode: Makefile -*- + +# Copyright 2000-2002 by Eric House (fixin@peak.org) +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +FRANK_EXT = xwd + +# this will make all dicts the new, larger type +#FORCE_4 = -force4 + +PALM_DICT_TYPE = DAWG +PAR = ../par.pl + +LANGUAGE = $(shell basename $$(pwd)) + +#all: target_all + +# let languages set this first, but we always add blank to it. +BLANK_INFO = "_" /dev/null /dev/null + +# Supply a default so don't have to type so much; feel free to change +TARGET_TYPE ?= FRANK + +ifdef NEWDAWG + TABLE_ARG = -mn +else + TABLE_ARG = -m +endif + +############################################################################## +# PalmOS rules +############################################################################## +ifeq ($(TARGET_TYPE),PALM) + +ifdef NEWDAWG + PDBTYPE = Xwr4 +else + PDBTYPE = Xwr3 +endif + +all: $(LANG)2to8.pdb + +empty: $(LANG)0to0.pdb + +# Those languages that have bitmap files for custom glyphs will need to +# define BMPBINFILES and perhaps provide a rule for building the files +binfiles.stamp: $(BMPBINFILES) + touch binfiles.stamp + +palmspecials.bin: ../palm_mkspecials.pl $(BMPFILES) + $< $(BLANK_INFO) $(LANG_SPECIAL_INFO) > $@ + +# can't just use values.bin because the specials bitmap info is +# platform-specific +palmvalues.bin: values.bin palmspecials.bin + cat $^ > $@ + +# values.bin: palmspecials.bin ../xloc binfiles.stamp +# cd ../ && $(MAKE) xloc +# binfileparms=""; \ +# if [ "$(BMPBINFILES)" != "" ]; then \ +# for f in $(BMPBINFILES)""; \ +# do binfileparms="$$binfileparms -i $$f"; \ +# done; \ +# fi; \ +# ../xloc -l $(LANGCODE) $$binfileparms -T $@ +# cat palmspecials.bin >> $@ + +# header (first record) is node count (long) and 4 chars: +# unsigned char firstEdgeRecNum; +# unsigned char charTableRecNum; +# unsigned char valTableRecNum; +# unsigned char reserved[3]; // worst case this points to a new resource + +# include "flags" as used on the other platforms +palmheader%.bin: $(LANG)%_wordcount.bin $(LANG)%_flags.bin + rm -f $@ + touch $@ +ifdef NEWDAWG + cat $(LANG)$*_flags.bin >> $@ +endif + cat $< >> $@ + perl -e "print pack(\"C\",3)" >> $@ # first edge + perl -e "print pack(\"C\",1)" >> $@ # char table rec number + perl -e "print pack(\"C\",2)" >> $@ # valTable rec number + perl -e "print pack(\"CCC\",0)" >> $@ # reserved 3 bytes + perl -e "print pack(\"CC\",0)" >> $@ # c code added two more... + + +# This works, but leaves out the header info that the current version +# has. I'm not sure anybody cares, though... +$(LANG)%.pdb: dawg$(LANG)%.stamp table.bin palmvalues.bin palmheader%.bin + $(PAR) c -a backup $@ \ + $(basename $(@F)) $(PALM_DICT_TYPE) $(PDBTYPE) \ + palmheader$*.bin table.bin palmvalues.bin dawg$(LANG)$*_*.bin + +# start=$$(echo $@ | sed -e 's/$(LANG)\([0-9]*\)to[0-9]*.pdb/\1/'); \ +# end=$$(echo $@ | sed -e 's/$(LANG)[0-9]*to\([0-9]*\).pdb/\1/'); \ +# zcat $< | grep "^.\{$${start},$${end}\}$$" | \ +# ../dict2pdb -t table.bin -v values.bin -n $(basename $(@F)) \ +# > $@ + +# the files to export for byod +byodbins: table.bin values.bin palmvalues.bin + +#endif # TARGET_TYPE==PALM + +############################################################################## +# Franklin ebook rules +############################################################################## +else +ifeq ($(TARGET_TYPE),FRANK) + +# If we're on a system that can build for Franklin, assume that's what +# we want to build (and the .xwd.saved [<-bug] file for other non-palm +# platforms is a by-product). But if the EBM tools aren't there, just +# build the .xwd file. +ifeq (x$(shell echo -n $$EBOOKMAN_SDK)x,xx) +all: $(LANG)2to8.xwd +else +all: $(LANG)2to8.seb +endif + +empty: $(LANG)0to0.seb + +ifneq (x$(shell echo -n $$EBOOKMAN_SDK)x,xx) +include ${EBOOKMAN_SDK}/ebsdk.uses +endif + +# a binary file (one byte) giving the number of tiles in the dict +charcount.bin: table.bin +ifdef NEWDAWG + siz=$$(wc -c $< | sed -e 's/$ $@ +else + siz=$$(wc -c $< | sed -e 's/$ $@ +endif + +# For each entry in the table whose face < 32, there needs to be a pair of +# pbitm files and a string giving the printing form +frankspecials.bin: ../frank_mkspecials.pl $(BMPFILES) + $< $(BLANK_INFO) $(LANG_SPECIAL_INFO) > $@ + +$(LANG)%.$(FRANK_EXT): dawg$(LANG)%.stamp $(LANG)%_flags.bin charcount.bin table.bin values.bin frankspecials.bin + cat $(LANG)$*_flags.bin charcount.bin table.bin values.bin \ + frankspecials.bin $(LANG)StartLoc.bin $$(ls dawg$(LANG)$*_*.bin) > $@ + cp $@ saveme.bin + +$(LANG)%.seb: $(LANG)%.$(FRANK_EXT) $(LANG)%.atts + ${ESDK_CREATESEB_EXE} $< + cp $< $<.saved + +$(LANG)%.atts: #recreate it each time based on params + echo '_PUB|global+read-only|"Eric_House"' >> $@ + echo "_NAME|global+read-only|\"$(LANG)2to8\"" >> $@ + echo "_EXT|global+read-only|\"$(FRANK_EXT)\"" >> $@ + echo '_LCAT|nosign+global|"CONTENT"' >> $@ + echo '_PERM|global+read-only|"r"' >> $@ + +# values.bin: ../xloc +# cd ../ && $(MAKE) xloc +# ../xloc -l $(LANGCODE) -T $@ + +# the files to export for byod +byodbins: table.bin values.bin frankspecials.bin + + +else + (Need to define TARGET_TYPE if get error pointing to this line) +endif #ifeq ($(TARGET_TYPE),FRANK) +endif + +ifeq (s$(TARGET_TYPE),s) +It's an error not to specify a TARGET_TYPE +endif + +############################################################################## +# shared rules +############################################################################## + +# For some reason I can't fathom dawg$(LANG)% gets nuked every time +# the top-level rule fires (all: for whatever TARGET_TYPE.) It +# happens after the rule finishes.... + +# 16 bits worth of flags for the start of the eventual file. At this +# point, the flags mean this: +# 1: old-style DAWG. +# 2: new-style DAWG, three bytes per node. +# 3: new-style DAWG, four bytes per node +$(LANG)%_flags.bin: dawg$(LANG)%.stamp +ifdef NEWDAWG + if [ 3 == $$(cat $(LANG)$*_nodesize.bin) ] ; \ + then perl -e "print pack(\"n\",0x0002)" > $@; echo "flags=2"; \ + else perl -e "print pack(\"n\",0x0003)" > $@; echo "flags=3"; \ + fi +else + if [ 3 == $$(cat $(LANG)$*_nodesize.bin) ] ; \ + then perl -e "print pack(\"n\",0x0001)" > $@; echo "flags=1"; \ + else echo "ERROR: old format can't handle 4-byte"; exit 1; \ + fi +endif + +dawg$(LANG)%.stamp: $(LANG)Main.dict.gz ../dict2dawg.pl table.bin ../Makefile.langcommon + start=$$(echo $@ | sed -e 's/dawg$(LANG)\([0-9]*\)to[0-9]*.stamp/\1/'); \ + end=$$(echo $@ | sed -e 's/dawg$(LANG)[0-9]*to\([0-9]*\).stamp/\1/'); \ + echo $${start} and $$end; \ + zcat $< | grep "^.\{$${start},$${end}\}$$" | tr '\n' '\0'| \ + sort -z | ../dict2dawg.pl $(TABLE_ARG) table.bin -b 28000 \ + -ob dawg$(LANG)$* \ + -sn $(LANG)StartLoc.bin -k -term 0 -wc $(LANG)$*_wordcount.bin \ + $(FORCE_4) -ns $(LANG)$*_nodesize.bin + touch $@ + +$(LANG)%_wordcount.bin: dawg$(LANG)%.stamp + @echo + +# the files to export for byod +allbins: + $(MAKE) TARGET_TYPE=PALM byodbins + $(MAKE) TARGET_TYPE=FRANK byodbins + +table.bin: ../xloc.pl +ifdef NEWDAWG + perl -I../ ../xloc.pl -tn > $@ +else + perl -I../ ../xloc.pl -t > $@ +endif + +values.bin: ../xloc.pl + perl -I../ ../xloc.pl -v > $@ + +%.dict: %.dict.gz + zcat $< > $@ + +clean_common: + rm -f $(LANG)Main.dict *.bin *.pdb *.seb dawg*.stamp *.$(FRANK_EXT) \ + $(LANG)*.pdb $(LANG)*.seb + +help: + @echo "make TARGET_TYPE=[FRANK|PALM]" + +test: + @echo $(LANGUAGE) diff --git a/dawg/dict2dawg.pl b/dawg/dict2dawg.pl new file mode 100755 index 000000000..eb1729be2 --- /dev/null +++ b/dawg/dict2dawg.pl @@ -0,0 +1,798 @@ +#!/usr/bin/perl + +############################################################################## +# adapted from C++ code Copyright (C) 2000 Falk Hueffner +# This version Copyright (C) 2002 Eric House (fixin@peak.org) +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 +# USA +############################################################################## + +# inputs: 0. Name of file mapping letters to 0..31 values. In English +# case just contains A..Z. This will be used to translate the tries +# on output. +# 1. Max number of bytes per binary output file. +# +# 2. Basename of binary files for output. + +# 3. Name of file to which to write the number of the +# startNode, since I'm not rewriting a bunch of code to expect Falk's +# '*' node at the start. +# + +# In STDIN, the text file to be compressed. It absolutely +# must be sorted. The sort doesn't have to follow the order in the +# map file, however. + +# This is meant eventually to be runnable as part of a cgi system for +# letting users generating Crosswords dicts online. + + + +use strict; +use POSIX; + +my $gFirstDiff; +my @gCurrentWord; +my $gCurrentWord; # save so can check for sortedness +my $gDone = 0; +my @gInputStrings; +my $gNeedsSort = 1; # read from cmd line eventually +my @gNodes; # final array of nodes +my $gNBytesPerOutfile = 0xFFFFFFFF; +my $gTableFile; +my $gOutFileBase; +my $gStartNodeOut; +my $gInFileName; +my $gKillIfMissing = 1; +my $gTermChar = '/n'; +my $gDumpText = 0; # dump the dict as text after? +my $gCountFile; +my $gBytesPerNodeFile; # where to write whether node size 3 or 4 +my $gWordCount = 0; +my %gTableHash; +my @gRevMap; +my $debug = 0; +my %gSubsHash; +my $gForceFour = 0; # use four bytes regardless of need? +my $gNBytesPerNode; +my $gUseUnicode; + +main(); + +exit; + +############################################################################## + +sub main() { + + if ( !parseARGV() ) { + usage(); + exit(); + } + + makeTableHash(); + + my $infile; + + if ( $gInFileName ) { + open $infile, "<$gInFileName"; + } else { + $infile = \*STDIN; + } + + @gInputStrings = parseAndSort( $infile ); + if ( $gInFileName ) { + close $infile; + } + + # Do I need this stupid thing? Better to move the first row to + # the front of the array and patch everything else. Or fix the + # non-palm dictionary format to include the offset of the first + # node. + + my $dummyNode = 0xFFFFFFFF; + @gNodes = ( $dummyNode ); + + readNextWord(); + + my $firstRootChildOffset = buildNode(0); + + moveTopToFront( \$firstRootChildOffset ); + + if ( $gStartNodeOut ) { + writeOutStartNode( $gStartNodeOut, $firstRootChildOffset ); + } + + print STDERR "\n... dumping table ...\n" if $debug; + printNodes( \@gNodes, "done with main" ) if $debug; + + # write out the number of nodes if requested + if ( $gCountFile ) { + open OFILE, "> $gCountFile"; + print OFILE pack( "N", $gWordCount ); + close OFILE; + print STDERR "wrote out: got $gWordCount words\n"; + } + + if ( $gOutFileBase ) { + emitNodes( $gNBytesPerOutfile, $gOutFileBase ); + } + + if ( $gDumpText && @gNodes > 0 ) { + printOneLevel( $firstRootChildOffset, "" ); + } + + if ( $gBytesPerNodeFile ) { + open OFILE, "> $gBytesPerNodeFile"; + print OFILE $gNBytesPerNode; + close OFILE; + } + print STDERR "Used $gNBytesPerNode per node.\n"; +} # main + +# We now have an array of nodes with the last subarray being the +# logical top of the tree. Move them to the start, fixing all fco +# refs, so that legacy code like Palm can assume top==0. +# +# Note: It'd probably be a bit faster to integrate this with emitNodes +# -- unless I need to have an in-memory list that can be used for +# lookups. But that's best for debugging, so keep it this way for now. +# +# Also Note: the first node is a dummy that can and should be tossed +# now. + +sub moveTopToFront($) { + my ( $firstRef ) = @_; + + my $firstChild = ${$firstRef}; + ${$firstRef} = 0; + my @lastSub; + + if ( $firstChild > 0 ) { + # remove the last (the root) subarray + @lastSub = splice( @gNodes, $firstChild ); + } else { + die "there should be no words!!" if $gWordCount != 0; + } + # remove the first (garbage) node + shift @gNodes; + + my $diff; + if ( $firstChild > 0 ) { + # -1 because all move down by 1; see prev line + $diff = @lastSub - 1; + die "something wrong with len\n" if $diff < 0; + } else { + $diff = 0; + } + + # stick it on the front + splice( @gNodes, 0, 0, @lastSub); + + # We add $diff to everything. There's no subtracting because + # nobody had any refs to the top list. + + for ( my $i = 0; $i < @gNodes; ++$i ) { + my $fco = TrieNodeGetFirstChildOffset( $gNodes[$i] ); + if ( $fco != 0 ) { # 0 means NONE, not 0th!! + TrieNodeSetFirstChildOffset( \$gNodes[$i], $fco+$diff ); + } + } +} # moveTopToFront + + +sub buildNode { + my ( $depth ) = @_; + + if ( @gCurrentWord == $depth ) { + # End of word reached. If the next word isn't a continuation + # of the current one, then we've reached the bottom of the + # recursion tree. + readNextWord(); + if ($gFirstDiff < $depth || $gDone) { + return 0; + } + } + + my @newedges; + + do { + my $letter = $gCurrentWord[$depth]; + my $isTerminal = @gCurrentWord - 1 == $depth ? 1:0; + + my $nodeOffset = buildNode($depth+1); + my $newNode = MakeTrieNode($letter, $isTerminal, $nodeOffset); + push( @newedges, $newNode ); + + } while ( ($gFirstDiff == $depth) && !$gDone); + + TrieNodeSetIsLastSibling( \@newedges[@newedges-1], 1 ); + + return addNodes( \@newedges ); +} # buildNode + +sub addNodes { + my ( $newedgesR ) = @_; + + my $found = findSubArray( $newedgesR ); + + if ( $found >= 0 ) { + die "0 is an invalid match!!!" if $found == 0; + return $found; + } else { + + my $firstFreeIndex = @gNodes; + + print STDERR "adding...\n" if $debug; + printNodes( $newedgesR ) if $debug; + + push @gNodes, (@{$newedgesR}); + + registerSubArray( $newedgesR, $firstFreeIndex ); + return $firstFreeIndex; + } +} # addNodes + +sub printNode { + my ( $index, $node ) = @_; + + print STDERR "[$index] "; + + printf( STDERR + "letter=%d; isTerminal=%d; isLastSib=%d; fco=%d;\n", + TrieNodeGetLetter($node), + TrieNodeGetIsTerminal($node), + TrieNodeGetIsLastSibling($node), + TrieNodeGetFirstChildOffset($node)); +} # printNode + +sub printNodes { + my ( $nodesR, $name ) = @_; + + my $len = @{$nodesR}; + # print "printNodes($name): len = $len\n"; + + for ( my $i = 0; $i < $len; ++$i ) { + my $node = ${$nodesR}[$i]; + printNode( $i, $node ); + } + +} + + +# Hashing. We'll keep a hash of offsets into the existing nodes +# array, and as the key use a string that represents the entire sub +# array. Since the key is what we're matching for, there should never +# be more than one value per hash and so we don't need buckets. +# Return -1 if there's no match. + +sub findSubArray { + my ( $newedgesR ) = @_; + + my $key = join('', @{$newedgesR}); + + if ( exists( $gSubsHash{$key} ) ) { + return $gSubsHash{$key}; + } else { + return -1; + } +} # findSubArray + +# add to the hash +sub registerSubArray { + my ( $edgesR, $nodeLoc ) = @_; + + my $key = join( '', @{$edgesR} ); + + if ( exists $gSubsHash{$key} ) { + die "entry for key shouldn't exist!!"; + } else { + $gSubsHash{$key} = $nodeLoc; + } + +} # registerSubArray + +sub readNextWord() { + my @word; + + if ( !$gDone ) { + $gDone = @gInputStrings == 0; + if ( !$gDone ) { + @word = @{shift @gInputStrings}; + } else { + print STDERR "gDone set to true\n" if $debug; + } + + print STDERR "got word: ", join(',',@word), "\n" if $debug; + } + my $numCommonLetters = 0; + my $len = @word; + if ( @gCurrentWord < $len ) { + $len = @gCurrentWord; + } + + while ( @gCurrentWord[$numCommonLetters] eq @word[$numCommonLetters] + && $numCommonLetters < $len) { + ++$numCommonLetters; + } + + $gFirstDiff = $numCommonLetters; + die "words ", join(",",@gCurrentWord), " and ", join(",", @word), " out of order" if #$debug && + @gCurrentWord > 0 && @word > 0 + && !firstBeforeSecond( \@gCurrentWord, \@word ); + @gCurrentWord = @word; +} # readNextWord + +sub firstBeforeSecond { + my ( $firstR, $secondR ) = @_; + + for ( my $i = 0; ; ++$i ) { + + # if we reach the end of the first word/list, we're done. + if ( $i == @{$firstR} ) { + die "duplicate!!!" if $i == @{$secondR}; + return 1; + # but if we reach the second end first, we've failed + } elsif ( $i == @{$secondR} ) { + return 0; + } + + my $diff = ${$firstR}[$i] <=> ${$secondR}[$i]; + + if ( $diff == 0 ) { + next; + } else { + return $diff < 0; + } + } +} # firstBeforeSecond + +# passed to sort. Should remain unprototyped for effeciency's sake + +sub cmpWords { + + my $lenA = @{$a}; + my $lenB = @{$b}; + my $min = $lenA > $lenB? $lenB: $lenA; + + for ( my $i = 0; $i < $min; ++$i ) { + my $ac = ${$a}[$i]; + my $bc = ${$b}[$i]; + + my $res = $ac <=> $bc; + + if ( $res != 0 ) { + return $res; # we're done + } + } + + # If we got here, they match up to their common length. Longer is + # greater. + my $res = @{$a} <=> @{$b}; + return $res; # which is longer? +} # cmpWords + +sub parseAndSort() { + my ( $infile ) = @_; + + my @wordlist; + my @word; + + WORDLOOP: + for ( ; ; ) { + + my $dropWord = 0; + splice @word; # empty it + + # for each byte + for ( ; ; ) { + my $byt = getc($infile); + + if ( $byt eq undef ) { + last WORDLOOP; + } elsif ( $byt eq $gTermChar ) { + if ( !$dropWord ) { + push @wordlist, [ @word ]; + ++$gWordCount; + } + next WORDLOOP; + } elsif ( exists( $gTableHash{$byt} ) ) { + if ( !$dropWord ) { + push @word, $gTableHash{$byt}; + die "word too long" if @word > 15; + } + } elsif ($gKillIfMissing) { + die "char $byt (", $byt+0, ") not in map file $gTableFile\n"; + } else { + $dropWord = 1; + splice @word; # lose anything we already have + } + } + } + + if ( $gNeedsSort && ($gWordCount > 0) ) { + @wordlist = sort cmpWords @wordlist; + } + + print STDERR "length of list is ", @wordlist + 0, ".\n" if $debug; + + return @wordlist; +} # parseAndSort + +# Print binary representation of trie array. This isn't used yet, but +# eventually it'll want to dump to multiple files appropriate for Palm +# that can be catenated together on other platforms. There'll need to +# be a file giving the offset of the first node too. Also, might want +# to move to 4-byte representation when the input can't otherwise be +# handled. + +sub dumpNodes { + + for ( my $i = 0; $i < @gNodes; ++$i ) { + my $node = $gNodes[$i]; + my $bstr = pack( "I", $node ); + print STDOUT $bstr; + } +} + +############################################################################## +# Little node-field setters and getters to hide what bits represent +# what. +############################################################################## + +sub TrieNodeSetIsTerminal { + my ( $nodeR, $isTerminal ) = @_; + + if ( $isTerminal ) { + ${$nodeR} |= 1 << 31; + } else { + ${$nodeR} &= ~(1 << 31); + } +} + +sub TrieNodeGetIsTerminal { + my ( $node ) = @_; + return ($node & 1 << 31) != 0; +} + +sub TrieNodeSetIsLastSibling { + my ( $nodeR, $isLastSibling ) = @_; + if ( $isLastSibling ) { + ${$nodeR} |= 1 << 30; + } else { + ${$nodeR} &= ~(1 << 30); + } +} + +sub TrieNodeGetIsLastSibling { + my ( $node ) = @_; + return ($node & 1 << 30) != 0; +} + +sub TrieNodeSetLetter { + my ( $nodeR, $letter ) = @_; + + die "letter ", $letter, " too big" if $letter >= 32; + + my $mask = ~(0x1F << 25); + ${$nodeR} &= $mask; # clear all the bits + ${$nodeR} |= ($letter << 25); # set new ones +} + +sub TrieNodeGetLetter { + my ( $node ) = @_; + $node >>= 25; + $node &= 0x1F; + return $node; +} + +sub TrieNodeSetFirstChildOffset { + my ( $nodeR, $fco ) = @_; + + die "$fco larger than 25 bits" if ($fco & 0xFE000000) != 0; + + my $mask = ~0x01FFFFFF; + ${$nodeR} &= $mask; # clear all the bits + ${$nodeR} |= $fco; # set new ones +} + +sub TrieNodeGetFirstChildOffset { + my ( $node ) = @_; + $node &= 0x01FFFFFF; # 24 bits + return $node; +} + + +sub MakeTrieNode { + my ( $letter, $isTerminal, $firstChildOffset, $isLastSibling ) = @_; + my $result = 0; + + TrieNodeSetIsTerminal( \$result, $isTerminal ); + TrieNodeSetIsLastSibling( \$result, $isLastSibling ); + TrieNodeSetLetter( \$result, $letter ); + TrieNodeSetFirstChildOffset( \$result, $firstChildOffset ); + + return $result; +} # MakeTrieNode + +# Caller may need to know the offset of the first top-level node. +# Write it here. +sub writeOutStartNode { + my ( $startNodeOut, $firstRootChildOffset ) = @_; + + open NODEOUT, ">$startNodeOut"; + print NODEOUT pack( "N", $firstRootChildOffset ); + close NODEOUT; +} # writeOutStartNode + +# build the hash for translating. I'm using a hash assuming it'll be +# fast. Key is the letter; value is the 0..31 value to be output. +sub makeTableHash { + my $i; + open TABLEFILE, "< $gTableFile"; + + splice @gRevMap; # empty it + + for ( $i = 0; ; ++$i ) { + my $ch = getc(TABLEFILE); + if ( $ch eq undef ) { + last; + } + + if ( $gUseUnicode ) { # skip the first byte each time: tmp HACK!!! + $ch = getc(TABLEFILE); + } + if ( $ch eq undef ) { + last; + } + + push @gRevMap, $ch; + + if ( ord($ch) == 0 ) { # blank + next; # we want to increment i when blank seen since + # it is a tile value + } + + die "$gTableFile too large\n" if $i > 32; + die "only blank (0) can be 32nd char\n" if ($i == 32 && $ch != 0); + + $gTableHash{$ch} = $i; + } + + close TABLEFILE; +} # makeTableHash + +# emitNodes. "input" is $gNodes. From it we write up to +# $nBytesPerOutfile to files named $outFileBase0..n, mapping the +# letter field down to 5 bits with a hash built from $tableFile. If +# at any point we encounter a letter not in the hash we fail with an +# error. + +sub emitNodes($$) { + my ( $gNBytesPerOutfile, $outFileBase ) = @_; + + # now do the emit. + + # is 17 bits enough? + printf STDOUT ("There are %d (0x%x) nodes in this DAWG.\n", + 0 + @gNodes, 0 + @gNodes ); + if ( @gNodes > 0x1FFFF || $gForceFour ) { + $gNBytesPerNode = 4; + } else { + $gNBytesPerNode = 3; + } + + my $nextIndex = 0; + my $nextFileNum = 0; + + for ( $nextFileNum = 0; ; ++$nextFileNum ) { + + if ( $nextIndex >= @gNodes ) { + last; # we're done + } + + die "Too many outfiles; infinite loop?" if $nextFileNum > 99; + + my $outName = sprintf("${outFileBase}_%03d.bin", $nextFileNum); + open OUTFILE, "> $outName"; + my $curSize = 0; + + while ( $nextIndex < @gNodes ) { + + # scan to find the next terminal + my $i; + for ( $i = $nextIndex; + !TrieNodeGetIsLastSibling($gNodes[$i]); + ++$i ) { + + # do nothing but a sanity check + if ( $i >= @gNodes) { + die "bad trie format: last node not last sibling" ; + } + + } + ++$i; # move beyond the terminal + my $nextSize = ($i - $nextIndex) * $gNBytesPerNode; + if ($curSize + $nextSize > $gNBytesPerOutfile) { + last; + } else { + # emit the subarray + while ( $nextIndex < $i ) { + outputNode( $gNodes[$nextIndex], $gNBytesPerNode, + \*OUTFILE ); + ++$nextIndex; + } + $curSize += $nextSize; + } + } + + close OUTFILE; + } + +} # emitNodes + +sub printWord { + my ( $str ) = @_; + + print STDERR "$str\n"; +} + +# print out the entire dictionary, as text, to STDERR. + +sub printOneLevel { + + my ( $index, $str ) = @_; + + for ( ; ; ) { + + my $newStr = $str; + my $node = $gNodes[$index++]; + + my $lindx = $gRevMap[TrieNodeGetLetter($node)]; + + if ( ord($lindx) >= 0x20 ) { + $newStr .= "$lindx"; + } else { + print STDERR "sub space" if $debug; + $newStr .= "\\" . chr('0'+$lindx); + } + + if ( TrieNodeGetIsTerminal($node) ) { + printWord( $newStr ); + } + + my $fco = TrieNodeGetFirstChildOffset( $node ); + if ( $fco != 0 ) { + printOneLevel( $fco, $newStr ); + } + + if ( TrieNodeGetIsLastSibling($node) ) { + last; + } + } +} + +sub outputNode { + my ( $node, $nBytes, $outfile ) = @_; + + my $fco = TrieNodeGetFirstChildOffset($node); + my $fourthByte; + + if ( $nBytes == 4 ) { + $fourthByte = $fco >> 17; + die "fco too big" if $fourthByte > 0xFF; + $fco &= 0x1FFFF; + } + + # format according to dawg.h: + # typedef struct array_edge { + # unsigned char highByte; + # unsigned char lowByte; + # unsigned char bits; +#ifdef FOUR_BYTE + # unsigned char moreBits; +#endif + # } array_edge; + + # define LETTERMASK 0x1f + # define ACCEPTINGMASK 0x20 + # define LASTEDGEMASK 0x40 + # define LASTBITMASK 0x80 + + # write the fco (less that one bit). We want two bytes worth + # in three-byte mode, and three in four-byte mode (which is + # untested) + for ( my $i = 1; $i >= 0; --$i ) { + my $tmp = ($fco >> ($i * 8)) & 0xFF; + print $outfile pack( "C", $tmp ); + } + $fco >>= 16; # it should now be 1 or 0 + die "fco not 1 or 0" if $fco > 1; + + my $chIn5 = TrieNodeGetLetter($node); + my $bits = $chIn5; + + if ( TrieNodeGetIsLastSibling($node) ) { + $bits |= 0x40; + } + if ( TrieNodeGetIsTerminal($node) ) { + $bits |= 0x20; + } + if ( $fco != 0 ) { + $bits |= 0x80; + } + print $outfile pack( "C", $bits ); + + # the final byte, if in use + if ( $nBytes == 4 ) { + print $outfile pack( "C", $fourthByte ); + } +} # outputNode + +sub usage { + print STDERR "usage: $0 \n" + . "\t[-b bytesPerFile] (default = 0xFFFFFFFF)\n" + . "\t-m mapFile\n" + . "\t-ob outFileBase\n" + . "\t-sn start node out file\n" + . "\t[-if input file name] -- default = stdin\n" + . "\t[-term ch] (word terminator -- default = '\\0'\n" + . "\t[-nosort] (input already sorted in accord with -m; " . + " default=sort'\n" + . "\t[-dump] (write dictionary as text to STDERR for testing)\n" + . "\t[-force4](use 4 bytes per node regardless of need)\n" + . "\t[-r] (reject words with letters not in mapfile)\n" + . "\t[-k] (kill if any letters no in mapfile -- default)\n" + ; + +} # usage + +sub parseARGV { + + my $arg; + while ( my $arg = shift(@ARGV) ) { + + SWITCH: { + if ($arg =~ /-b/) {$gNBytesPerOutfile = shift(@ARGV), last SWITCH;} + if ($arg =~ /-mn/) {$gTableFile = shift(@ARGV); + $gUseUnicode = 1; + last SWITCH;} + if ($arg =~ /-m/) {$gTableFile = shift(@ARGV); last SWITCH;} + if ($arg =~ /-ob/) {$gOutFileBase = shift(@ARGV), last SWITCH;} + if ($arg =~ /-sn/) {$gStartNodeOut = shift(@ARGV), last SWITCH;} + if ($arg =~ /-if/) {$gInFileName = shift(@ARGV), last SWITCH;} + if ($arg =~ /-r/) {$gKillIfMissing = 0; last SWITCH;} + if ($arg =~ /-k/) {$gKillIfMissing = 1; last SWITCH;} + if ($arg =~ /-term/) {$gTermChar = chr(shift(@ARGV)); last SWITCH;} + if ($arg =~ /-dump/) {$gDumpText = 1; last SWITCH;} + if ($arg =~ /-nosort/) {$gNeedsSort = 0; last SWITCH;} + if ($arg =~ /-wc/) {$gCountFile = shift(@ARGV); last SWITCH;} + if ($arg =~ /-ns/) {$gBytesPerNodeFile = shift(@ARGV); last SWITCH;} + if ($arg =~ /-force4/) {$gForceFour = 1; last SWITCH;} + die "unexpected arg $arg\n"; + } + } + + + print STDERR "gNBytesPerOutfile=$gNBytesPerOutfile\n" if $debug; + print STDERR "gTableFile=$gTableFile\n" if $debug; + print STDERR "gOutFileBase=$gOutFileBase\n" if $debug; + print STDERR "gStartNodeOut=$gStartNodeOut\n" if $debug; + printf STDERR "gTermChar=%s(%d)\n", $gTermChar, ord($gTermChar) if $debug; + + return $gTableFile; + +} # parseARGV diff --git a/dawg/frank_mkspecials.pl b/dawg/frank_mkspecials.pl new file mode 100755 index 000000000..353727a74 --- /dev/null +++ b/dawg/frank_mkspecials.pl @@ -0,0 +1,45 @@ +#!/usr/bin/perl + +# Copyright 2001 by Eric House (fixin@peak.org) +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +# Given arguments consisting of triples, first a string and then pbitm +# files representing bitmaps. For each triple, print out the string and +# then the converted bitmaps. + +use strict; + +while ( @ARGV ) { + my $str = shift(); + my $largebmp = shift(); + my $smallbmp = shift(); + + doOne( $str, $largebmp, $smallbmp ); +} + +sub doOne { + my ( $str, $largebmp, $smallbmp ) = @_; + + print pack( "C", length($str) ); + print $str; + + print STDERR "looking at $largebmp", "\n"; + + print `cat $largebmp | ../pbitm2bin.pl`; + print `cat $smallbmp | ../pbitm2bin.pl`; +} + + diff --git a/dawg/palm_mkspecials.pl b/dawg/palm_mkspecials.pl new file mode 100755 index 000000000..d30f413d5 --- /dev/null +++ b/dawg/palm_mkspecials.pl @@ -0,0 +1,111 @@ +#!/usr/bin/perl + +# Copyright 2002 by Eric House +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +# Given arguments consisting of triples, first a string and then pbitm +# files representing bitmaps. The format looks like this: + +# array [0-n] of { char len; +# char[3] alt txt; +# int16 offsetOfLarge; +# int16 offsetOfSmall; +# } +# array [0-n] of { +# bitmapLargeIfPresent; +# bitmapSmallIfPresent; +# } +# +# In addition, there's padding between bitmaps if needed to get the next +# one to a 2-byte boundary. And the input files are not in PalmOS bitmap +# format, so thay have to get converted into a tmp file before the sizes +# can be known and included in the eventual output. + +use strict; + +my $tmpfile = "/tmp/tmpout$$"; + +my $nSpecials = @ARGV / 3; +die "wrong number of args" if (@ARGV % 3) != 0; +my $gOffset = $nSpecials * 8; # sizeof(Xloc_specialEntry) + +open TMPFILE, "> $tmpfile"; + +for ( my $i = 0; $i < $nSpecials; ++$i ) { + + my $size; + + my $str = shift( @ARGV ); + my $len = length($str); + die "string $str too long" if $len > 3; + print $str; + while ( $len < 4 ) { + ++$len; + print pack("c", 0 ); + } + + doOneFile( shift( @ARGV ), \*TMPFILE, \$gOffset ); + doOneFile( shift( @ARGV ), \*TMPFILE, \$gOffset ); +} + +close TMPFILE; + +# now append the tempfile +open TMPFILE, "< $tmpfile"; +while ( read( TMPFILE, my $buffer, 128 ) ) { + print $buffer; +} +close TMPFILE; + +unlink $tmpfile; + +exit 0; + + +sub doOneFile($$) { + my ( $fil, $fh, $offsetR ) = @_; + + my $size = convertBmp($fil, $fh ); + if ( ($size % 2) != 0 ) { + ++$size; + print $fh pack( "c", 0 ); + } + + print pack( "n", $size > 0? ${$offsetR} : 0 ); + + ${$offsetR} += $size; +} # doOneFile + +sub convertBmp($$) { + my ( $pbitmfile, $fhandle ) = @_; + + if ( $pbitmfile eq "/dev/null" ) { + return 0; + } else { + + # for some reason I can't get quote marks to print into tmp.rcp using just `echo` + open TMP, "> tmp.rcp"; + print TMP "BITMAP ID 1000 \"$pbitmfile\" AUTOCOMPRESS"; + close TMP; + + `pilrc tmp.rcp`; + print $fhandle `cat Tbmp03e8.bin`; + my $siz = -s "Tbmp03e8.bin"; + `rm -f tmp.rcp Tbmp03e8.bin`; + + return $siz; + } +} diff --git a/dawg/par.pl b/dawg/par.pl new file mode 100755 index 000000000..11ca565d2 --- /dev/null +++ b/dawg/par.pl @@ -0,0 +1,234 @@ +#!/usr/bin/perl + +# Copyright 2002 by Eric House (fixin@peak.org) All rights reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +# Only enough of par's features to support building a crosswords dict +# pdb + +use strict; + +my $debug = 0; + + +# stolen from par source +my $PRC_FLAGS_RESOURCE = (0x1<<0); +my $PRC_FLAGS_READONLY = (0x1<<1); +my $PRC_FLAGS_DIRTY = (0x1<<2); +my $PRC_FLAGS_BACKUP = (0x1<<3); +my $PRC_FLAGS_NEWER = (0x1<<4); +my $PRC_FLAGS_RESET = (0x1<<5); +my $PRC_FLAGS_COPYPREVENT = (0x1<<6); +my $PRC_FLAGS_STREAM = (0x1<<7); +my $PRC_FLAGS_HIDDEN = (0x1<<8); +my $PRC_FLAGS_LAUNCHABLE = (0x1<<9); +my $PRC_FLAGS_RECYCLABLE = (0x1<<10); +my $PRC_FLAGS_BUNDLE = (0x1<<11); +my $PRC_FLAGS_OPEN = (0x1<<15); + + +my $gAttrs = 0; +my $gVersion = 1; # par defaults this to 1 + +my $cmd = shift( @ARGV ); +die "only 'c' supported now" if $cmd ne "c" && $cmd ne "-c"; + +readHOptions( \@ARGV ); + +my $dbfile = shift( @ARGV ); +my $name = shift( @ARGV ); +die "name $name too long" if length($name) > 31; +my $type = shift( @ARGV ); +die "type $type must be of length 4" if length($type) != 4; +my $cid = shift( @ARGV ); +die "cid $cid must be of length 4" if length($cid) != 4; + +my @fileNames; +my @fileLengths; + +my $nFiles = 0; + +while ( @ARGV > 0 ) { + my $filename = shift( @ARGV ); + push @fileNames, $filename; + push @fileLengths, -s $filename; + ++$nFiles; +} + +# from par's prcp.h; thanks djw! +# typedef struct prc_file_t { +# prc_byte_t name[32]; +# prc_byte_t flags[2]; +# prc_byte_t version[2]; +# prc_byte_t ctime[4]; +# prc_byte_t mtime[4]; +# prc_byte_t btime[4]; +# prc_byte_t modnum[4]; +# prc_byte_t appinfo[4]; +# prc_byte_t sortinfo[4]; +# prc_byte_t type[4]; +# prc_byte_t cid[4]; +# prc_byte_t unique_id_seed[4]; +# prc_byte_t next_record_list[4]; +# prc_byte_t nrecords[2]; +# } prc_file_t; + +my $str; +my $offset = 0; + +open OUTFILE, "> $dbfile" or die "couldn't open outfile $dbfile for writing"; + +# print the string, then pad with 0s +$offset = length($name); +print OUTFILE $name; +while ( $offset < 32 ) { + print OUTFILE pack("c", 0); + ++$offset; +} + +$str = pack("n", $gAttrs); # flags +print OUTFILE $str; +$offset += length($str); + +$str = pack("n", $gVersion); # version +print OUTFILE $str; +$offset += length($str); + +my $time = time() + 2082844800; +$str = pack("NNN", $time, $time, 0); # ctime, mtime, btime +print OUTFILE $str; +$offset += length($str); + +$str = pack("N", 0 ); # mod num +print OUTFILE $str; +$offset += length($str); + +$str = pack("N", 0 ); # appinfo +print OUTFILE $str; +$offset += length($str); + +$str = pack("N", 0 ); # sortinfo +print OUTFILE $str; +$offset += length($str); + + +print OUTFILE $type; # type +print OUTFILE $cid; # cid +$offset += 8; + +$str = pack("NN", 0, 0 ); # unique_id_seed, next_record_list +print OUTFILE $str; +$offset += length($str); + +$str = pack("n", $nFiles ); # nrecords +print OUTFILE $str; +$offset += length($str); + +$offset += $nFiles * 8; +$offset += 2; # djw adds 2 bytes after size list; see below +foreach my $len ( @fileLengths ) { + print OUTFILE pack( "N", $offset ); + print OUTFILE pack( "N", 0 ); + $offset += $len; +} + +print OUTFILE pack( "n", 0 ); # djw does this sans comment: flush.c, line 87 + +foreach my $file ( @fileNames ) { + open INFILE, "<$file" or die "couldn't open infile $file\n"; + my $buffer; + while ( read INFILE, $buffer, 1024 ) { + print OUTFILE $buffer; + } + close INFILE; +} + + +close OUTFILE; + +exit 0; + +############################################################################## +# Subroutines +############################################################################## + +sub readHOptions { + + my ( $argvR ) = @_; + + for ( ; ; ) { + my $opt = ${$argvR}[0]; + + if ( $opt !~ /^-/ ) { + last; + } + + # it starts with a '-': use it; else don't consume anything + shift @{$argvR}; + + if ( $opt eq "-a" ) { + my $attrs = shift @{$argvR}; + processAttrString( $attrs ); + } elsif ( $opt eq "-v" ) { + $gVersion = shift @{$argvR}; + } else { + die "what's with \"$opt\": -a and -v are the only hattrs supported"; + } + } + +} # readHOptions + +sub processAttrString { + + my ( $attrs ) = @_; + + foreach my $flag ( split /\|/, $attrs ) { + + print STDERR "looking at flag $flag\n" if $debug; + + if ( $flag =~ /resource/ ) { + $gAttrs |= $PRC_FLAGS_RESOURCE; + die "resource attr not supported"; + } elsif ( $flag =~ /readonly/ ) { + $gAttrs |= $PRC_FLAGS_READONLY; + } elsif ( $flag =~ /dirty/ ) { + $gAttrs |= $PRC_FLAGS_DIRTY; + } elsif ( $flag =~ /backup/ ) { + $gAttrs |= $PRC_FLAGS_BACKUP; + } elsif ( $flag =~ /newer/ ) { + $gAttrs |= $PRC_FLAGS_NEWER; + } elsif ( $flag =~ /reset/ ) { + $gAttrs |= $PRC_FLAGS_RESET; + } elsif ( $flag =~ /copyprevent/ ) { + $gAttrs |= $PRC_FLAGS_COPYPREVENT; + } elsif ( $flag =~ /stream/ ) { + $gAttrs |= $PRC_FLAGS_STREAM; + die "stream attr not supported"; + } elsif ( $flag =~ /hidden/ ) { + $gAttrs |= $PRC_FLAGS_HIDDEN; + } elsif ( $flag =~ /launchable/ ) { + $gAttrs |= $PRC_FLAGS_LAUNCHABLE; + } elsif ( $flag =~ /recyclable/ ) { + $gAttrs |= $PRC_FLAGS_RECYCLABLE; + } elsif ( $flag =~ /bundle/ ) { + $gAttrs |= $PRC_FLAGS_BUNDLE; + } elsif ( $flag =~ /open/ ) { + $gAttrs |= $PRC_FLAGS_OPEN; + } else { + die "flag $flag not supportd"; + } + } +} # processAttrString diff --git a/dawg/xloc.pl b/dawg/xloc.pl new file mode 100755 index 000000000..d811b51a1 --- /dev/null +++ b/dawg/xloc.pl @@ -0,0 +1,42 @@ +#!/usr/bin/perl + +# Copyright 2002 by Eric House (fixin@peak.org). All rights reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +# test and wrapper file for xloc.pm + +use strict; +use xloc; + + + +my $arg = shift(@ARGV); +my $lang = shift(@ARGV); +my $path = "./$lang"; +my $infoFile = "$path/info.txt"; + +die "info file $infoFile not found\n" if ! -s $infoFile; + + +my $xlocToken = xloc::ParseTileInfo($infoFile); + +if ( $arg eq "-t" ) { + xloc::WriteMapFile( $xlocToken, 0, \*STDOUT ); +} elsif ( $arg eq "-tn" ) { + xloc::WriteMapFile( $xlocToken, 1, \*STDOUT ); +} elsif ( $arg eq "-v" ) { + xloc::WriteValuesFile( $xlocToken, \*STDOUT ); +} diff --git a/dawg/xloc.pm b/dawg/xloc.pm new file mode 100644 index 000000000..729af26e4 --- /dev/null +++ b/dawg/xloc.pm @@ -0,0 +1,180 @@ +#!/usr/bin/perl + +# Copyright 2002 by Eric House (fixin@peak.org). All rights reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +# The idea here is that all that matters about a language is stored in +# one file (possibly excepting rules for prepping a dictionary). +# There's a list of tile faces, counts and values, and also some +# name-value pairs as needed. The pairs come first, and then a list +# of tiles. + +package xloc; + +use strict; +use warnings; + +BEGIN { + use Exporter (); + our ($VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS); + + $VERSION = 1.00; + + @ISA = qw(Exporter); + @EXPORT = qw(&ParseTileInfo &GetNTiles &TileFace &TileValue + &TileCount &GetValue &WriteMapFile &WriteValuesFile); + %EXPORT_TAGS = ( ); +} + +# Returns what's meant to be an opaque object that can be passed back +# for queries. It's a hash with name-value pairs and an _INFO entry +# containing a list of tile info lists. + +sub ParseTileInfo($) { + my ( $filePath ) = @_; + my %result; + + open INPUT, "<$filePath" or die "couldn't open $filePath"; + + my $inTiles = 0; + my @tiles; + while ( ) { + + chomp; + s/\#.*$//; + s/^\s*$//; # nuke all-white-space lines + next if !length; + + if ( $inTiles ) { + if ( // ) { + last; + } else { + my ( $count, $val, $face ) = m/^\s*(\w+)\s+(\w+)\s+(.*)\s*$/; + push @tiles, [ $count, $val, $face ]; + } + } elsif ( /\w:/ ) { + my ( $nam, $val ) = split ':', $_, 2; + $result{$nam} .= $val; + } elsif ( // ) { + $inTiles = 1; + } + + } + + close INPUT; + + $result{"_TILES"} = [ @tiles ]; + + return \%result; +} + +sub GetNTiles($) { + my ( $hashR ) = @_; + + my $listR = ${$hashR}{"_TILES"}; + + return 0 + @{$listR}; +} + +sub GetValue($$) { + my ( $hashR, $name ) = @_; + return ${$hashR}{$name}; +} + +sub WriteMapFile($$$) { + my ( $hashR, $unicode, $fhr ) = @_; + + my $packStr; + if ( $unicode ) { + $packStr = "n"; + } else { + $packStr = "C"; + } + + my $count = GetNTiles($hashR); + my $specialCount = 0; + for ( my $i = 0; $i < $count; ++$i ) { + my $tileR = GetNthTile( $hashR, $i ); + my $str = ${$tileR}[2]; + + if ( $str =~ /\'(.)\'/ ) { + print $fhr pack($packStr, ord($1) ); + } elsif ( $str =~ /\"(.+)\"/ ) { + print $fhr pack($packStr, $specialCount++ ); + } elsif ( $str =~ /(\d+)/ ) { + print $fhr pack( $packStr, chr($1) ); + } else { + die "WriteMapFile: unrecognized face format $str"; + } + } +} # WriteMapFile + +sub WriteValuesFile($$) { + my ( $hashR, $fhr ) = @_; + + my $header = GetValue( $hashR,"XLOC_HEADER" ); + die "no XLOC_HEADER found" if ! $header; + + print STDERR "header is $header\n"; + + print $fhr pack( "n", hex($header) ); + + my $count = GetNTiles($hashR); + for ( my $i = 0; $i < $count; ++$i ) { + my $tileR = GetNthTile( $hashR, $i ); + + print $fhr pack( "c", TileValue($tileR) ); + print $fhr pack( "c", TileCount($tileR) ); + } + +} # WriteValuesFile + +sub GetNthTile($$) { + my ( $hashR, $n ) = @_; + my $listR = ${$hashR}{"_TILES"}; + + return ${$listR}[$n]; +} + +sub TileFace($) { + my ( $tileR ) = @_; + + my $str = ${$tileR}[2]; + + if ( $str =~ /\'(.)\'/ ) { + return $1; + } elsif ( $str =~ /\"(.+)\"/ ) { + return $1; + } elsif ( $str =~ /(\d+)/ ) { + return chr($1); + } else { + die "TileFace: unrecognized face format: $str"; + } +} + +sub TileValue($) { + my ( $tileR ) = @_; + + return ${$tileR}[0]; +} + +sub TileCount($) { + my ( $tileR ) = @_; + + return ${$tileR}[1]; +} + +1;