mirror of
git://xwords.git.sourceforge.net/gitroot/xwords/xwords
synced 2025-01-04 23:02:02 +01:00
first cut at python version of dawg2dict
Perl version doesn't work and I don't remember enough of the language to fix it.
This commit is contained in:
parent
c5a8319fa8
commit
58c3ab4e4a
1 changed files with 250 additions and 0 deletions
250
xwords4/dawg/dawg2dict.py
Executable file
250
xwords4/dawg/dawg2dict.py
Executable file
|
@ -0,0 +1,250 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import argparse, io, struct, sys
|
||||
|
||||
oneByteFmt = struct.Struct('B')
|
||||
SPACE = ' '
|
||||
|
||||
def getNullTermParam(fh):
|
||||
msg = ""
|
||||
while True:
|
||||
(oneChar) = oneByteFmt.unpack(fh.read(oneByteFmt.size))
|
||||
if int(oneChar[0]) == 0: break
|
||||
msg += chr(oneChar[0])
|
||||
return msg
|
||||
|
||||
def addFace( faces, face ):
|
||||
assert face
|
||||
faces.append( face )
|
||||
# print( 'addFace(): added:', face, ' now have', len(faces), 'faces' )
|
||||
|
||||
# Each face is one or two synonym strings (typically the upper- and
|
||||
# lower-case versions of a tile face), with a space as separator in the
|
||||
# two-string case.
|
||||
#
|
||||
# Each letter read is a space/delimiter or not. If it's a delimiter,
|
||||
# then we append what we had before. Otherwise if we have something
|
||||
# already then this is a new face starting. Otherwise it's a synonym
|
||||
# for something we're processing already.
|
||||
|
||||
def splitFaces( buf ):
|
||||
faces = []
|
||||
synonyms = None
|
||||
lastWasDelim = False
|
||||
for oneChar in buf:
|
||||
# print('read char', oneChar)
|
||||
|
||||
if oneChar == SPACE:
|
||||
assert synonyms # there's better be one already
|
||||
lastWasDelim = True
|
||||
else:
|
||||
# print( "read non-delim char:", oneChar )
|
||||
if lastWasDelim:
|
||||
assert len(synonyms) == 1
|
||||
synonyms.append( oneChar )
|
||||
addFace( faces, synonyms )
|
||||
synonyms = None
|
||||
lastWasDelim = False
|
||||
else:
|
||||
if synonyms:
|
||||
addFace( faces, synonyms )
|
||||
synonyms = [ oneChar ]
|
||||
|
||||
if synonyms: addFace( faces, synonyms )
|
||||
|
||||
return faces
|
||||
|
||||
def loadCountsAndValues( fh, numFaces, data ):
|
||||
twoBytesFmt = struct.Struct('BB')
|
||||
for ii in range(numFaces):
|
||||
pair = twoBytesFmt.unpack(fh.read(twoBytesFmt.size))
|
||||
data[ii]['count'] = int(pair[0])
|
||||
data[ii]['val'] = int(pair[1])
|
||||
|
||||
def eatBitmap( fh ):
|
||||
nCols = int(oneByteFmt.unpack(fh.read(oneByteFmt.size))[0])
|
||||
if nCols > 0:
|
||||
nRows = int(oneByteFmt.unpack(fh.read(oneByteFmt.size))[0])
|
||||
nBytes = ((nRows*nCols)+7) // 8
|
||||
print('eatBitmap(): skipping', nBytes, 'bytes; nCols:', nCols, 'nRows:', nRows)
|
||||
fh.read(nBytes)
|
||||
|
||||
def loadSpecialData( fh, data ):
|
||||
count = 0
|
||||
lastSpecial = ord(' ')
|
||||
for datum in data:
|
||||
# print('loadSpecialData: comparing', ord(datum['faces'][0]), 'with', lastSpecial)
|
||||
if len(datum['faces']) == 1 and ord(datum['faces'][0]) < lastSpecial:
|
||||
count += 1
|
||||
txtlen = int(oneByteFmt.unpack(fh.read(oneByteFmt.size))[0])
|
||||
txt = fh.read(txtlen).decode("UTF-8")
|
||||
# print('loadSpecialData(): found:', txt, 'of len', txtlen)
|
||||
datum['faces'] = txt.split( SPACE )
|
||||
eatBitmap( fh )
|
||||
eatBitmap( fh )
|
||||
|
||||
def loadNodes( dawg, nodeSize ):
|
||||
nodes = []
|
||||
fmtStr = ''
|
||||
for ii in range(nodeSize): fmtStr += 'B'
|
||||
fmt = struct.Struct(fmtStr)
|
||||
|
||||
while True:
|
||||
buf = dawg.read(nodeSize)
|
||||
if len(buf) == 0: break
|
||||
assert len(buf) == nodeSize
|
||||
arr = fmt.unpack(buf)
|
||||
val = 0
|
||||
for elem in arr:
|
||||
val = (val << 8) + elem
|
||||
nodes.append(val)
|
||||
# print('loaded node 0x{:x} of len {}'.format(val, nodeSize))
|
||||
return nodes
|
||||
|
||||
def parseNode( node, nodeSize ):
|
||||
|
||||
if nodeSize == 4:
|
||||
accepting = (node & 0x00008000) != 0
|
||||
isLast = (node & 0x00004000) != 0
|
||||
chrIndex = (node & 0x00003f00) >> 8
|
||||
nextEdge = (node >> 16) + ((node & 0x000000FF) << 16)
|
||||
elif nodeSize == 3:
|
||||
accepting = (node & 0x00000080) != 0
|
||||
isLast = (node & 0x00000040) != 0
|
||||
chrIndex = node & 0x0000001f
|
||||
nextEdge = (node >> 8) + ((node & 0x00000020) << 11)
|
||||
|
||||
return (nextEdge, chrIndex, accepting, isLast )
|
||||
|
||||
def expandDAWG( nodes, nodeSize, indx, data, words, letters = [] ):
|
||||
if len(letters) > 15: error( "infinite recursion???" )
|
||||
|
||||
while True:
|
||||
node = nodes[indx]
|
||||
indx += 1
|
||||
( nextEdge, chrIndex, accepting, lastEdge ) = parseNode( node, nodeSize )
|
||||
|
||||
letters.append( data[chrIndex]['faces'][0] )
|
||||
if accepting:
|
||||
words.append( ''.join(letters) )
|
||||
|
||||
if nextEdge != 0:
|
||||
expandDAWG( nodes, nodeSize, nextEdge, data, words, letters )
|
||||
|
||||
letters.pop()
|
||||
|
||||
if lastEdge: break
|
||||
|
||||
def process(args):
|
||||
DICT_SYNONYMS_MASK = 0x10
|
||||
DICT_HEADER_MASK = 0x08
|
||||
|
||||
with open(args.DAWG, "rb") as dawg:
|
||||
nWords = 0
|
||||
|
||||
headerFmt = struct.Struct('!HH')
|
||||
(flags, headerLen) = headerFmt.unpack(dawg.read(headerFmt.size))
|
||||
print( 'read flags: {:x}, header len: {}'.format(flags, headerLen ),
|
||||
file=sys.stderr )
|
||||
if not 0 == DICT_HEADER_MASK & flags:
|
||||
flags &= ~DICT_HEADER_MASK
|
||||
header = io.BytesIO(dawg.read(headerLen))
|
||||
nWordsFmt = struct.Struct('!L')
|
||||
nWords = nWordsFmt.unpack(header.read(nWordsFmt.size))[0]
|
||||
|
||||
print( 'header: read nWords: {}'.format(nWords ), file=sys.stderr )
|
||||
|
||||
msg = getNullTermParam(header)
|
||||
if args.DUMP_MSG:
|
||||
print( 'msg: {}'.format(msg))
|
||||
md5Sum = getNullTermParam(header)
|
||||
print( 'header: read sum: {}'.format(md5Sum), file=sys.stderr )
|
||||
|
||||
nodeSize = 0
|
||||
isUTF8 = False
|
||||
flags &= ~DICT_SYNONYMS_MASK
|
||||
if flags == 0x0002:
|
||||
nodeSize = 3
|
||||
elif flags == 0x0003:
|
||||
nodeSize = 4
|
||||
elif flags == 0x0004:
|
||||
isUTF8 = True
|
||||
nodeSize = 3
|
||||
elif flags == 0x0005:
|
||||
isUTF8 = True
|
||||
nodeSize = 4
|
||||
else:
|
||||
error("unexpected flags value")
|
||||
print( 'nodesize: {}, isUTF8: {}'.format(nodeSize, isUTF8), file=sys.stderr )
|
||||
|
||||
numFaceBytes = 0
|
||||
if isUTF8:
|
||||
numFaceBytes = oneByteFmt.unpack(dawg.read(oneByteFmt.size))[0]
|
||||
numFaces = int(oneByteFmt.unpack(dawg.read(oneByteFmt.size))[0])
|
||||
if not isUTF8:
|
||||
numFaceBytes = numFaces * 2
|
||||
if numFaces > 64:
|
||||
error("too many faces: " + numFaces)
|
||||
print( 'numFaceBytes: {}, numFaces: {}'.format(numFaceBytes, numFaces), file=sys.stderr )
|
||||
|
||||
print( 'TODO: confirm checksum', file=sys.stderr )
|
||||
|
||||
data = []
|
||||
if isUTF8:
|
||||
faceBytes = dawg.read(numFaceBytes).decode("UTF-8")
|
||||
faces = splitFaces( faceBytes )
|
||||
assert( len(faces) == numFaces )
|
||||
# print( 'loaded', len(faces), 'faces:', faces )
|
||||
for datum in faces:
|
||||
data.append({'faces' : datum })
|
||||
else:
|
||||
error('I don\'t handle obsolete ascii case')
|
||||
|
||||
langCode = 0x7F & oneByteFmt.unpack(dawg.read(oneByteFmt.size))[0]
|
||||
dawg.read( oneByteFmt.size ) # skip byte
|
||||
|
||||
loadCountsAndValues( dawg, numFaces, data )
|
||||
loadSpecialData( dawg, data )
|
||||
|
||||
offsetStruct = struct.Struct('!L')
|
||||
assert offsetStruct.size == 4
|
||||
offset = int(offsetStruct.unpack(dawg.read(offsetStruct.size))[0])
|
||||
|
||||
if args.DUMP_TILES:
|
||||
for ii in range(len(data)):
|
||||
print( 'tile {:2d}: {}:'.format(ii, data[ii]) )
|
||||
|
||||
nodes = loadNodes( dawg, nodeSize )
|
||||
words = []
|
||||
expandDAWG( nodes, nodeSize, offset, data, words )
|
||||
assert len(words) == nWords
|
||||
if args.DUMP_WORDS:
|
||||
for word in words:
|
||||
print(word)
|
||||
|
||||
def mkParser():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--dict', dest = 'DAWG', type = str, required = True,
|
||||
help = 'the .xwd file to load')
|
||||
parser.add_argument('--dump-words', dest = 'DUMP_WORDS', default = False,
|
||||
action = 'store_true', help = 'write wordlist to stdout')
|
||||
parser.add_argument('--dump-tiles', dest = 'DUMP_TILES', default = False,
|
||||
action = 'store_true', help = 'write tile metadata to stdout')
|
||||
parser.add_argument('--dump-msg', dest = 'DUMP_MSG', default = False,
|
||||
action = 'store_true', help = 'write header user-visible message to stdout')
|
||||
|
||||
|
||||
# [-raw | -json] [-get-sum] [-get-desc] -dict <xwdORpdb>
|
||||
|
||||
return parser
|
||||
|
||||
def parseArgs():
|
||||
args = mkParser().parse_args()
|
||||
process( args )
|
||||
|
||||
def main():
|
||||
args = parseArgs()
|
||||
|
||||
##############################################################################
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Reference in a new issue