mirror of
git://xwords.git.sourceforge.net/gitroot/xwords/xwords
synced 2025-01-03 23:04:08 +01:00
300 lines
11 KiB
Python
Executable file
300 lines
11 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
|
|
import argparse, io, struct, sys
|
|
|
|
oneByteFmt = struct.Struct('B')
|
|
SPACE = ' '
|
|
|
|
def getNullTermParam(fh):
|
|
msg = ""
|
|
while True:
|
|
(oneChar) = oneByteFmt.unpack(fh.read(oneByteFmt.size))
|
|
if int(oneChar[0]) == 0: break
|
|
msg += chr(oneChar[0])
|
|
return msg
|
|
|
|
def addFace( faces, face ):
|
|
assert face
|
|
faces.append( face )
|
|
# print( 'addFace(): added:', face, ' now have', len(faces), 'faces' )
|
|
|
|
# Each face is one or two synonym strings (typically the upper- and
|
|
# lower-case versions of a tile face), with a space as separator in the
|
|
# two-string case.
|
|
#
|
|
# Each letter read is a space/delimiter or not. If it's a delimiter,
|
|
# then we append what we had before. Otherwise if we have something
|
|
# already then this is a new face starting. Otherwise it's a synonym
|
|
# for something we're processing already.
|
|
|
|
def splitFaces( buf ):
|
|
faces = []
|
|
synonyms = None
|
|
lastWasDelim = False
|
|
for oneChar in buf:
|
|
# print('read char', oneChar)
|
|
|
|
if oneChar == SPACE:
|
|
assert synonyms # there's better be one already
|
|
lastWasDelim = True
|
|
else:
|
|
# print( "read non-delim char:", oneChar )
|
|
if lastWasDelim:
|
|
assert len(synonyms) == 1
|
|
synonyms.append( oneChar )
|
|
addFace( faces, synonyms )
|
|
synonyms = None
|
|
lastWasDelim = False
|
|
else:
|
|
if synonyms:
|
|
addFace( faces, synonyms )
|
|
synonyms = [ oneChar ]
|
|
|
|
if synonyms: addFace( faces, synonyms )
|
|
|
|
return faces
|
|
|
|
def loadCountsAndValues( fh, numFaces, extraData, data ):
|
|
for ii in range(numFaces):
|
|
data[ii]['counts'] = {15: int.from_bytes(fh.read(1), 'little')}
|
|
data[ii]['val'] = int.from_bytes(fh.read(1), 'little')
|
|
if extraData:
|
|
buf = io.BytesIO(extraData)
|
|
while True:
|
|
siz = int.from_bytes(buf.read(1), 'little')
|
|
if not siz: break
|
|
for ii in range(numFaces):
|
|
count = int.from_bytes(buf.read(1), 'little')
|
|
data[ii]['counts'][siz] = count
|
|
|
|
def eatBitmap( fh ):
|
|
nCols = int(oneByteFmt.unpack(fh.read(oneByteFmt.size))[0])
|
|
if nCols > 0:
|
|
nRows = int(oneByteFmt.unpack(fh.read(oneByteFmt.size))[0])
|
|
nBytes = ((nRows*nCols)+7) // 8
|
|
print('eatBitmap(): skipping {} bytes; nCols: {}, nRows: {}:'.format(nBytes,nCols, nRows),\
|
|
file=sys.stderr)
|
|
fh.read(nBytes)
|
|
|
|
def loadSpecialData( fh, data ):
|
|
count = 0
|
|
lastSpecial = ord(' ')
|
|
for datum in data:
|
|
# print('loadSpecialData: comparing', ord(datum['faces'][0]), 'with', lastSpecial)
|
|
if len(datum['faces']) == 1 and ord(datum['faces'][0]) < lastSpecial:
|
|
count += 1
|
|
txtlen = int(oneByteFmt.unpack(fh.read(oneByteFmt.size))[0])
|
|
txt = fh.read(txtlen).decode("UTF-8")
|
|
# print('loadSpecialData(): found:', txt, 'of len', txtlen)
|
|
datum['faces'] = txt.split( SPACE )
|
|
eatBitmap( fh )
|
|
eatBitmap( fh )
|
|
|
|
def loadNodes( dawg, nodeSize ):
|
|
nodes = []
|
|
fmtStr = ''
|
|
for ii in range(nodeSize): fmtStr += 'B'
|
|
fmt = struct.Struct(fmtStr)
|
|
|
|
while True:
|
|
buf = dawg.read(nodeSize)
|
|
if len(buf) == 0: break
|
|
assert len(buf) == nodeSize
|
|
arr = fmt.unpack(buf)
|
|
val = 0
|
|
for elem in arr:
|
|
val = (val << 8) + elem
|
|
nodes.append(val)
|
|
# print('loaded node 0x{:x} of len {}'.format(val, nodeSize))
|
|
return nodes
|
|
|
|
def parseNode( node, nodeSize ):
|
|
|
|
if nodeSize == 4:
|
|
accepting = (node & 0x00008000) != 0
|
|
isLast = (node & 0x00004000) != 0
|
|
chrIndex = (node & 0x00003f00) >> 8
|
|
nextEdge = (node >> 16) + ((node & 0x000000FF) << 16)
|
|
elif nodeSize == 3:
|
|
accepting = (node & 0x00000080) != 0
|
|
isLast = (node & 0x00000040) != 0
|
|
chrIndex = node & 0x0000001f
|
|
nextEdge = (node >> 8) + ((node & 0x00000020) << 11)
|
|
|
|
return (nextEdge, chrIndex, accepting, isLast )
|
|
|
|
def expandDAWG( nodes, nodeSize, delim, indx, data, words, letters = [] ):
|
|
if len(letters) > 15: error( "infinite recursion???" )
|
|
|
|
while True:
|
|
node = nodes[indx]
|
|
indx += 1
|
|
( nextEdge, chrIndex, accepting, lastEdge ) = parseNode( node, nodeSize )
|
|
|
|
letters.append( data[chrIndex]['faces'][0] )
|
|
if accepting:
|
|
words.append( delim.join(letters) )
|
|
|
|
if nextEdge != 0:
|
|
expandDAWG( nodes, nodeSize, delim, nextEdge, data, words, letters )
|
|
|
|
letters.pop()
|
|
|
|
if lastEdge: break
|
|
|
|
def process(args):
|
|
DICT_SYNONYMS_MASK = 0x10
|
|
DICT_HEADER_MASK = 0x08
|
|
|
|
with open(args.DAWG, "rb") as dawg:
|
|
nWords = 0
|
|
extraData = None
|
|
|
|
headerFmt = struct.Struct('!HH')
|
|
(flags, headerLen) = headerFmt.unpack(dawg.read(headerFmt.size))
|
|
print( 'read flags: {:x}, header len: {}'.format(flags, headerLen ),
|
|
file=sys.stderr )
|
|
if not 0 == DICT_HEADER_MASK & flags:
|
|
flags &= ~DICT_HEADER_MASK
|
|
header = io.BytesIO(dawg.read(headerLen))
|
|
nWordsFmt = struct.Struct('!L')
|
|
nWords = nWordsFmt.unpack(header.read(nWordsFmt.size))[0]
|
|
|
|
print( 'header: read nWords: {}'.format(nWords ), file=sys.stderr )
|
|
|
|
try: # older wordlists won't have these
|
|
msg = getNullTermParam(header)
|
|
if args.DUMP_MSG:
|
|
print(msg, file=sys.stdout)
|
|
sys.exit(0)
|
|
md5Sum = getNullTermParam(header)
|
|
print( 'header: read sum: {}'.format(md5Sum), file=sys.stderr )
|
|
|
|
hdrFlags = int.from_bytes(header.read(2), 'little')
|
|
print( 'header: flags: {:X}'.format(hdrFlags), file=sys.stderr)
|
|
|
|
langCode = getNullTermParam(header)
|
|
print('header: langCode: {}; '.format(langCode), file=sys.stderr, end = '')
|
|
langName = getNullTermParam(header)
|
|
print('langName: {}'.format(langName), file=sys.stderr)
|
|
|
|
extraSize = int.from_bytes(header.read(1), 'little')
|
|
print( 'header: extraSize: {}'.format(extraSize), file=sys.stderr )
|
|
extraData = header.read(extraSize)
|
|
|
|
except Exception as ex:
|
|
# This will fire when header ends before langCode,
|
|
# i.e. for older wordlists
|
|
print( 'old/small header? exception!! {} '.format(ex), file=sys.stderr )
|
|
|
|
if args.GET_SUM:
|
|
print( '{}'.format(md5Sum), file=sys.stdout )
|
|
sys.exit(0)
|
|
elif args.GET_NWORDS:
|
|
print( '{}'.format(nWords), file=sys.stdout )
|
|
sys.exit(0)
|
|
elif args.GET_LNAME:
|
|
print( '{}'.format(langName), file=sys.stdout )
|
|
sys.exit(0)
|
|
|
|
nodeSize = 0
|
|
isUTF8 = False
|
|
flags &= ~DICT_SYNONYMS_MASK
|
|
if flags == 0x0002:
|
|
nodeSize = 3
|
|
elif flags == 0x0003:
|
|
nodeSize = 4
|
|
elif flags == 0x0004:
|
|
isUTF8 = True
|
|
nodeSize = 3
|
|
elif flags == 0x0005:
|
|
isUTF8 = True
|
|
nodeSize = 4
|
|
else:
|
|
error("unexpected flags value")
|
|
print( 'nodesize: {}, isUTF8: {}'.format(nodeSize, isUTF8), file=sys.stderr )
|
|
|
|
numFaceBytes = 0
|
|
if isUTF8:
|
|
numFaceBytes = oneByteFmt.unpack(dawg.read(oneByteFmt.size))[0]
|
|
numFaces = int(oneByteFmt.unpack(dawg.read(oneByteFmt.size))[0])
|
|
if not isUTF8:
|
|
numFaceBytes = numFaces * 2
|
|
assert numFaces <= 64, 'too many faces: {}'.format(numFaces)
|
|
print( 'numFaceBytes: {}, numFaces: {}'.format(numFaceBytes, numFaces), file=sys.stderr )
|
|
|
|
print( 'TODO: confirm checksum', file=sys.stderr )
|
|
|
|
data = []
|
|
if isUTF8:
|
|
faceBytes = dawg.read(numFaceBytes).decode("UTF-8")
|
|
faces = splitFaces( faceBytes )
|
|
assert( len(faces) == numFaces )
|
|
# print( 'loaded', len(faces), 'faces:', faces )
|
|
for datum in faces:
|
|
data.append({'faces' : datum })
|
|
else:
|
|
error('I don\'t handle obsolete ascii case')
|
|
|
|
langCode = oneByteFmt.unpack(dawg.read(oneByteFmt.size))[0]
|
|
official = 0 != langCode & 0x80
|
|
langCode = 0x7F & langCode
|
|
print( 'langCode: {:02X}; official: {}'.format(langCode, official), file=sys.stderr )
|
|
dawg.read( oneByteFmt.size ) # skip byte
|
|
|
|
loadCountsAndValues( dawg, numFaces, extraData, data )
|
|
loadSpecialData( dawg, data )
|
|
|
|
offsetStruct = struct.Struct('!L')
|
|
assert offsetStruct.size == 4
|
|
offset = int(offsetStruct.unpack(dawg.read(offsetStruct.size))[0])
|
|
|
|
if args.DUMP_TILES:
|
|
for ii in range(len(data)):
|
|
print( 'tile {:2d}: {}:'.format(ii, data[ii]) )
|
|
|
|
nodes = loadNodes( dawg, nodeSize )
|
|
words = []
|
|
if nodes:
|
|
expandDAWG( nodes, nodeSize, args.DELIM, offset, data, words )
|
|
if not len(words) == nWords:
|
|
print("loaded {} words but header claims {}".format(len(words), nWords), file=sys.stderr)
|
|
# assert len(words) == nWords
|
|
if args.DUMP_WORDS:
|
|
for word in words:
|
|
# if we're piped to head we'll get an exception, so just exit
|
|
try: print(word)
|
|
except: break
|
|
|
|
def mkParser():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--dawg', dest = 'DAWG', type = str, required = True,
|
|
help = 'the .xwd file to load')
|
|
parser.add_argument('--dump-words', dest = 'DUMP_WORDS', default = False,
|
|
action = 'store_true', help = 'write wordlist to stdout')
|
|
parser.add_argument('--dump-tiles', dest = 'DUMP_TILES', default = False,
|
|
action = 'store_true', help = 'write tile metadata to stdout')
|
|
parser.add_argument('--dump-msg', dest = 'DUMP_MSG', default = False,
|
|
action = 'store_true', help = 'write header user-visible message to stdout')
|
|
parser.add_argument('--get-sum', dest = 'GET_SUM', default = False,
|
|
action = 'store_true', help = 'write md5sum to stdout')
|
|
parser.add_argument('--get-nwords', dest = 'GET_NWORDS', default = False,
|
|
action = 'store_true', help = 'write count of words to stdout')
|
|
parser.add_argument('--get-lname', dest = 'GET_LNAME', default = False,
|
|
action = 'store_true', help = 'write language name to stdout (if present)')
|
|
parser.add_argument('--separator', dest = 'DELIM', default = '', help = 'printed between tiles')
|
|
|
|
# [-raw | -json] [-get-sum] [-get-desc] -dict <xwdORpdb>
|
|
|
|
return parser
|
|
|
|
def parseArgs():
|
|
args = mkParser().parse_args()
|
|
process( args )
|
|
|
|
def main():
|
|
args = parseArgs()
|
|
|
|
##############################################################################
|
|
if __name__ == '__main__':
|
|
main()
|