mirror of
https://github.com/noDRM/DeDRM_tools
synced 2025-01-16 15:41:51 +01:00
afa4ac5716
THIS IS ON THE MASTER BRANCH. The Master branch will be Python 3.0 from now on. While Python 2.7 support will not be deliberately broken, all efforts should now focus on Python 3.0 compatibility. I can see a lot of work has been done. There's more to do. I've bumped the version number of everything I came across to the next major number for Python 3.0 compatibility indication. Thanks everyone. I hope to update here at least once a week until we have a stable 7.0 release for calibre 5.0
886 lines
30 KiB
Python
886 lines
30 KiB
Python
#! /usr/bin/python
|
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
|
# For use with Topaz Scripts Version 2.6
|
|
# Added Python 3 compatibility, September 2020
|
|
|
|
from __future__ import print_function
|
|
class Unbuffered:
|
|
def __init__(self, stream):
|
|
self.stream = stream
|
|
def write(self, data):
|
|
self.stream.write(data)
|
|
self.stream.flush()
|
|
def __getattr__(self, attr):
|
|
return getattr(self.stream, attr)
|
|
|
|
import sys
|
|
sys.stdout=Unbuffered(sys.stdout)
|
|
|
|
import csv
|
|
import os
|
|
import getopt
|
|
from struct import pack
|
|
from struct import unpack
|
|
|
|
class TpzDRMError(Exception):
|
|
pass
|
|
|
|
# Get a 7 bit encoded number from string. The most
|
|
# significant byte comes first and has the high bit (8th) set
|
|
|
|
def readEncodedNumber(file):
|
|
flag = False
|
|
c = file.read(1)
|
|
if (len(c) == 0):
|
|
return None
|
|
data = ord(c)
|
|
|
|
if data == 0xFF:
|
|
flag = True
|
|
c = file.read(1)
|
|
if (len(c) == 0):
|
|
return None
|
|
data = ord(c)
|
|
|
|
if data >= 0x80:
|
|
datax = (data & 0x7F)
|
|
while data >= 0x80 :
|
|
c = file.read(1)
|
|
if (len(c) == 0):
|
|
return None
|
|
data = ord(c)
|
|
datax = (datax <<7) + (data & 0x7F)
|
|
data = datax
|
|
|
|
if flag:
|
|
data = -data
|
|
return data
|
|
|
|
|
|
# returns a binary string that encodes a number into 7 bits
|
|
# most significant byte first which has the high bit set
|
|
|
|
def encodeNumber(number):
|
|
result = ""
|
|
negative = False
|
|
flag = 0
|
|
|
|
if number < 0 :
|
|
number = -number + 1
|
|
negative = True
|
|
|
|
while True:
|
|
byte = number & 0x7F
|
|
number = number >> 7
|
|
byte += flag
|
|
result += chr(byte)
|
|
flag = 0x80
|
|
if number == 0 :
|
|
if (byte == 0xFF and negative == False) :
|
|
result += chr(0x80)
|
|
break
|
|
|
|
if negative:
|
|
result += chr(0xFF)
|
|
|
|
return result[::-1]
|
|
|
|
|
|
|
|
# create / read a length prefixed string from the file
|
|
|
|
def lengthPrefixString(data):
|
|
return encodeNumber(len(data))+data
|
|
|
|
def readString(file):
|
|
stringLength = readEncodedNumber(file)
|
|
if (stringLength == None):
|
|
return ""
|
|
sv = file.read(stringLength)
|
|
if (len(sv) != stringLength):
|
|
return ""
|
|
return unpack(str(stringLength)+"s",sv)[0]
|
|
|
|
|
|
# convert a binary string generated by encodeNumber (7 bit encoded number)
|
|
# to the value you would find inside the page*.dat files to be processed
|
|
|
|
def convert(i):
|
|
result = ''
|
|
val = encodeNumber(i)
|
|
for j in range(len(val)):
|
|
c = ord(val[j:j+1])
|
|
result += '%02x' % c
|
|
return result
|
|
|
|
|
|
|
|
# the complete string table used to store all book text content
|
|
# as well as the xml tokens and values that make sense out of it
|
|
|
|
class Dictionary(object):
|
|
def __init__(self, dictFile):
|
|
self.filename = dictFile
|
|
self.size = 0
|
|
self.fo = open(dictFile,'rb')
|
|
self.stable = []
|
|
self.size = readEncodedNumber(self.fo)
|
|
for i in range(self.size):
|
|
self.stable.append(self.escapestr(readString(self.fo)))
|
|
self.pos = 0
|
|
|
|
def escapestr(self, str):
|
|
str = str.replace('&','&')
|
|
str = str.replace('<','<')
|
|
str = str.replace('>','>')
|
|
str = str.replace('=','=')
|
|
return str
|
|
|
|
def lookup(self,val):
|
|
if ((val >= 0) and (val < self.size)) :
|
|
self.pos = val
|
|
return self.stable[self.pos]
|
|
else:
|
|
print("Error - %d outside of string table limits" % val)
|
|
raise TpzDRMError('outside of string table limits')
|
|
# sys.exit(-1)
|
|
|
|
def getSize(self):
|
|
return self.size
|
|
|
|
def getPos(self):
|
|
return self.pos
|
|
|
|
def dumpDict(self):
|
|
for i in range(self.size):
|
|
print("%d %s %s" % (i, convert(i), self.stable[i]))
|
|
return
|
|
|
|
# parses the xml snippets that are represented by each page*.dat file.
|
|
# also parses the other0.dat file - the main stylesheet
|
|
# and information used to inject the xml snippets into page*.dat files
|
|
|
|
class PageParser(object):
|
|
def __init__(self, filename, dict, debug, flat_xml):
|
|
self.fo = open(filename,'rb')
|
|
self.id = os.path.basename(filename).replace('.dat','')
|
|
self.dict = dict
|
|
self.debug = debug
|
|
self.first_unknown = True
|
|
self.flat_xml = flat_xml
|
|
self.tagpath = []
|
|
self.doc = []
|
|
self.snippetList = []
|
|
|
|
|
|
# hash table used to enable the decoding process
|
|
# This has all been developed by trial and error so it may still have omissions or
|
|
# contain errors
|
|
# Format:
|
|
# tag : (number of arguments, argument type, subtags present, special case of subtags presents when escaped)
|
|
|
|
token_tags = {
|
|
'x' : (1, 'scalar_number', 0, 0),
|
|
'y' : (1, 'scalar_number', 0, 0),
|
|
'h' : (1, 'scalar_number', 0, 0),
|
|
'w' : (1, 'scalar_number', 0, 0),
|
|
'firstWord' : (1, 'scalar_number', 0, 0),
|
|
'lastWord' : (1, 'scalar_number', 0, 0),
|
|
'rootID' : (1, 'scalar_number', 0, 0),
|
|
'stemID' : (1, 'scalar_number', 0, 0),
|
|
'type' : (1, 'scalar_text', 0, 0),
|
|
|
|
'info' : (0, 'number', 1, 0),
|
|
|
|
'info.word' : (0, 'number', 1, 1),
|
|
'info.word.ocrText' : (1, 'text', 0, 0),
|
|
'info.word.firstGlyph' : (1, 'raw', 0, 0),
|
|
'info.word.lastGlyph' : (1, 'raw', 0, 0),
|
|
'info.word.bl' : (1, 'raw', 0, 0),
|
|
'info.word.link_id' : (1, 'number', 0, 0),
|
|
|
|
'glyph' : (0, 'number', 1, 1),
|
|
'glyph.x' : (1, 'number', 0, 0),
|
|
'glyph.y' : (1, 'number', 0, 0),
|
|
'glyph.glyphID' : (1, 'number', 0, 0),
|
|
|
|
'dehyphen' : (0, 'number', 1, 1),
|
|
'dehyphen.rootID' : (1, 'number', 0, 0),
|
|
'dehyphen.stemID' : (1, 'number', 0, 0),
|
|
'dehyphen.stemPage' : (1, 'number', 0, 0),
|
|
'dehyphen.sh' : (1, 'number', 0, 0),
|
|
|
|
'links' : (0, 'number', 1, 1),
|
|
'links.page' : (1, 'number', 0, 0),
|
|
'links.rel' : (1, 'number', 0, 0),
|
|
'links.row' : (1, 'number', 0, 0),
|
|
'links.title' : (1, 'text', 0, 0),
|
|
'links.href' : (1, 'text', 0, 0),
|
|
'links.type' : (1, 'text', 0, 0),
|
|
'links.id' : (1, 'number', 0, 0),
|
|
|
|
'paraCont' : (0, 'number', 1, 1),
|
|
'paraCont.rootID' : (1, 'number', 0, 0),
|
|
'paraCont.stemID' : (1, 'number', 0, 0),
|
|
'paraCont.stemPage' : (1, 'number', 0, 0),
|
|
|
|
'paraStems' : (0, 'number', 1, 1),
|
|
'paraStems.stemID' : (1, 'number', 0, 0),
|
|
|
|
'wordStems' : (0, 'number', 1, 1),
|
|
'wordStems.stemID' : (1, 'number', 0, 0),
|
|
|
|
'empty' : (1, 'snippets', 1, 0),
|
|
|
|
'page' : (1, 'snippets', 1, 0),
|
|
'page.class' : (1, 'scalar_text', 0, 0),
|
|
'page.pageid' : (1, 'scalar_text', 0, 0),
|
|
'page.pagelabel' : (1, 'scalar_text', 0, 0),
|
|
'page.type' : (1, 'scalar_text', 0, 0),
|
|
'page.h' : (1, 'scalar_number', 0, 0),
|
|
'page.w' : (1, 'scalar_number', 0, 0),
|
|
'page.startID' : (1, 'scalar_number', 0, 0),
|
|
|
|
'group' : (1, 'snippets', 1, 0),
|
|
'group.class' : (1, 'scalar_text', 0, 0),
|
|
'group.type' : (1, 'scalar_text', 0, 0),
|
|
'group._tag' : (1, 'scalar_text', 0, 0),
|
|
'group.orientation': (1, 'scalar_text', 0, 0),
|
|
|
|
'region' : (1, 'snippets', 1, 0),
|
|
'region.class' : (1, 'scalar_text', 0, 0),
|
|
'region.type' : (1, 'scalar_text', 0, 0),
|
|
'region.x' : (1, 'scalar_number', 0, 0),
|
|
'region.y' : (1, 'scalar_number', 0, 0),
|
|
'region.h' : (1, 'scalar_number', 0, 0),
|
|
'region.w' : (1, 'scalar_number', 0, 0),
|
|
'region.orientation' : (1, 'scalar_text', 0, 0),
|
|
|
|
'empty_text_region' : (1, 'snippets', 1, 0),
|
|
|
|
'img' : (1, 'snippets', 1, 0),
|
|
'img.x' : (1, 'scalar_number', 0, 0),
|
|
'img.y' : (1, 'scalar_number', 0, 0),
|
|
'img.h' : (1, 'scalar_number', 0, 0),
|
|
'img.w' : (1, 'scalar_number', 0, 0),
|
|
'img.src' : (1, 'scalar_number', 0, 0),
|
|
'img.color_src' : (1, 'scalar_number', 0, 0),
|
|
'img.gridSize' : (1, 'scalar_number', 0, 0),
|
|
'img.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
|
'img.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
|
'img.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
|
'img.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
|
'img.image_type' : (1, 'scalar_number', 0, 0),
|
|
|
|
'paragraph' : (1, 'snippets', 1, 0),
|
|
'paragraph.class' : (1, 'scalar_text', 0, 0),
|
|
'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
|
|
'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
|
|
'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
|
|
'paragraph.gridSize' : (1, 'scalar_number', 0, 0),
|
|
'paragraph.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
|
'paragraph.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
|
'paragraph.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
|
'paragraph.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
|
|
|
|
|
'word_semantic' : (1, 'snippets', 1, 1),
|
|
'word_semantic.type' : (1, 'scalar_text', 0, 0),
|
|
'word_semantic.class' : (1, 'scalar_text', 0, 0),
|
|
'word_semantic.firstWord' : (1, 'scalar_number', 0, 0),
|
|
'word_semantic.lastWord' : (1, 'scalar_number', 0, 0),
|
|
'word_semantic.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
|
'word_semantic.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
|
'word_semantic.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
|
'word_semantic.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
|
|
|
'word' : (1, 'snippets', 1, 0),
|
|
'word.type' : (1, 'scalar_text', 0, 0),
|
|
'word.class' : (1, 'scalar_text', 0, 0),
|
|
'word.firstGlyph' : (1, 'scalar_number', 0, 0),
|
|
'word.lastGlyph' : (1, 'scalar_number', 0, 0),
|
|
|
|
'_span' : (1, 'snippets', 1, 0),
|
|
'_span.class' : (1, 'scalar_text', 0, 0),
|
|
'_span.firstWord' : (1, 'scalar_number', 0, 0),
|
|
'_span.lastWord' : (1, 'scalar_number', 0, 0),
|
|
'_span.gridSize' : (1, 'scalar_number', 0, 0),
|
|
'_span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
|
'_span.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
|
'_span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
|
'_span.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
|
|
|
'span' : (1, 'snippets', 1, 0),
|
|
'span.firstWord' : (1, 'scalar_number', 0, 0),
|
|
'span.lastWord' : (1, 'scalar_number', 0, 0),
|
|
'span.gridSize' : (1, 'scalar_number', 0, 0),
|
|
'span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
|
'span.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
|
'span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
|
'span.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
|
|
|
'extratokens' : (1, 'snippets', 1, 0),
|
|
'extratokens.class' : (1, 'scalar_text', 0, 0),
|
|
'extratokens.type' : (1, 'scalar_text', 0, 0),
|
|
'extratokens.firstGlyph' : (1, 'scalar_number', 0, 0),
|
|
'extratokens.lastGlyph' : (1, 'scalar_number', 0, 0),
|
|
'extratokens.gridSize' : (1, 'scalar_number', 0, 0),
|
|
'extratokens.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
|
'extratokens.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
|
'extratokens.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
|
'extratokens.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
|
|
|
'glyph.h' : (1, 'number', 0, 0),
|
|
'glyph.w' : (1, 'number', 0, 0),
|
|
'glyph.use' : (1, 'number', 0, 0),
|
|
'glyph.vtx' : (1, 'number', 0, 1),
|
|
'glyph.len' : (1, 'number', 0, 1),
|
|
'glyph.dpi' : (1, 'number', 0, 0),
|
|
'vtx' : (0, 'number', 1, 1),
|
|
'vtx.x' : (1, 'number', 0, 0),
|
|
'vtx.y' : (1, 'number', 0, 0),
|
|
'len' : (0, 'number', 1, 1),
|
|
'len.n' : (1, 'number', 0, 0),
|
|
|
|
'book' : (1, 'snippets', 1, 0),
|
|
'version' : (1, 'snippets', 1, 0),
|
|
'version.FlowEdit_1_id' : (1, 'scalar_text', 0, 0),
|
|
'version.FlowEdit_1_version' : (1, 'scalar_text', 0, 0),
|
|
'version.Schema_id' : (1, 'scalar_text', 0, 0),
|
|
'version.Schema_version' : (1, 'scalar_text', 0, 0),
|
|
'version.Topaz_version' : (1, 'scalar_text', 0, 0),
|
|
'version.WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0),
|
|
'version.WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0),
|
|
'version.ZoneEdit_1_id' : (1, 'scalar_text', 0, 0),
|
|
'version.ZoneEdit_1_version' : (1, 'scalar_text', 0, 0),
|
|
'version.chapterheaders' : (1, 'scalar_text', 0, 0),
|
|
'version.creation_date' : (1, 'scalar_text', 0, 0),
|
|
'version.header_footer' : (1, 'scalar_text', 0, 0),
|
|
'version.init_from_ocr' : (1, 'scalar_text', 0, 0),
|
|
'version.letter_insertion' : (1, 'scalar_text', 0, 0),
|
|
'version.xmlinj_convert' : (1, 'scalar_text', 0, 0),
|
|
'version.xmlinj_reflow' : (1, 'scalar_text', 0, 0),
|
|
'version.xmlinj_transform' : (1, 'scalar_text', 0, 0),
|
|
'version.findlists' : (1, 'scalar_text', 0, 0),
|
|
'version.page_num' : (1, 'scalar_text', 0, 0),
|
|
'version.page_type' : (1, 'scalar_text', 0, 0),
|
|
'version.bad_text' : (1, 'scalar_text', 0, 0),
|
|
'version.glyph_mismatch' : (1, 'scalar_text', 0, 0),
|
|
'version.margins' : (1, 'scalar_text', 0, 0),
|
|
'version.staggered_lines' : (1, 'scalar_text', 0, 0),
|
|
'version.paragraph_continuation' : (1, 'scalar_text', 0, 0),
|
|
'version.toc' : (1, 'scalar_text', 0, 0),
|
|
|
|
'stylesheet' : (1, 'snippets', 1, 0),
|
|
'style' : (1, 'snippets', 1, 0),
|
|
'style._tag' : (1, 'scalar_text', 0, 0),
|
|
'style.type' : (1, 'scalar_text', 0, 0),
|
|
'style._after_type' : (1, 'scalar_text', 0, 0),
|
|
'style._parent_type' : (1, 'scalar_text', 0, 0),
|
|
'style._after_parent_type' : (1, 'scalar_text', 0, 0),
|
|
'style.class' : (1, 'scalar_text', 0, 0),
|
|
'style._after_class' : (1, 'scalar_text', 0, 0),
|
|
'rule' : (1, 'snippets', 1, 0),
|
|
'rule.attr' : (1, 'scalar_text', 0, 0),
|
|
'rule.value' : (1, 'scalar_text', 0, 0),
|
|
|
|
'original' : (0, 'number', 1, 1),
|
|
'original.pnum' : (1, 'number', 0, 0),
|
|
'original.pid' : (1, 'text', 0, 0),
|
|
'pages' : (0, 'number', 1, 1),
|
|
'pages.ref' : (1, 'number', 0, 0),
|
|
'pages.id' : (1, 'number', 0, 0),
|
|
'startID' : (0, 'number', 1, 1),
|
|
'startID.page' : (1, 'number', 0, 0),
|
|
'startID.id' : (1, 'number', 0, 0),
|
|
|
|
'median_d' : (1, 'number', 0, 0),
|
|
'median_h' : (1, 'number', 0, 0),
|
|
'median_firsty' : (1, 'number', 0, 0),
|
|
'median_lasty' : (1, 'number', 0, 0),
|
|
|
|
'num_footers_maybe' : (1, 'number', 0, 0),
|
|
'num_footers_yes' : (1, 'number', 0, 0),
|
|
'num_headers_maybe' : (1, 'number', 0, 0),
|
|
'num_headers_yes' : (1, 'number', 0, 0),
|
|
|
|
'tracking' : (1, 'number', 0, 0),
|
|
'src' : (1, 'text', 0, 0),
|
|
|
|
}
|
|
|
|
|
|
# full tag path record keeping routines
|
|
def tag_push(self, token):
|
|
self.tagpath.append(token)
|
|
def tag_pop(self):
|
|
if len(self.tagpath) > 0 :
|
|
self.tagpath.pop()
|
|
def tagpath_len(self):
|
|
return len(self.tagpath)
|
|
def get_tagpath(self, i):
|
|
cnt = len(self.tagpath)
|
|
if i < cnt : result = self.tagpath[i]
|
|
for j in range(i+1, cnt) :
|
|
result += '.' + self.tagpath[j]
|
|
return result
|
|
|
|
|
|
# list of absolute command byte values values that indicate
|
|
# various types of loop meachanisms typically used to generate vectors
|
|
|
|
cmd_list = (0x76, 0x76)
|
|
|
|
# peek at and return 1 byte that is ahead by i bytes
|
|
def peek(self, aheadi):
|
|
c = self.fo.read(aheadi)
|
|
if (len(c) == 0):
|
|
return None
|
|
self.fo.seek(-aheadi,1)
|
|
c = c[-1:]
|
|
return ord(c)
|
|
|
|
|
|
# get the next value from the file being processed
|
|
def getNext(self):
|
|
nbyte = self.peek(1);
|
|
if (nbyte == None):
|
|
return None
|
|
val = readEncodedNumber(self.fo)
|
|
return val
|
|
|
|
|
|
# format an arg by argtype
|
|
def formatArg(self, arg, argtype):
|
|
if (argtype == 'text') or (argtype == 'scalar_text') :
|
|
result = self.dict.lookup(arg)
|
|
elif (argtype == 'raw') or (argtype == 'number') or (argtype == 'scalar_number') :
|
|
result = arg
|
|
elif (argtype == 'snippets') :
|
|
result = arg
|
|
else :
|
|
print("Error Unknown argtype %s" % argtype)
|
|
sys.exit(-2)
|
|
return result
|
|
|
|
|
|
# process the next tag token, recursively handling subtags,
|
|
# arguments, and commands
|
|
def procToken(self, token):
|
|
|
|
known_token = False
|
|
self.tag_push(token)
|
|
|
|
if self.debug : print('Processing: ', self.get_tagpath(0))
|
|
cnt = self.tagpath_len()
|
|
for j in range(cnt):
|
|
tkn = self.get_tagpath(j)
|
|
if tkn in self.token_tags :
|
|
num_args = self.token_tags[tkn][0]
|
|
argtype = self.token_tags[tkn][1]
|
|
subtags = self.token_tags[tkn][2]
|
|
splcase = self.token_tags[tkn][3]
|
|
ntags = -1
|
|
known_token = True
|
|
break
|
|
|
|
if known_token :
|
|
|
|
# handle subtags if present
|
|
subtagres = []
|
|
if (splcase == 1):
|
|
# this type of tag uses of escape marker 0x74 indicate subtag count
|
|
if self.peek(1) == 0x74:
|
|
skip = readEncodedNumber(self.fo)
|
|
subtags = 1
|
|
num_args = 0
|
|
|
|
if (subtags == 1):
|
|
ntags = readEncodedNumber(self.fo)
|
|
if self.debug : print('subtags: ' + token + ' has ' + str(ntags))
|
|
for j in range(ntags):
|
|
val = readEncodedNumber(self.fo)
|
|
subtagres.append(self.procToken(self.dict.lookup(val)))
|
|
|
|
# arguments can be scalars or vectors of text or numbers
|
|
argres = []
|
|
if num_args > 0 :
|
|
firstarg = self.peek(1)
|
|
if (firstarg in self.cmd_list) and (argtype != 'scalar_number') and (argtype != 'scalar_text'):
|
|
# single argument is a variable length vector of data
|
|
arg = readEncodedNumber(self.fo)
|
|
argres = self.decodeCMD(arg,argtype)
|
|
else :
|
|
# num_arg scalar arguments
|
|
for i in range(num_args):
|
|
argres.append(self.formatArg(readEncodedNumber(self.fo), argtype))
|
|
|
|
# build the return tag
|
|
result = []
|
|
tkn = self.get_tagpath(0)
|
|
result.append(tkn)
|
|
result.append(subtagres)
|
|
result.append(argtype)
|
|
result.append(argres)
|
|
self.tag_pop()
|
|
return result
|
|
|
|
# all tokens that need to be processed should be in the hash
|
|
# table if it may indicate a problem, either new token
|
|
# or an out of sync condition
|
|
else:
|
|
result = []
|
|
if (self.debug or self.first_unknown):
|
|
print('Unknown Token:', token)
|
|
self.first_unknown = False
|
|
self.tag_pop()
|
|
return result
|
|
|
|
|
|
# special loop used to process code snippets
|
|
# it is NEVER used to format arguments.
|
|
# builds the snippetList
|
|
def doLoop72(self, argtype):
|
|
cnt = readEncodedNumber(self.fo)
|
|
if self.debug :
|
|
result = 'Set of '+ str(cnt) + ' xml snippets. The overall structure \n'
|
|
result += 'of the document is indicated by snippet number sets at the\n'
|
|
result += 'end of each snippet. \n'
|
|
print(result)
|
|
for i in range(cnt):
|
|
if self.debug: print('Snippet:',str(i))
|
|
snippet = []
|
|
snippet.append(i)
|
|
val = readEncodedNumber(self.fo)
|
|
snippet.append(self.procToken(self.dict.lookup(val)))
|
|
self.snippetList.append(snippet)
|
|
return
|
|
|
|
|
|
|
|
# general loop code gracisouly submitted by "skindle" - thank you!
|
|
def doLoop76Mode(self, argtype, cnt, mode):
|
|
result = []
|
|
adj = 0
|
|
if mode & 1:
|
|
adj = readEncodedNumber(self.fo)
|
|
mode = mode >> 1
|
|
x = []
|
|
for i in range(cnt):
|
|
x.append(readEncodedNumber(self.fo) - adj)
|
|
for i in range(mode):
|
|
for j in range(1, cnt):
|
|
x[j] = x[j] + x[j - 1]
|
|
for i in range(cnt):
|
|
result.append(self.formatArg(x[i],argtype))
|
|
return result
|
|
|
|
|
|
# dispatches loop commands bytes with various modes
|
|
# The 0x76 style loops are used to build vectors
|
|
|
|
# This was all derived by trial and error and
|
|
# new loop types may exist that are not handled here
|
|
# since they did not appear in the test cases
|
|
|
|
def decodeCMD(self, cmd, argtype):
|
|
if (cmd == 0x76):
|
|
|
|
# loop with cnt, and mode to control loop styles
|
|
cnt = readEncodedNumber(self.fo)
|
|
mode = readEncodedNumber(self.fo)
|
|
|
|
if self.debug : print('Loop for', cnt, 'with mode', mode, ': ')
|
|
return self.doLoop76Mode(argtype, cnt, mode)
|
|
|
|
if self.dbug: print("Unknown command", cmd)
|
|
result = []
|
|
return result
|
|
|
|
|
|
|
|
# add full tag path to injected snippets
|
|
def updateName(self, tag, prefix):
|
|
name = tag[0]
|
|
subtagList = tag[1]
|
|
argtype = tag[2]
|
|
argList = tag[3]
|
|
nname = prefix + '.' + name
|
|
nsubtaglist = []
|
|
for j in subtagList:
|
|
nsubtaglist.append(self.updateName(j,prefix))
|
|
ntag = []
|
|
ntag.append(nname)
|
|
ntag.append(nsubtaglist)
|
|
ntag.append(argtype)
|
|
ntag.append(argList)
|
|
return ntag
|
|
|
|
|
|
|
|
# perform depth first injection of specified snippets into this one
|
|
def injectSnippets(self, snippet):
|
|
snipno, tag = snippet
|
|
name = tag[0]
|
|
subtagList = tag[1]
|
|
argtype = tag[2]
|
|
argList = tag[3]
|
|
nsubtagList = []
|
|
if len(argList) > 0 :
|
|
for j in argList:
|
|
asnip = self.snippetList[j]
|
|
aso, atag = self.injectSnippets(asnip)
|
|
atag = self.updateName(atag, name)
|
|
nsubtagList.append(atag)
|
|
argtype='number'
|
|
argList=[]
|
|
if len(nsubtagList) > 0 :
|
|
subtagList.extend(nsubtagList)
|
|
tag = []
|
|
tag.append(name)
|
|
tag.append(subtagList)
|
|
tag.append(argtype)
|
|
tag.append(argList)
|
|
snippet = []
|
|
snippet.append(snipno)
|
|
snippet.append(tag)
|
|
return snippet
|
|
|
|
|
|
|
|
# format the tag for output
|
|
def formatTag(self, node):
|
|
name = node[0]
|
|
subtagList = node[1]
|
|
argtype = node[2]
|
|
argList = node[3]
|
|
fullpathname = name.split('.')
|
|
nodename = fullpathname.pop()
|
|
ilvl = len(fullpathname)
|
|
indent = ' ' * (3 * ilvl)
|
|
rlst = []
|
|
rlst.append(indent + '<' + nodename + '>')
|
|
if len(argList) > 0:
|
|
alst = []
|
|
for j in argList:
|
|
if (argtype == 'text') or (argtype == 'scalar_text') :
|
|
alst.append(j + '|')
|
|
else :
|
|
alst.append(str(j) + ',')
|
|
argres = "".join(alst)
|
|
argres = argres[0:-1]
|
|
if argtype == 'snippets' :
|
|
rlst.append('snippets:' + argres)
|
|
else :
|
|
rlst.append(argres)
|
|
if len(subtagList) > 0 :
|
|
rlst.append('\n')
|
|
for j in subtagList:
|
|
if len(j) > 0 :
|
|
rlst.append(self.formatTag(j))
|
|
rlst.append(indent + '</' + nodename + '>\n')
|
|
else:
|
|
rlst.append('</' + nodename + '>\n')
|
|
return "".join(rlst)
|
|
|
|
|
|
# flatten tag
|
|
def flattenTag(self, node):
|
|
name = node[0]
|
|
subtagList = node[1]
|
|
argtype = node[2]
|
|
argList = node[3]
|
|
rlst = []
|
|
rlst.append(name)
|
|
if (len(argList) > 0):
|
|
alst = []
|
|
for j in argList:
|
|
if (argtype == 'text') or (argtype == 'scalar_text') :
|
|
alst.append(j + '|')
|
|
else :
|
|
alst.append(str(j) + '|')
|
|
argres = "".join(alst)
|
|
argres = argres[0:-1]
|
|
if argtype == 'snippets' :
|
|
rlst.append('.snippets=' + argres)
|
|
else :
|
|
rlst.append('=' + argres)
|
|
rlst.append('\n')
|
|
for j in subtagList:
|
|
if len(j) > 0 :
|
|
rlst.append(self.flattenTag(j))
|
|
return "".join(rlst)
|
|
|
|
|
|
# reduce create xml output
|
|
def formatDoc(self, flat_xml):
|
|
rlst = []
|
|
for j in self.doc :
|
|
if len(j) > 0:
|
|
if flat_xml:
|
|
rlst.append(self.flattenTag(j))
|
|
else:
|
|
rlst.append(self.formatTag(j))
|
|
result = "".join(rlst)
|
|
if self.debug : print(result)
|
|
return result
|
|
|
|
|
|
|
|
# main loop - parse the page.dat files
|
|
# to create structured document and snippets
|
|
|
|
# FIXME: value at end of magic appears to be a subtags count
|
|
# but for what? For now, inject an 'info" tag as it is in
|
|
# every dictionary and seems close to what is meant
|
|
# The alternative is to special case the last _ "0x5f" to mean something
|
|
|
|
def process(self):
|
|
|
|
# peek at the first bytes to see what type of file it is
|
|
magic = self.fo.read(9)
|
|
if (magic[0:1] == 'p') and (magic[2:9] == 'marker_'):
|
|
first_token = 'info'
|
|
elif (magic[0:1] == 'p') and (magic[2:9] == '__PAGE_'):
|
|
skip = self.fo.read(2)
|
|
first_token = 'info'
|
|
elif (magic[0:1] == 'p') and (magic[2:8] == '_PAGE_'):
|
|
first_token = 'info'
|
|
elif (magic[0:1] == 'g') and (magic[2:9] == '__GLYPH'):
|
|
skip = self.fo.read(3)
|
|
first_token = 'info'
|
|
else :
|
|
# other0.dat file
|
|
first_token = None
|
|
self.fo.seek(-9,1)
|
|
|
|
|
|
# main loop to read and build the document tree
|
|
while True:
|
|
|
|
if first_token != None :
|
|
# use "inserted" first token 'info' for page and glyph files
|
|
tag = self.procToken(first_token)
|
|
if len(tag) > 0 :
|
|
self.doc.append(tag)
|
|
first_token = None
|
|
|
|
v = self.getNext()
|
|
if (v == None):
|
|
break
|
|
|
|
if (v == 0x72):
|
|
self.doLoop72('number')
|
|
elif (v > 0) and (v < self.dict.getSize()) :
|
|
tag = self.procToken(self.dict.lookup(v))
|
|
if len(tag) > 0 :
|
|
self.doc.append(tag)
|
|
else:
|
|
if self.debug:
|
|
print("Main Loop: Unknown value: %x" % v)
|
|
if (v == 0):
|
|
if (self.peek(1) == 0x5f):
|
|
skip = self.fo.read(1)
|
|
first_token = 'info'
|
|
|
|
# now do snippet injection
|
|
if len(self.snippetList) > 0 :
|
|
if self.debug : print('Injecting Snippets:')
|
|
snippet = self.injectSnippets(self.snippetList[0])
|
|
snipno = snippet[0]
|
|
tag_add = snippet[1]
|
|
if self.debug : print(self.formatTag(tag_add))
|
|
if len(tag_add) > 0:
|
|
self.doc.append(tag_add)
|
|
|
|
# handle generation of xml output
|
|
xmlpage = self.formatDoc(self.flat_xml)
|
|
|
|
return xmlpage
|
|
|
|
|
|
def fromData(dict, fname):
|
|
flat_xml = True
|
|
debug = False
|
|
pp = PageParser(fname, dict, debug, flat_xml)
|
|
xmlpage = pp.process()
|
|
return xmlpage
|
|
|
|
def getXML(dict, fname):
|
|
flat_xml = False
|
|
debug = False
|
|
pp = PageParser(fname, dict, debug, flat_xml)
|
|
xmlpage = pp.process()
|
|
return xmlpage
|
|
|
|
def usage():
|
|
print('Usage: ')
|
|
print(' convert2xml.py dict0000.dat infile.dat ')
|
|
print(' ')
|
|
print(' Options:')
|
|
print(' -h print this usage help message ')
|
|
print(' -d turn on debug output to check for potential errors ')
|
|
print(' --flat-xml output the flattened xml page description only ')
|
|
print(' ')
|
|
print(' This program will attempt to convert a page*.dat file or ')
|
|
print(' glyphs*.dat file, using the dict0000.dat file, to its xml description. ')
|
|
print(' ')
|
|
print(' Use "cmbtc_dump.py" first to unencrypt, uncompress, and dump ')
|
|
print(' the *.dat files from a Topaz format e-book.')
|
|
|
|
#
|
|
# Main
|
|
#
|
|
|
|
def main(argv):
|
|
dictFile = ""
|
|
pageFile = ""
|
|
debug = False
|
|
flat_xml = False
|
|
printOutput = False
|
|
if len(argv) == 0:
|
|
printOutput = True
|
|
argv = sys.argv
|
|
|
|
try:
|
|
opts, args = getopt.getopt(argv[1:], "hd", ["flat-xml"])
|
|
|
|
except getopt.GetoptError as err:
|
|
|
|
# print help information and exit:
|
|
print(str(err)) # will print something like "option -a not recognized"
|
|
usage()
|
|
sys.exit(2)
|
|
|
|
if len(opts) == 0 and len(args) == 0 :
|
|
usage()
|
|
sys.exit(2)
|
|
|
|
for o, a in opts:
|
|
if o =="-d":
|
|
debug=True
|
|
if o =="-h":
|
|
usage()
|
|
sys.exit(0)
|
|
if o =="--flat-xml":
|
|
flat_xml = True
|
|
|
|
dictFile, pageFile = args[0], args[1]
|
|
|
|
# read in the string table dictionary
|
|
dict = Dictionary(dictFile)
|
|
# dict.dumpDict()
|
|
|
|
# create a page parser
|
|
pp = PageParser(pageFile, dict, debug, flat_xml)
|
|
|
|
xmlpage = pp.process()
|
|
|
|
if printOutput:
|
|
print(xmlpage)
|
|
return 0
|
|
|
|
return xmlpage
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main(''))
|