topaz tools 1.0 (I think)

This commit is contained in:
some_updates 2010-01-17 12:10:35 +00:00 committed by Apprentice Alf
parent 1fc40376cf
commit 0a437510f6
9 changed files with 2931 additions and 0 deletions

View file

@ -0,0 +1,865 @@
#! /usr/bin/python
"""
Comprehensive Mazama Book DRM with Topaz Cryptography V2.0
-----BEGIN PUBLIC KEY-----
MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDdBHJ4CNc6DNFCw4MRCw4SWAK6
M8hYfnNEI0yQmn5Ti+W8biT7EatpauE/5jgQMPBmdNrDr1hbHyHBSP7xeC2qlRWC
B62UCxeu/fpfnvNHDN/wPWWH4jynZ2M6cdcnE5LQ+FfeKqZn7gnG2No1U9h7oOHx
y2/pHuYme7U1TsgSjwIDAQAB
-----END PUBLIC KEY-----
"""
from __future__ import with_statement
import csv
import sys
import os
import getopt
import zlib
from struct import pack
from struct import unpack
from ctypes import windll, c_char_p, c_wchar_p, c_uint, POINTER, byref, \
create_unicode_buffer, create_string_buffer, CFUNCTYPE, addressof, \
string_at, Structure, c_void_p, cast
import _winreg as winreg
import Tkinter
import Tkconstants
import tkMessageBox
import traceback
import hashlib
MAX_PATH = 255
kernel32 = windll.kernel32
advapi32 = windll.advapi32
crypt32 = windll.crypt32
global kindleDatabase
global bookFile
global bookPayloadOffset
global bookHeaderRecords
global bookMetadata
global bookKey
global command
#
# Various character maps used to decrypt books. Probably supposed to act as obfuscation
#
charMap1 = "n5Pr6St7Uv8Wx9YzAb0Cd1Ef2Gh3Jk4M"
charMap2 = "AaZzB0bYyCc1XxDdW2wEeVv3FfUuG4g-TtHh5SsIiR6rJjQq7KkPpL8lOoMm9Nn_"
charMap3 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
charMap4 = "ABCDEFGHIJKLMNPQRSTUVWXYZ123456789"
#
# Exceptions for all the problems that might happen during the script
#
class CMBDTCError(Exception):
pass
class CMBDTCFatal(Exception):
pass
#
# Stolen stuff
#
class DataBlob(Structure):
_fields_ = [('cbData', c_uint),
('pbData', c_void_p)]
DataBlob_p = POINTER(DataBlob)
def GetSystemDirectory():
GetSystemDirectoryW = kernel32.GetSystemDirectoryW
GetSystemDirectoryW.argtypes = [c_wchar_p, c_uint]
GetSystemDirectoryW.restype = c_uint
def GetSystemDirectory():
buffer = create_unicode_buffer(MAX_PATH + 1)
GetSystemDirectoryW(buffer, len(buffer))
return buffer.value
return GetSystemDirectory
GetSystemDirectory = GetSystemDirectory()
def GetVolumeSerialNumber():
GetVolumeInformationW = kernel32.GetVolumeInformationW
GetVolumeInformationW.argtypes = [c_wchar_p, c_wchar_p, c_uint,
POINTER(c_uint), POINTER(c_uint),
POINTER(c_uint), c_wchar_p, c_uint]
GetVolumeInformationW.restype = c_uint
def GetVolumeSerialNumber(path):
vsn = c_uint(0)
GetVolumeInformationW(path, None, 0, byref(vsn), None, None, None, 0)
return vsn.value
return GetVolumeSerialNumber
GetVolumeSerialNumber = GetVolumeSerialNumber()
def GetUserName():
GetUserNameW = advapi32.GetUserNameW
GetUserNameW.argtypes = [c_wchar_p, POINTER(c_uint)]
GetUserNameW.restype = c_uint
def GetUserName():
buffer = create_unicode_buffer(32)
size = c_uint(len(buffer))
while not GetUserNameW(buffer, byref(size)):
buffer = create_unicode_buffer(len(buffer) * 2)
size.value = len(buffer)
return buffer.value.encode('utf-16-le')[::2]
return GetUserName
GetUserName = GetUserName()
def CryptUnprotectData():
_CryptUnprotectData = crypt32.CryptUnprotectData
_CryptUnprotectData.argtypes = [DataBlob_p, c_wchar_p, DataBlob_p,
c_void_p, c_void_p, c_uint, DataBlob_p]
_CryptUnprotectData.restype = c_uint
def CryptUnprotectData(indata, entropy):
indatab = create_string_buffer(indata)
indata = DataBlob(len(indata), cast(indatab, c_void_p))
entropyb = create_string_buffer(entropy)
entropy = DataBlob(len(entropy), cast(entropyb, c_void_p))
outdata = DataBlob()
if not _CryptUnprotectData(byref(indata), None, byref(entropy),
None, None, 0, byref(outdata)):
raise CMBDTCFatal("Failed to Unprotect Data")
return string_at(outdata.pbData, outdata.cbData)
return CryptUnprotectData
CryptUnprotectData = CryptUnprotectData()
#
# Returns the MD5 digest of "message"
#
def MD5(message):
ctx = hashlib.md5()
ctx.update(message)
return ctx.digest()
#
# Returns the MD5 digest of "message"
#
def SHA1(message):
ctx = hashlib.sha1()
ctx.update(message)
return ctx.digest()
#
# Open the book file at path
#
def openBook(path):
try:
return open(path,'rb')
except:
raise CMBDTCFatal("Could not open book file: " + path)
#
# Encode the bytes in data with the characters in map
#
def encode(data, map):
result = ""
for char in data:
value = ord(char)
Q = (value ^ 0x80) // len(map)
R = value % len(map)
result += map[Q]
result += map[R]
return result
#
# Hash the bytes in data and then encode the digest with the characters in map
#
def encodeHash(data,map):
return encode(MD5(data),map)
#
# Decode the string in data with the characters in map. Returns the decoded bytes
#
def decode(data,map):
result = ""
for i in range (0,len(data),2):
high = map.find(data[i])
low = map.find(data[i+1])
value = (((high * 0x40) ^ 0x80) & 0xFF) + low
result += pack("B",value)
return result
#
# Locate and open the Kindle.info file (Hopefully in the way it is done in the Kindle application)
#
def openKindleInfo():
regkey = winreg.OpenKey(winreg.HKEY_CURRENT_USER, "Software\\Microsoft\\Windows\\CurrentVersion\\Explorer\\Shell Folders\\")
path = winreg.QueryValueEx(regkey, 'Local AppData')[0]
return open(path+'\\Amazon\\Kindle For PC\\{AMAwzsaPaaZAzmZzZQzgZCAkZ3AjA_AY}\\kindle.info','r')
#
# Parse the Kindle.info file and return the records as a list of key-values
#
def parseKindleInfo():
DB = {}
infoReader = openKindleInfo()
infoReader.read(1)
data = infoReader.read()
items = data.split('{')
for item in items:
splito = item.split(':')
DB[splito[0]] =splito[1]
return DB
#
# Find if the original string for a hashed/encoded string is known. If so return the original string othwise return an empty string. (Totally not optimal)
#
def findNameForHash(hash):
names = ["kindle.account.tokens","kindle.cookie.item","eulaVersionAccepted","login_date","kindle.token.item","login","kindle.key.item","kindle.name.info","kindle.device.info", "MazamaRandomNumber"]
result = ""
for name in names:
if hash == encodeHash(name, charMap2):
result = name
break
return name
#
# Print all the records from the kindle.info file (option -i)
#
def printKindleInfo():
for record in kindleDatabase:
name = findNameForHash(record)
if name != "" :
print (name)
print ("--------------------------\n")
else :
print ("Unknown Record")
print getKindleInfoValueForHash(record)
print "\n"
#
# Get a record from the Kindle.info file for the key "hashedKey" (already hashed and encoded). Return the decoded and decrypted record
#
def getKindleInfoValueForHash(hashedKey):
global kindleDatabase
encryptedValue = decode(kindleDatabase[hashedKey],charMap2)
return CryptUnprotectData(encryptedValue,"")
#
# Get a record from the Kindle.info file for the string in "key" (plaintext). Return the decoded and decrypted record
#
def getKindleInfoValueForKey(key):
return getKindleInfoValueForHash(encodeHash(key,charMap2))
#
# Get a 7 bit encoded number from the book file
#
def bookReadEncodedNumber():
flag = False
data = ord(bookFile.read(1))
if data == 0xFF:
flag = True
data = ord(bookFile.read(1))
if data >= 0x80:
datax = (data & 0x7F)
while data >= 0x80 :
data = ord(bookFile.read(1))
datax = (datax <<7) + (data & 0x7F)
data = datax
if flag:
data = -data
return data
#
# Encode a number in 7 bit format
#
def encodeNumber(number):
result = ""
negative = False
flag = 0
if number < 0 :
number = -number + 1
negative = True
while True:
byte = number & 0x7F
number = number >> 7
byte += flag
result += chr(byte)
flag = 0x80
if number == 0 : break
if negative:
result += chr(0xFF)
return result[::-1]
#
# Get a length prefixed string from the file
#
def bookReadString():
stringLength = bookReadEncodedNumber()
return unpack(str(stringLength)+"s",bookFile.read(stringLength))[0]
#
# Returns a length prefixed string
#
def lengthPrefixString(data):
return encodeNumber(len(data))+data
#
# Read and return the data of one header record at the current book file position [[offset,decompressedLength,compressedLength],...]
#
def bookReadHeaderRecordData():
nbValues = bookReadEncodedNumber()
values = []
for i in range (0,nbValues):
values.append([bookReadEncodedNumber(),bookReadEncodedNumber(),bookReadEncodedNumber()])
return values
#
# Read and parse one header record at the current book file position and return the associated data [[offset,decompressedLength,compressedLength],...]
#
def parseTopazHeaderRecord():
if ord(bookFile.read(1)) != 0x63:
raise CMBDTCFatal("Parse Error : Invalid Header")
tag = bookReadString()
record = bookReadHeaderRecordData()
return [tag,record]
#
# Parse the header of a Topaz file, get all the header records and the offset for the payload
#
def parseTopazHeader():
global bookHeaderRecords
global bookPayloadOffset
magic = unpack("4s",bookFile.read(4))[0]
if magic != 'TPZ0':
raise CMBDTCFatal("Parse Error : Invalid Header, not a Topaz file")
nbRecords = bookReadEncodedNumber()
bookHeaderRecords = {}
for i in range (0,nbRecords):
result = parseTopazHeaderRecord()
print result[0], result[1]
bookHeaderRecords[result[0]] = result[1]
if ord(bookFile.read(1)) != 0x64 :
raise CMBDTCFatal("Parse Error : Invalid Header")
bookPayloadOffset = bookFile.tell()
#
# Get a record in the book payload, given its name and index. If necessary the record is decrypted. The record is not decompressed
# Correction, the record is correctly decompressed too
#
def getBookPayloadRecord(name, index):
encrypted = False
compressed = False
try:
recordOffset = bookHeaderRecords[name][index][0]
except:
raise CMBDTCFatal("Parse Error : Invalid Record, record not found")
bookFile.seek(bookPayloadOffset + recordOffset)
tag = bookReadString()
if tag != name :
raise CMBDTCFatal("Parse Error : Invalid Record, record name doesn't match")
recordIndex = bookReadEncodedNumber()
if recordIndex < 0 :
encrypted = True
recordIndex = -recordIndex -1
if recordIndex != index :
raise CMBDTCFatal("Parse Error : Invalid Record, index doesn't match")
if (bookHeaderRecords[name][index][2] > 0):
compressed = True
record = bookFile.read(bookHeaderRecords[name][index][2])
else:
record = bookFile.read(bookHeaderRecords[name][index][1])
if encrypted:
ctx = topazCryptoInit(bookKey)
record = topazCryptoDecrypt(record,ctx)
if compressed:
record = zlib.decompress(record)
return record
#
# Extract, decrypt and decompress a book record indicated by name and index and print it or save it in "filename"
#
def extractBookPayloadRecord(name, index, filename):
compressed = False
try:
compressed = bookHeaderRecords[name][index][2] != 0
record = getBookPayloadRecord(name,index)
except:
print("Could not find record")
# if compressed:
# try:
# record = zlib.decompress(record)
# except:
# raise CMBDTCFatal("Could not decompress record")
if filename != "":
try:
file = open(filename,"wb")
file.write(record)
file.close()
except:
raise CMBDTCFatal("Could not write to destination file")
else:
print(record)
#
# return next record [key,value] from the book metadata from the current book position
#
def readMetadataRecord():
return [bookReadString(),bookReadString()]
#
# Parse the metadata record from the book payload and return a list of [key,values]
#
def parseMetadata():
global bookHeaderRecords
global bookPayloadAddress
global bookMetadata
bookMetadata = {}
bookFile.seek(bookPayloadOffset + bookHeaderRecords["metadata"][0][0])
tag = bookReadString()
if tag != "metadata" :
raise CMBDTCFatal("Parse Error : Record Names Don't Match")
flags = ord(bookFile.read(1))
nbRecords = ord(bookFile.read(1))
for i in range (0,nbRecords) :
record =readMetadataRecord()
bookMetadata[record[0]] = record[1]
#
# Returns two bit at offset from a bit field
#
def getTwoBitsFromBitField(bitField,offset):
byteNumber = offset // 4
bitPosition = 6 - 2*(offset % 4)
return ord(bitField[byteNumber]) >> bitPosition & 3
#
# Returns the six bits at offset from a bit field
#
def getSixBitsFromBitField(bitField,offset):
offset *= 3
value = (getTwoBitsFromBitField(bitField,offset) <<4) + (getTwoBitsFromBitField(bitField,offset+1) << 2) +getTwoBitsFromBitField(bitField,offset+2)
return value
#
# 8 bits to six bits encoding from hash to generate PID string
#
def encodePID(hash):
global charMap3
PID = ""
for position in range (0,8):
PID += charMap3[getSixBitsFromBitField(hash,position)]
return PID
#
# Context initialisation for the Topaz Crypto
#
def topazCryptoInit(key):
ctx1 = 0x0CAFFE19E
for keyChar in key:
keyByte = ord(keyChar)
ctx2 = ctx1
ctx1 = ((((ctx1 >>2) * (ctx1 >>7))&0xFFFFFFFF) ^ (keyByte * keyByte * 0x0F902007)& 0xFFFFFFFF )
return [ctx1,ctx2]
#
# decrypt data with the context prepared by topazCryptoInit()
#
def topazCryptoDecrypt(data, ctx):
ctx1 = ctx[0]
ctx2 = ctx[1]
plainText = ""
for dataChar in data:
dataByte = ord(dataChar)
m = (dataByte ^ ((ctx1 >> 3) &0xFF) ^ ((ctx2<<3) & 0xFF)) &0xFF
ctx2 = ctx1
ctx1 = (((ctx1 >> 2) * (ctx1 >> 7)) &0xFFFFFFFF) ^((m * m * 0x0F902007) &0xFFFFFFFF)
plainText += chr(m)
return plainText
#
# Decrypt a payload record with the PID
#
def decryptRecord(data,PID):
ctx = topazCryptoInit(PID)
return topazCryptoDecrypt(data, ctx)
#
# Try to decrypt a dkey record (contains the book PID)
#
def decryptDkeyRecord(data,PID):
record = decryptRecord(data,PID)
fields = unpack("3sB8sB8s3s",record)
if fields[0] != "PID" or fields[5] != "pid" :
raise CMBDTCError("Didn't find PID magic numbers in record")
elif fields[1] != 8 or fields[3] != 8 :
raise CMBDTCError("Record didn't contain correct length fields")
elif fields[2] != PID :
raise CMBDTCError("Record didn't contain PID")
return fields[4]
#
# Decrypt all the book's dkey records (contain the book PID)
#
def decryptDkeyRecords(data,PID):
nbKeyRecords = ord(data[0])
records = []
data = data[1:]
for i in range (0,nbKeyRecords):
length = ord(data[0])
try:
key = decryptDkeyRecord(data[1:length+1],PID)
records.append(key)
except CMBDTCError:
pass
data = data[1+length:]
return records
#
# Encryption table used to generate the device PID
#
def generatePidEncryptionTable() :
table = []
for counter1 in range (0,0x100):
value = counter1
for counter2 in range (0,8):
if (value & 1 == 0) :
value = value >> 1
else :
value = value >> 1
value = value ^ 0xEDB88320
table.append(value)
return table
#
# Seed value used to generate the device PID
#
def generatePidSeed(table,dsn) :
value = 0
for counter in range (0,4) :
index = (ord(dsn[counter]) ^ value) &0xFF
value = (value >> 8) ^ table[index]
return value
#
# Generate the device PID
#
def generateDevicePID(table,dsn,nbRoll):
seed = generatePidSeed(table,dsn)
pidAscii = ""
pid = [(seed >>24) &0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF,(seed>>24) & 0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF]
index = 0
for counter in range (0,nbRoll):
pid[index] = pid[index] ^ ord(dsn[counter])
index = (index+1) %8
for counter in range (0,8):
index = ((((pid[counter] >>5) & 3) ^ pid[counter]) & 0x1f) + (pid[counter] >> 7)
pidAscii += charMap4[index]
return pidAscii
#
# Create decrypted book payload
#
def createDecryptedPayload(payload):
for headerRecord in bookHeaderRecords:
name = headerRecord
if name != "dkey" :
ext = '.dat'
if name == 'img' : ext = '.jpg'
for index in range (0,len(bookHeaderRecords[name])) :
fnum = "%04d" % index
fname = name + fnum + ext
destdir = payload
if name == 'img':
destdir = os.path.join(payload,'img')
if name == 'page':
destdir = os.path.join(payload,'page')
if name == 'glyphs':
destdir = os.path.join(payload,'glyphs')
outputFile = os.path.join(destdir,fname)
file(outputFile, 'wb').write(getBookPayloadRecord(name, index))
# Create decrypted book
#
def createDecryptedBook(outdir):
if not os.path.exists(outdir):
os.makedirs(outdir)
destdir = os.path.join(outdir,'img')
if not os.path.exists(destdir):
os.makedirs(destdir)
destdir = os.path.join(outdir,'page')
if not os.path.exists(destdir):
os.makedirs(destdir)
destdir = os.path.join(outdir,'glyphs')
if not os.path.exists(destdir):
os.makedirs(destdir)
createDecryptedPayload(outdir)
#
# Set the command to execute by the programm according to cmdLine parameters
#
def setCommand(name) :
global command
if command != "" :
raise CMBDTCFatal("Invalid command line parameters")
else :
command = name
#
# Program usage
#
def usage():
print("\nUsage:")
print("\ncmbtc_dump.py [options] bookFileName\n")
print("-p Adds a PID to the list of PIDs that are tried to decrypt the book key (can be used several times)")
print("-d Dumps the unencrypted book as files to outdir")
print("-o Output directory to save book files to")
print("-v Verbose (can be used several times)")
print("-i Prints kindle.info database")
#
# Main
#
def main(argv=sys.argv):
global kindleDatabase
global bookMetadata
global bookKey
global bookFile
global command
progname = os.path.basename(argv[0])
verbose = 0
recordName = ""
recordIndex = 0
outdir = ""
PIDs = []
kindleDatabase = None
command = ""
try:
opts, args = getopt.getopt(sys.argv[1:], "vi:o:p:d")
except getopt.GetoptError, err:
# print help information and exit:
print str(err) # will print something like "option -a not recognized"
usage()
sys.exit(2)
if len(opts) == 0 and len(args) == 0 :
usage()
sys.exit(2)
for o, a in opts:
if o == "-v":
verbose+=1
if o == "-i":
setCommand("printInfo")
if o =="-o":
if a == None :
raise CMBDTCFatal("Invalid parameter for -o")
outdir = a
if o =="-p":
PIDs.append(a)
if o =="-d":
setCommand("doit")
if command == "" :
raise CMBDTCFatal("No action supplied on command line")
#
# Read the encrypted database
#
try:
kindleDatabase = parseKindleInfo()
except Exception as message:
if verbose>0:
print(message)
if kindleDatabase != None :
if command == "printInfo" :
printKindleInfo()
#
# Compute the DSN
#
# Get the Mazama Random number
MazamaRandomNumber = getKindleInfoValueForKey("MazamaRandomNumber")
# Get the HDD serial
encodedSystemVolumeSerialNumber = encodeHash(str(GetVolumeSerialNumber(GetSystemDirectory().split('\\')[0] + '\\')),charMap1)
# Get the current user name
encodedUsername = encodeHash(GetUserName(),charMap1)
# concat, hash and encode
DSN = encode(SHA1(MazamaRandomNumber+encodedSystemVolumeSerialNumber+encodedUsername),charMap1)
if verbose >1:
print("DSN: " + DSN)
#
# Compute the device PID
#
table = generatePidEncryptionTable()
devicePID = generateDevicePID(table,DSN,4)
PIDs.append(devicePID)
if verbose > 0:
print("Device PID: " + devicePID)
#
# Open book and parse metadata
#
if len(args) == 1:
bookFile = openBook(args[0])
parseTopazHeader()
parseMetadata()
#
# Compute book PID
#
# Get the account token
if kindleDatabase != None:
kindleAccountToken = getKindleInfoValueForKey("kindle.account.tokens")
if verbose >1:
print("Account Token: " + kindleAccountToken)
keysRecord = bookMetadata["keys"]
keysRecordRecord = bookMetadata[keysRecord]
pidHash = SHA1(DSN+kindleAccountToken+keysRecord+keysRecordRecord)
bookPID = encodePID(pidHash)
PIDs.append(bookPID)
if verbose > 0:
print ("Book PID: " + bookPID )
#
# Decrypt book key
#
dkey = getBookPayloadRecord('dkey', 0)
bookKeys = []
for PID in PIDs :
bookKeys+=decryptDkeyRecords(dkey,PID)
if len(bookKeys) == 0 :
if verbose > 0 :
print ("Book key could not be found. Maybe this book is not registered with this device.")
else :
bookKey = bookKeys[0]
if verbose > 0:
print("Book key: " + bookKey.encode('hex'))
if command == "printRecord" :
extractBookPayloadRecord(recordName,int(recordIndex),outputFile)
if outputFile != "" and verbose>0 :
print("Wrote record to file: "+outputFile)
elif command == "doit" :
if outdir != "" :
createDecryptedBook(outdir)
if verbose >0 :
print ("Decrypted book saved. Don't pirate!")
elif verbose > 0:
print("Output directory name was not supplied.")
return 0
if __name__ == '__main__':
sys.exit(main())

View file

@ -0,0 +1,821 @@
#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
from __future__ import with_statement
import csv
import sys
import os
import getopt
from struct import pack
from struct import unpack
# Get a 7 bit encoded number from string. The most
# significant byte comes first and has the high bit (8th) set
def readEncodedNumber(file):
flag = False
c = file.read(1)
if (len(c) == 0):
return None
data = ord(c)
if data == 0xFF:
flag = True
c = file.read(1)
if (len(c) == 0):
return None
data = ord(c)
if data >= 0x80:
datax = (data & 0x7F)
while data >= 0x80 :
c = file.read(1)
if (len(c) == 0):
return None
data = ord(c)
datax = (datax <<7) + (data & 0x7F)
data = datax
if flag:
data = -data
return data
# returns a binary string that encodes a number into 7 bits
# most significant byte first which has the high bit set
def encodeNumber(number):
result = ""
negative = False
flag = 0
if number < 0 :
number = -number + 1
negative = True
while True:
byte = number & 0x7F
number = number >> 7
byte += flag
result += chr(byte)
flag = 0x80
if number == 0 : break
if negative:
result += chr(0xFF)
return result[::-1]
# create / read a length prefixed string from the file
def lengthPrefixString(data):
return encodeNumber(len(data))+data
def readString(file):
stringLength = readEncodedNumber(file)
if (stringLength == None):
return ""
sv = file.read(stringLength)
if (len(sv) != stringLength):
return ""
return unpack(str(stringLength)+"s",sv)[0]
# convert a binary string generated by encodeNumber (7 bit encoded number)
# to the value you would find inside the page*.dat files to be processed
def convert(i):
result = ''
val = encodeNumber(i)
for j in xrange(len(val)):
c = ord(val[j:j+1])
result += '%02x' % c
return result
# the complete string table used to store all book text content
# as well as the xml tokens and values that make sense out of it
class Dictionary(object):
def __init__(self, dictFile):
self.filename = dictFile
self.size = 0
self.fo = file(dictFile,'rb')
self.stable = []
self.size = readEncodedNumber(self.fo)
for i in xrange(self.size):
self.stable.append(self.escapestr(readString(self.fo)))
self.pos = 0
def escapestr(self, str):
str = str.replace('&','&amp;')
str = str.replace('<','&lt;')
str = str.replace('>','&gt;')
str = str.replace('=','&#61;')
return str
def lookup(self,val):
if ((val >= 0) and (val < self.size)) :
self.pos = val
return self.stable[self.pos]
else:
print "Error - %d outside of string table limits" % val
sys.exit(-1)
def getSize(self):
return self.size
def getPos(self):
return self.pos
def dumpDict(self):
for i in xrange(self.size):
print "%d %s %s" % (i, convert(i), self.stable[i])
return
# parses the xml snippets that are represented by each page*.dat file.
# also parses the other0.dat file - the main stylesheet
# and information used to inject the xml snippets into page*.dat files
class PageParser(object):
def __init__(self, filename, dict, debug, flat_xml):
self.fo = file(filename,'rb')
self.id = os.path.basename(filename).replace('.dat','')
self.dict = dict
self.debug = debug
self.flat_xml = flat_xml
self.tagpath = []
self.doc = []
self.snippetList = []
# hash table used to enable the decoding process
# This has all been developed by trial and error so it may still have omissions or
# contain errors
# Format:
# tag : (number of arguments, argument type, subtags present, special case of subtags presents when escaped)
token_tags = {
'book' : (1, 'snippets', 1, 0),
'version' : (1, 'snippets', 1, 0),
'stylesheet' : (1, 'snippets', 1, 0),
'links' : (0, 'number', 0, 1),
'pages' : (0, 'number', 0, 1),
'page' : (1, 'snippets', 1, 0),
'group' : (1, 'snippets', 1, 0),
'region' : (1, 'snippets', 1, 0),
'reflow' : (1, 'number', 1, 0),
'img' : (1, 'snippets', 1, 0),
'paragraph' : (1, 'snippets', 1, 0),
'extratokens' : (1, 'snippets', 1, 0),
'style' : (1, 'snippets', 1, 0),
'rule' : (1, 'snippets', 1, 0),
'_span' : (1, 'snippets', 1, 0),
'word_semantic': (1, 'snippets', 1, 1),
'value' : (1, 'scalar_text', 0, 0),
'h' : (1, 'scalar_number', 0, 0),
'w' : (1, 'scalar_number', 0, 0),
'firstWord' : (1, 'scalar_number', 0, 0),
'lastWord' : (1, 'scalar_number', 0, 0),
'x' : (1, 'number', 0, 0),
'y' : (1, 'number', 0, 0),
'links.page' : (1, 'number', 0, 0),
'link_id' : (1, 'number', 0, 0),
'glyph' : (0, 'number', 1, 1),
'glyph.h' : (1, 'number', 0, 0),
'glyph.w' : (1, 'number', 0, 0),
'sh' : (1, 'number', 0, 0),
'word' : (0, 'number', 1, 1),
'src' : (1, 'scalar_number', 0, 0),
'rel' : (1, 'number', 0, 0),
'row' : (1, 'number', 0, 0),
'startID' : (1, 'number', 0, 1),
'startID.page' : (1, 'number', 0, 0),
'glyphID' : (1, 'number', 0, 0),
'rootID' : (1, 'number', 0, 0),
'stemID' : (1, 'number', 0, 0),
'margin-top' : (1, 'number', 0, 0),
'stemPage' : (1, 'number', 0, 0),
'dehyphen' : (1, 'number', 1, 1),
'rootID' : (1, 'number', 0, 0),
'paraCont' : (1, 'number', 1, 1),
'paraStems' : (1, 'number', 1, 1),
'wordStems' : (1, 'number', 1, 1),
'original' : (0, 'number', 0, 1),
'use' : (1, 'number', 0, 0),
'vtx' : (1, 'number', 0, 1),
'len' : (1, 'number', 0, 1),
'dpi' : (1, 'number', 0, 0),
'n' : (1, 'number', 0, 0),
'id' : (1, 'number', 0, 0),
'ref' : (1, 'number', 0, 0),
'pnum' : (1, 'number', 0, 0),
'pid' : (1, 'text', 0, 0),
'info' : (0, 'number', 1, 0),
'bl' : (1, 'raw', 0, 0),
'firstGlyph' : (1, 'raw', 0, 0),
'lastGlyph' : (1, 'raw', 0, 0),
'ocrText' : (1, 'text', 0, 0),
'title' : (1, 'text', 0, 0),
'href' : (1, 'text', 0, 0),
'_parent_type' : (1, 'text', 0, 0),
'attr' : (1, 'scalar_text', 0, 0),
'justify' : (1, 'scalar_text', 0, 0),
'align' : (1, 'scalar_text', 0, 0),
'layout' : (1, 'scalar_text', 0, 0),
'pageid' : (1, 'scalar_text', 0, 0),
'pagelabel' : (1, 'scalar_text', 0, 0),
'type' : (1, 'text', 0, 0),
'class' : (1, 'scalar_text', 0, 0),
'container' : (1, 'scalar_text', 0, 0),
'_after_class' : (1, 'scalar_text', 0, 0),
'_tag' : (1, 'scalar_text', 0, 0),
'pos' : (1, 'scalar_text', 0, 0),
'page_num' : (1, 'scalar_text', 0, 0),
'page_type' : (1, 'scalar_text', 0, 0),
'findlists' : (1, 'scalar_text', 0, 0),
'FlowEdit_1_id' : (1, 'scalar_text', 0, 0),
'FlowEdit_1_version' : (1, 'scalar_text', 0, 0),
'Schema_id' : (1, 'scalar_text', 0, 0),
'Schema_version' : (1, 'scalar_text', 0, 0),
'Topaz_version' : (1, 'scalar_text', 0, 0),
'WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0),
'WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0),
'ZoneEdit_1_id' : (1, 'scalar_text', 0, 0),
'ZoneEdit_1_version' : (1, 'scalar_text', 0, 0),
'chapterheaders' : (1, 'scalar_text', 0, 0),
'creation_date' : (1, 'scalar_text', 0, 0),
'header_footer' : (1, 'scalar_text', 0, 0),
'init_from_ocr' : (1, 'scalar_text', 0, 0),
'letter_insertion' : (1, 'scalar_text', 0, 0),
'xmlinj_convert' : (1, 'scalar_text', 0, 0),
'xmlinj_reflow' : (1, 'scalar_text', 0, 0),
'xmlinj_transform' : (1, 'scalar_text', 0, 0),
}
# full tag path record keeping routines
def tag_push(self, token):
self.tagpath.append(token)
def tag_pop(self):
if len(self.tagpath) > 0 :
self.tagpath.pop()
def tagpath_len(self):
return len(self.tagpath)
def get_tagpath(self, i):
cnt = len(self.tagpath)
if i < cnt : result = self.tagpath[i]
for j in xrange(i+1, cnt) :
result += '.' + self.tagpath[j]
return result
# list of absolute command byte values values that indicate
# various types of loop meachanisms typically used to generate vectors
cmd_list = (0x76, 0x76)
# peek at and return 1 byte that is ahead by i bytes
def peek(self, aheadi):
c = self.fo.read(aheadi)
if (len(c) == 0):
return None
self.fo.seek(-aheadi,1)
c = c[-1:]
return ord(c)
# get the next value from the file being processed
def getNext(self):
nbyte = self.peek(1);
if (nbyte == None):
return None
val = readEncodedNumber(self.fo)
return val
# format an arg by argtype
def formatArg(self, arg, argtype):
if (argtype == 'text') or (argtype == 'scalar_text') :
result = self.dict.lookup(arg)
elif (argtype == 'raw') or (argtype == 'number') or (argtype == 'scalar_number') :
result = arg
elif (argtype == 'snippets') :
result = arg
else :
print "Error Unknown argtype %s" % argtype
sys.exit(-2)
return result
# process the next tag token, recursively handling subtags,
# arguments, and commands
def procToken(self, token):
known_token = False
self.tag_push(token)
if self.debug : print 'Processing: ', self.get_tagpath(0)
cnt = self.tagpath_len()
for j in xrange(cnt):
tkn = self.get_tagpath(j)
if tkn in self.token_tags :
num_args = self.token_tags[tkn][0]
argtype = self.token_tags[tkn][1]
subtags = self.token_tags[tkn][2]
splcase = self.token_tags[tkn][3]
ntags = -1
known_token = True
break
if known_token :
# handle subtags if present
subtagres = []
if (splcase == 1):
# this type of tag uses of escape marker 0x74 indicate subtag count
if self.peek(1) == 0x74:
skip = readEncodedNumber(self.fo)
subtags = 1
num_args = 0
if (subtags == 1):
ntags = readEncodedNumber(self.fo)
if self.debug : print 'subtags: ' + token + ' has ' + str(ntags)
for j in xrange(ntags):
val = readEncodedNumber(self.fo)
subtagres.append(self.procToken(self.dict.lookup(val)))
# arguments can be scalars or vectors of text or numbers
argres = []
if num_args > 0 :
firstarg = self.peek(1)
if (firstarg in self.cmd_list) and (argtype != 'scalar_number') and (argtype != 'scalar_text'):
# single argument is a variable length vector of data
arg = readEncodedNumber(self.fo)
argres = self.decodeCMD(arg,argtype)
else :
# num_arg scalar arguments
for i in xrange(num_args):
argres.append(self.formatArg(readEncodedNumber(self.fo), argtype))
# build the return tag
result = []
tkn = self.get_tagpath(0)
result.append(tkn)
result.append(subtagres)
result.append(argtype)
result.append(argres)
self.tag_pop()
return result
# all tokens that need to be processed should be in the hash
# table if it may indicate a problem, either new token
# or an out of sync condition
else:
result = []
if (self.debug):
print 'Unknown Token:', token
self.tag_pop()
return result
# special loop used to process code snippets
# it is NEVER used to format arguments.
# builds the snippetList
def doLoop72(self, argtype):
cnt = readEncodedNumber(self.fo)
if self.debug :
result = 'Set of '+ str(cnt) + ' xml snippets. The overall structure \n'
result += 'of the document is indicated by snippet number sets at the\n'
result += 'end of each snippet. \n'
print result
for i in xrange(cnt):
if self.debug: print 'Snippet:',str(i)
snippet = []
snippet.append(i)
val = readEncodedNumber(self.fo)
snippet.append(self.procToken(self.dict.lookup(val)))
self.snippetList.append(snippet)
return
# loop: pass though values unchanged
# DO NOT CHANGE - this has proven to be correct
def doLoop76Mode0(self, argtype, cnt):
result = []
for i in xrange(cnt):
result.append(self.formatArg(readEncodedNumber(self.fo), argtype))
return result
# loop generating values relative to the *negative*
# of the offset - don't ask why - it just is
# DO NOT CHANGE - this has proven to be correct
def doLoop76Mode1(self, argtype, cnt):
result = []
offset = -readEncodedNumber(self.fo)
for i in xrange(cnt):
val = readEncodedNumber(self.fo) + offset
result.append(self.formatArg(val, argtype))
return result
# loop generating values with starting value and accumulation
# DO NOT CHANGE - this has proven to be the correct
def doLoop76Mode2(self, argtype, cnt):
result = []
ptr = readEncodedNumber(self.fo)
result.append(self.formatArg(ptr, argtype))
for i in xrange(cnt-1):
ptr = ptr + readEncodedNumber(self.fo)
result.append(self.formatArg(ptr, argtype))
return result
# loop generating values with starting value and accumulation
# **after** subtracting adjustment value from each
# DO NOT CHANGE - this has been proven to be correct
def doLoop76Mode3(self, argtype, cnt):
result = []
adj = readEncodedNumber(self.fo)
ptr = readEncodedNumber(self.fo)
ptr = ptr - adj
result.append(self.formatArg(ptr, argtype))
for i in xrange(cnt-1):
ptr = ptr + readEncodedNumber(self.fo) - adj
result.append(self.formatArg(ptr,argtype))
return result
# loop using runing sum of data values and starting value
# with accumulation to get new value
# Again, don't ask it took me forever to figure this out
# DO NOT CHANGE - this has been proven to be correct
def doLoop76Mode4(self, argtype, cnt):
result = []
val = readEncodedNumber(self.fo)
runsum = val
ptr = val
result.append(self.formatArg(ptr, argtype))
for i in xrange(cnt-1):
runsum += readEncodedNumber(self.fo)
ptr = ptr + runsum
result.append(self.formatArg(ptr,argtype))
return result
# loop using and extra value as an adjustment
# and a running sum of the values after subtracting
# the adjustment, added to a ptr to get a new pointer
def doLoop76Mode5(self, argtype, cnt):
result = []
adj = readEncodedNumber(self.fo)
ptr = 0
runsum = 0
for i in xrange(cnt):
val = readEncodedNumber(self.fo)
runsum += (val - adj)
ptr = ptr +runsum
result.append(self.formatArg(ptr,argtype))
return result
# FIXME: I have only 4 points to work this out with inside my book
# So may be wrong but it is correct for my 4 points
def doLoop76Mode6(self, argtype, cnt):
result = []
oldval = 0
for i in xrange(cnt):
val = readEncodedNumber(self.fo)
ptr= (3 * oldval) + val + 1
result.append(self.formatArg(ptr,argtype))
oldval = val
return result
# dispatches loop commands bytes with various modes
# The 0x76 style loops are used to build vectors
# This was all derived by trial and error and
# new loop types may exist that are not handled here
# since they did not appear in the test cases
def decodeCMD(self, cmd, argtype):
# if (cmd == 0x72):
# self.doLoop72(argtype)
# result =[]
# return result
if (cmd == 0x76):
# loop with cnt, and mode to control loop styles
cnt = readEncodedNumber(self.fo)
mode = readEncodedNumber(self.fo)
if self.debug : print 'Loop for', cnt, 'with mode', mode, ': '
if (mode == 0x00):
return self.doLoop76Mode0(argtype, cnt)
elif (mode == 0x01):
return self.doLoop76Mode1(argtype, cnt)
elif (mode == 0x02):
return self.doLoop76Mode2(argtype, cnt)
elif (mode == 0x03):
return self.doLoop76Mode3(argtype, cnt)
elif (mode == 0x04):
return self.doLoop76Mode4(argtype, cnt)
elif (mode == 0x05):
return self.doLoop76Mode5(argtype, cnt)
elif (mode == 0x06):
return self.doLoop76Mode6(argtype, cnt)
else:
if self.debug :
# try to mark any unknown loop comands
# if they exist, unless they are used to process
# text or some other known list, we won't be able to prove them correct
print '*** Unknown Loop 0x%x %d %d :' % (cmd, cnt, mode)
for i in xrange(cnt):
val = readEncodedNumber(self.fo)
print ' 0x%x' % val,
print ' '
result = []
return result
if self.dbug: print "Unknown command", cmd
result = []
return result
# add full tag path to injected snippets
def updateName(self, tag, prefix):
name = tag[0]
subtagList = tag[1]
argtype = tag[2]
argList = tag[3]
nname = prefix + '.' + name
nsubtaglist = []
for j in subtagList:
nsubtaglist.append(self.updateName(j,prefix))
ntag = []
ntag.append(nname)
ntag.append(nsubtaglist)
ntag.append(argtype)
ntag.append(argList)
return ntag
# perform depth first injection of specified snippets into this one
def injectSnippets(self, snippet):
snipno, tag = snippet
name = tag[0]
subtagList = tag[1]
argtype = tag[2]
argList = tag[3]
nsubtagList = []
if len(argList) > 0 :
for j in argList:
asnip = self.snippetList[j]
aso, atag = self.injectSnippets(asnip)
atag = self.updateName(atag, name)
nsubtagList.append(atag)
argtype='number'
argList=[]
if len(nsubtagList) > 0 :
subtagList.extend(nsubtagList)
tag = []
tag.append(name)
tag.append(subtagList)
tag.append(argtype)
tag.append(argList)
snippet = []
snippet.append(snipno)
snippet.append(tag)
return snippet
# format the tag for output
def formatTag(self, node):
name = node[0]
subtagList = node[1]
argtype = node[2]
argList = node[3]
fullpathname = name.split('.')
nodename = fullpathname.pop()
ilvl = len(fullpathname)
indent = ' ' * (3 * ilvl)
result = indent + '<' + nodename + '>'
if len(argList) > 0:
argres = ''
for j in argList:
if (argtype == 'text') or (argtype == 'scalar_text') :
argres += j + '|'
else :
argres += str(j) + ','
argres = argres[0:-1]
if argtype == 'snippets' :
result += 'snippets:' + argres
else :
result += argres
if len(subtagList) > 0 :
result += '\n'
for j in subtagList:
if len(j) > 0 :
result += self.formatTag(j)
result += indent + '</' + nodename + '>\n'
else:
result += '</' + nodename + '>\n'
return result
# flatten tag
def flattenTag(self, node):
name = node[0]
subtagList = node[1]
argtype = node[2]
argList = node[3]
result = name
if (len(argList) > 0):
argres = ''
for j in argList:
if (argtype == 'text') or (argtype == 'scalar_text') :
argres += j + '|'
else :
argres += str(j) + '|'
argres = argres[0:-1]
if argtype == 'snippets' :
result += '.snippets=' + argres
else :
result += '=' + argres
result += '\n'
for j in subtagList:
if len(j) > 0 :
result += self.flattenTag(j)
return result
# reduce create xml output
def formatDoc(self, flat_xml):
result = ''
for j in self.doc :
if len(j) > 0:
if flat_xml:
result += self.flattenTag(j)
else:
result += self.formatTag(j)
if self.debug : print result
return result
# main loop - parse the page.dat files
# to create structured document and snippets
# FIXME: value at end of magic appears to be a subtags count
# but for what? For now, inject an 'info" tag as it is in
# every dictionary and seems close to what is meant
# The alternative is to special case the last _ "0x5f" to mean something
def process(self):
# peek at the first bytes to see what type of file it is
magic = self.fo.read(11)
if (magic[0:1] == 'p') and (magic[2:10] == '__PAGE__'):
first_token = 'info'
elif (magic[0:1] == 'g') and (magic[2:11] == '__GLYPH__'):
skip = self.fo.read(1)
first_token = 'info'
else :
# other0.dat file
first_token = None
self.fo.seek(-11,1)
# main loop to read and build the document tree
while True:
if first_token != None :
# use "inserted" first token 'info' for page and glyph files
tag = self.procToken(first_token)
if len(tag) > 0 :
self.doc.append(tag)
first_token = None
v = self.getNext()
if (v == None):
break
if (v == 0x72):
self.doLoop72('number')
elif (v > 0) and (v < self.dict.getSize()) :
tag = self.procToken(self.dict.lookup(v))
if len(tag) > 0 :
self.doc.append(tag)
else:
if self.debug:
print "Mina Loop: Unknown value: %x" % v
# now do snippet injection
if len(self.snippetList) > 0 :
if self.debug : print 'Injecting Snippets:'
snippet = self.injectSnippets(self.snippetList[0])
snipno = snippet[0]
tag_add = snippet[1]
if self.debug : print self.formatTag(tag_add)
if len(tag_add) > 0:
self.doc.append(tag_add)
# handle generation of xml output
xmlpage = self.formatDoc(self.flat_xml)
return xmlpage
def usage():
print 'Usage: '
print ' convert2xml.py dict0000.dat infile.dat '
print ' '
print ' Options:'
print ' -h print this usage help message '
print ' -d turn on debug output to check for potential errors '
print ' --flat-xml output the flattened xml page description only '
print ' '
print ' This program will attempt to convert a page*.dat file or '
print ' glyphs*.dat file, using the dict0000.dat file, to its xml description. '
print ' '
print ' Use "cmbtc_dump.py" first to unencrypt, uncompress, and dump '
print ' the *.dat files from a Topaz format e-book.'
#
# Main
#
def main(argv):
dictFile = ""
pageFile = ""
debug = False
flat_xml = False
printOutput = False
if len(argv) == 0:
printOutput = True
argv = sys.argv
else :
argv = argv.split()
try:
opts, args = getopt.getopt(argv[1:], "hd", ["flat-xml"])
except getopt.GetoptError, err:
# print help information and exit:
print str(err) # will print something like "option -a not recognized"
usage()
sys.exit(2)
if len(opts) == 0 and len(args) == 0 :
usage()
sys.exit(2)
for o, a in opts:
if o =="-d":
debug=True
if o =="-h":
usage()
sys.exit(0)
if o =="--flat-xml":
flat_xml = True
dictFile, pageFile = args[0], args[1]
# read in the string table dictionary
dict = Dictionary(dictFile)
# create a page parser
pp = PageParser(pageFile, dict, debug, flat_xml)
xmlpage = pp.process()
if printOutput:
print xmlpage
return 0
return xmlpage
if __name__ == '__main__':
sys.exit(main(''))

View file

@ -0,0 +1,109 @@
#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
from __future__ import with_statement
import csv
import sys
import os
import getopt
from struct import pack
from struct import unpack
#
# Get a 7 bit encoded number from string
#
def readEncodedNumber(file):
flag = False
c = file.read(1)
if (len(c) == 0):
return None
data = ord(c)
if data == 0xFF:
flag = True
c = file.read(1)
if (len(c) == 0):
return None
data = ord(c)
if data >= 0x80:
datax = (data & 0x7F)
while data >= 0x80 :
c = file.read(1)
if (len(c) == 0):
return None
data = ord(c)
datax = (datax <<7) + (data & 0x7F)
data = datax
if flag:
data = -data
return data
#
# Encode a number in 7 bit format
#
def encodeNumber(number):
result = ""
negative = False
flag = 0
if number < 0 :
number = -number + 1
negative = True
while True:
byte = number & 0x7F
number = number >> 7
byte += flag
result += chr(byte)
flag = 0x80
if number == 0 : break
if negative:
result += chr(0xFF)
return result[::-1]
#
# Get a length prefixed string from the file
#
def lengthPrefixString(data):
return encodeNumber(len(data))+data
def readString(file):
stringLength = readEncodedNumber(file)
if (stringLength == None):
return None
sv = file.read(stringLength)
if (len(sv) != stringLength):
return ""
return unpack(str(stringLength)+"s",sv)[0]
def getMetaArray(metaFile):
# parse the meta file into a Python dictionary (associative array)
result = {}
fo = file(metaFile,'rb')
size = readEncodedNumber(fo)
for i in xrange(size):
temp = readString(fo)
result[temp] = readString(fo)
fo.close()
return result
def getMetaData(metaFile):
# parse the meta file
result = ''
fo = file(metaFile,'rb')
size = readEncodedNumber(fo)
for i in xrange(size):
result += readString(fo) + '|'
result += readString(fo) + '\n'
fo.close()
return result

View file

@ -0,0 +1,299 @@
#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
from __future__ import with_statement
import csv
import sys
import os
import getopt
from struct import pack
from struct import unpack
class DocParser(object):
def __init__(self, flatxml, fileid):
self.id = os.path.basename(fileid).replace('.dat','')
self.flatdoc = flatxml.split('\n')
self.ocrtext = []
self.link_id = []
self.link_title = []
self.link_page = []
self.dehyphen_rootid = []
self.paracont_stemid = []
self.parastems_stemid = []
# find tag if within pos to end inclusive
def findinDoc(self, tagpath, pos, end) :
result = None
docList = self.flatdoc
cnt = len(docList)
if end == -1 :
end = cnt
else:
end = min(cnt,end)
foundat = -1
for j in xrange(pos, end):
item = docList[j]
if item.find('=') >= 0:
(name, argres) = item.split('=')
else :
name = item
argres = ''
if name.endswith(tagpath) :
result = argres
foundat = j
break
return foundat, result
# return list of start positions for the tagpath
def posinDoc(self, tagpath):
startpos = []
pos = 0
res = ""
while res != None :
(foundpos, res) = self.findinDoc(tagpath, pos, -1)
if res != None :
startpos.append(foundpos)
pos = foundpos + 1
return startpos
# get a description of the paragraph
def getParaDescription(self, start, end):
# normal paragraph
(pos, pclass) = self.findinDoc('paragraph.class',start,end)
# class names are an issue given topaz starts them with numerals (not allowed)
# use a mix of cases, (which cause some browsers problems), and actually
# attach numbers after "reclustered*" to the end to deal with reflow issues
# so we clean this up by lowercasing, prepend 'cl_', and remove all end pieces after reclustered
pclass = pclass.lower()
pclass = 'cl_' + pclass
p = pclass.find('reclustered')
if p > 0 : pclass = pclass[0:p+11]
(pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
(pos, slast) = self.findinDoc('paragraph.lastWord',start,end)
if (sfirst != None) and (slast != None) :
return pclass, int(sfirst), int(slast)
# some paragraphs are instead split into multiple spans and some even have word_semantic tags as well
# so walk through this region keeping track of the first firstword, and the last lastWord
# on any items that have it
(pos, sfirst) = self.findinDoc('firstWord',start, end)
first = int(sfirst)
last = -1
for i in xrange(pos+1,end):
(pos, slast) = self.findinDoc('lastWord',i,i+1)
if slast != None:
last = int(slast)
return pclass, first, last
def buildParagraph(self, cname, first, last, type, regtype) :
parares = ''
sep =''
br_lb = False
if (regtype == 'fixed') or (regtype == 'chapterheading') :
br_lb = True
handle_links = False
if len(self.link_id) > 0:
handle_links = True
if (type == 'full') or (type == 'begin') :
parares += '<p class="' + cname + '">'
if (type == 'end'):
parares += ' '
for j in xrange(first, last) :
word = self.ocrtext[j]
sep = ' '
if handle_links:
link = self.link_id[j]
if (link > 0):
title = self.link_title[link-1]
if title == "": title='_link_'
ptarget = self.link_page[link-1] - 1
linkhtml = '<a href="#page%04d">' % ptarget
linkhtml += title + '</a>'
pos = parares.rfind(title)
if pos >= 0:
parares = parares[0:pos] + linkhtml + parares[pos+len(title):]
else :
parares += linkhtml
if word == '_link_' : word = ''
elif (link < 0) :
if word == '_link_' : word = ''
if word == '_lb_':
if (j-1) in self.dehyphen_rootid :
word = ''
sep = ''
elif handle_links :
word = ''
sep = ''
elif br_lb :
word = '<br />\n'
sep = ''
else :
word = '\n'
sep = ''
if j in self.dehyphen_rootid :
word = word[0:-1]
sep = ''
parares += word + sep
if len(sep) > 0 : parares = parares[0:-1]
if (type == 'full') or (type == 'end') :
parares += '</p>'
return parares
# walk the document tree collecting the information needed
# to build an html page using the ocrText
def process(self):
htmlpage = ''
# first collect information from the xml doc that describes this page
(pos, argres) = self.findinDoc('info.word.ocrText',0,-1)
if argres : self.ocrtext = argres.split('|')
(pos, argres) = self.findinDoc('info.dehyphen.rootID',0,-1)
if argres:
argList = argres.split('|')
self.dehyphen_rootid = [ int(strval) for strval in argList]
(pos, self.parastems_stemid) = self.findinDoc('info.paraStems.stemID',0,-1)
if self.parastems_stemid == None : self.parastems_stemid = []
(pos, self.paracont_stemid) = self.findinDoc('info.paraCont.stemID',0,-1)
if self.paracont_stemid == None : self.paracont_stemid = []
(pos, argres) = self.findinDoc('info.word.link_id',0,-1)
if argres:
argList = argres.split('|')
self.link_id = [ int(strval) for strval in argList]
(pos, argres) = self.findinDoc('info.links.page',0,-1)
if argres :
argList = argres.split('|')
self.link_page = [ int(strval) for strval in argList]
(pos, argres) = self.findinDoc('info.links.title',0,-1)
if argres :
self.link_title = argres.split('|')
else:
self.link_title.append('')
(pos, pagetype) = self.findinDoc('page.type',0,-1)
# generate a list of each region starting point
# each region has one paragraph,, or one image, or one chapterheading
regionList= self.posinDoc('region')
regcnt = len(regionList)
regionList.append(-1)
anchorSet = False
breakSet = False
# process each region tag and convert what you can to html
for j in xrange(regcnt):
start = regionList[j]
end = regionList[j+1]
(pos, regtype) = self.findinDoc('region.type',start,end)
if regtype == 'graphic' :
if not anchorSet:
htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
anchorSet = True
(pos, simgsrc) = self.findinDoc('img.src',start,end)
if simgsrc:
htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
elif regtype == 'chapterheading' :
(pclass, first, last) = self.getParaDescription(start,end)
if not breakSet:
htmlpage += '<div style="page-break-after: always;">&nbsp;</div>\n'
breakSet = True
if not anchorSet:
htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
anchorSet = True
tag = 'h1'
if pclass[3:7] == 'ch1-' : tag = 'h1'
if pclass[3:7] == 'ch2-' : tag = 'h2'
if pclass[3:7] == 'ch3-' : tag = 'h3'
htmlpage += '<' + tag + ' class="' + pclass + '">'
htmlpage += self.buildParagraph(pclass,first,last,'middle', regtype)
htmlpage += '</' + tag + '>'
elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') :
ptype = 'full'
# check to see if this is a continution from the previous page
if (len(self.parastems_stemid) > 0):
ptype = 'end'
self.parastems_stemid=[]
else:
if not anchorSet:
htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
anchorSet = True
(pclass, first, last) = self.getParaDescription(start,end)
if ptype == 'full' :
tag = 'p'
if pclass[3:6] == 'h1-' : tag = 'h4'
if pclass[3:6] == 'h2-' : tag = 'h5'
if pclass[3:6] == 'h3-' : tag = 'h6'
htmlpage += '<' + tag + ' class="' + pclass + '">'
htmlpage += self.buildParagraph(pclass, first, last, 'middle', regtype)
htmlpage += '</' + tag + '>'
else :
htmlpage += self.buildParagraph(pclass, first, last, ptype, regtype)
elif (regtype == 'tocentry') :
ptype = 'full'
# check to see if this is a continution from the previous page
if (len(self.parastems_stemid) > 0) and (j == 0):
# process the first paragraph as a continuation from the last page
ptype = 'end'
self.parastems_stemid = []
else:
if not anchorSet:
htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
anchorSet = True
(pclass, first, last) = self.getParaDescription(start,end)
htmlpage += self.buildParagraph(pclass, first, last, ptype, regtype)
else :
print 'Unknown region type', regtype
print 'Warning: skipping this region'
if len(self.paracont_stemid) > 0 :
if htmlpage[-4:] == '</p>':
htmlpage = htmlpage[0:-4]
return htmlpage
return self.convert2HTML()
def convert2HTML(flatxml, fileid):
# create a document parser
dp = DocParser(flatxml, fileid)
htmlpage = dp.process()
return htmlpage

125
Topaz_Tools/lib/genhtml.py Normal file
View file

@ -0,0 +1,125 @@
#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
import os, sys, getopt
# local routines
import convert2xml
import flatxml2html
import decode_meta
import stylexml2css
def usage():
print 'Usage: '
print ' '
print ' genhtml.py unencryptedBookDir'
print ' '
def main(argv):
bookDir = ''
if len(argv) == 0:
argv = sys.argv
else :
argv = argv.split()
try:
opts, args = getopt.getopt(argv[1:], "h:")
except getopt.GetoptError, err:
print str(err)
usage()
sys.exit(2)
if len(opts) == 0 and len(args) == 0 :
usage()
sys.exit(2)
for o, a in opts:
if o =="-h":
usage()
sys.exit(0)
bookDir = args[0]
if not os.path.exists(bookDir) :
print "Can not find directory with unencrypted book"
sys.exit(-1)
dictFile = os.path.join(bookDir,'dict0000.dat')
if not os.path.exists(dictFile) :
print "Can not find dict0000.dat file"
sys.exit(-1)
pageDir = os.path.join(bookDir,'page')
if not os.path.exists(pageDir) :
print "Can not find page directory in unencrypted book"
sys.exit(-1)
imgDir = os.path.join(bookDir,'img')
if not os.path.exists(imgDir) :
print "Can not find image directory in unencrypted book"
sys.exit(-1)
otherFile = os.path.join(bookDir,'other0000.dat')
if not os.path.exists(otherFile) :
print "Can not find other0000.dat in unencrypted book"
sys.exit(-1)
metaFile = os.path.join(bookDir,'metadata0000.dat')
if not os.path.exists(metaFile) :
print "Can not find metadata0000.dat in unencrypted book"
sys.exit(-1)
htmlFileName = "book.html"
htmlstr = '<html>\n'
filenames = os.listdir(pageDir)
filenames = sorted(filenames)
print 'Processing ... '
htmlstr += '<head>\n'
print ' ', 'metadata0000.dat'
fname = os.path.join(bookDir,'metadata0000.dat')
xname = os.path.join(bookDir, 'metadata.txt')
metastr = decode_meta.getMetaData(fname)
file(xname, 'wb').write(metastr)
meta_array = decode_meta.getMetaArray(fname)
htmlstr += '<meta name="Author" content="' + meta_array['Authors'] + '" />\n'
htmlstr += '<meta name="Title" content="' + meta_array['Title'] + '" />\n'
print ' ', 'other0000.dat'
fname = os.path.join(bookDir,'other0000.dat')
xname = os.path.join(bookDir, 'style.css')
xmlstr = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
cssstr = '<style>\n'
cssstr += stylexml2css.convert2CSS(xmlstr)
cssstr += '</style>\n'
file(xname, 'wb').write(cssstr)
htmlstr += cssstr
htmlstr += '</head>\n<body>\n'
for filename in filenames:
print ' ', filename
fname = os.path.join(pageDir,filename)
flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
htmlstr += flatxml2html.convert2HTML(flat_xml, fname)
htmlstr += '</body>\n</html>\n'
file(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
print 'Processing Complete'
return 0
if __name__ == '__main__':
sys.exit(main(''))

295
Topaz_Tools/lib/gensvg.py Normal file
View file

@ -0,0 +1,295 @@
#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
import os, sys, getopt
# local routines
import convert2xml
import flatxml2html
import decode_meta
class GParser(object):
def __init__(self, flatxml):
self.flatdoc = flatxml.split('\n')
self.dpi = 1440
self.gh = self.getData('info.glyph.h')
self.gw = self.getData('info.glyph.w')
self.guse = self.getData('info.glyph.use')
self.count = len(self.guse)
self.gvtx = self.getData('info.glyph.vtx')
self.glen = self.getData('info.glyph.len')
self.gdpi = self.getData('info.glyph.dpi')
self.vx = self.getData('info.vtx.x')
self.vy = self.getData('info.vtx.y')
self.vlen = self.getData('info.len.n')
self.glen.append(len(self.vlen))
self.gvtx.append(len(self.vx))
def getData(self, path):
result = None
cnt = len(self.flatdoc)
for j in xrange(cnt):
item = self.flatdoc[j]
if item.find('=') >= 0:
(name, argt) = item.split('=')
argres = argt.split('|')
else:
name = item
argres = []
if (name == path):
result = argres
break
if (len(argres) > 0) :
for j in xrange(0,len(argres)):
argres[j] = int(argres[j])
return result
def getPath(self, gly):
path = ''
if (gly < 0) or (gly >= self.count):
return path
tx = self.vx[self.gvtx[gly]:self.gvtx[gly+1]-1]
ty = self.vy[self.gvtx[gly]:self.gvtx[gly+1]-1]
p = 0
for k in xrange(self.glen[gly], self.glen[gly+1]):
if (p == 0):
zx = tx[0:self.vlen[k]+1]
zy = ty[0:self.vlen[k]+1]
else:
zx = tx[self.vlen[k-1]+1:self.vlen[k]+1]
zy = ty[self.vlen[k-1]+1:self.vlen[k]+1]
p += 1
for j in xrange(0, len(zx)):
if (j == 0):
path += 'M %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly])
else:
path += 'L %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly])
path += 'z'
return path
class PParser(object):
def __init__(self, flatxml):
self.flatdoc = flatxml.split('\n')
self.temp = []
self.ph = self.getData('page.h')[0]
self.pw = self.getData('page.w')[0]
self.gx = self.getData('info.glyph.x')
self.gy = self.getData('info.glyph.y')
self.gid = self.getData('info.glyph.glyphID')
def getData(self, path):
result = None
cnt = len(self.flatdoc)
for j in xrange(cnt):
item = self.flatdoc[j]
if item.find('=') >= 0:
(name, argt) = item.split('=')
argres = argt.split('|')
else:
name = item
argres = []
if (name.endswith(path)):
result = argres
break
if (len(argres) > 0) :
for j in xrange(0,len(argres)):
argres[j] = int(argres[j])
return result
def getDataTemp(self, path):
result = None
cnt = len(self.temp)
for j in xrange(cnt):
item = self.temp[j]
if item.find('=') >= 0:
(name, argt) = item.split('=')
argres = argt.split('|')
else:
name = item
argres = []
if (name.endswith(path)):
result = argres
self.temp.pop(j)
break
if (len(argres) > 0) :
for j in xrange(0,len(argres)):
argres[j] = int(argres[j])
return result
def getImages(self):
result = []
self.temp = self.flatdoc
while (self.getDataTemp('region.img') != None):
h = self.getDataTemp('region.img.h')[0]
w = self.getDataTemp('region.img.w')[0]
x = self.getDataTemp('region.img.x')[0]
y = self.getDataTemp('region.img.y')[0]
src = self.getDataTemp('region.img.src')[0]
result.append('<image xlink:href="../img/img%04d.jpg" x="%d" y="%d" width="%d" height="%d" />\n' % (src, x, y, w, h))
return result
def getGlyphs(self,glyfname):
result = []
if (self.gid != None) and (len(self.gid) > 0):
glyphs = []
for j in set(self.gid):
glyphs.append(j)
glyphs.sort()
gfile = open(glyfname, 'r')
j = 0
while True :
inp = gfile.readline()
if (inp == ''):
break
id='id="gl%d"' % glyphs[j]
if (inp.find(id) > 0):
result.append(inp)
j += 1
if (j == len(glyphs)):
break
gfile.close()
return result
def usage():
print 'Usage: '
print ' '
print ' gensvg.py unencryptedBookDir'
print ' '
def main(argv):
bookDir = ''
if len(argv) == 0:
argv = sys.argv
else :
argv = argv.split()
try:
opts, args = getopt.getopt(argv[1:], "h:")
except getopt.GetoptError, err:
print str(err)
usage()
sys.exit(2)
if len(opts) == 0 and len(args) == 0 :
usage()
sys.exit(2)
for o, a in opts:
if o =="-h":
usage()
sys.exit(0)
bookDir = args[0]
if not os.path.exists(bookDir) :
print "Can not find directory with unencrypted book"
sys.exit(-1)
dictFile = os.path.join(bookDir,'dict0000.dat')
if not os.path.exists(dictFile) :
print "Can not find dict0000.dat file"
sys.exit(-1)
pageDir = os.path.join(bookDir,'page')
if not os.path.exists(pageDir) :
print "Can not find page directory in unencrypted book"
sys.exit(-1)
imgDir = os.path.join(bookDir,'img')
if not os.path.exists(imgDir) :
print "Can not find image directory in unencrypted book"
sys.exit(-1)
glyphsDir = os.path.join(bookDir,'glyphs')
if not os.path.exists(glyphsDir) :
print "Can not find glyphs directory in unencrypted book"
sys.exit(-1)
metaFile = os.path.join(bookDir,'metadata0000.dat')
if not os.path.exists(metaFile) :
print "Can not find metadata0000.dat in unencrypted book"
sys.exit(-1)
svgDir = os.path.join(bookDir,'svg')
if not os.path.exists(svgDir) :
os.makedirs(svgDir)
print 'Processing Meta Data ... '
print ' ', 'metadata0000.dat'
fname = os.path.join(bookDir,'metadata0000.dat')
metadata = decode_meta.getMetaArray(fname)
print 'Processing Glyphs ... '
filenames = os.listdir(glyphsDir)
filenames = sorted(filenames)
glyfname = os.path.join(svgDir,'glyphs.svg')
glyfile = open(glyfname, 'w')
glyfile.write('<?xml version="1.0" standalone="no"?>\n')
glyfile.write('<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n')
glyfile.write('<svg width="512" height="512" viewBox="0 0 511 511" xmlns="http://www.w3.org/2000/svg" version="1.1">\n')
glyfile.write('<title>Glyphs for %s</title>\n' % metadata['Title'])
glyfile.write('<defs>\n')
counter = 0
for filename in filenames:
print ' ', filename
fname = os.path.join(glyphsDir,filename)
flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
gp = GParser(flat_xml)
for i in xrange(0, gp.count):
path = gp.getPath(i)
glyfile.write('<path id="gl%d" d="%s" fill="black" />\n' % (counter * 256 + i, path))
counter += 1
glyfile.write('</defs>\n')
glyfile.write('</svg>\n')
glyfile.close()
print 'Processing Pages ... '
scaledpi = 720
filenames = os.listdir(pageDir)
filenames = sorted(filenames)
counter = 0
for filename in filenames:
print ' ', filename
fname = os.path.join(pageDir,filename)
flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
pp = PParser(flat_xml)
pfile = open(os.path.join(svgDir,filename.replace('.dat','.svg')), 'w')
pfile.write('<?xml version="1.0" standalone="no"?>\n')
pfile.write('<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n')
pfile.write('<svg width="%fin" height="%fin" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (pp.pw / scaledpi, pp.ph / scaledpi, pp.pw -1, pp.ph -1))
pfile.write('<title>Page %d - %s by %s</title>\n' % (counter, metadata['Title'],metadata['Authors']))
if (pp.gid != None):
pfile.write('<defs>\n')
gdefs = pp.getGlyphs(glyfname)
for j in xrange(0,len(gdefs)):
pfile.write(gdefs[j])
pfile.write('</defs>\n')
for j in xrange(0,len(pp.gid)):
pfile.write('<use xlink:href="#gl%d" x="%d" y="%d" />\n' % (pp.gid[j], pp.gx[j], pp.gy[j]))
img = pp.getImages()
if (img != None):
for j in xrange(0,len(img)):
pfile.write(img[j])
pfile.write('</svg>')
pfile.close()
counter += 1
print 'Processing Complete'
return 0
if __name__ == '__main__':
sys.exit(main(''))

121
Topaz_Tools/lib/genxml.py Normal file
View file

@ -0,0 +1,121 @@
#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
import os, sys, getopt
# local routines
import convert2xml
import flatxml2html
import decode_meta
def usage():
print 'Usage: '
print ' '
print ' genxml.py dict0000.dat unencryptedBookDir'
print ' '
def main(argv):
bookDir = ''
if len(argv) == 0:
argv = sys.argv
else :
argv = argv.split()
try:
opts, args = getopt.getopt(argv[1:], "h:")
except getopt.GetoptError, err:
print str(err)
usage()
sys.exit(2)
if len(opts) == 0 and len(args) == 0 :
usage()
sys.exit(2)
for o, a in opts:
if o =="-h":
usage()
sys.exit(0)
bookDir = args[0]
if not os.path.exists(bookDir) :
print "Can not find directory with unencrypted book"
sys.exit(-1)
dictFile = os.path.join(bookDir,'dict0000.dat')
if not os.path.exists(dictFile) :
print "Can not find dict0000.dat file"
sys.exit(-1)
pageDir = os.path.join(bookDir,'page')
if not os.path.exists(pageDir) :
print "Can not find page directory in unencrypted book"
sys.exit(-1)
glyphsDir = os.path.join(bookDir,'glyphs')
if not os.path.exists(glyphsDir) :
print "Can not find glyphs directory in unencrypted book"
sys.exit(-1)
otherFile = os.path.join(bookDir,'other0000.dat')
if not os.path.exists(otherFile) :
print "Can not find other0000.dat in unencrypted book"
sys.exit(-1)
metaFile = os.path.join(bookDir,'metadata0000.dat')
if not os.path.exists(metaFile) :
print "Can not find metadata0000.dat in unencrypted book"
sys.exit(-1)
xmlDir = os.path.join(bookDir,'xml')
if not os.path.exists(xmlDir):
os.makedirs(xmlDir)
print 'Processing ... '
print ' ', 'metadata0000.dat'
fname = os.path.join(bookDir,'metadata0000.dat')
xname = os.path.join(xmlDir, 'metadata.txt')
metastr = decode_meta.getMetaData(fname)
file(xname, 'wb').write(metastr)
print ' ', 'other0000.dat'
fname = os.path.join(bookDir,'other0000.dat')
xname = os.path.join(xmlDir, 'stylesheet.xml')
xmlstr = convert2xml.main('convert2xml.py ' + dictFile + ' ' + fname)
file(xname, 'wb').write(xmlstr)
filenames = os.listdir(pageDir)
filenames = sorted(filenames)
for filename in filenames:
print ' ', filename
fname = os.path.join(pageDir,filename)
xname = os.path.join(xmlDir, filename.replace('.dat','.xml'))
xmlstr = convert2xml.main('convert2xml.py ' + dictFile + ' ' + fname)
file(xname, 'wb').write(xmlstr)
filenames = os.listdir(glyphsDir)
filenames = sorted(filenames)
for filename in filenames:
print ' ', filename
fname = os.path.join(glyphsDir,filename)
xname = os.path.join(xmlDir, filename.replace('.dat','.xml'))
xmlstr = convert2xml.main('convert2xml.py ' + dictFile + ' ' + fname)
file(xname, 'wb').write(xmlstr)
print 'Processing Complete'
return 0
if __name__ == '__main__':
sys.exit(main(''))

View file

@ -0,0 +1,75 @@
This is experimental and it will probably not work for you but...
ALSO: Please do not use any of this to steal. Theft is wrong.
This is meant to allow conversion of Topaz books for other book readers you own
Here are the steps:
1. Unzip the topazscripts.zip file to get the full set of python scripts.
The files you should have after unzipping are:
cmbtc_dump.py - (author: cmbtc) unencrypts and dumps sections into separate files
decode_meta.py - converts metadata0000.dat to human readable text (for the most part)
convert2xml.py - converts page*.dat, other*.dat, and glyphs*.dat files to pseudo xml descriptions
flatxml2html.py - converts a "flattened" xml description to html using the ocrtext
stylexml2css.py - converts stylesheet "flattened" xml into css (as best it can)
genxml.py - main program to convert everything to xml
genhtml.py - main program to generate "book.html"
gensvg.py - (author: clarknova) main program to create an svg grpahic of each page
Please note, gensvg.py, genhtml.py, and genxml.py import and use
decode_meta.py, convert2xml.py, flatxml2html.py, and stylexml2css.py
so please keep all of these python scripts together in the same place.
2. Remove the DRM from the Topaz book and build a directory
of its contents as files
All Thanks go to CMBTC who broke the DRM for Topaz - without it nothing else
would be possible
cmbtc_dump.py -d -o TARGETDIR [-p pid] YOURTOPAZBOOKNAMEHERE
This should create a directory called "TARGETDIR" in your current directory.
It should have the following files in it:
metadata0000.dat - metadata info
other0000.dat - information used to create a style sheet
dict0000.dat - dictionary of words used to build page descriptions
page - directory filled with page*.dat files
glyphs - directory filled with glyphs*.dat files
3. Convert the files in "TARGETDIR" to their xml descriptions
which can be found in TARGETDIR/xml/ upon completion.
genxml.py TARGETDIR
4. Create book.html which can be found in "TARGETDIR" after
completion. This html conversion can not fully capture
all of the layouts actually used in the book and needs to
be edited to include special font handling such as bold
or italics that can not be determined from the ocrText
information or the style information. If you want to
see things exactly as they were, see step 5 below.
genhtml.py TARGETDIR
5. Create an svg description of each page which can
be found in TARGETDIR/svg/ upon completion.
All thanks go to CLARKNOVA for this program. This program is
needed to actually see the true image of each page so that hand
editing of the html created by step 4 can be done.
Or use the resulting svg files to read each page of the book
exactly as it has been laid out originally.
gensvg.py TARGETDIR

View file

@ -0,0 +1,221 @@
#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
from __future__ import with_statement
import csv
import sys
import os
import getopt
from struct import pack
from struct import unpack
class DocParser(object):
def __init__(self, flatxml):
self.flatdoc = flatxml.split('\n')
stags = {
'paragraph' : 'p',
'graphic' : '.graphic'
}
attr_val_map = {
'hang' : ('text-indent: ', 135),
'indent' : ('text-indent: ', 135),
'line-space' : ('line-height: ', 190),
'margin-bottom' : ('margin-bottom: ', 135),
'margin-left' : ('margin-left: ', 135),
'margin-right' : ('margin-right: ', 135),
'margin-top' : ('margin-top: ', 135),
'space-after' : ('padding-bottom: ', 135),
}
attr_str_map = {
'align-center' : 'text-align: center; margin-left: auto; margin-right: auto;',
'align-left' : 'text-align: left;',
'align-right' : 'text-align: right;',
'align-justify' : 'text-align: justify;',
'display-inline' : 'display: inline;',
'pos-left' : 'text-align: left;',
'pos-right' : 'text-align: right;',
'pos-center' : 'text-align: center; margin-left: auto; margin-right: auto;',
}
# find tag if within pos to end inclusive
def findinDoc(self, tagpath, pos, end) :
result = None
docList = self.flatdoc
cnt = len(docList)
if end == -1 :
end = cnt
else:
end = min(cnt,end)
foundat = -1
for j in xrange(pos, end):
item = docList[j]
if item.find('=') >= 0:
(name, argres) = item.split('=')
else :
name = item
argres = ''
if name.endswith(tagpath) :
result = argres
foundat = j
break
return foundat, result
# return list of start positions for the tagpath
def posinDoc(self, tagpath):
startpos = []
pos = 0
res = ""
while res != None :
(foundpos, res) = self.findinDoc(tagpath, pos, -1)
if res != None :
startpos.append(foundpos)
pos = foundpos + 1
return startpos
def process(self):
csspage = ''
# generate a list of each <style> starting point in the stylesheet
styleList= self.posinDoc('book.stylesheet.style')
stylecnt = len(styleList)
styleList.append(-1)
# process each style converting what you can
for j in xrange(stylecnt):
start = styleList[j]
end = styleList[j+1]
(pos, tag) = self.findinDoc('style._tag',start,end)
if tag == None :
(pos, tag) = self.findinDoc('style.type',start,end)
# Is this something we know how to convert to css
if tag in self.stags :
# get the style class
(pos, sclass) = self.findinDoc('style.class',start,end)
if sclass != None:
sclass = '.cl_' + sclass.lower()
else :
sclass = ''
# check for any "after class" specifiers
(pos, aftclass) = self.findinDoc('style._after_class',start,end)
if aftclass != None:
aftclass = '.cl_' + aftclass.lower()
else :
aftclass = ''
cssargs = {}
while True :
(pos, attr) = self.findinDoc('style.rule.attr', start, end)
(pos, val) = self.findinDoc('style.rule.value', start, end)
if attr == None : break
if (attr == 'display') or (attr == 'pos') or (attr == 'align'):
# handle text based attributess
attr = attr + '-' + val
if attr in self.attr_str_map :
cssargs[attr] = (self.attr_str_map[attr], '')
else :
# handle value based attributes
if attr in self.attr_val_map :
(name, scale) = self.attr_val_map[attr]
if not ((attr == 'hang') and (int(val) == 0)) :
ems = int(val)/scale
cssargs[attr] = (self.attr_val_map[attr][0], ems)
keep = True
start = pos + 1
# disable all of the after class tags until I figure out how to handle them
# remove all numerals after the "reclustered"
if aftclass != "" : keep = False
p = sclass.find('reclustered')
if p >= 0:
sclass = sclass[0:p+11]
if keep :
# make sure line-space does not go below 1em
if 'line-space' in cssargs:
seg = cssargs['line-space'][0]
val = cssargs['line-space'][1]
if val < 1.0: val = 1.0
del cssargs['line-space']
cssargs['line-space'] = (self.attr_val_map['line-space'][0], val)
# handle modifications for css style hanging indents
if 'hang' in cssargs:
hseg = cssargs['hang'][0]
hval = cssargs['hang'][1]
del cssargs['hang']
cssargs['hang'] = (self.attr_val_map['hang'][0], -hval)
mval = 0
mseg = 'margin-left: '
if 'margin-left' in cssargs:
mseg = cssargs['margin-left'][0]
mval = cssargs['margin-left'][1]
mval = hval + mval
cssargs['margin-left'] = (mseg, mval)
if 'indent' in cssargs:
del cssargs['indent']
cssline = sclass + ' { '
for key in iter(cssargs):
mseg = cssargs[key][0]
mval = cssargs[key][1]
if mval == '':
cssline += mseg + ' '
else :
aseg = mseg + '%.1fem;' % mval
cssline += aseg + ' '
cssline += '}'
# handle special case of paragraph class used inside chapter heading
# and non-chapter headings
if sclass != '' :
ctype = sclass[4:7]
if ctype == 'ch1' :
csspage += 'h1' + cssline + '\n'
if ctype == 'ch2' :
csspage += 'h2' + cssline + '\n'
if ctype == 'ch3' :
csspage += 'h3' + cssline + '\n'
if ctype == 'h1-' :
csspage += 'h4' + cssline + '\n'
if ctype == 'h2-' :
csspage += 'h5' + cssline + '\n'
if ctype == 'h3_' :
csspage += 'h6' + cssline + '\n'
csspage += self.stags[tag] + cssline + '\n'
return csspage
def convert2CSS(flatxml):
# create a document parser
dp = DocParser(flatxml)
csspage = dp.process()
return csspage