topazscripts 1.6

This commit is contained in:
some_updates 2010-01-21 12:14:31 +00:00 committed by Apprentice Alf
parent a1fec0b54d
commit 58e9c973ab
4 changed files with 598 additions and 90 deletions

View file

@ -1,3 +1,8 @@
Changes in version 1.6
- support for books whose paragraphs have no styles
- support to run cmbtc_dump on Linux and Mac OSX provided you know your PID of your ipod or standalone Kindle
(contributed by DiapDealer)
Changes in version 1.5
- completely reworked generation of styles to use actual page heights and widths
- added new script getpagedim.py to support the above

View file

@ -0,0 +1,504 @@
#! /usr/bin/python
from __future__ import with_statement
import csv
import sys
import os
import getopt
import zlib
from struct import pack
from struct import unpack
MAX_PATH = 255
# Put the first 8 characters of your Kindle PID here
# or supply it with the -p option in the command line
####################################################
kindlePID = "12345678"
####################################################
global bookFile
global bookPayloadOffset
global bookHeaderRecords
global bookMetadata
global bookKey
global command
#
# Exceptions for all the problems that might happen during the script
#
class CMBDTCError(Exception):
pass
class CMBDTCFatal(Exception):
pass
#
# Open the book file at path
#
def openBook(path):
try:
return open(path,'rb')
except:
raise CMBDTCFatal("Could not open book file: " + path)
#
# Get a 7 bit encoded number from the book file
#
def bookReadEncodedNumber():
flag = False
data = ord(bookFile.read(1))
if data == 0xFF:
flag = True
data = ord(bookFile.read(1))
if data >= 0x80:
datax = (data & 0x7F)
while data >= 0x80 :
data = ord(bookFile.read(1))
datax = (datax <<7) + (data & 0x7F)
data = datax
if flag:
data = -data
return data
#
# Encode a number in 7 bit format
#
def encodeNumber(number):
result = ""
negative = False
flag = 0
print("Using encodeNumber routine")
if number < 0 :
number = -number + 1
negative = True
while True:
byte = number & 0x7F
number = number >> 7
byte += flag
result += chr(byte)
flag = 0x80
if number == 0 : break
if negative:
result += chr(0xFF)
return result[::-1]
#
# Get a length prefixed string from the file
#
def bookReadString():
stringLength = bookReadEncodedNumber()
return unpack(str(stringLength)+"s",bookFile.read(stringLength))[0]
#
# Returns a length prefixed string
#
def lengthPrefixString(data):
return encodeNumber(len(data))+data
#
# Read and return the data of one header record at the current book file position [[offset,decompressedLength,compressedLength],...]
#
def bookReadHeaderRecordData():
nbValues = bookReadEncodedNumber()
values = []
for i in range (0,nbValues):
values.append([bookReadEncodedNumber(),bookReadEncodedNumber(),bookReadEncodedNumber()])
return values
#
# Read and parse one header record at the current book file position and return the associated data [[offset,decompressedLength,compressedLength],...]
#
def parseTopazHeaderRecord():
if ord(bookFile.read(1)) != 0x63:
raise CMBDTCFatal("Parse Error : Invalid Header")
tag = bookReadString()
record = bookReadHeaderRecordData()
return [tag,record]
#
# Parse the header of a Topaz file, get all the header records and the offset for the payload
#
def parseTopazHeader():
global bookHeaderRecords
global bookPayloadOffset
magic = unpack("4s",bookFile.read(4))[0]
if magic != 'TPZ0':
raise CMBDTCFatal("Parse Error : Invalid Header, not a Topaz file")
nbRecords = bookReadEncodedNumber()
bookHeaderRecords = {}
for i in range (0,nbRecords):
result = parseTopazHeaderRecord()
print result[0], result[1]
bookHeaderRecords[result[0]] = result[1]
if ord(bookFile.read(1)) != 0x64 :
raise CMBDTCFatal("Parse Error : Invalid Header")
bookPayloadOffset = bookFile.tell()
#
# Get a record in the book payload, given its name and index. If necessary the record is decrypted. The record is not decompressed
# Correction, the record is correctly decompressed too
#
def getBookPayloadRecord(name, index):
encrypted = False
compressed = False
try:
recordOffset = bookHeaderRecords[name][index][0]
except:
raise CMBDTCFatal("Parse Error : Invalid Record, record not found")
bookFile.seek(bookPayloadOffset + recordOffset)
tag = bookReadString()
if tag != name :
raise CMBDTCFatal("Parse Error : Invalid Record, record name doesn't match")
recordIndex = bookReadEncodedNumber()
if recordIndex < 0 :
encrypted = True
recordIndex = -recordIndex -1
if recordIndex != index :
raise CMBDTCFatal("Parse Error : Invalid Record, index doesn't match")
if (bookHeaderRecords[name][index][2] > 0):
compressed = True
record = bookFile.read(bookHeaderRecords[name][index][2])
else:
record = bookFile.read(bookHeaderRecords[name][index][1])
if encrypted:
ctx = topazCryptoInit(bookKey)
record = topazCryptoDecrypt(record,ctx)
if compressed:
record = zlib.decompress(record)
return record
#
# Extract, decrypt and decompress a book record indicated by name and index and print it or save it in "filename"
#
def extractBookPayloadRecord(name, index, filename):
compressed = False
try:
compressed = bookHeaderRecords[name][index][2] != 0
record = getBookPayloadRecord(name,index)
except:
print("Could not find record")
# if compressed:
# try:
# record = zlib.decompress(record)
# except:
# raise CMBDTCFatal("Could not decompress record")
if filename != "":
try:
file = open(filename,"wb")
file.write(record)
file.close()
except:
raise CMBDTCFatal("Could not write to destination file")
else:
print(record)
#
# return next record [key,value] from the book metadata from the current book position
#
def readMetadataRecord():
return [bookReadString(),bookReadString()]
#
# Parse the metadata record from the book payload and return a list of [key,values]
#
def parseMetadata():
global bookHeaderRecords
global bookPayloadAddress
global bookMetadata
bookMetadata = {}
bookFile.seek(bookPayloadOffset + bookHeaderRecords["metadata"][0][0])
tag = bookReadString()
if tag != "metadata" :
raise CMBDTCFatal("Parse Error : Record Names Don't Match")
flags = ord(bookFile.read(1))
nbRecords = ord(bookFile.read(1))
for i in range (0,nbRecords) :
record =readMetadataRecord()
bookMetadata[record[0]] = record[1]
#
# Context initialisation for the Topaz Crypto
#
def topazCryptoInit(key):
ctx1 = 0x0CAFFE19E
for keyChar in key:
keyByte = ord(keyChar)
ctx2 = ctx1
ctx1 = ((((ctx1 >>2) * (ctx1 >>7))&0xFFFFFFFF) ^ (keyByte * keyByte * 0x0F902007)& 0xFFFFFFFF )
return [ctx1,ctx2]
#
# decrypt data with the context prepared by topazCryptoInit()
#
def topazCryptoDecrypt(data, ctx):
ctx1 = ctx[0]
ctx2 = ctx[1]
plainText = ""
for dataChar in data:
dataByte = ord(dataChar)
m = (dataByte ^ ((ctx1 >> 3) &0xFF) ^ ((ctx2<<3) & 0xFF)) &0xFF
ctx2 = ctx1
ctx1 = (((ctx1 >> 2) * (ctx1 >> 7)) &0xFFFFFFFF) ^((m * m * 0x0F902007) &0xFFFFFFFF)
plainText += chr(m)
return plainText
#
# Decrypt a payload record with the PID
#
def decryptRecord(data,PID):
ctx = topazCryptoInit(PID)
return topazCryptoDecrypt(data, ctx)
#
# Try to decrypt a dkey record (contains the book PID)
#
def decryptDkeyRecord(data,PID):
record = decryptRecord(data,PID)
fields = unpack("3sB8sB8s3s",record)
if fields[0] != "PID" or fields[5] != "pid" :
raise CMBDTCError("Didn't find PID magic numbers in record")
elif fields[1] != 8 or fields[3] != 8 :
raise CMBDTCError("Record didn't contain correct length fields")
elif fields[2] != PID :
raise CMBDTCError("Record didn't contain PID")
return fields[4]
#
# Decrypt all the book's dkey records (contain the book PID)
#
def decryptDkeyRecords(data,PID):
nbKeyRecords = ord(data[0])
records = []
data = data[1:]
for i in range (0,nbKeyRecords):
length = ord(data[0])
try:
key = decryptDkeyRecord(data[1:length+1],PID)
records.append(key)
except CMBDTCError:
pass
data = data[1+length:]
return records
#
# Create decrypted book payload
#
def createDecryptedPayload(payload):
for headerRecord in bookHeaderRecords:
name = headerRecord
if name != "dkey" :
ext = '.dat'
if name == 'img' : ext = '.jpg'
for index in range (0,len(bookHeaderRecords[name])) :
fnum = "%04d" % index
fname = name + fnum + ext
destdir = payload
if name == 'img':
destdir = os.path.join(payload,'img')
if name == 'page':
destdir = os.path.join(payload,'page')
if name == 'glyphs':
destdir = os.path.join(payload,'glyphs')
outputFile = os.path.join(destdir,fname)
file(outputFile, 'wb').write(getBookPayloadRecord(name, index))
# Create decrypted book
#
def createDecryptedBook(outdir):
if not os.path.exists(outdir):
os.makedirs(outdir)
destdir = os.path.join(outdir,'img')
if not os.path.exists(destdir):
os.makedirs(destdir)
destdir = os.path.join(outdir,'page')
if not os.path.exists(destdir):
os.makedirs(destdir)
destdir = os.path.join(outdir,'glyphs')
if not os.path.exists(destdir):
os.makedirs(destdir)
createDecryptedPayload(outdir)
#
# Set the command to execute by the programm according to cmdLine parameters
#
def setCommand(name) :
global command
if command != "" :
raise CMBDTCFatal("Invalid command line parameters")
else :
command = name
#
# Program usage
#
def usage():
print("\nUsage:")
print("\ncmbtc_dump_linux.py [options] bookFileName\n")
print("-p Adds a PID to the list of PIDs that are tried to decrypt the book key (can be used several times)")
print("-d Dumps the unencrypted book as files to outdir")
print("-o Output directory to save book files to")
print("-v Verbose (can be used several times)")
#
# Main
#
def main(argv=sys.argv):
global bookMetadata
global bookKey
global bookFile
global command
progname = os.path.basename(argv[0])
verbose = 0
recordName = ""
recordIndex = 0
outdir = ""
PIDs = []
command = ""
# Preloads your Kindle pid from the top of the program.
PIDs.append(kindlePID)
try:
opts, args = getopt.getopt(sys.argv[1:], "vo:p:d")
except getopt.GetoptError, err:
# print help information and exit:
print str(err) # will print something like "option -a not recognized"
usage()
sys.exit(2)
if len(opts) == 0 and len(args) == 0 :
usage()
sys.exit(2)
for o, a in opts:
if o == "-v":
verbose+=1
if o =="-o":
if a == None :
raise CMBDTCFatal("Invalid parameter for -o")
outdir = a
if o =="-p":
PIDs.append(a)
if o =="-d":
setCommand("doit")
if command == "" :
raise CMBDTCFatal("No action supplied on command line")
#
# Open book and parse metadata
#
if len(args) == 1:
bookFile = openBook(args[0])
parseTopazHeader()
parseMetadata()
#
# Decrypt book key
#
dkey = getBookPayloadRecord('dkey', 0)
bookKeys = []
for PID in PIDs :
bookKeys+=decryptDkeyRecords(dkey,PID)
if len(bookKeys) == 0 :
if verbose > 0 :
print ("Book key could not be found. Maybe this book is not registered with this device.")
else :
bookKey = bookKeys[0]
if verbose > 0:
print("Book key: " + bookKey.encode('hex'))
if command == "printRecord" :
extractBookPayloadRecord(recordName,int(recordIndex),outputFile)
if outputFile != "" and verbose>0 :
print("Wrote record to file: "+outputFile)
elif command == "doit" :
if outdir != "" :
createDecryptedBook(outdir)
if verbose >0 :
print ("Decrypted book saved. Don't pirate!")
elif verbose > 0:
print("Output directory name was not supplied.")
return 0
if __name__ == '__main__':
sys.exit(main())

View file

@ -13,7 +13,8 @@ from struct import unpack
class DocParser(object):
def __init__(self, flatxml, classlst, fileid):
self.id = os.path.basename(fileid).replace('.dat','')
self.flatdoc = flatxml.split('\n')
self.docList = flatxml.split('\n')
self.docSize = len(self.docList)
self.classList = {}
tmpList = classlst.split('\n')
for pclass in tmpList:
@ -29,12 +30,10 @@ class DocParser(object):
self.paracont_stemid = []
self.parastems_stemid = []
# find tag if within pos to end inclusive
# return tag at line pos in document
def lineinDoc(self, pos) :
docList = self.flatdoc
cnt = len(docList)
if (pos >= 0) and (pos < cnt) :
item = docList[pos]
if (pos >= 0) and (pos < self.docSize) :
item = self.docList[pos]
if item.find('=') >= 0:
(name, argres) = item.split('=',1)
else :
@ -43,20 +42,18 @@ class DocParser(object):
return name, argres
# find tag if within pos to end inclusive
# find tag in doc if within pos to end inclusive
def findinDoc(self, tagpath, pos, end) :
result = None
docList = self.flatdoc
cnt = len(docList)
if end == -1 :
end = cnt
end = self.docSize
else:
end = min(cnt,end)
end = min(self.docSize, end)
foundat = -1
for j in xrange(pos, end):
item = docList[j]
item = self.docList[j]
if item.find('=') >= 0:
(name, argres) = item.split('=')
(name, argres) = item.split('=',1)
else :
name = item
argres = ''
@ -85,7 +82,7 @@ class DocParser(object):
result = []
# normal paragraph
# paragraph
(pos, pclass) = self.findinDoc('paragraph.class',start,end)
# class names are an issue given topaz may start them with numerals (not allowed),
@ -94,19 +91,20 @@ class DocParser(object):
# from a base class (but then not actually provide all of these _reclustereed
# classes in the stylesheet!
# so we clean this up by lowercasing, prepend 'cl_', and getting any baseclass
# so we clean this up by lowercasing, prepend 'cl-', and getting any baseclass
# that exists in the stylesheet first, and then adding this specific class
# after
classres = ''
pclass = pclass.lower()
pclass = 'cl-' + pclass
p = pclass.find('_')
if p > 0 :
baseclass = pclass[0:p]
if baseclass in self.classList:
classres += baseclass + ' '
classres += pclass
pclass = classres
if pclass != None :
classres = ''
pclass = pclass.lower()
pclass = 'cl-' + pclass
p = pclass.find('_')
if p > 0 :
baseclass = pclass[0:p]
if baseclass in self.classList:
classres += baseclass + ' '
classres += pclass
pclass = classres
# build up a description of the paragraph in result and return it
# first check for the basic - all words paragraph
@ -128,9 +126,7 @@ class DocParser(object):
# if end is -1 then we must search to end of document
if end == -1 :
docList = self.flatdoc
cnt = len(docList)
end = cnt
end = self.docSize
while (line < end) :
@ -171,20 +167,20 @@ class DocParser(object):
return pclass, result
def buildParagraph(self, cname, pdesc, type, regtype) :
def buildParagraph(self, pclass, pdesc, type, regtype) :
parares = ''
sep =''
br_lb = False
if (regtype == 'fixed') or (regtype == 'chapterheading'):
br_lb = True
classres = ''
if pclass :
classres = ' class="' + pclass + '"'
handle_links = False
if len(self.link_id) > 0:
handle_links = True
br_lb = (regtype == 'fixed') or (regtype == 'chapterheading')
handle_links = len(self.link_id) > 0
if (type == 'full') or (type == 'begin') :
parares += '<p class="' + cname + '">'
parares += '<p' + classres + '>'
if (type == 'end'):
parares += ' '
@ -218,10 +214,7 @@ class DocParser(object):
if word == '_link_' : word = ''
if word == '_lb_':
if (num-1) in self.dehyphen_rootid :
word = ''
sep = ''
elif handle_links :
if ((num-1) in self.dehyphen_rootid ) or handle_links:
word = ''
sep = ''
elif br_lb :
@ -261,43 +254,51 @@ class DocParser(object):
htmlpage = ''
# first collect information from the xml doc that describes this page
# get the ocr text
(pos, argres) = self.findinDoc('info.word.ocrText',0,-1)
if argres : self.ocrtext = argres.split('|')
# get information to dehyphenate the text
(pos, argres) = self.findinDoc('info.dehyphen.rootID',0,-1)
if argres:
argList = argres.split('|')
self.dehyphen_rootid = [ int(strval) for strval in argList]
# determine if first paragraph is continued from previous page
(pos, self.parastems_stemid) = self.findinDoc('info.paraStems.stemID',0,-1)
if self.parastems_stemid == None : self.parastems_stemid = []
first_para_continued = (self.parastems_stemid != None)
# determine if last paragraph is continued onto the next page
(pos, self.paracont_stemid) = self.findinDoc('info.paraCont.stemID',0,-1)
if self.paracont_stemid == None : self.paracont_stemid = []
last_para_continued = (self.paracont_stemid != None)
# collect link ids
(pos, argres) = self.findinDoc('info.word.link_id',0,-1)
if argres:
argList = argres.split('|')
self.link_id = [ int(strval) for strval in argList]
# collect link destination page numbers
(pos, argres) = self.findinDoc('info.links.page',0,-1)
if argres :
argList = argres.split('|')
self.link_page = [ int(strval) for strval in argList]
# collect link titles
(pos, argres) = self.findinDoc('info.links.title',0,-1)
if argres :
self.link_title = argres.split('|')
else:
self.link_title.append('')
# get page type
(pos, pagetype) = self.findinDoc('page.type',0,-1)
# generate a list of each region starting point
# each region has one paragraph,, or one image, or one chapterheading
regionList= self.posinDoc('region')
regcnt = len(regionList)
regionList.append(-1)
@ -308,47 +309,48 @@ class DocParser(object):
# process each region tag and convert what you can to html
for j in xrange(regcnt):
start = regionList[j]
end = regionList[j+1]
(pos, regtype) = self.findinDoc('region.type',start,end)
# set anchor for link target on this page
if not anchorSet and not first_para_continued:
htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
anchorSet = True
if regtype == 'graphic' :
if not anchorSet:
htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
anchorSet = True
(pos, simgsrc) = self.findinDoc('img.src',start,end)
if simgsrc:
htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
elif regtype == 'chapterheading' :
(pclass, pdesc) = self.getParaDescription(start,end)
if not breakSet:
htmlpage += '<div style="page-break-after: always;">&nbsp;</div>\n'
breakSet = True
if not anchorSet:
htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
anchorSet = True
tag = 'h1'
if pclass[3:7] == 'ch1-' : tag = 'h1'
if pclass[3:7] == 'ch2-' : tag = 'h2'
if pclass[3:7] == 'ch3-' : tag = 'h3'
htmlpage += '<' + tag + ' class="' + pclass + '">'
if pclass and (len(pclass) >= 7):
if pclass[3:7] == 'ch1-' : tag = 'h1'
if pclass[3:7] == 'ch2-' : tag = 'h2'
if pclass[3:7] == 'ch3-' : tag = 'h3'
htmlpage += '<' + tag + ' class="' + pclass + '">'
else:
htmlpage += '<' + tag + '>'
htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
htmlpage += '</' + tag + '>'
elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') or (regtype == 'listitem') :
ptype = 'full'
# check to see if this is a continution from the previous page
if (len(self.parastems_stemid) > 0):
if first_para_continued :
ptype = 'end'
self.parastems_stemid=[]
else:
if not anchorSet:
htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
anchorSet = True
first_para_continued = False
(pclass, pdesc) = self.getParaDescription(start,end)
if ptype == 'full' :
if pclass and (len(pclass) >= 6) and (ptype == 'full'):
tag = 'p'
if pclass[3:6] == 'h1-' : tag = 'h4'
if pclass[3:6] == 'h2-' : tag = 'h5'
@ -359,28 +361,22 @@ class DocParser(object):
else :
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
elif (regtype == 'tocentry') :
ptype = 'full'
# check to see if this is a continution from the previous page
if (len(self.parastems_stemid) > 0) and (j == 0):
# process the first paragraph as a continuation from the last page
if first_para_continued :
ptype = 'end'
self.parastems_stemid = []
else:
if not anchorSet:
htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
anchorSet = True
first_para_continued = False
(pclass, pdesc) = self.getParaDescription(start,end)
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
elif (regtype == 'synth_fcvr.center') or (regtype == 'synth_text.center'):
if not anchorSet:
htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
anchorSet = True
(pos, simgsrc) = self.findinDoc('img.src',start,end)
if simgsrc:
htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
else :
print 'Warning: Unknown region type', regtype
(pos, temp) = self.findinDoc('paragraph',start,end)
@ -389,15 +385,11 @@ class DocParser(object):
regtype = 'fixed'
ptype = 'full'
# check to see if this is a continution from the previous page
if (len(self.parastems_stemid) > 0):
if first_para_continued :
ptype = 'end'
self.parastems_stemid=[]
else:
if not anchorSet:
htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
anchorSet = True
first_para_continued = False
(pclass, pdesc) = self.getParaDescription(start,end)
if ptype == 'full' :
if pclass and (ptype == 'full') and (len(pclass) >= 6):
tag = 'p'
if pclass[3:6] == 'h1-' : tag = 'h4'
if pclass[3:6] == 'h2-' : tag = 'h5'
@ -408,24 +400,20 @@ class DocParser(object):
else :
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
else :
print 'Treating this like a "image" region'
if not anchorSet:
htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
anchorSet = True
print 'Treating this like a "graphic" region'
(pos, simgsrc) = self.findinDoc('img.src',start,end)
if simgsrc:
htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
if len(self.paracont_stemid) > 0 :
if last_para_continued :
if htmlpage[-4:] == '</p>':
htmlpage = htmlpage[0:-4]
htmlpage = htmlpage[0:-4]
last_para_continued = False
return htmlpage
return self.convert2HTML()
def convert2HTML(flatxml, classlst, fileid):

View file

@ -3,7 +3,7 @@ Contributors:
clarknova - for all of the svg and glyph generation and many other bug fixes and improvements
skindle - for figuing out the general case for the mode loops
some updates - for conversion to xml, basic html
DiapDealer - for extensive testing and feedback
DiapDealer - for extensive testing and feedback, and standalone linux/macosx version of cmbtc_dump
stewball - for extensive testing and feedback
and others for posting, feedback and testing
@ -29,6 +29,17 @@ genxml.py - main program to convert everything to xml
genhtml.py - main program to generate "book.html"
gensvg.py - (author: clarknova) main program to create an svg grpahic of each page
In addition there is now a new file:
cmbtc_dump_mac_linux.py
If you know the pid of your ipod and/or your standalone Kindle and your book
was meant for that device, you can use this program to dump the proper sections
on Mac OSX and Linux (and even Windows if you do not have Kindle4PC installed).
Thank DiapDealer for creating it!
Please note, gensvg.py, genhtml.py, and genxml.py import and use
decode_meta.py, convert2xml.py, flatxml2html.py, getpagedim.py and stylexml2css.py
so please keep all of these python scripts together in the same place.