More fixes for Amazon books, fixing identity checks, started on Topaz.

This commit is contained in:
Apprentice Harper 2020-10-16 13:58:59 +01:00
parent dc27c36761
commit 939cdbb0c9
8 changed files with 530 additions and 512 deletions

View file

@ -56,7 +56,7 @@ def readEncodedNumber(file):
c = file.read(1) c = file.read(1)
if (len(c) == 0): if (len(c) == 0):
return None return None
data = ord(c) data = c[0]
datax = (datax <<7) + (data & 0x7F) datax = (datax <<7) + (data & 0x7F)
data = datax data = datax
@ -188,232 +188,232 @@ class PageParser(object):
# tag : (number of arguments, argument type, subtags present, special case of subtags presents when escaped) # tag : (number of arguments, argument type, subtags present, special case of subtags presents when escaped)
token_tags = { token_tags = {
'x' : (1, 'scalar_number', 0, 0), b'x' : (1, 'scalar_number', 0, 0),
'y' : (1, 'scalar_number', 0, 0), b'y' : (1, 'scalar_number', 0, 0),
'h' : (1, 'scalar_number', 0, 0), b'h' : (1, 'scalar_number', 0, 0),
'w' : (1, 'scalar_number', 0, 0), b'w' : (1, 'scalar_number', 0, 0),
'firstWord' : (1, 'scalar_number', 0, 0), b'firstWord' : (1, 'scalar_number', 0, 0),
'lastWord' : (1, 'scalar_number', 0, 0), b'lastWord' : (1, 'scalar_number', 0, 0),
'rootID' : (1, 'scalar_number', 0, 0), b'rootID' : (1, 'scalar_number', 0, 0),
'stemID' : (1, 'scalar_number', 0, 0), b'stemID' : (1, 'scalar_number', 0, 0),
'type' : (1, 'scalar_text', 0, 0), b'type' : (1, 'scalar_text', 0, 0),
'info' : (0, 'number', 1, 0), b'info' : (0, 'number', 1, 0),
'info.word' : (0, 'number', 1, 1), b'info.word' : (0, 'number', 1, 1),
'info.word.ocrText' : (1, 'text', 0, 0), b'info.word.ocrText' : (1, 'text', 0, 0),
'info.word.firstGlyph' : (1, 'raw', 0, 0), b'info.word.firstGlyph' : (1, 'raw', 0, 0),
'info.word.lastGlyph' : (1, 'raw', 0, 0), b'info.word.lastGlyph' : (1, 'raw', 0, 0),
'info.word.bl' : (1, 'raw', 0, 0), b'info.word.bl' : (1, 'raw', 0, 0),
'info.word.link_id' : (1, 'number', 0, 0), b'info.word.link_id' : (1, 'number', 0, 0),
'glyph' : (0, 'number', 1, 1), b'glyph' : (0, 'number', 1, 1),
'glyph.x' : (1, 'number', 0, 0), b'glyph.x' : (1, 'number', 0, 0),
'glyph.y' : (1, 'number', 0, 0), b'glyph.y' : (1, 'number', 0, 0),
'glyph.glyphID' : (1, 'number', 0, 0), b'glyph.glyphID' : (1, 'number', 0, 0),
'dehyphen' : (0, 'number', 1, 1), b'dehyphen' : (0, 'number', 1, 1),
'dehyphen.rootID' : (1, 'number', 0, 0), b'dehyphen.rootID' : (1, 'number', 0, 0),
'dehyphen.stemID' : (1, 'number', 0, 0), b'dehyphen.stemID' : (1, 'number', 0, 0),
'dehyphen.stemPage' : (1, 'number', 0, 0), b'dehyphen.stemPage' : (1, 'number', 0, 0),
'dehyphen.sh' : (1, 'number', 0, 0), b'dehyphen.sh' : (1, 'number', 0, 0),
'links' : (0, 'number', 1, 1), b'links' : (0, 'number', 1, 1),
'links.page' : (1, 'number', 0, 0), b'links.page' : (1, 'number', 0, 0),
'links.rel' : (1, 'number', 0, 0), b'links.rel' : (1, 'number', 0, 0),
'links.row' : (1, 'number', 0, 0), b'links.row' : (1, 'number', 0, 0),
'links.title' : (1, 'text', 0, 0), b'links.title' : (1, 'text', 0, 0),
'links.href' : (1, 'text', 0, 0), b'links.href' : (1, 'text', 0, 0),
'links.type' : (1, 'text', 0, 0), b'links.type' : (1, 'text', 0, 0),
'links.id' : (1, 'number', 0, 0), b'links.id' : (1, 'number', 0, 0),
'paraCont' : (0, 'number', 1, 1), b'paraCont' : (0, 'number', 1, 1),
'paraCont.rootID' : (1, 'number', 0, 0), b'paraCont.rootID' : (1, 'number', 0, 0),
'paraCont.stemID' : (1, 'number', 0, 0), b'paraCont.stemID' : (1, 'number', 0, 0),
'paraCont.stemPage' : (1, 'number', 0, 0), b'paraCont.stemPage' : (1, 'number', 0, 0),
'paraStems' : (0, 'number', 1, 1), b'paraStems' : (0, 'number', 1, 1),
'paraStems.stemID' : (1, 'number', 0, 0), b'paraStems.stemID' : (1, 'number', 0, 0),
'wordStems' : (0, 'number', 1, 1), b'wordStems' : (0, 'number', 1, 1),
'wordStems.stemID' : (1, 'number', 0, 0), b'wordStems.stemID' : (1, 'number', 0, 0),
'empty' : (1, 'snippets', 1, 0), b'empty' : (1, 'snippets', 1, 0),
'page' : (1, 'snippets', 1, 0), b'page' : (1, 'snippets', 1, 0),
'page.class' : (1, 'scalar_text', 0, 0), b'page.class' : (1, 'scalar_text', 0, 0),
'page.pageid' : (1, 'scalar_text', 0, 0), b'page.pageid' : (1, 'scalar_text', 0, 0),
'page.pagelabel' : (1, 'scalar_text', 0, 0), b'page.pagelabel' : (1, 'scalar_text', 0, 0),
'page.type' : (1, 'scalar_text', 0, 0), b'page.type' : (1, 'scalar_text', 0, 0),
'page.h' : (1, 'scalar_number', 0, 0), b'page.h' : (1, 'scalar_number', 0, 0),
'page.w' : (1, 'scalar_number', 0, 0), b'page.w' : (1, 'scalar_number', 0, 0),
'page.startID' : (1, 'scalar_number', 0, 0), b'page.startID' : (1, 'scalar_number', 0, 0),
'group' : (1, 'snippets', 1, 0), b'group' : (1, 'snippets', 1, 0),
'group.class' : (1, 'scalar_text', 0, 0), b'group.class' : (1, 'scalar_text', 0, 0),
'group.type' : (1, 'scalar_text', 0, 0), b'group.type' : (1, 'scalar_text', 0, 0),
'group._tag' : (1, 'scalar_text', 0, 0), b'group._tag' : (1, 'scalar_text', 0, 0),
'group.orientation': (1, 'scalar_text', 0, 0), b'group.orientation': (1, 'scalar_text', 0, 0),
'region' : (1, 'snippets', 1, 0), b'region' : (1, 'snippets', 1, 0),
'region.class' : (1, 'scalar_text', 0, 0), b'region.class' : (1, 'scalar_text', 0, 0),
'region.type' : (1, 'scalar_text', 0, 0), b'region.type' : (1, 'scalar_text', 0, 0),
'region.x' : (1, 'scalar_number', 0, 0), b'region.x' : (1, 'scalar_number', 0, 0),
'region.y' : (1, 'scalar_number', 0, 0), b'region.y' : (1, 'scalar_number', 0, 0),
'region.h' : (1, 'scalar_number', 0, 0), b'region.h' : (1, 'scalar_number', 0, 0),
'region.w' : (1, 'scalar_number', 0, 0), b'region.w' : (1, 'scalar_number', 0, 0),
'region.orientation' : (1, 'scalar_text', 0, 0), b'region.orientation' : (1, 'scalar_text', 0, 0),
'empty_text_region' : (1, 'snippets', 1, 0), b'empty_text_region' : (1, 'snippets', 1, 0),
'img' : (1, 'snippets', 1, 0), b'img' : (1, 'snippets', 1, 0),
'img.x' : (1, 'scalar_number', 0, 0), b'img.x' : (1, 'scalar_number', 0, 0),
'img.y' : (1, 'scalar_number', 0, 0), b'img.y' : (1, 'scalar_number', 0, 0),
'img.h' : (1, 'scalar_number', 0, 0), b'img.h' : (1, 'scalar_number', 0, 0),
'img.w' : (1, 'scalar_number', 0, 0), b'img.w' : (1, 'scalar_number', 0, 0),
'img.src' : (1, 'scalar_number', 0, 0), b'img.src' : (1, 'scalar_number', 0, 0),
'img.color_src' : (1, 'scalar_number', 0, 0), b'img.color_src' : (1, 'scalar_number', 0, 0),
'img.gridSize' : (1, 'scalar_number', 0, 0), b'img.gridSize' : (1, 'scalar_number', 0, 0),
'img.gridBottomCenter' : (1, 'scalar_number', 0, 0), b'img.gridBottomCenter' : (1, 'scalar_number', 0, 0),
'img.gridTopCenter' : (1, 'scalar_number', 0, 0), b'img.gridTopCenter' : (1, 'scalar_number', 0, 0),
'img.gridBeginCenter' : (1, 'scalar_number', 0, 0), b'img.gridBeginCenter' : (1, 'scalar_number', 0, 0),
'img.gridEndCenter' : (1, 'scalar_number', 0, 0), b'img.gridEndCenter' : (1, 'scalar_number', 0, 0),
'img.image_type' : (1, 'scalar_number', 0, 0), b'img.image_type' : (1, 'scalar_number', 0, 0),
'paragraph' : (1, 'snippets', 1, 0), b'paragraph' : (1, 'snippets', 1, 0),
'paragraph.class' : (1, 'scalar_text', 0, 0), b'paragraph.class' : (1, 'scalar_text', 0, 0),
'paragraph.firstWord' : (1, 'scalar_number', 0, 0), b'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
'paragraph.lastWord' : (1, 'scalar_number', 0, 0), b'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
'paragraph.lastWord' : (1, 'scalar_number', 0, 0), b'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
'paragraph.gridSize' : (1, 'scalar_number', 0, 0), b'paragraph.gridSize' : (1, 'scalar_number', 0, 0),
'paragraph.gridBottomCenter' : (1, 'scalar_number', 0, 0), b'paragraph.gridBottomCenter' : (1, 'scalar_number', 0, 0),
'paragraph.gridTopCenter' : (1, 'scalar_number', 0, 0), b'paragraph.gridTopCenter' : (1, 'scalar_number', 0, 0),
'paragraph.gridBeginCenter' : (1, 'scalar_number', 0, 0), b'paragraph.gridBeginCenter' : (1, 'scalar_number', 0, 0),
'paragraph.gridEndCenter' : (1, 'scalar_number', 0, 0), b'paragraph.gridEndCenter' : (1, 'scalar_number', 0, 0),
'word_semantic' : (1, 'snippets', 1, 1), b'word_semantic' : (1, 'snippets', 1, 1),
'word_semantic.type' : (1, 'scalar_text', 0, 0), b'word_semantic.type' : (1, 'scalar_text', 0, 0),
'word_semantic.class' : (1, 'scalar_text', 0, 0), b'word_semantic.class' : (1, 'scalar_text', 0, 0),
'word_semantic.firstWord' : (1, 'scalar_number', 0, 0), b'word_semantic.firstWord' : (1, 'scalar_number', 0, 0),
'word_semantic.lastWord' : (1, 'scalar_number', 0, 0), b'word_semantic.lastWord' : (1, 'scalar_number', 0, 0),
'word_semantic.gridBottomCenter' : (1, 'scalar_number', 0, 0), b'word_semantic.gridBottomCenter' : (1, 'scalar_number', 0, 0),
'word_semantic.gridTopCenter' : (1, 'scalar_number', 0, 0), b'word_semantic.gridTopCenter' : (1, 'scalar_number', 0, 0),
'word_semantic.gridBeginCenter' : (1, 'scalar_number', 0, 0), b'word_semantic.gridBeginCenter' : (1, 'scalar_number', 0, 0),
'word_semantic.gridEndCenter' : (1, 'scalar_number', 0, 0), b'word_semantic.gridEndCenter' : (1, 'scalar_number', 0, 0),
'word' : (1, 'snippets', 1, 0), b'word' : (1, 'snippets', 1, 0),
'word.type' : (1, 'scalar_text', 0, 0), b'word.type' : (1, 'scalar_text', 0, 0),
'word.class' : (1, 'scalar_text', 0, 0), b'word.class' : (1, 'scalar_text', 0, 0),
'word.firstGlyph' : (1, 'scalar_number', 0, 0), b'word.firstGlyph' : (1, 'scalar_number', 0, 0),
'word.lastGlyph' : (1, 'scalar_number', 0, 0), b'word.lastGlyph' : (1, 'scalar_number', 0, 0),
'_span' : (1, 'snippets', 1, 0), b'_span' : (1, 'snippets', 1, 0),
'_span.class' : (1, 'scalar_text', 0, 0), b'_span.class' : (1, 'scalar_text', 0, 0),
'_span.firstWord' : (1, 'scalar_number', 0, 0), b'_span.firstWord' : (1, 'scalar_number', 0, 0),
'_span.lastWord' : (1, 'scalar_number', 0, 0), b'_span.lastWord' : (1, 'scalar_number', 0, 0),
'_span.gridSize' : (1, 'scalar_number', 0, 0), b'_span.gridSize' : (1, 'scalar_number', 0, 0),
'_span.gridBottomCenter' : (1, 'scalar_number', 0, 0), b'_span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
'_span.gridTopCenter' : (1, 'scalar_number', 0, 0), b'_span.gridTopCenter' : (1, 'scalar_number', 0, 0),
'_span.gridBeginCenter' : (1, 'scalar_number', 0, 0), b'_span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
'_span.gridEndCenter' : (1, 'scalar_number', 0, 0), b'_span.gridEndCenter' : (1, 'scalar_number', 0, 0),
'span' : (1, 'snippets', 1, 0), b'span' : (1, 'snippets', 1, 0),
'span.firstWord' : (1, 'scalar_number', 0, 0), b'span.firstWord' : (1, 'scalar_number', 0, 0),
'span.lastWord' : (1, 'scalar_number', 0, 0), b'span.lastWord' : (1, 'scalar_number', 0, 0),
'span.gridSize' : (1, 'scalar_number', 0, 0), b'span.gridSize' : (1, 'scalar_number', 0, 0),
'span.gridBottomCenter' : (1, 'scalar_number', 0, 0), b'span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
'span.gridTopCenter' : (1, 'scalar_number', 0, 0), b'span.gridTopCenter' : (1, 'scalar_number', 0, 0),
'span.gridBeginCenter' : (1, 'scalar_number', 0, 0), b'span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
'span.gridEndCenter' : (1, 'scalar_number', 0, 0), b'span.gridEndCenter' : (1, 'scalar_number', 0, 0),
'extratokens' : (1, 'snippets', 1, 0), b'extratokens' : (1, 'snippets', 1, 0),
'extratokens.class' : (1, 'scalar_text', 0, 0), b'extratokens.class' : (1, 'scalar_text', 0, 0),
'extratokens.type' : (1, 'scalar_text', 0, 0), b'extratokens.type' : (1, 'scalar_text', 0, 0),
'extratokens.firstGlyph' : (1, 'scalar_number', 0, 0), b'extratokens.firstGlyph' : (1, 'scalar_number', 0, 0),
'extratokens.lastGlyph' : (1, 'scalar_number', 0, 0), b'extratokens.lastGlyph' : (1, 'scalar_number', 0, 0),
'extratokens.gridSize' : (1, 'scalar_number', 0, 0), b'extratokens.gridSize' : (1, 'scalar_number', 0, 0),
'extratokens.gridBottomCenter' : (1, 'scalar_number', 0, 0), b'extratokens.gridBottomCenter' : (1, 'scalar_number', 0, 0),
'extratokens.gridTopCenter' : (1, 'scalar_number', 0, 0), b'extratokens.gridTopCenter' : (1, 'scalar_number', 0, 0),
'extratokens.gridBeginCenter' : (1, 'scalar_number', 0, 0), b'extratokens.gridBeginCenter' : (1, 'scalar_number', 0, 0),
'extratokens.gridEndCenter' : (1, 'scalar_number', 0, 0), b'extratokens.gridEndCenter' : (1, 'scalar_number', 0, 0),
'glyph.h' : (1, 'number', 0, 0), b'glyph.h' : (1, 'number', 0, 0),
'glyph.w' : (1, 'number', 0, 0), b'glyph.w' : (1, 'number', 0, 0),
'glyph.use' : (1, 'number', 0, 0), b'glyph.use' : (1, 'number', 0, 0),
'glyph.vtx' : (1, 'number', 0, 1), b'glyph.vtx' : (1, 'number', 0, 1),
'glyph.len' : (1, 'number', 0, 1), b'glyph.len' : (1, 'number', 0, 1),
'glyph.dpi' : (1, 'number', 0, 0), b'glyph.dpi' : (1, 'number', 0, 0),
'vtx' : (0, 'number', 1, 1), b'vtx' : (0, 'number', 1, 1),
'vtx.x' : (1, 'number', 0, 0), b'vtx.x' : (1, 'number', 0, 0),
'vtx.y' : (1, 'number', 0, 0), b'vtx.y' : (1, 'number', 0, 0),
'len' : (0, 'number', 1, 1), b'len' : (0, 'number', 1, 1),
'len.n' : (1, 'number', 0, 0), b'len.n' : (1, 'number', 0, 0),
'book' : (1, 'snippets', 1, 0), b'book' : (1, 'snippets', 1, 0),
'version' : (1, 'snippets', 1, 0), b'version' : (1, 'snippets', 1, 0),
'version.FlowEdit_1_id' : (1, 'scalar_text', 0, 0), b'version.FlowEdit_1_id' : (1, 'scalar_text', 0, 0),
'version.FlowEdit_1_version' : (1, 'scalar_text', 0, 0), b'version.FlowEdit_1_version' : (1, 'scalar_text', 0, 0),
'version.Schema_id' : (1, 'scalar_text', 0, 0), b'version.Schema_id' : (1, 'scalar_text', 0, 0),
'version.Schema_version' : (1, 'scalar_text', 0, 0), b'version.Schema_version' : (1, 'scalar_text', 0, 0),
'version.Topaz_version' : (1, 'scalar_text', 0, 0), b'version.Topaz_version' : (1, 'scalar_text', 0, 0),
'version.WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0), b'version.WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0),
'version.WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0), b'version.WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0),
'version.ZoneEdit_1_id' : (1, 'scalar_text', 0, 0), b'version.ZoneEdit_1_id' : (1, 'scalar_text', 0, 0),
'version.ZoneEdit_1_version' : (1, 'scalar_text', 0, 0), b'version.ZoneEdit_1_version' : (1, 'scalar_text', 0, 0),
'version.chapterheaders' : (1, 'scalar_text', 0, 0), b'version.chapterheaders' : (1, 'scalar_text', 0, 0),
'version.creation_date' : (1, 'scalar_text', 0, 0), b'version.creation_date' : (1, 'scalar_text', 0, 0),
'version.header_footer' : (1, 'scalar_text', 0, 0), b'version.header_footer' : (1, 'scalar_text', 0, 0),
'version.init_from_ocr' : (1, 'scalar_text', 0, 0), b'version.init_from_ocr' : (1, 'scalar_text', 0, 0),
'version.letter_insertion' : (1, 'scalar_text', 0, 0), b'version.letter_insertion' : (1, 'scalar_text', 0, 0),
'version.xmlinj_convert' : (1, 'scalar_text', 0, 0), b'version.xmlinj_convert' : (1, 'scalar_text', 0, 0),
'version.xmlinj_reflow' : (1, 'scalar_text', 0, 0), b'version.xmlinj_reflow' : (1, 'scalar_text', 0, 0),
'version.xmlinj_transform' : (1, 'scalar_text', 0, 0), b'version.xmlinj_transform' : (1, 'scalar_text', 0, 0),
'version.findlists' : (1, 'scalar_text', 0, 0), b'version.findlists' : (1, 'scalar_text', 0, 0),
'version.page_num' : (1, 'scalar_text', 0, 0), b'version.page_num' : (1, 'scalar_text', 0, 0),
'version.page_type' : (1, 'scalar_text', 0, 0), b'version.page_type' : (1, 'scalar_text', 0, 0),
'version.bad_text' : (1, 'scalar_text', 0, 0), b'version.bad_text' : (1, 'scalar_text', 0, 0),
'version.glyph_mismatch' : (1, 'scalar_text', 0, 0), b'version.glyph_mismatch' : (1, 'scalar_text', 0, 0),
'version.margins' : (1, 'scalar_text', 0, 0), b'version.margins' : (1, 'scalar_text', 0, 0),
'version.staggered_lines' : (1, 'scalar_text', 0, 0), b'version.staggered_lines' : (1, 'scalar_text', 0, 0),
'version.paragraph_continuation' : (1, 'scalar_text', 0, 0), b'version.paragraph_continuation' : (1, 'scalar_text', 0, 0),
'version.toc' : (1, 'scalar_text', 0, 0), b'version.toc' : (1, 'scalar_text', 0, 0),
'stylesheet' : (1, 'snippets', 1, 0), b'stylesheet' : (1, 'snippets', 1, 0),
'style' : (1, 'snippets', 1, 0), b'style' : (1, 'snippets', 1, 0),
'style._tag' : (1, 'scalar_text', 0, 0), b'style._tag' : (1, 'scalar_text', 0, 0),
'style.type' : (1, 'scalar_text', 0, 0), b'style.type' : (1, 'scalar_text', 0, 0),
'style._after_type' : (1, 'scalar_text', 0, 0), b'style._after_type' : (1, 'scalar_text', 0, 0),
'style._parent_type' : (1, 'scalar_text', 0, 0), b'style._parent_type' : (1, 'scalar_text', 0, 0),
'style._after_parent_type' : (1, 'scalar_text', 0, 0), b'style._after_parent_type' : (1, 'scalar_text', 0, 0),
'style.class' : (1, 'scalar_text', 0, 0), b'style.class' : (1, 'scalar_text', 0, 0),
'style._after_class' : (1, 'scalar_text', 0, 0), b'style._after_class' : (1, 'scalar_text', 0, 0),
'rule' : (1, 'snippets', 1, 0), b'rule' : (1, 'snippets', 1, 0),
'rule.attr' : (1, 'scalar_text', 0, 0), b'rule.attr' : (1, 'scalar_text', 0, 0),
'rule.value' : (1, 'scalar_text', 0, 0), b'rule.value' : (1, 'scalar_text', 0, 0),
'original' : (0, 'number', 1, 1), b'original' : (0, 'number', 1, 1),
'original.pnum' : (1, 'number', 0, 0), b'original.pnum' : (1, 'number', 0, 0),
'original.pid' : (1, 'text', 0, 0), b'original.pid' : (1, 'text', 0, 0),
'pages' : (0, 'number', 1, 1), b'pages' : (0, 'number', 1, 1),
'pages.ref' : (1, 'number', 0, 0), b'pages.ref' : (1, 'number', 0, 0),
'pages.id' : (1, 'number', 0, 0), b'pages.id' : (1, 'number', 0, 0),
'startID' : (0, 'number', 1, 1), b'startID' : (0, 'number', 1, 1),
'startID.page' : (1, 'number', 0, 0), b'startID.page' : (1, 'number', 0, 0),
'startID.id' : (1, 'number', 0, 0), b'startID.id' : (1, 'number', 0, 0),
'median_d' : (1, 'number', 0, 0), b'median_d' : (1, 'number', 0, 0),
'median_h' : (1, 'number', 0, 0), b'median_h' : (1, 'number', 0, 0),
'median_firsty' : (1, 'number', 0, 0), b'median_firsty' : (1, 'number', 0, 0),
'median_lasty' : (1, 'number', 0, 0), b'median_lasty' : (1, 'number', 0, 0),
'num_footers_maybe' : (1, 'number', 0, 0), b'num_footers_maybe' : (1, 'number', 0, 0),
'num_footers_yes' : (1, 'number', 0, 0), b'num_footers_yes' : (1, 'number', 0, 0),
'num_headers_maybe' : (1, 'number', 0, 0), b'num_headers_maybe' : (1, 'number', 0, 0),
'num_headers_yes' : (1, 'number', 0, 0), b'num_headers_yes' : (1, 'number', 0, 0),
'tracking' : (1, 'number', 0, 0), b'tracking' : (1, 'number', 0, 0),
'src' : (1, 'text', 0, 0), b'src' : (1, 'text', 0, 0),
} }
@ -430,7 +430,7 @@ class PageParser(object):
cnt = len(self.tagpath) cnt = len(self.tagpath)
if i < cnt : result = self.tagpath[i] if i < cnt : result = self.tagpath[i]
for j in range(i+1, cnt) : for j in range(i+1, cnt) :
result += '.' + self.tagpath[j] result += b'.' + self.tagpath[j]
return result return result
@ -505,7 +505,7 @@ class PageParser(object):
if (subtags == 1): if (subtags == 1):
ntags = readEncodedNumber(self.fo) ntags = readEncodedNumber(self.fo)
if self.debug : print('subtags: ' + token + ' has ' + str(ntags)) if self.debug : print('subtags: ', token , ' has ' , str(ntags))
for j in range(ntags): for j in range(ntags):
val = readEncodedNumber(self.fo) val = readEncodedNumber(self.fo)
subtagres.append(self.procToken(self.dict.lookup(val))) subtagres.append(self.procToken(self.dict.lookup(val)))
@ -613,7 +613,7 @@ class PageParser(object):
subtagList = tag[1] subtagList = tag[1]
argtype = tag[2] argtype = tag[2]
argList = tag[3] argList = tag[3]
nname = prefix + '.' + name nname = prefix + b'.' + name
nsubtaglist = [] nsubtaglist = []
for j in subtagList: for j in subtagList:
nsubtaglist.append(self.updateName(j,prefix)) nsubtaglist.append(self.updateName(j,prefix))
@ -662,34 +662,34 @@ class PageParser(object):
subtagList = node[1] subtagList = node[1]
argtype = node[2] argtype = node[2]
argList = node[3] argList = node[3]
fullpathname = name.split('.') fullpathname = name.split(b'.')
nodename = fullpathname.pop() nodename = fullpathname.pop()
ilvl = len(fullpathname) ilvl = len(fullpathname)
indent = ' ' * (3 * ilvl) indent = b' ' * (3 * ilvl)
rlst = [] rlst = []
rlst.append(indent + '<' + nodename + '>') rlst.append(indent + b'<' + nodename + b'>')
if len(argList) > 0: if len(argList) > 0:
alst = [] alst = []
for j in argList: for j in argList:
if (argtype == 'text') or (argtype == 'scalar_text') : if (argtype == b'text') or (argtype == b'scalar_text') :
alst.append(j + '|') alst.append(j + b'|')
else : else :
alst.append(str(j) + ',') alst.append(str(j).encode('utf-8') + b',')
argres = "".join(alst) argres = b"".join(alst)
argres = argres[0:-1] argres = argres[0:-1]
if argtype == 'snippets' : if argtype == b'snippets' :
rlst.append('snippets:' + argres) rlst.append(b'snippets:' + argres)
else : else :
rlst.append(argres) rlst.append(argres)
if len(subtagList) > 0 : if len(subtagList) > 0 :
rlst.append('\n') rlst.append(b'\n')
for j in subtagList: for j in subtagList:
if len(j) > 0 : if len(j) > 0 :
rlst.append(self.formatTag(j)) rlst.append(self.formatTag(j))
rlst.append(indent + '</' + nodename + '>\n') rlst.append(indent + b'</' + nodename + b'>\n')
else: else:
rlst.append('</' + nodename + '>\n') rlst.append(b'</' + nodename + b'>\n')
return "".join(rlst) return b"".join(rlst)
# flatten tag # flatten tag
@ -704,20 +704,20 @@ class PageParser(object):
alst = [] alst = []
for j in argList: for j in argList:
if (argtype == 'text') or (argtype == 'scalar_text') : if (argtype == 'text') or (argtype == 'scalar_text') :
alst.append(j + '|') alst.append(j + b'|')
else : else :
alst.append(str(j) + '|') alst.append(str(j).encode('utf-8') + b'|')
argres = "".join(alst) argres = b"".join(alst)
argres = argres[0:-1] argres = argres[0:-1]
if argtype == 'snippets' : if argtype == b'snippets' :
rlst.append('.snippets=' + argres) rlst.append(b'.snippets=' + argres)
else : else :
rlst.append('=' + argres) rlst.append(b'=' + argres)
rlst.append('\n') rlst.append(b'\n')
for j in subtagList: for j in subtagList:
if len(j) > 0 : if len(j) > 0 :
rlst.append(self.flattenTag(j)) rlst.append(self.flattenTag(j))
return "".join(rlst) return b"".join(rlst)
# reduce create xml output # reduce create xml output
@ -729,7 +729,7 @@ class PageParser(object):
rlst.append(self.flattenTag(j)) rlst.append(self.flattenTag(j))
else: else:
rlst.append(self.formatTag(j)) rlst.append(self.formatTag(j))
result = "".join(rlst) result = b"".join(rlst)
if self.debug : print(result) if self.debug : print(result)
return result return result
@ -747,16 +747,16 @@ class PageParser(object):
# peek at the first bytes to see what type of file it is # peek at the first bytes to see what type of file it is
magic = self.fo.read(9) magic = self.fo.read(9)
if (magic[0:1] == 'p') and (magic[2:9] == 'marker_'): if (magic[0:1] == b'p') and (magic[2:9] == b'marker_'):
first_token = 'info' first_token = b'info'
elif (magic[0:1] == 'p') and (magic[2:9] == '__PAGE_'): elif (magic[0:1] == b'p') and (magic[2:9] == b'__PAGE_'):
skip = self.fo.read(2) skip = self.fo.read(2)
first_token = 'info' first_token = b'info'
elif (magic[0:1] == 'p') and (magic[2:8] == '_PAGE_'): elif (magic[0:1] == b'p') and (magic[2:8] == b'_PAGE_'):
first_token = 'info' first_token = b'info'
elif (magic[0:1] == 'g') and (magic[2:9] == '__GLYPH'): elif (magic[0:1] == b'g') and (magic[2:9] == b'__GLYPH'):
skip = self.fo.read(3) skip = self.fo.read(3)
first_token = 'info' first_token = b'info'
else : else :
# other0.dat file # other0.dat file
first_token = None first_token = None
@ -778,7 +778,7 @@ class PageParser(object):
break break
if (v == 0x72): if (v == 0x72):
self.doLoop72('number') self.doLoop72(b'number')
elif (v > 0) and (v < self.dict.getSize()) : elif (v > 0) and (v < self.dict.getSize()) :
tag = self.procToken(self.dict.lookup(v)) tag = self.procToken(self.dict.lookup(v))
if len(tag) > 0 : if len(tag) > 0 :
@ -789,7 +789,7 @@ class PageParser(object):
if (v == 0): if (v == 0):
if (self.peek(1) == 0x5f): if (self.peek(1) == 0x5f):
skip = self.fo.read(1) skip = self.fo.read(1)
first_token = 'info' first_token = b'info'
# now do snippet injection # now do snippet injection
if len(self.snippetList) > 0 : if len(self.snippetList) > 0 :
@ -809,14 +809,14 @@ class PageParser(object):
def fromData(dict, fname): def fromData(dict, fname):
flat_xml = True flat_xml = True
debug = False debug = True
pp = PageParser(fname, dict, debug, flat_xml) pp = PageParser(fname, dict, debug, flat_xml)
xmlpage = pp.process() xmlpage = pp.process()
return xmlpage return xmlpage
def getXML(dict, fname): def getXML(dict, fname):
flat_xml = False flat_xml = False
debug = False debug = True
pp = PageParser(fname, dict, debug, flat_xml) pp = PageParser(fname, dict, debug, flat_xml)
xmlpage = pp.process() xmlpage = pp.process()
return xmlpage return xmlpage
@ -845,7 +845,7 @@ def main(argv):
sys.stderr=SafeUnbuffered(sys.stderr) sys.stderr=SafeUnbuffered(sys.stderr)
dictFile = "" dictFile = ""
pageFile = "" pageFile = ""
debug = False debug = True
flat_xml = False flat_xml = False
printOutput = False printOutput = False
if len(argv) == 0: if len(argv) == 0:

View file

@ -7,6 +7,7 @@ import csv
import os import os
import math import math
import getopt import getopt
import functools
from struct import pack from struct import pack
from struct import unpack from struct import unpack
@ -15,14 +16,14 @@ class DocParser(object):
def __init__(self, flatxml, classlst, fileid, bookDir, gdict, fixedimage): def __init__(self, flatxml, classlst, fileid, bookDir, gdict, fixedimage):
self.id = os.path.basename(fileid).replace('.dat','') self.id = os.path.basename(fileid).replace('.dat','')
self.svgcount = 0 self.svgcount = 0
self.docList = flatxml.split('\n') self.docList = flatxml.split(b'\n')
self.docSize = len(self.docList) self.docSize = len(self.docList)
self.classList = {} self.classList = {}
self.bookDir = bookDir self.bookDir = bookDir
self.gdict = gdict self.gdict = gdict
tmpList = classlst.split('\n') tmpList = classlst.split('\n')
for pclass in tmpList: for pclass in tmpList:
if pclass != '': if pclass != b'':
# remove the leading period from the css name # remove the leading period from the css name
cname = pclass[1:] cname = pclass[1:]
self.classList[cname] = True self.classList[cname] = True
@ -57,9 +58,9 @@ class DocParser(object):
imgfile = os.path.join(imgDir,imgname) imgfile = os.path.join(imgDir,imgname)
# get glyph information # get glyph information
gxList = self.getData('info.glyph.x',0,-1) gxList = self.getData(b'info.glyph.x',0,-1)
gyList = self.getData('info.glyph.y',0,-1) gyList = self.getData(b'info.glyph.y',0,-1)
gidList = self.getData('info.glyph.glyphID',0,-1) gidList = self.getData(b'info.glyph.glyphID',0,-1)
gids = [] gids = []
maxws = [] maxws = []
@ -122,11 +123,11 @@ class DocParser(object):
def lineinDoc(self, pos) : def lineinDoc(self, pos) :
if (pos >= 0) and (pos < self.docSize) : if (pos >= 0) and (pos < self.docSize) :
item = self.docList[pos] item = self.docList[pos]
if item.find('=') >= 0: if item.find(b'=') >= 0:
(name, argres) = item.split('=',1) (name, argres) = item.split(b'=',1)
else : else :
name = item name = item
argres = '' argres = b''
return name, argres return name, argres
@ -140,11 +141,13 @@ class DocParser(object):
foundat = -1 foundat = -1
for j in range(pos, end): for j in range(pos, end):
item = self.docList[j] item = self.docList[j]
if item.find('=') >= 0: if item.find(b'=') >= 0:
(name, argres) = item.split('=',1) (name, argres) = item.split(b'=',1)
else : else :
name = item name = item
argres = '' argres = ''
if (isinstance(tagpath,str)):
tagpath = tagpath.encode('utf-8')
if name.endswith(tagpath) : if name.endswith(tagpath) :
result = argres result = argres
foundat = j foundat = j
@ -170,7 +173,7 @@ class DocParser(object):
argres=[] argres=[]
(foundat, argt) = self.findinDoc(tagpath, pos, end) (foundat, argt) = self.findinDoc(tagpath, pos, end)
if (argt != None) and (len(argt) > 0) : if (argt != None) and (len(argt) > 0) :
argList = argt.split('|') argList = argt.split(b'|')
argres = [ int(strval) for strval in argList] argres = [ int(strval) for strval in argList]
return argres return argres
@ -191,21 +194,21 @@ class DocParser(object):
# also some class names have spaces in them so need to convert to dashes # also some class names have spaces in them so need to convert to dashes
if nclass != None : if nclass != None :
nclass = nclass.replace(' ','-') nclass = nclass.replace(b' ',b'-')
classres = '' classres = b''
nclass = nclass.lower() nclass = nclass.lower()
nclass = 'cl-' + nclass nclass = b'cl-' + nclass
baseclass = '' baseclass = b''
# graphic is the base class for captions # graphic is the base class for captions
if nclass.find('cl-cap-') >=0 : if nclass.find(b'cl-cap-') >=0 :
classres = 'graphic' + ' ' classres = b'graphic' + b' '
else : else :
# strip to find baseclass # strip to find baseclass
p = nclass.find('_') p = nclass.find(b'_')
if p > 0 : if p > 0 :
baseclass = nclass[0:p] baseclass = nclass[0:p]
if baseclass in self.classList: if baseclass in self.classList:
classres += baseclass + ' ' classres += baseclass + b' '
classres += nclass classres += nclass
nclass = classres nclass = classres
return nclass return nclass
@ -225,11 +228,11 @@ class DocParser(object):
return -1 return -1
result = [] result = []
(pos, pagetype) = self.findinDoc('page.type',0,-1) (pos, pagetype) = self.findinDoc(b'page.type',0,-1)
groupList = self.posinDoc('page.group') groupList = self.posinDoc(b'page.group')
groupregionList = self.posinDoc('page.group.region') groupregionList = self.posinDoc(b'page.group.region')
pageregionList = self.posinDoc('page.region') pageregionList = self.posinDoc(b'page.region')
# integrate into one list # integrate into one list
for j in groupList: for j in groupList:
result.append(('grpbeg',j)) result.append(('grpbeg',j))
@ -237,7 +240,7 @@ class DocParser(object):
result.append(('gregion',j)) result.append(('gregion',j))
for j in pageregionList: for j in pageregionList:
result.append(('pregion',j)) result.append(('pregion',j))
result.sort(compare) result.sort(key=functools.cmp_to_key(compare))
# insert group end and page end indicators # insert group end and page end indicators
inGroup = False inGroup = False
@ -267,33 +270,33 @@ class DocParser(object):
result = [] result = []
# paragraph # paragraph
(pos, pclass) = self.findinDoc('paragraph.class',start,end) (pos, pclass) = self.findinDoc(b'paragraph.class',start,end)
pclass = self.getClass(pclass) pclass = self.getClass(pclass)
# if paragraph uses extratokens (extra glyphs) then make it fixed # if paragraph uses extratokens (extra glyphs) then make it fixed
(pos, extraglyphs) = self.findinDoc('paragraph.extratokens',start,end) (pos, extraglyphs) = self.findinDoc(b'paragraph.extratokens',start,end)
# build up a description of the paragraph in result and return it # build up a description of the paragraph in result and return it
# first check for the basic - all words paragraph # first check for the basic - all words paragraph
(pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end) (pos, sfirst) = self.findinDoc(b'paragraph.firstWord',start,end)
(pos, slast) = self.findinDoc('paragraph.lastWord',start,end) (pos, slast) = self.findinDoc(b'paragraph.lastWord',start,end)
if (sfirst != None) and (slast != None) : if (sfirst != None) and (slast != None) :
first = int(sfirst) first = int(sfirst)
last = int(slast) last = int(slast)
makeImage = (regtype == 'vertical') or (regtype == 'table') makeImage = (regtype == b'vertical') or (regtype == b'table')
makeImage = makeImage or (extraglyphs != None) makeImage = makeImage or (extraglyphs != None)
if self.fixedimage: if self.fixedimage:
makeImage = makeImage or (regtype == 'fixed') makeImage = makeImage or (regtype == b'fixed')
if (pclass != None): if (pclass != None):
makeImage = makeImage or (pclass.find('.inverted') >= 0) makeImage = makeImage or (pclass.find(b'.inverted') >= 0)
if self.fixedimage : if self.fixedimage :
makeImage = makeImage or (pclass.find('cl-f-') >= 0) makeImage = makeImage or (pclass.find(b'cl-f-') >= 0)
# before creating an image make sure glyph info exists # before creating an image make sure glyph info exists
gidList = self.getData('info.glyph.glyphID',0,-1) gidList = self.getData(b'info.glyph.glyphID',0,-1)
makeImage = makeImage & (len(gidList) > 0) makeImage = makeImage & (len(gidList) > 0)
@ -307,8 +310,8 @@ class DocParser(object):
# translate first and last word into first and last glyphs # translate first and last word into first and last glyphs
# and generate inline image and include it # and generate inline image and include it
glyphList = [] glyphList = []
firstglyphList = self.getData('word.firstGlyph',0,-1) firstglyphList = self.getData(b'word.firstGlyph',0,-1)
gidList = self.getData('info.glyph.glyphID',0,-1) gidList = self.getData(b'info.glyph.glyphID',0,-1)
firstGlyph = firstglyphList[first] firstGlyph = firstglyphList[first]
if last < len(firstglyphList): if last < len(firstglyphList):
lastGlyph = firstglyphList[last] lastGlyph = firstglyphList[last]
@ -326,8 +329,8 @@ class DocParser(object):
for glyphnum in range(firstGlyph, lastGlyph): for glyphnum in range(firstGlyph, lastGlyph):
glyphList.append(glyphnum) glyphList.append(glyphnum)
# include any extratokens if they exist # include any extratokens if they exist
(pos, sfg) = self.findinDoc('extratokens.firstGlyph',start,end) (pos, sfg) = self.findinDoc(b'extratokens.firstGlyph',start,end)
(pos, slg) = self.findinDoc('extratokens.lastGlyph',start,end) (pos, slg) = self.findinDoc(b'extratokens.lastGlyph',start,end)
if (sfg != None) and (slg != None): if (sfg != None) and (slg != None):
for glyphnum in range(int(sfg), int(slg)): for glyphnum in range(int(sfg), int(slg)):
glyphList.append(glyphnum) glyphList.append(glyphnum)
@ -368,39 +371,39 @@ class DocParser(object):
(name, argres) = self.lineinDoc(line) (name, argres) = self.lineinDoc(line)
if name.endswith('span.firstWord') : if name.endswith(b'span.firstWord') :
sp_first = int(argres) sp_first = int(argres)
elif name.endswith('span.lastWord') : elif name.endswith(b'span.lastWord') :
sp_last = int(argres) sp_last = int(argres)
elif name.endswith('word.firstGlyph') : elif name.endswith(b'word.firstGlyph') :
gl_first = int(argres) gl_first = int(argres)
elif name.endswith('word.lastGlyph') : elif name.endswith(b'word.lastGlyph') :
gl_last = int(argres) gl_last = int(argres)
elif name.endswith('word_semantic.firstWord'): elif name.endswith(b'word_semantic.firstWord'):
ws_first = int(argres) ws_first = int(argres)
elif name.endswith('word_semantic.lastWord'): elif name.endswith(b'word_semantic.lastWord'):
ws_last = int(argres) ws_last = int(argres)
elif name.endswith('word.class'): elif name.endswith(b'word.class'):
# we only handle spaceafter word class # we only handle spaceafter word class
try: try:
(cname, space) = argres.split('-',1) (cname, space) = argres.split(b'-',1)
if space == '' : space = '0' if space == b'' : space = b'0'
if (cname == 'spaceafter') and (int(space) > 0) : if (cname == b'spaceafter') and (int(space) > 0) :
word_class = 'sa' word_class = 'sa'
except: except:
pass pass
elif name.endswith('word.img.src'): elif name.endswith(b'word.img.src'):
result.append(('img' + word_class, int(argres))) result.append(('img' + word_class, int(argres)))
word_class = '' word_class = ''
elif name.endswith('region.img.src'): elif name.endswith(b'region.img.src'):
result.append(('img' + word_class, int(argres))) result.append(('img' + word_class, int(argres)))
if (sp_first != -1) and (sp_last != -1): if (sp_first != -1) and (sp_last != -1):
@ -437,7 +440,7 @@ class DocParser(object):
classres = '' classres = ''
if pclass : if pclass :
classres = ' class="' + pclass + '"' classres = ' class="' + pclass.decode('utf-8') + '"'
br_lb = (regtype == 'fixed') or (regtype == 'chapterheading') or (regtype == 'vertical') br_lb = (regtype == 'fixed') or (regtype == 'chapterheading') or (regtype == 'vertical')
@ -470,8 +473,8 @@ class DocParser(object):
if (link > 0): if (link > 0):
linktype = self.link_type[link-1] linktype = self.link_type[link-1]
title = self.link_title[link-1] title = self.link_title[link-1]
if (title == "") or (parares.rfind(title) < 0): if (title == b"") or (parares.rfind(title.decode('utf-8')) < 0):
title=parares[lstart:] title=parares[lstart:].encode('utf-8')
if linktype == 'external' : if linktype == 'external' :
linkhref = self.link_href[link-1] linkhref = self.link_href[link-1]
linkhtml = '<a href="%s">' % linkhref linkhtml = '<a href="%s">' % linkhref
@ -482,33 +485,34 @@ class DocParser(object):
else : else :
# just link to the current page # just link to the current page
linkhtml = '<a href="#' + self.id + '">' linkhtml = '<a href="#' + self.id + '">'
linkhtml += title + '</a>' linkhtml += title.decode('utf-8')
pos = parares.rfind(title) linkhtml += '</a>'
pos = parares.rfind(title.decode('utf-8'))
if pos >= 0: if pos >= 0:
parares = parares[0:pos] + linkhtml + parares[pos+len(title):] parares = parares[0:pos] + linkhtml + parares[pos+len(title):]
else : else :
parares += linkhtml parares += linkhtml
lstart = len(parares) lstart = len(parares)
if word == '_link_' : word = '' if word == b'_link_' : word = b''
elif (link < 0) : elif (link < 0) :
if word == '_link_' : word = '' if word == b'_link_' : word = b''
if word == '_lb_': if word == b'_lb_':
if ((num-1) in self.dehyphen_rootid ) or handle_links: if ((num-1) in self.dehyphen_rootid ) or handle_links:
word = '' word = b''
sep = '' sep = ''
elif br_lb : elif br_lb :
word = '<br />\n' word = b'<br />\n'
sep = '' sep = ''
else : else :
word = '\n' word = b'\n'
sep = '' sep = ''
if num in self.dehyphen_rootid : if num in self.dehyphen_rootid :
word = word[0:-1] word = word[0:-1]
sep = '' sep = ''
parares += word + sep parares += word.decode('utf-8') + sep
elif wtype == 'img' : elif wtype == 'img' :
sep = '' sep = ''
@ -522,7 +526,9 @@ class DocParser(object):
elif wtype == 'svg' : elif wtype == 'svg' :
sep = '' sep = ''
parares += '<img src="img/' + self.id + '_%04d.svg" alt="" />' % num parares += '<img src="img/'
parares += self.id
parares += '_%04d.svg" alt="" />' % num
parares += sep parares += sep
if len(sep) > 0 : parares = parares[0:-1] if len(sep) > 0 : parares = parares[0:-1]
@ -545,7 +551,7 @@ class DocParser(object):
(wtype, num) = pdesc[j] (wtype, num) = pdesc[j]
if wtype == 'ocr' : if wtype == 'ocr' :
word = self.ocrtext[num] word = self.ocrtext[num].decode('utf-8')
sep = ' ' sep = ' '
if handle_links: if handle_links:
@ -553,7 +559,7 @@ class DocParser(object):
if (link > 0): if (link > 0):
linktype = self.link_type[link-1] linktype = self.link_type[link-1]
title = self.link_title[link-1] title = self.link_title[link-1]
title = title.rstrip('. ') title = title.rstrip(b'. ')
alt_title = parares[lstart:] alt_title = parares[lstart:]
alt_title = alt_title.strip() alt_title = alt_title.strip()
# now strip off the actual printed page number # now strip off the actual printed page number
@ -607,38 +613,38 @@ class DocParser(object):
hlst = [] hlst = []
# get the ocr text # get the ocr text
(pos, argres) = self.findinDoc('info.word.ocrText',0,-1) (pos, argres) = self.findinDoc(b'info.word.ocrText',0,-1)
if argres : self.ocrtext = argres.split('|') if argres : self.ocrtext = argres.split(b'|')
# get information to dehyphenate the text # get information to dehyphenate the text
self.dehyphen_rootid = self.getData('info.dehyphen.rootID',0,-1) self.dehyphen_rootid = self.getData(b'info.dehyphen.rootID',0,-1)
# determine if first paragraph is continued from previous page # determine if first paragraph is continued from previous page
(pos, self.parastems_stemid) = self.findinDoc('info.paraStems.stemID',0,-1) (pos, self.parastems_stemid) = self.findinDoc(b'info.paraStems.stemID',0,-1)
first_para_continued = (self.parastems_stemid != None) first_para_continued = (self.parastems_stemid != None)
# determine if last paragraph is continued onto the next page # determine if last paragraph is continued onto the next page
(pos, self.paracont_stemid) = self.findinDoc('info.paraCont.stemID',0,-1) (pos, self.paracont_stemid) = self.findinDoc(b'info.paraCont.stemID',0,-1)
last_para_continued = (self.paracont_stemid != None) last_para_continued = (self.paracont_stemid != None)
# collect link ids # collect link ids
self.link_id = self.getData('info.word.link_id',0,-1) self.link_id = self.getData(b'info.word.link_id',0,-1)
# collect link destination page numbers # collect link destination page numbers
self.link_page = self.getData('info.links.page',0,-1) self.link_page = self.getData(b'info.links.page',0,-1)
# collect link types (container versus external) # collect link types (container versus external)
(pos, argres) = self.findinDoc('info.links.type',0,-1) (pos, argres) = self.findinDoc(b'info.links.type',0,-1)
if argres : self.link_type = argres.split('|') if argres : self.link_type = argres.split(b'|')
# collect link destinations # collect link destinations
(pos, argres) = self.findinDoc('info.links.href',0,-1) (pos, argres) = self.findinDoc(b'info.links.href',0,-1)
if argres : self.link_href = argres.split('|') if argres : self.link_href = argres.split(b'|')
# collect link titles # collect link titles
(pos, argres) = self.findinDoc('info.links.title',0,-1) (pos, argres) = self.findinDoc(b'info.links.title',0,-1)
if argres : if argres :
self.link_title = argres.split('|') self.link_title = argres.split(b'|')
else: else:
self.link_title.append('') self.link_title.append('')
@ -662,51 +668,51 @@ class DocParser(object):
# set anchor for link target on this page # set anchor for link target on this page
if not anchorSet and not first_para_continued: if not anchorSet and not first_para_continued:
hlst.append('<div style="visibility: hidden; height: 0; width: 0;" id="') hlst.append('<div style="visibility: hidden; height: 0; width: 0;" id="')
hlst.append(self.id + '" title="pagetype_' + pagetype + '"></div>\n') hlst.append(self.id + '" title="pagetype_' + pagetype.decode('utf-8') + '"></div>\n')
anchorSet = True anchorSet = True
# handle groups of graphics with text captions # handle groups of graphics with text captions
if (etype == 'grpbeg'): if (etype == b'grpbeg'):
(pos, grptype) = self.findinDoc('group.type', start, end) (pos, grptype) = self.findinDoc(b'group.type', start, end)
if grptype != None: if grptype != None:
if grptype == 'graphic': if grptype == b'graphic':
gcstr = ' class="' + grptype + '"' gcstr = ' class="' + grptype.decode('utf-8') + '"'
hlst.append('<div' + gcstr + '>') hlst.append('<div' + gcstr + '>')
inGroup = True inGroup = True
elif (etype == 'grpend'): elif (etype == b'grpend'):
if inGroup: if inGroup:
hlst.append('</div>\n') hlst.append('</div>\n')
inGroup = False inGroup = False
else: else:
(pos, regtype) = self.findinDoc('region.type',start,end) (pos, regtype) = self.findinDoc(b'region.type',start,end)
if regtype == 'graphic' : if regtype == b'graphic' :
(pos, simgsrc) = self.findinDoc('img.src',start,end) (pos, simgsrc) = self.findinDoc(b'img.src',start,end)
if simgsrc: if simgsrc:
if inGroup: if inGroup:
hlst.append('<img src="img/img%04d.jpg" alt="" />' % int(simgsrc)) hlst.append('<img src="img/img%04d.jpg" alt="" />' % int(simgsrc))
else: else:
hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)) hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc))
elif regtype == 'chapterheading' : elif regtype == b'chapterheading' :
(pclass, pdesc) = self.getParaDescription(start,end, regtype) (pclass, pdesc) = self.getParaDescription(start,end, regtype)
if not breakSet: if not breakSet:
hlst.append('<div style="page-break-after: always;">&nbsp;</div>\n') hlst.append('<div style="page-break-after: always;">&nbsp;</div>\n')
breakSet = True breakSet = True
tag = 'h1' tag = 'h1'
if pclass and (len(pclass) >= 7): if pclass and (len(pclass) >= 7):
if pclass[3:7] == 'ch1-' : tag = 'h1' if pclass[3:7] == b'ch1-' : tag = 'h1'
if pclass[3:7] == 'ch2-' : tag = 'h2' if pclass[3:7] == b'ch2-' : tag = 'h2'
if pclass[3:7] == 'ch3-' : tag = 'h3' if pclass[3:7] == b'ch3-' : tag = 'h3'
hlst.append('<' + tag + ' class="' + pclass + '">') hlst.append('<' + tag + ' class="' + pclass.decode('utf-8') + '">')
else: else:
hlst.append('<' + tag + '>') hlst.append('<' + tag + '>')
hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype)) hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype))
hlst.append('</' + tag + '>') hlst.append('</' + tag + '>')
elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') or (regtype == 'listitem'): elif (regtype == b'text') or (regtype == b'fixed') or (regtype == b'insert') or (regtype == b'listitem'):
ptype = 'full' ptype = 'full'
# check to see if this is a continution from the previous page # check to see if this is a continution from the previous page
if first_para_continued : if first_para_continued :
@ -715,16 +721,16 @@ class DocParser(object):
(pclass, pdesc) = self.getParaDescription(start,end, regtype) (pclass, pdesc) = self.getParaDescription(start,end, regtype)
if pclass and (len(pclass) >= 6) and (ptype == 'full'): if pclass and (len(pclass) >= 6) and (ptype == 'full'):
tag = 'p' tag = 'p'
if pclass[3:6] == 'h1-' : tag = 'h4' if pclass[3:6] == b'h1-' : tag = 'h4'
if pclass[3:6] == 'h2-' : tag = 'h5' if pclass[3:6] == b'h2-' : tag = 'h5'
if pclass[3:6] == 'h3-' : tag = 'h6' if pclass[3:6] == b'h3-' : tag = 'h6'
hlst.append('<' + tag + ' class="' + pclass + '">') hlst.append('<' + tag + ' class="' + pclass.decode('utf-8') + '">')
hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype)) hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype))
hlst.append('</' + tag + '>') hlst.append('</' + tag + '>')
else : else :
hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype)) hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))
elif (regtype == 'tocentry') : elif (regtype == b'tocentry') :
ptype = 'full' ptype = 'full'
if first_para_continued : if first_para_continued :
ptype = 'end' ptype = 'end'
@ -733,7 +739,7 @@ class DocParser(object):
tocinfo += self.buildTOCEntry(pdesc) tocinfo += self.buildTOCEntry(pdesc)
hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype)) hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))
elif (regtype == 'vertical') or (regtype == 'table') : elif (regtype == b'vertical') or (regtype == b'table') :
ptype = 'full' ptype = 'full'
if inGroup: if inGroup:
ptype = 'middle' ptype = 'middle'
@ -744,19 +750,19 @@ class DocParser(object):
hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype)) hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))
elif (regtype == 'synth_fcvr.center'): elif (regtype == b'synth_fcvr.center'):
(pos, simgsrc) = self.findinDoc('img.src',start,end) (pos, simgsrc) = self.findinDoc(b'img.src',start,end)
if simgsrc: if simgsrc:
hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)) hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc))
else : else :
print(' Making region type', regtype, end=' ') print(' Making region type', regtype, end=' ')
(pos, temp) = self.findinDoc('paragraph',start,end) (pos, temp) = self.findinDoc(b'paragraph',start,end)
(pos2, temp) = self.findinDoc('span',start,end) (pos2, temp) = self.findinDoc(b'span',start,end)
if pos != -1 or pos2 != -1: if pos != -1 or pos2 != -1:
print(' a "text" region') print(' a "text" region')
orig_regtype = regtype orig_regtype = regtype
regtype = 'fixed' regtype = b'fixed'
ptype = 'full' ptype = 'full'
# check to see if this is a continution from the previous page # check to see if this is a continution from the previous page
if first_para_continued : if first_para_continued :
@ -764,23 +770,23 @@ class DocParser(object):
first_para_continued = False first_para_continued = False
(pclass, pdesc) = self.getParaDescription(start,end, regtype) (pclass, pdesc) = self.getParaDescription(start,end, regtype)
if not pclass: if not pclass:
if orig_regtype.endswith('.right') : pclass = 'cl-right' if orig_regtype.endswith(b'.right') : pclass = 'cl-right'
elif orig_regtype.endswith('.center') : pclass = 'cl-center' elif orig_regtype.endswith(b'.center') : pclass = 'cl-center'
elif orig_regtype.endswith('.left') : pclass = 'cl-left' elif orig_regtype.endswith(b'.left') : pclass = 'cl-left'
elif orig_regtype.endswith('.justify') : pclass = 'cl-justify' elif orig_regtype.endswith(b'.justify') : pclass = 'cl-justify'
if pclass and (ptype == 'full') and (len(pclass) >= 6): if pclass and (ptype == 'full') and (len(pclass) >= 6):
tag = 'p' tag = 'p'
if pclass[3:6] == 'h1-' : tag = 'h4' if pclass[3:6] == b'h1-' : tag = 'h4'
if pclass[3:6] == 'h2-' : tag = 'h5' if pclass[3:6] == b'h2-' : tag = 'h5'
if pclass[3:6] == 'h3-' : tag = 'h6' if pclass[3:6] == b'h3-' : tag = 'h6'
hlst.append('<' + tag + ' class="' + pclass + '">') hlst.append('<' + tag + ' class="' + pclass.decode('utf-8') + '">')
hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype)) hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype))
hlst.append('</' + tag + '>') hlst.append('</' + tag + '>')
else : else :
hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype)) hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))
else : else :
print(' a "graphic" region') print(' a "graphic" region')
(pos, simgsrc) = self.findinDoc('img.src',start,end) (pos, simgsrc) = self.findinDoc(b'img.src',start,end)
if simgsrc: if simgsrc:
hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)) hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc))

View file

@ -12,7 +12,7 @@ from struct import unpack
class PParser(object): class PParser(object):
def __init__(self, gd, flatxml, meta_array): def __init__(self, gd, flatxml, meta_array):
self.gd = gd self.gd = gd
self.flatdoc = flatxml.split('\n') self.flatdoc = flatxml.split(b'\n')
self.docSize = len(self.flatdoc) self.docSize = len(self.flatdoc)
self.temp = [] self.temp = []
@ -58,11 +58,11 @@ class PParser(object):
def lineinDoc(self, pos) : def lineinDoc(self, pos) :
if (pos >= 0) and (pos < self.docSize) : if (pos >= 0) and (pos < self.docSize) :
item = self.flatdoc[pos] item = self.flatdoc[pos]
if item.find('=') >= 0: if item.find(b'=') >= 0:
(name, argres) = item.split('=',1) (name, argres) = item.split(b'=',1)
else : else :
name = item name = item
argres = '' argres = b''
return name, argres return name, argres
# find tag in doc if within pos to end inclusive # find tag in doc if within pos to end inclusive
@ -75,11 +75,13 @@ class PParser(object):
foundat = -1 foundat = -1
for j in range(pos, end): for j in range(pos, end):
item = self.flatdoc[j] item = self.flatdoc[j]
if item.find('=') >= 0: if item.find(b'=') >= 0:
(name, argres) = item.split('=',1) (name, argres) = item.split(b'=',1)
else : else :
name = item name = item
argres = '' argres = b''
if (isinstance(tagpath,str)):
tagpath = tagpath.encode('utf-8')
if name.endswith(tagpath) : if name.endswith(tagpath) :
result = argres result = argres
foundat = j foundat = j
@ -103,9 +105,9 @@ class PParser(object):
cnt = len(self.flatdoc) cnt = len(self.flatdoc)
for j in range(cnt): for j in range(cnt):
item = self.flatdoc[j] item = self.flatdoc[j]
if item.find('=') >= 0: if item.find(b'=') >= 0:
(name, argt) = item.split('=') (name, argt) = item.split(b'=')
argres = argt.split('|') argres = argt.split(b'|')
else: else:
name = item name = item
argres = [] argres = []
@ -120,15 +122,17 @@ class PParser(object):
def getDataatPos(self, path, pos): def getDataatPos(self, path, pos):
result = None result = None
item = self.flatdoc[pos] item = self.flatdoc[pos]
if item.find('=') >= 0: if item.find(b'=') >= 0:
(name, argt) = item.split('=') (name, argt) = item.split(b'=')
argres = argt.split('|') argres = argt.split(b'|')
else: else:
name = item name = item
argres = [] argres = []
if (len(argres) > 0) : if (len(argres) > 0) :
for j in range(0,len(argres)): for j in range(0,len(argres)):
argres[j] = int(argres[j]) argres[j] = int(argres[j])
if (isinstance(path,str)):
path = path.encode('utf-8')
if (name.endswith(path)): if (name.endswith(path)):
result = argres result = argres
return result return result
@ -138,12 +142,14 @@ class PParser(object):
cnt = len(self.temp) cnt = len(self.temp)
for j in range(cnt): for j in range(cnt):
item = self.temp[j] item = self.temp[j]
if item.find('=') >= 0: if item.find(b'=') >= 0:
(name, argt) = item.split('=') (name, argt) = item.split(b'=')
argres = argt.split('|') argres = argt.split(b'|')
else: else:
name = item name = item
argres = [] argres = []
if (isinstance(path,str)):
path = path.encode('utf-8')
if (name.endswith(path)): if (name.endswith(path)):
result = argres result = argres
self.temp.pop(j) self.temp.pop(j)

View file

@ -44,10 +44,10 @@ if inCalibre :
from calibre_plugins.dedrm import flatxml2svg from calibre_plugins.dedrm import flatxml2svg
from calibre_plugins.dedrm import stylexml2css from calibre_plugins.dedrm import stylexml2css
else : else :
from . import convert2xml import convert2xml
from . import flatxml2html import flatxml2html
from . import flatxml2svg import flatxml2svg
from . import stylexml2css import stylexml2css
# global switch # global switch
buildXML = False buildXML = False
@ -117,10 +117,10 @@ class Dictionary(object):
self.stable.append(self.escapestr(readString(self.fo))) self.stable.append(self.escapestr(readString(self.fo)))
self.pos = 0 self.pos = 0
def escapestr(self, str): def escapestr(self, str):
str = str.replace('&','&amp;') str = str.replace(b'&',b'&amp;')
str = str.replace('<','&lt;') str = str.replace(b'<',b'&lt;')
str = str.replace('>','&gt;') str = str.replace(b'>',b'&gt;')
str = str.replace('=','&#61;') str = str.replace(b'=',b'&#61;')
return str return str
def lookup(self,val): def lookup(self,val):
if ((val >= 0) and (val < self.size)) : if ((val >= 0) and (val < self.size)) :
@ -138,7 +138,7 @@ class Dictionary(object):
class PageDimParser(object): class PageDimParser(object):
def __init__(self, flatxml): def __init__(self, flatxml):
self.flatdoc = flatxml.split('\n') self.flatdoc = flatxml.split(b'\n')
# find tag if within pos to end inclusive # find tag if within pos to end inclusive
def findinDoc(self, tagpath, pos, end) : def findinDoc(self, tagpath, pos, end) :
result = None result = None
@ -151,8 +151,8 @@ class PageDimParser(object):
foundat = -1 foundat = -1
for j in range(pos, end): for j in range(pos, end):
item = docList[j] item = docList[j]
if item.find('=') >= 0: if item.find(b'=') >= 0:
(name, argres) = item.split('=') (name, argres) = item.split(b'=')
else : else :
name = item name = item
argres = '' argres = ''
@ -162,8 +162,8 @@ class PageDimParser(object):
break break
return foundat, result return foundat, result
def process(self): def process(self):
(pos, sph) = self.findinDoc('page.h',0,-1) (pos, sph) = self.findinDoc(b'page.h',0,-1)
(pos, spw) = self.findinDoc('page.w',0,-1) (pos, spw) = self.findinDoc(b'page.w',0,-1)
if (sph == None): sph = '-1' if (sph == None): sph = '-1'
if (spw == None): spw = '-1' if (spw == None): spw = '-1'
return sph, spw return sph, spw
@ -176,21 +176,21 @@ def getPageDim(flatxml):
class GParser(object): class GParser(object):
def __init__(self, flatxml): def __init__(self, flatxml):
self.flatdoc = flatxml.split('\n') self.flatdoc = flatxml.split(b'\n')
self.dpi = 1440 self.dpi = 1440
self.gh = self.getData('info.glyph.h') self.gh = self.getData(b'info.glyph.h')
self.gw = self.getData('info.glyph.w') self.gw = self.getData(b'info.glyph.w')
self.guse = self.getData('info.glyph.use') self.guse = self.getData(b'info.glyph.use')
if self.guse : if self.guse :
self.count = len(self.guse) self.count = len(self.guse)
else : else :
self.count = 0 self.count = 0
self.gvtx = self.getData('info.glyph.vtx') self.gvtx = self.getData(b'info.glyph.vtx')
self.glen = self.getData('info.glyph.len') self.glen = self.getData(b'info.glyph.len')
self.gdpi = self.getData('info.glyph.dpi') self.gdpi = self.getData(b'info.glyph.dpi')
self.vx = self.getData('info.vtx.x') self.vx = self.getData(b'info.vtx.x')
self.vy = self.getData('info.vtx.y') self.vy = self.getData(b'info.vtx.y')
self.vlen = self.getData('info.len.n') self.vlen = self.getData(b'info.len.n')
if self.vlen : if self.vlen :
self.glen.append(len(self.vlen)) self.glen.append(len(self.vlen))
elif self.glen: elif self.glen:
@ -204,9 +204,9 @@ class GParser(object):
cnt = len(self.flatdoc) cnt = len(self.flatdoc)
for j in range(cnt): for j in range(cnt):
item = self.flatdoc[j] item = self.flatdoc[j]
if item.find('=') >= 0: if item.find(b'=') >= 0:
(name, argt) = item.split('=') (name, argt) = item.split(b'=')
argres = argt.split('|') argres = argt.split(b'|')
else: else:
name = item name = item
argres = [] argres = []
@ -431,7 +431,7 @@ def generateBook(bookDir, raw, fixedimage):
# now get the css info # now get the css info
cssstr , classlst = stylexml2css.convert2CSS(flat_xml, fontsize, ph, pw) cssstr , classlst = stylexml2css.convert2CSS(flat_xml, fontsize, ph, pw)
open(xname, 'wb').write(cssstr) open(xname, 'w').write(cssstr)
if buildXML: if buildXML:
xname = os.path.join(xmlDir, 'other0000.xml') xname = os.path.join(xmlDir, 'other0000.xml')
open(xname, 'wb').write(convert2xml.getXML(dict, otherFile)) open(xname, 'wb').write(convert2xml.getXML(dict, otherFile))
@ -525,7 +525,7 @@ def generateBook(bookDir, raw, fixedimage):
hlst.append('</body>\n</html>\n') hlst.append('</body>\n</html>\n')
htmlstr = "".join(hlst) htmlstr = "".join(hlst)
hlst = None hlst = None
open(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr) open(os.path.join(bookDir, htmlFileName), 'w').write(htmlstr)
print(" ") print(" ")
print('Extracting Table of Contents from Amazon OCR') print('Extracting Table of Contents from Amazon OCR')
@ -571,7 +571,7 @@ def generateBook(bookDir, raw, fixedimage):
tlst.append('</body>\n') tlst.append('</body>\n')
tlst.append('</html>\n') tlst.append('</html>\n')
tochtml = "".join(tlst) tochtml = "".join(tlst)
open(os.path.join(svgDir, 'toc.xhtml'), 'wb').write(tochtml) open(os.path.join(svgDir, 'toc.xhtml'), 'w').write(tochtml)
# now create index_svg.xhtml that points to all required files # now create index_svg.xhtml that points to all required files
@ -608,7 +608,7 @@ def generateBook(bookDir, raw, fixedimage):
flst = [] flst = []
for page in pagelst: for page in pagelst:
flst.append(xmllst[page]) flst.append(xmllst[page])
flat_svg = "".join(flst) flat_svg = b"".join(flst)
flst=None flst=None
svgxml = flatxml2svg.convert2SVG(gd, flat_svg, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi) svgxml = flatxml2svg.convert2SVG(gd, flat_svg, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi)
if (raw) : if (raw) :
@ -626,7 +626,7 @@ def generateBook(bookDir, raw, fixedimage):
slst.append('</body>\n</html>\n') slst.append('</body>\n</html>\n')
svgindex = "".join(slst) svgindex = "".join(slst)
slst = None slst = None
open(os.path.join(bookDir, 'index_svg.xhtml'), 'wb').write(svgindex) open(os.path.join(bookDir, 'index_svg.xhtml'), 'w').write(svgindex)
print(" ") print(" ")
@ -637,16 +637,16 @@ def generateBook(bookDir, raw, fixedimage):
olst.append('<package xmlns="http://www.idpf.org/2007/opf" unique-identifier="guid_id">\n') olst.append('<package xmlns="http://www.idpf.org/2007/opf" unique-identifier="guid_id">\n')
# adding metadata # adding metadata
olst.append(' <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">\n') olst.append(' <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">\n')
if 'GUID' in meta_array: if b'GUID' in meta_array:
olst.append(' <dc:identifier opf:scheme="GUID" id="guid_id">' + meta_array['GUID'] + '</dc:identifier>\n') olst.append(' <dc:identifier opf:scheme="GUID" id="guid_id">' + meta_array[b'GUID'].decode('utf-8') + '</dc:identifier>\n')
if 'ASIN' in meta_array: if b'ASIN' in meta_array:
olst.append(' <dc:identifier opf:scheme="ASIN">' + meta_array['ASIN'] + '</dc:identifier>\n') olst.append(' <dc:identifier opf:scheme="ASIN">' + meta_array[b'ASIN'].decode('utf-8') + '</dc:identifier>\n')
if 'oASIN' in meta_array: if b'oASIN' in meta_array:
olst.append(' <dc:identifier opf:scheme="oASIN">' + meta_array['oASIN'] + '</dc:identifier>\n') olst.append(' <dc:identifier opf:scheme="oASIN">' + meta_array[b'oASIN'].decode('utf-8') + '</dc:identifier>\n')
olst.append(' <dc:title>' + meta_array['Title'] + '</dc:title>\n') olst.append(' <dc:title>' + meta_array[b'Title'].decode('utf-8') + '</dc:title>\n')
olst.append(' <dc:creator opf:role="aut">' + meta_array['Authors'] + '</dc:creator>\n') olst.append(' <dc:creator opf:role="aut">' + meta_array[b'Authors'].decode('utf-8') + '</dc:creator>\n')
olst.append(' <dc:language>en</dc:language>\n') olst.append(' <dc:language>en</dc:language>\n')
olst.append(' <dc:date>' + meta_array['UpdateTime'] + '</dc:date>\n') olst.append(' <dc:date>' + meta_array[b'UpdateTime'].decode('utf-8') + '</dc:date>\n')
if isCover: if isCover:
olst.append(' <meta name="cover" content="bookcover"/>\n') olst.append(' <meta name="cover" content="bookcover"/>\n')
olst.append(' </metadata>\n') olst.append(' </metadata>\n')
@ -675,7 +675,7 @@ def generateBook(bookDir, raw, fixedimage):
olst.append('</package>\n') olst.append('</package>\n')
opfstr = "".join(olst) opfstr = "".join(olst)
olst = None olst = None
open(opfname, 'wb').write(opfstr) open(opfname, 'w').write(opfstr)
print('Processing Complete') print('Processing Complete')

View file

@ -49,14 +49,15 @@ def SHA1(message):
# Encode the bytes in data with the characters in map # Encode the bytes in data with the characters in map
# data and map should be byte arrays
def encode(data, map): def encode(data, map):
result = '' result = b''
for char in data: for char in data:
value = ord(char) value = char
Q = (value ^ 0x80) // len(map) Q = (value ^ 0x80) // len(map)
R = value % len(map) R = value % len(map)
result += map[Q] result += bytes([map[Q]])
result += map[R] result += bytes([map[R]])
return result return result
# Hash the bytes in data and then encode the digest with the characters in map # Hash the bytes in data and then encode the digest with the characters in map
@ -117,7 +118,7 @@ def generatePidEncryptionTable() :
def generatePidSeed(table,dsn) : def generatePidSeed(table,dsn) :
value = 0 value = 0
for counter in range (0,4) : for counter in range (0,4) :
index = (ord(dsn[counter]) ^ value) &0xFF index = (dsn[counter] ^ value) & 0xFF
value = (value >> 8) ^ table[index] value = (value >> 8) ^ table[index]
return value return value
@ -129,7 +130,7 @@ def generateDevicePID(table,dsn,nbRoll):
pid = [(seed >>24) &0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF,(seed>>24) & 0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF] pid = [(seed >>24) &0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF,(seed>>24) & 0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF]
index = 0 index = 0
for counter in range (0,nbRoll): for counter in range (0,nbRoll):
pid[index] = pid[index] ^ ord(dsn[counter]) pid[index] = pid[index] ^ dsn[counter]
index = (index+1) %8 index = (index+1) %8
for counter in range (0,8): for counter in range (0,8):
index = ((((pid[counter] >>5) & 3) ^ pid[counter]) & 0x1f) + (pid[counter] >> 7) index = ((((pid[counter] >>5) & 3) ^ pid[counter]) & 0x1f) + (pid[counter] >> 7)
@ -205,7 +206,7 @@ def getK4Pids(rec209, token, kindleDatabase):
try: try:
# Get the kindle account token, if present # Get the kindle account token, if present
kindleAccountToken = bytearray.fromhex((kindleDatabase[1])[b'kindle.account.tokens']).decode() kindleAccountToken = bytearray.fromhex((kindleDatabase[1])['kindle.account.tokens'])
except KeyError: except KeyError:
kindleAccountToken="" kindleAccountToken=""
@ -213,30 +214,30 @@ def getK4Pids(rec209, token, kindleDatabase):
try: try:
# Get the DSN token, if present # Get the DSN token, if present
DSN = bytearray.fromhex((kindleDatabase[1])['DSN']).decode() DSN = bytearray.fromhex((kindleDatabase[1])['DSN'])
print("Got DSN key from database {0}".format(kindleDatabase[0])) print("Got DSN key from database {0}".format(kindleDatabase[0]))
except KeyError: except KeyError:
# See if we have the info to generate the DSN # See if we have the info to generate the DSN
try: try:
# Get the Mazama Random number # Get the Mazama Random number
MazamaRandomNumber = bytearray.fromhex((kindleDatabase[1])[b'MazamaRandomNumber']).decode() MazamaRandomNumber = bytearray.fromhex((kindleDatabase[1])['MazamaRandomNumber'])
#print "Got MazamaRandomNumber from database {0}".format(kindleDatabase[0]) #print "Got MazamaRandomNumber from database {0}".format(kindleDatabase[0])
try: try:
# Get the SerialNumber token, if present # Get the SerialNumber token, if present
IDString = bytearray.fromhex((kindleDatabase[1])[b'SerialNumber']).decode() IDString = bytearray.fromhex((kindleDatabase[1])['SerialNumber'])
print("Got SerialNumber from database {0}".format(kindleDatabase[0])) print("Got SerialNumber from database {0}".format(kindleDatabase[0]))
except KeyError: except KeyError:
# Get the IDString we added # Get the IDString we added
IDString = bytearray.fromhex((kindleDatabase[1])[b'IDString']).decode() IDString = bytearray.fromhex((kindleDatabase[1])['IDString'])
try: try:
# Get the UsernameHash token, if present # Get the UsernameHash token, if present
encodedUsername = bytearray.fromhex((kindleDatabase[1])[b'UsernameHash']).decode() encodedUsername = bytearray.fromhex((kindleDatabase[1])['UsernameHash'])
print("Got UsernameHash from database {0}".format(kindleDatabase[0])) print("Got UsernameHash from database {0}".format(kindleDatabase[0]))
except KeyError: except KeyError:
# Get the UserName we added # Get the UserName we added
UserName = bytearray.fromhex((kindleDatabase[1])[b'UserName']).decode() UserName = bytearray.fromhex((kindleDatabase[1])['UserName'])
# encode it # encode it
encodedUsername = encodeHash(UserName,charMap1) encodedUsername = encodeHash(UserName,charMap1)
#print "encodedUsername",encodedUsername.encode('hex') #print "encodedUsername",encodedUsername.encode('hex')
@ -266,19 +267,19 @@ def getK4Pids(rec209, token, kindleDatabase):
# Compute book PIDs # Compute book PIDs
# book pid # book pid
pidHash = SHA1(DSN.encode()+kindleAccountToken.encode()+rec209+token) pidHash = SHA1(DSN+kindleAccountToken+rec209+token)
bookPID = encodePID(pidHash) bookPID = encodePID(pidHash)
bookPID = checksumPid(bookPID) bookPID = checksumPid(bookPID)
pids.append(bookPID) pids.append(bookPID)
# variant 1 # variant 1
pidHash = SHA1(kindleAccountToken.encode()+rec209+token) pidHash = SHA1(kindleAccountToken+rec209+token)
bookPID = encodePID(pidHash) bookPID = encodePID(pidHash)
bookPID = checksumPid(bookPID) bookPID = checksumPid(bookPID)
pids.append(bookPID) pids.append(bookPID)
# variant 2 # variant 2
pidHash = SHA1(DSN.encode()+rec209+token) pidHash = SHA1(DSN+rec209+token)
bookPID = encodePID(pidHash) bookPID = encodePID(pidHash)
bookPID = checksumPid(bookPID) bookPID = checksumPid(bookPID)
pids.append(bookPID) pids.append(bookPID)

View file

@ -7,7 +7,7 @@
from __future__ import print_function from __future__ import print_function
__license__ = 'GPL v3' __license__ = 'GPL v3'
__version__ = "1.00" __version__ = "1.0"
# This is a python script. You need a Python interpreter to run it. # This is a python script. You need a Python interpreter to run it.
# For example, ActiveState Python, which exists for windows. # For example, ActiveState Python, which exists for windows.
@ -73,7 +73,7 @@ __version__ = "1.00"
# 0.40 - moved unicode_argv call inside main for Windows DeDRM compatibility # 0.40 - moved unicode_argv call inside main for Windows DeDRM compatibility
# 0.41 - Fixed potential unicode problem in command line calls # 0.41 - Fixed potential unicode problem in command line calls
# 0.42 - Added GPL v3 licence. updated/removed some print statements # 0.42 - Added GPL v3 licence. updated/removed some print statements
# 1.00 - Python 3 compatibility for calibre 5.0 # 1.0 - Python 3 compatibility for calibre 5.0
import sys import sys
import os import os
@ -330,7 +330,7 @@ class MobiBook:
} }
title = '' title = ''
codec = 'windows-1252' codec = 'windows-1252'
if self.magic == 'BOOKMOBI': if self.magic == b'BOOKMOBI':
if 503 in self.meta_array: if 503 in self.meta_array:
title = self.meta_array[503] title = self.meta_array[503]
else: else:

View file

@ -15,36 +15,36 @@ debug = False
class DocParser(object): class DocParser(object):
def __init__(self, flatxml, fontsize, ph, pw): def __init__(self, flatxml, fontsize, ph, pw):
self.flatdoc = flatxml.split('\n') self.flatdoc = flatxml.split(b'\n')
self.fontsize = int(fontsize) self.fontsize = int(fontsize)
self.ph = int(ph) * 1.0 self.ph = int(ph) * 1.0
self.pw = int(pw) * 1.0 self.pw = int(pw) * 1.0
stags = { stags = {
'paragraph' : 'p', b'paragraph' : 'p',
'graphic' : '.graphic' b'graphic' : '.graphic'
} }
attr_val_map = { attr_val_map = {
'hang' : 'text-indent: ', b'hang' : 'text-indent: ',
'indent' : 'text-indent: ', b'indent' : 'text-indent: ',
'line-space' : 'line-height: ', b'line-space' : 'line-height: ',
'margin-bottom' : 'margin-bottom: ', b'margin-bottom' : 'margin-bottom: ',
'margin-left' : 'margin-left: ', b'margin-left' : 'margin-left: ',
'margin-right' : 'margin-right: ', b'margin-right' : 'margin-right: ',
'margin-top' : 'margin-top: ', b'margin-top' : 'margin-top: ',
'space-after' : 'padding-bottom: ', b'space-after' : 'padding-bottom: ',
} }
attr_str_map = { attr_str_map = {
'align-center' : 'text-align: center; margin-left: auto; margin-right: auto;', b'align-center' : 'text-align: center; margin-left: auto; margin-right: auto;',
'align-left' : 'text-align: left;', b'align-left' : 'text-align: left;',
'align-right' : 'text-align: right;', b'align-right' : 'text-align: right;',
'align-justify' : 'text-align: justify;', b'align-justify' : 'text-align: justify;',
'display-inline' : 'display: inline;', b'display-inline' : 'display: inline;',
'pos-left' : 'text-align: left;', b'pos-left' : 'text-align: left;',
'pos-right' : 'text-align: right;', b'pos-right' : 'text-align: right;',
'pos-center' : 'text-align: center; margin-left: auto; margin-right: auto;', b'pos-center' : 'text-align: center; margin-left: auto; margin-right: auto;',
} }
@ -60,11 +60,13 @@ class DocParser(object):
foundat = -1 foundat = -1
for j in range(pos, end): for j in range(pos, end):
item = docList[j] item = docList[j]
if item.find('=') >= 0: if item.find(b'=') >= 0:
(name, argres) = item.split('=',1) (name, argres) = item.split(b'=',1)
else : else :
name = item name = item
argres = '' argres = b''
if (isinstance(tagpath,str)):
tagpath = tagpath.encode('utf-8')
if name.endswith(tagpath) : if name.endswith(tagpath) :
result = argres result = argres
foundat = j foundat = j
@ -76,7 +78,7 @@ class DocParser(object):
def posinDoc(self, tagpath): def posinDoc(self, tagpath):
startpos = [] startpos = []
pos = 0 pos = 0
res = "" res = b""
while res != None : while res != None :
(foundpos, res) = self.findinDoc(tagpath, pos, -1) (foundpos, res) = self.findinDoc(tagpath, pos, -1)
if res != None : if res != None :
@ -87,11 +89,11 @@ class DocParser(object):
# returns a vector of integers for the tagpath # returns a vector of integers for the tagpath
def getData(self, tagpath, pos, end, clean=False): def getData(self, tagpath, pos, end, clean=False):
if clean: if clean:
digits_only = re.compile(r'''([0-9]+)''') digits_only = re.compile(rb'''([0-9]+)''')
argres=[] argres=[]
(foundat, argt) = self.findinDoc(tagpath, pos, end) (foundat, argt) = self.findinDoc(tagpath, pos, end)
if (argt != None) and (len(argt) > 0) : if (argt != None) and (len(argt) > 0) :
argList = argt.split('|') argList = argt.split(b'|')
for strval in argList: for strval in argList:
if clean: if clean:
m = re.search(digits_only, strval) m = re.search(digits_only, strval)
@ -109,7 +111,7 @@ class DocParser(object):
csspage += '.cl-justify { text-align: justify; }\n' csspage += '.cl-justify { text-align: justify; }\n'
# generate a list of each <style> starting point in the stylesheet # generate a list of each <style> starting point in the stylesheet
styleList= self.posinDoc('book.stylesheet.style') styleList= self.posinDoc(b'book.stylesheet.style')
stylecnt = len(styleList) stylecnt = len(styleList)
styleList.append(-1) styleList.append(-1)
@ -121,30 +123,30 @@ class DocParser(object):
start = styleList[j] start = styleList[j]
end = styleList[j+1] end = styleList[j+1]
(pos, tag) = self.findinDoc('style._tag',start,end) (pos, tag) = self.findinDoc(b'style._tag',start,end)
if tag == None : if tag == None :
(pos, tag) = self.findinDoc('style.type',start,end) (pos, tag) = self.findinDoc(b'style.type',start,end)
# Is this something we know how to convert to css # Is this something we know how to convert to css
if tag in self.stags : if tag in self.stags :
# get the style class # get the style class
(pos, sclass) = self.findinDoc('style.class',start,end) (pos, sclass) = self.findinDoc(b'style.class',start,end)
if sclass != None: if sclass != None:
sclass = sclass.replace(' ','-') sclass = sclass.replace(b' ',b'-')
sclass = '.cl-' + sclass.lower() sclass = b'.cl-' + sclass.lower()
else : else :
sclass = '' sclass = b''
if debug: print('sclass', sclass) if debug: print('sclass', sclass)
# check for any "after class" specifiers # check for any "after class" specifiers
(pos, aftclass) = self.findinDoc('style._after_class',start,end) (pos, aftclass) = self.findinDoc(b'style._after_class',start,end)
if aftclass != None: if aftclass != None:
aftclass = aftclass.replace(' ','-') aftclass = aftclass.replace(b' ',b'-')
aftclass = '.cl-' + aftclass.lower() aftclass = b'.cl-' + aftclass.lower()
else : else :
aftclass = '' aftclass = b''
if debug: print('aftclass', aftclass) if debug: print('aftclass', aftclass)
@ -152,34 +154,37 @@ class DocParser(object):
while True : while True :
(pos1, attr) = self.findinDoc('style.rule.attr', start, end) (pos1, attr) = self.findinDoc(b'style.rule.attr', start, end)
(pos2, val) = self.findinDoc('style.rule.value', start, end) (pos2, val) = self.findinDoc(b'style.rule.value', start, end)
if debug: print('attr', attr) if debug: print('attr', attr)
if debug: print('val', val) if debug: print('val', val)
if attr == None : break if attr == None : break
if (attr == 'display') or (attr == 'pos') or (attr == 'align'): if (attr == b'display') or (attr == b'pos') or (attr == b'align'):
# handle text based attributess # handle text based attributess
attr = attr + '-' + val attr = attr + b'-' + val
if attr in self.attr_str_map : if attr in self.attr_str_map :
cssargs[attr] = (self.attr_str_map[attr], '') cssargs[attr] = (self.attr_str_map[attr], b'')
else : else :
# handle value based attributes # handle value based attributes
if attr in self.attr_val_map : if attr in self.attr_val_map :
name = self.attr_val_map[attr] name = self.attr_val_map[attr]
if attr in ('margin-bottom', 'margin-top', 'space-after') : if attr in (b'margin-bottom', b'margin-top', b'space-after') :
scale = self.ph scale = self.ph
elif attr in ('margin-right', 'indent', 'margin-left', 'hang') : elif attr in (b'margin-right', b'indent', b'margin-left', b'hang') :
scale = self.pw scale = self.pw
elif attr == 'line-space': elif attr == b'line-space':
scale = self.fontsize * 2.0 scale = self.fontsize * 2.0
else:
print("Scale not defined!")
scale = 1.0
if val == "": if val == "":
val = 0 val = 0
if not ((attr == 'hang') and (int(val) == 0)): if not ((attr == b'hang') and (int(val) == 0)):
try: try:
f = float(val) f = float(val)
except: except:
@ -198,32 +203,32 @@ class DocParser(object):
if debug: print('keeping style') if debug: print('keeping style')
# make sure line-space does not go below 100% or above 300% since # make sure line-space does not go below 100% or above 300% since
# it can be wacky in some styles # it can be wacky in some styles
if 'line-space' in cssargs: if b'line-space' in cssargs:
seg = cssargs['line-space'][0] seg = cssargs[b'line-space'][0]
val = cssargs['line-space'][1] val = cssargs[b'line-space'][1]
if val < 1.0: val = 1.0 if val < 1.0: val = 1.0
if val > 3.0: val = 3.0 if val > 3.0: val = 3.0
del cssargs['line-space'] del cssargs[b'line-space']
cssargs['line-space'] = (self.attr_val_map['line-space'], val) cssargs[b'line-space'] = (self.attr_val_map[b'line-space'], val)
# handle modifications for css style hanging indents # handle modifications for css style hanging indents
if 'hang' in cssargs: if b'hang' in cssargs:
hseg = cssargs['hang'][0] hseg = cssargs[b'hang'][0]
hval = cssargs['hang'][1] hval = cssargs[b'hang'][1]
del cssargs['hang'] del cssargs[b'hang']
cssargs['hang'] = (self.attr_val_map['hang'], -hval) cssargs[b'hang'] = (self.attr_val_map[b'hang'], -hval)
mval = 0 mval = 0
mseg = 'margin-left: ' mseg = 'margin-left: '
mval = hval mval = hval
if 'margin-left' in cssargs: if b'margin-left' in cssargs:
mseg = cssargs['margin-left'][0] mseg = cssargs[b'margin-left'][0]
mval = cssargs['margin-left'][1] mval = cssargs[b'margin-left'][1]
if mval < 0: mval = 0 if mval < 0: mval = 0
mval = hval + mval mval = hval + mval
cssargs['margin-left'] = (mseg, mval) cssargs[b'margin-left'] = (mseg, mval)
if 'indent' in cssargs: if b'indent' in cssargs:
del cssargs['indent'] del cssargs[b'indent']
cssline = sclass + ' { ' cssline = sclass + ' { '
for key in iter(cssargs): for key in iter(cssargs):

View file

@ -173,7 +173,7 @@ def decryptRecord(data,PID):
def decryptDkeyRecord(data,PID): def decryptDkeyRecord(data,PID):
record = decryptRecord(data,PID) record = decryptRecord(data,PID)
fields = unpack('3sB8sB8s3s',record) fields = unpack('3sB8sB8s3s',record)
if fields[0] != 'PID' or fields[5] != 'pid' : if fields[0] != b'PID' or fields[5] != b'pid' :
raise DrmException("Didn't find PID magic numbers in record") raise DrmException("Didn't find PID magic numbers in record")
elif fields[1] != 8 or fields[3] != 8 : elif fields[1] != 8 or fields[3] != 8 :
raise DrmException("Record didn't contain correct length fields") raise DrmException("Record didn't contain correct length fields")
@ -183,11 +183,11 @@ def decryptDkeyRecord(data,PID):
# Decrypt all dkey records (contain the book PID) # Decrypt all dkey records (contain the book PID)
def decryptDkeyRecords(data,PID): def decryptDkeyRecords(data,PID):
nbKeyRecords = ord(data[0]) nbKeyRecords = data[0]
records = [] records = []
data = data[1:] data = data[1:]
for i in range (0,nbKeyRecords): for i in range (0,nbKeyRecords):
length = ord(data[0]) length = data[0]
try: try:
key = decryptDkeyRecord(data[1:length+1],PID) key = decryptDkeyRecord(data[1:length+1],PID)
records.append(key) records.append(key)
@ -209,7 +209,7 @@ class TopazBook:
self.bookMetadata = {} self.bookMetadata = {}
self.bookKey = None self.bookKey = None
magic = unpack('4s',self.fo.read(4))[0] magic = unpack('4s',self.fo.read(4))[0]
if magic != 'TPZ0': if magic != b'TPZ0':
raise DrmException("Parse Error : Invalid Header, not a Topaz file") raise DrmException("Parse Error : Invalid Header, not a Topaz file")
self.parseTopazHeaders() self.parseTopazHeaders()
self.parseMetadata() self.parseMetadata()
@ -244,9 +244,9 @@ class TopazBook:
def parseMetadata(self): def parseMetadata(self):
# Parse the metadata record from the book payload and return a list of [key,values] # Parse the metadata record from the book payload and return a list of [key,values]
self.fo.seek(self.bookPayloadOffset + self.bookHeaderRecords['metadata'][0][0]) self.fo.seek(self.bookPayloadOffset + self.bookHeaderRecords[b'metadata'][0][0])
tag = bookReadString(self.fo) tag = bookReadString(self.fo)
if tag != 'metadata' : if tag != b'metadata' :
raise DrmException("Parse Error : Record Names Don't Match") raise DrmException("Parse Error : Record Names Don't Match")
flags = ord(self.fo.read(1)) flags = ord(self.fo.read(1))
nbRecords = ord(self.fo.read(1)) nbRecords = ord(self.fo.read(1))
@ -260,18 +260,18 @@ class TopazBook:
return self.bookMetadata return self.bookMetadata
def getPIDMetaInfo(self): def getPIDMetaInfo(self):
keysRecord = self.bookMetadata.get('keys','') keysRecord = self.bookMetadata.get(b'keys',b'')
keysRecordRecord = '' keysRecordRecord = b''
if keysRecord != '': if keysRecord != b'':
keylst = keysRecord.split(',') keylst = keysRecord.split(b',')
for keyval in keylst: for keyval in keylst:
keysRecordRecord += self.bookMetadata.get(keyval,'') keysRecordRecord += self.bookMetadata.get(keyval,b'')
return keysRecord, keysRecordRecord return keysRecord, keysRecordRecord
def getBookTitle(self): def getBookTitle(self):
title = '' title = b''
if 'Title' in self.bookMetadata: if b'Title' in self.bookMetadata:
title = self.bookMetadata['Title'] title = self.bookMetadata[b'Title']
return title.decode('utf-8') return title.decode('utf-8')
def setBookKey(self, key): def setBookKey(self, key):
@ -323,7 +323,7 @@ class TopazBook:
raw = 0 raw = 0
fixedimage=True fixedimage=True
try: try:
keydata = self.getBookPayloadRecord('dkey', 0) keydata = self.getBookPayloadRecord(b'dkey', 0)
except DrmException as e: except DrmException as e:
print("no dkey record found, book may not be encrypted") print("no dkey record found, book may not be encrypted")
print("attempting to extrct files without a book key") print("attempting to extrct files without a book key")
@ -354,7 +354,7 @@ class TopazBook:
pass pass
else: else:
bookKey = bookKeys[0] bookKey = bookKeys[0]
print("Book Key Found! ({0})".format(bookKey.encode('hex'))) print("Book Key Found! ({0})".format(bookKey.hex()))
break break
if not bookKey: if not bookKey:
@ -396,26 +396,26 @@ class TopazBook:
outdir = self.outdir outdir = self.outdir
for headerRecord in self.bookHeaderRecords: for headerRecord in self.bookHeaderRecords:
name = headerRecord name = headerRecord
if name != 'dkey': if name != b'dkey':
ext = ".dat" ext = ".dat"
if name == 'img': ext = ".jpg" if name == b'img': ext = ".jpg"
if name == 'color' : ext = ".jpg" if name == b'color' : ext = ".jpg"
print("Processing Section: {0}\n. . .".format(name), end=' ') print("Processing Section: {0}\n. . .".format(name.decode('utf-8')), end=' ')
for index in range (0,len(self.bookHeaderRecords[name])) : for index in range (0,len(self.bookHeaderRecords[name])) :
fname = "{0}{1:04d}{2}".format(name,index,ext) fname = "{0}{1:04d}{2}".format(name.decode('utf-8'),index,ext)
destdir = outdir destdir = outdir
if name == 'img': if name == b'img':
destdir = os.path.join(outdir,"img") destdir = os.path.join(outdir,"img")
if name == 'color': if name == b'color':
destdir = os.path.join(outdir,"color_img") destdir = os.path.join(outdir,"color_img")
if name == 'page': if name == b'page':
destdir = os.path.join(outdir,"page") destdir = os.path.join(outdir,"page")
if name == 'glyphs': if name == b'glyphs':
destdir = os.path.join(outdir,"glyphs") destdir = os.path.join(outdir,"glyphs")
outputFile = os.path.join(destdir,fname) outputFile = os.path.join(destdir,fname)
print(".", end=' ') print(".", end=' ')
record = self.getBookPayloadRecord(name,index) record = self.getBookPayloadRecord(name,index)
if record != '': if record != b'':
open(outputFile, 'wb').write(record) open(outputFile, 'wb').write(record)
print(" ") print(" ")