mirror of
https://github.com/apprenticeharper/DeDRM_tools
synced 2025-01-15 03:41:06 +01:00
topazscripts 2.0
This commit is contained in:
parent
24f001c61e
commit
3b4f1fe587
12 changed files with 282 additions and 172 deletions
|
@ -1,17 +1,29 @@
|
||||||
Changes in version 1.8
|
Changes in version 2.0
|
||||||
|
|
||||||
|
- gensvg.py now accepts two options
|
||||||
|
-x : output browseable XHTML+SVG pages (default)
|
||||||
|
-r : output raw SVG images (useful for later conversion to pdf)
|
||||||
|
|
||||||
|
- flatxml2html.py now understands page.groups of type graphic
|
||||||
|
and handles vertical regions as svg images
|
||||||
|
|
||||||
|
- genhtml.py now accepts an option
|
||||||
|
--fixed-image : which will force the conversion
|
||||||
|
of all fixed regions to svg images
|
||||||
|
|
||||||
|
- minor bug fixes and html conversion improvements
|
||||||
|
|
||||||
|
|
||||||
|
Changes in version 1.8
|
||||||
- gensvg.py now builds wonderful xhtml pages with embedded svg
|
- gensvg.py now builds wonderful xhtml pages with embedded svg
|
||||||
that can be easily paged through as if reading a book!
|
that can be easily paged through as if reading a book!
|
||||||
(tested in Safari for Mac and Win and Firefox)
|
(tested in Safari for Mac and Win and Firefox)
|
||||||
(requires javascript to be enabled)
|
(requires javascript to be enabled)
|
||||||
|
|
||||||
- genhtml.py now REQUIRES that gensvg.py be run FIRST
|
- genhtml.py now REQUIRES that gensvg.py be run FIRST
|
||||||
this allows create of images on the fly from glyphs
|
this allows create of images on the fly from glyphs
|
||||||
|
|
||||||
- genhtml.py now automatically makes tables of words into svg
|
- genhtml.py now automatically makes tables of words into svg
|
||||||
based images and will handle glyph based ornate first
|
based images and will handle glyph based ornate first
|
||||||
letters of words
|
letters of words
|
||||||
|
|
||||||
- cmbtc_dump_mac_linux.py has been renamed to be
|
- cmbtc_dump_mac_linux.py has been renamed to be
|
||||||
cmbtc_dump_nonK4PC.py to make it clearer
|
cmbtc_dump_nonK4PC.py to make it clearer
|
||||||
when it needs to be used
|
when it needs to be used
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
#! /usr/bin/python
|
#! /usr/bin/python
|
||||||
# For use in Topaz Scripts version 1.8
|
# For use in Topaz Scripts version 2.0
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
#! /usr/bin/python
|
#! /usr/bin/python
|
||||||
# For use with Topaz Scripts Version 1.8
|
# For use with Topaz Scripts Version 2.0
|
||||||
|
|
||||||
from __future__ import with_statement
|
from __future__ import with_statement
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
#! /usr/bin/python
|
#! /usr/bin/python
|
||||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
# For use with Topaz Scripts Version 1.8
|
# For use with Topaz Scripts Version 2.0
|
||||||
|
|
||||||
from __future__ import with_statement
|
from __future__ import with_statement
|
||||||
import csv
|
import csv
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
#! /usr/bin/python
|
#! /usr/bin/python
|
||||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
# For use with Topaz Scripts Version 1.8
|
# For use with Topaz Scripts Version 2.0
|
||||||
|
|
||||||
from __future__ import with_statement
|
from __future__ import with_statement
|
||||||
import csv
|
import csv
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
#! /usr/bin/python
|
#! /usr/bin/python
|
||||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
# For use with Topaz Scripts Version 1.8
|
# For use with Topaz Scripts Version 2.0
|
||||||
|
|
||||||
from __future__ import with_statement
|
from __future__ import with_statement
|
||||||
import csv
|
import csv
|
||||||
|
@ -13,7 +13,7 @@ from struct import unpack
|
||||||
|
|
||||||
|
|
||||||
class DocParser(object):
|
class DocParser(object):
|
||||||
def __init__(self, flatxml, classlst, fileid, bookDir):
|
def __init__(self, flatxml, classlst, fileid, bookDir, fixedimage):
|
||||||
self.id = os.path.basename(fileid).replace('.dat','')
|
self.id = os.path.basename(fileid).replace('.dat','')
|
||||||
self.svgcount = 0
|
self.svgcount = 0
|
||||||
self.docList = flatxml.split('\n')
|
self.docList = flatxml.split('\n')
|
||||||
|
@ -28,6 +28,7 @@ class DocParser(object):
|
||||||
# remove the leading period from the css name
|
# remove the leading period from the css name
|
||||||
cname = pclass[1:]
|
cname = pclass[1:]
|
||||||
self.classList[cname] = True
|
self.classList[cname] = True
|
||||||
|
self.fixedimage = fixedimage
|
||||||
self.ocrtext = []
|
self.ocrtext = []
|
||||||
self.link_id = []
|
self.link_id = []
|
||||||
self.link_title = []
|
self.link_title = []
|
||||||
|
@ -63,7 +64,7 @@ class DocParser(object):
|
||||||
imgname = self.id + '_%04d.svg' % self.svgcount
|
imgname = self.id + '_%04d.svg' % self.svgcount
|
||||||
imgfile = os.path.join(imgDir,imgname)
|
imgfile = os.path.join(imgDir,imgname)
|
||||||
|
|
||||||
# build hash table of glyph paths keyed by glyph id
|
# build hashtable of glyph paths keyed by glyph id
|
||||||
if self.numPaths == 0:
|
if self.numPaths == 0:
|
||||||
gfile = open(glyfile, 'r')
|
gfile = open(glyfile, 'r')
|
||||||
while True:
|
while True:
|
||||||
|
@ -194,15 +195,9 @@ class DocParser(object):
|
||||||
return argres
|
return argres
|
||||||
|
|
||||||
|
|
||||||
|
# get the class
|
||||||
# build a description of the paragraph
|
def getClass(self, pclass):
|
||||||
def getParaDescription(self, start, end):
|
nclass = pclass
|
||||||
|
|
||||||
result = []
|
|
||||||
|
|
||||||
# paragraph
|
|
||||||
(pos, pclass) = self.findinDoc('paragraph.class',start,end)
|
|
||||||
|
|
||||||
# class names are an issue given topaz may start them with numerals (not allowed),
|
# class names are an issue given topaz may start them with numerals (not allowed),
|
||||||
# use a mix of cases (which cause some browsers problems), and actually
|
# use a mix of cases (which cause some browsers problems), and actually
|
||||||
# attach numbers after "_reclustered*" to the end to deal classeses that inherit
|
# attach numbers after "_reclustered*" to the end to deal classeses that inherit
|
||||||
|
@ -212,17 +207,85 @@ class DocParser(object):
|
||||||
# so we clean this up by lowercasing, prepend 'cl-', and getting any baseclass
|
# so we clean this up by lowercasing, prepend 'cl-', and getting any baseclass
|
||||||
# that exists in the stylesheet first, and then adding this specific class
|
# that exists in the stylesheet first, and then adding this specific class
|
||||||
# after
|
# after
|
||||||
if pclass != None :
|
if nclass != None :
|
||||||
classres = ''
|
classres = ''
|
||||||
pclass = pclass.lower()
|
nclass = nclass.lower()
|
||||||
pclass = 'cl-' + pclass
|
nclass = 'cl-' + nclass
|
||||||
p = pclass.find('_')
|
baseclass = ''
|
||||||
if p > 0 :
|
# graphic is the base class for captions
|
||||||
baseclass = pclass[0:p]
|
if nclass.find('cl-cap-') >=0 :
|
||||||
if baseclass in self.classList:
|
classres = 'graphic' + ' '
|
||||||
classres += baseclass + ' '
|
else :
|
||||||
classres += pclass
|
# strip to find baseclass
|
||||||
pclass = classres
|
p = nclass.find('_')
|
||||||
|
if p > 0 :
|
||||||
|
baseclass = nclass[0:p]
|
||||||
|
if baseclass in self.classList:
|
||||||
|
classres += baseclass + ' '
|
||||||
|
classres += nclass
|
||||||
|
nclass = classres
|
||||||
|
return nclass
|
||||||
|
|
||||||
|
|
||||||
|
# develop a sorted description of the starting positions of
|
||||||
|
# groups and regions on the page, as well as the page type
|
||||||
|
def PageDescription(self):
|
||||||
|
|
||||||
|
def compare(x, y):
|
||||||
|
(xtype, xval) = x
|
||||||
|
(ytype, yval) = y
|
||||||
|
if xval > yval:
|
||||||
|
return 1
|
||||||
|
if xval == yval:
|
||||||
|
return 0
|
||||||
|
return -1
|
||||||
|
|
||||||
|
result = []
|
||||||
|
(pos, pagetype) = self.findinDoc('page.type',0,-1)
|
||||||
|
|
||||||
|
groupList = self.posinDoc('page.group')
|
||||||
|
groupregionList = self.posinDoc('page.group.region')
|
||||||
|
pageregionList = self.posinDoc('page.region')
|
||||||
|
# integrate into one list
|
||||||
|
for j in groupList:
|
||||||
|
result.append(('grpbeg',j))
|
||||||
|
for j in groupregionList:
|
||||||
|
result.append(('gregion',j))
|
||||||
|
for j in pageregionList:
|
||||||
|
result.append(('pregion',j))
|
||||||
|
result.sort(compare)
|
||||||
|
|
||||||
|
# insert group end and page end indicators
|
||||||
|
inGroup = False
|
||||||
|
j = 0
|
||||||
|
while True:
|
||||||
|
if j == len(result): break
|
||||||
|
rtype = result[j][0]
|
||||||
|
rval = result[j][1]
|
||||||
|
if not inGroup and (rtype == 'grpbeg') :
|
||||||
|
inGroup = True
|
||||||
|
j = j + 1
|
||||||
|
elif inGroup and (rtype in ('grpbeg', 'pregion')):
|
||||||
|
result.insert(j,('grpend',rval))
|
||||||
|
inGroup = False
|
||||||
|
else:
|
||||||
|
j = j + 1
|
||||||
|
if inGroup:
|
||||||
|
result.append(('grpend',-1))
|
||||||
|
result.append(('pageend', -1))
|
||||||
|
return pagetype, result
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# build a description of the paragraph
|
||||||
|
def getParaDescription(self, start, end, regtype):
|
||||||
|
|
||||||
|
result = []
|
||||||
|
|
||||||
|
# paragraph
|
||||||
|
(pos, pclass) = self.findinDoc('paragraph.class',start,end)
|
||||||
|
|
||||||
|
pclass = self.getClass(pclass)
|
||||||
|
|
||||||
# build up a description of the paragraph in result and return it
|
# build up a description of the paragraph in result and return it
|
||||||
# first check for the basic - all words paragraph
|
# first check for the basic - all words paragraph
|
||||||
|
@ -231,13 +294,49 @@ class DocParser(object):
|
||||||
if (sfirst != None) and (slast != None) :
|
if (sfirst != None) and (slast != None) :
|
||||||
first = int(sfirst)
|
first = int(sfirst)
|
||||||
last = int(slast)
|
last = int(slast)
|
||||||
for wordnum in xrange(first, last):
|
|
||||||
result.append(('ocr', wordnum))
|
makeImage = (regtype == 'vertical') or (regtype == 'table')
|
||||||
|
if self.fixedimage:
|
||||||
|
makeImage = makeImage or (regtype == 'fixed')
|
||||||
|
|
||||||
|
if (pclass != None):
|
||||||
|
makeImage = makeImage or (pclass.find('.inverted') >= 0)
|
||||||
|
if self.fixedimage :
|
||||||
|
makeImage = makeImage or (pclass.find('cl-f-') >= 0)
|
||||||
|
|
||||||
|
if not makeImage :
|
||||||
|
# standard all word paragraph
|
||||||
|
for wordnum in xrange(first, last):
|
||||||
|
result.append(('ocr', wordnum))
|
||||||
|
return pclass, result
|
||||||
|
|
||||||
|
# convert paragraph to svg image
|
||||||
|
# translate first and last word into first and last glyphs
|
||||||
|
# and generate inline image and include it
|
||||||
|
glyphList = []
|
||||||
|
firstglyphList = self.getData('word.firstGlyph',0,-1)
|
||||||
|
gidList = self.getData('info.glyph.glyphID',0,-1)
|
||||||
|
firstGlyph = firstglyphList[first]
|
||||||
|
if last < len(firstglyphList):
|
||||||
|
lastGlyph = firstglyphList[last]
|
||||||
|
else :
|
||||||
|
lastGlyph = len(gidList)
|
||||||
|
for glyphnum in xrange(firstGlyph, lastGlyph):
|
||||||
|
glyphList.append(glyphnum)
|
||||||
|
# include any extratokens if they exist
|
||||||
|
(pos, sfg) = self.findinDoc('extratokens.firstGlyph',start,end)
|
||||||
|
(pos, slg) = self.findinDoc('extratokens.lastGlyph',start,end)
|
||||||
|
if (sfg != None) and (slg != None):
|
||||||
|
for glyphnum in xrange(int(sfg), int(slg)):
|
||||||
|
glyphList.append(glyphnum)
|
||||||
|
num = self.svgcount
|
||||||
|
self.glyphs_to_image(glyphList)
|
||||||
|
self.svgcount += 1
|
||||||
|
result.append(('svg', num))
|
||||||
return pclass, result
|
return pclass, result
|
||||||
|
|
||||||
# this type of paragrph may be made up of multiple _spans, inline
|
# this type of paragrph may be made up of multiple spans, inline
|
||||||
# word monograms (images) and words with semantic meaning
|
# word monograms (images), and words with semantic meaning,
|
||||||
# and now a new type "span" versus the old "_span"
|
|
||||||
# plus glyphs used to form starting letter of first word
|
# plus glyphs used to form starting letter of first word
|
||||||
|
|
||||||
# need to parse this type line by line
|
# need to parse this type line by line
|
||||||
|
@ -252,6 +351,7 @@ class DocParser(object):
|
||||||
|
|
||||||
(name, argres) = self.lineinDoc(line)
|
(name, argres) = self.lineinDoc(line)
|
||||||
|
|
||||||
|
# handle both span and _span
|
||||||
if name.endswith('span.firstWord') :
|
if name.endswith('span.firstWord') :
|
||||||
first = int(argres)
|
first = int(argres)
|
||||||
(name, argres) = self.lineinDoc(line+1)
|
(name, argres) = self.lineinDoc(line+1)
|
||||||
|
@ -422,148 +522,78 @@ class DocParser(object):
|
||||||
else:
|
else:
|
||||||
self.link_title.append('')
|
self.link_title.append('')
|
||||||
|
|
||||||
|
# get a descriptions of the starting points of the regions
|
||||||
# get page type
|
# and groups on the page
|
||||||
(pos, pagetype) = self.findinDoc('page.type',0,-1)
|
(pagetype, pageDesc) = self.PageDescription()
|
||||||
|
regcnt = len(pageDesc) - 1
|
||||||
|
|
||||||
# generate a list of each region starting point
|
|
||||||
# each region has one paragraph,, or one image, or one chapterheading
|
|
||||||
|
|
||||||
regionList= self.posinDoc('region')
|
|
||||||
regcnt = len(regionList)
|
|
||||||
regionList.append(-1)
|
|
||||||
|
|
||||||
anchorSet = False
|
anchorSet = False
|
||||||
breakSet = False
|
breakSet = False
|
||||||
|
inGroup = False
|
||||||
|
|
||||||
# process each region tag and convert what you can to html
|
# process each region on the page and convert what you can to html
|
||||||
|
|
||||||
for j in xrange(regcnt):
|
for j in xrange(regcnt):
|
||||||
|
|
||||||
start = regionList[j]
|
(etype, start) = pageDesc[j]
|
||||||
end = regionList[j+1]
|
(ntype, end) = pageDesc[j+1]
|
||||||
|
|
||||||
(pos, regtype) = self.findinDoc('region.type',start,end)
|
|
||||||
|
|
||||||
# set anchor for link target on this page
|
# set anchor for link target on this page
|
||||||
if not anchorSet and not first_para_continued:
|
if not anchorSet and not first_para_continued:
|
||||||
htmlpage += '<div style="visibility: hidden; height: 0; width: 0;" id="' + self.id + '" title="pagetype_' + pagetype + '"></div>\n'
|
htmlpage += '<div style="visibility: hidden; height: 0; width: 0;" id="'
|
||||||
|
htmlpage += self.id + '" title="pagetype_' + pagetype + '"></div>\n'
|
||||||
anchorSet = True
|
anchorSet = True
|
||||||
|
|
||||||
if regtype == 'graphic' :
|
# handle groups of graphics with text captions
|
||||||
(pos, simgsrc) = self.findinDoc('img.src',start,end)
|
if (etype == 'grpbeg'):
|
||||||
if simgsrc:
|
(pos, grptype) = self.findinDoc('group.type', start, end)
|
||||||
htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
|
if grptype != None:
|
||||||
|
if grptype == 'graphic':
|
||||||
|
gcstr = ' class="' + grptype + '"'
|
||||||
|
htmlpage += '<div' + gcstr + '>'
|
||||||
|
inGroup = True
|
||||||
|
|
||||||
|
elif (etype == 'grpend'):
|
||||||
|
if inGroup:
|
||||||
|
htmlpage += '</div>\n'
|
||||||
|
inGroup = False
|
||||||
|
|
||||||
elif regtype == 'chapterheading' :
|
else:
|
||||||
(pclass, pdesc) = self.getParaDescription(start,end)
|
(pos, regtype) = self.findinDoc('region.type',start,end)
|
||||||
if not breakSet:
|
|
||||||
htmlpage += '<div style="page-break-after: always;"> </div>\n'
|
|
||||||
breakSet = True
|
|
||||||
tag = 'h1'
|
|
||||||
if pclass and (len(pclass) >= 7):
|
|
||||||
if pclass[3:7] == 'ch1-' : tag = 'h1'
|
|
||||||
if pclass[3:7] == 'ch2-' : tag = 'h2'
|
|
||||||
if pclass[3:7] == 'ch3-' : tag = 'h3'
|
|
||||||
htmlpage += '<' + tag + ' class="' + pclass + '">'
|
|
||||||
else:
|
|
||||||
htmlpage += '<' + tag + '>'
|
|
||||||
htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
|
|
||||||
htmlpage += '</' + tag + '>'
|
|
||||||
|
|
||||||
|
if regtype == 'graphic' :
|
||||||
|
(pos, simgsrc) = self.findinDoc('img.src',start,end)
|
||||||
|
if simgsrc:
|
||||||
|
if inGroup:
|
||||||
|
htmlpage += '<img src="img/img%04d.jpg" alt="" />' % int(simgsrc)
|
||||||
|
else:
|
||||||
|
htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
|
||||||
|
|
||||||
elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') or (regtype == 'listitem'):
|
elif regtype == 'chapterheading' :
|
||||||
ptype = 'full'
|
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
|
||||||
# check to see if this is a continution from the previous page
|
if not breakSet:
|
||||||
if first_para_continued :
|
htmlpage += '<div style="page-break-after: always;"> </div>\n'
|
||||||
ptype = 'end'
|
breakSet = True
|
||||||
first_para_continued = False
|
tag = 'h1'
|
||||||
(pclass, pdesc) = self.getParaDescription(start,end)
|
if pclass and (len(pclass) >= 7):
|
||||||
if pclass and (len(pclass) >= 6) and (ptype == 'full'):
|
if pclass[3:7] == 'ch1-' : tag = 'h1'
|
||||||
tag = 'p'
|
if pclass[3:7] == 'ch2-' : tag = 'h2'
|
||||||
if pclass[3:6] == 'h1-' : tag = 'h4'
|
if pclass[3:7] == 'ch3-' : tag = 'h3'
|
||||||
if pclass[3:6] == 'h2-' : tag = 'h5'
|
htmlpage += '<' + tag + ' class="' + pclass + '">'
|
||||||
if pclass[3:6] == 'h3-' : tag = 'h6'
|
else:
|
||||||
htmlpage += '<' + tag + ' class="' + pclass + '">'
|
htmlpage += '<' + tag + '>'
|
||||||
htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
|
htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
|
||||||
htmlpage += '</' + tag + '>'
|
htmlpage += '</' + tag + '>'
|
||||||
else :
|
|
||||||
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
|
|
||||||
|
|
||||||
|
elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') or (regtype == 'listitem'):
|
||||||
elif (regtype == 'tocentry') :
|
|
||||||
ptype = 'full'
|
|
||||||
if first_para_continued :
|
|
||||||
ptype = 'end'
|
|
||||||
first_para_continued = False
|
|
||||||
(pclass, pdesc) = self.getParaDescription(start,end)
|
|
||||||
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
|
|
||||||
|
|
||||||
|
|
||||||
elif (regtype == 'vertical') :
|
|
||||||
ptype = 'full'
|
|
||||||
if first_para_continued :
|
|
||||||
ptype = 'end'
|
|
||||||
first_para_continued = False
|
|
||||||
(pclass, pdesc) = self.getParaDescription(start,end)
|
|
||||||
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
|
|
||||||
|
|
||||||
|
|
||||||
elif (regtype == 'table') :
|
|
||||||
# translate first and last word into first and last glyphs
|
|
||||||
# and generate table as an image and include a link to it
|
|
||||||
glyphList = []
|
|
||||||
(pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
|
|
||||||
(pos, slast) = self.findinDoc('paragraph.lastWord',start,end)
|
|
||||||
firstglyphList = self.getData('word.firstGlyph',0,-1)
|
|
||||||
gidList = self.getData('info.glyph.glyphID',0,-1)
|
|
||||||
if (sfirst != None) and (slast != None) :
|
|
||||||
first = int(sfirst)
|
|
||||||
last = int(slast)
|
|
||||||
firstGlyph = firstglyphList[first]
|
|
||||||
if last < len(firstglyphList):
|
|
||||||
lastGlyph = firstglyphList[last]
|
|
||||||
else :
|
|
||||||
lastGlyph = len(gidList)
|
|
||||||
for glyphnum in xrange(firstGlyph, lastGlyph):
|
|
||||||
glyphList.append(glyphnum)
|
|
||||||
num = self.svgcount
|
|
||||||
self.glyphs_to_image(glyphList)
|
|
||||||
self.svgcount += 1
|
|
||||||
htmlpage += '<div class="graphic"><img src="img/' + self.id + '_%04d.svg" alt="" /></div>' % num
|
|
||||||
else :
|
|
||||||
ptype = 'full'
|
|
||||||
if first_para_continued :
|
|
||||||
ptype = 'end'
|
|
||||||
first_para_continued = False
|
|
||||||
(pclass, pdesc) = self.getParaDescription(start,end)
|
|
||||||
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
|
|
||||||
print " "
|
|
||||||
print "Warning: - Table Conversions are notoriously poor"
|
|
||||||
print " Strongly recommend taking a screen capture image of the "
|
|
||||||
print " table in %s.svg and using it to replace this attempt at a table" % self.id
|
|
||||||
print " "
|
|
||||||
|
|
||||||
elif (regtype == 'synth_fcvr.center') or (regtype == 'synth_text.center'):
|
|
||||||
(pos, simgsrc) = self.findinDoc('img.src',start,end)
|
|
||||||
if simgsrc:
|
|
||||||
htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
|
|
||||||
|
|
||||||
else :
|
|
||||||
print 'Warning: region type', regtype
|
|
||||||
(pos, temp) = self.findinDoc('paragraph',start,end)
|
|
||||||
if pos != -1:
|
|
||||||
print ' is a "text" region'
|
|
||||||
regtype = 'fixed'
|
|
||||||
ptype = 'full'
|
ptype = 'full'
|
||||||
# check to see if this is a continution from the previous page
|
# check to see if this is a continution from the previous page
|
||||||
if first_para_continued :
|
if first_para_continued :
|
||||||
ptype = 'end'
|
ptype = 'end'
|
||||||
first_para_continued = False
|
first_para_continued = False
|
||||||
(pclass, pdesc) = self.getParaDescription(start,end)
|
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
|
||||||
if pclass and (ptype == 'full') and (len(pclass) >= 6):
|
if pclass and (len(pclass) >= 6) and (ptype == 'full'):
|
||||||
tag = 'p'
|
tag = 'p'
|
||||||
if pclass[3:6] == 'h1-' : tag = 'h4'
|
if pclass[3:6] == 'h1-' : tag = 'h4'
|
||||||
if pclass[3:6] == 'h2-' : tag = 'h5'
|
if pclass[3:6] == 'h2-' : tag = 'h5'
|
||||||
|
@ -573,12 +603,60 @@ class DocParser(object):
|
||||||
htmlpage += '</' + tag + '>'
|
htmlpage += '</' + tag + '>'
|
||||||
else :
|
else :
|
||||||
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
|
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
|
||||||
else :
|
|
||||||
print ' is a "graphic" region'
|
elif (regtype == 'tocentry') :
|
||||||
|
ptype = 'full'
|
||||||
|
if first_para_continued :
|
||||||
|
ptype = 'end'
|
||||||
|
first_para_continued = False
|
||||||
|
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
|
||||||
|
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
|
||||||
|
|
||||||
|
|
||||||
|
elif (regtype == 'vertical') or (regtype == 'table') :
|
||||||
|
ptype = 'full'
|
||||||
|
if inGroup:
|
||||||
|
ptype = 'middle'
|
||||||
|
if first_para_continued :
|
||||||
|
ptype = 'end'
|
||||||
|
first_para_continued = False
|
||||||
|
(pclass, pdesc) = self.getParaDescription(start, end, regtype)
|
||||||
|
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
|
||||||
|
|
||||||
|
|
||||||
|
elif (regtype == 'synth_fcvr.center') or (regtype == 'synth_text.center'):
|
||||||
(pos, simgsrc) = self.findinDoc('img.src',start,end)
|
(pos, simgsrc) = self.findinDoc('img.src',start,end)
|
||||||
if simgsrc:
|
if simgsrc:
|
||||||
htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
|
htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
|
||||||
|
|
||||||
|
else :
|
||||||
|
print 'Warning: region type', regtype
|
||||||
|
(pos, temp) = self.findinDoc('paragraph',start,end)
|
||||||
|
if pos != -1:
|
||||||
|
print ' is a "text" region'
|
||||||
|
regtype = 'fixed'
|
||||||
|
ptype = 'full'
|
||||||
|
# check to see if this is a continution from the previous page
|
||||||
|
if first_para_continued :
|
||||||
|
ptype = 'end'
|
||||||
|
first_para_continued = False
|
||||||
|
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
|
||||||
|
if pclass and (ptype == 'full') and (len(pclass) >= 6):
|
||||||
|
tag = 'p'
|
||||||
|
if pclass[3:6] == 'h1-' : tag = 'h4'
|
||||||
|
if pclass[3:6] == 'h2-' : tag = 'h5'
|
||||||
|
if pclass[3:6] == 'h3-' : tag = 'h6'
|
||||||
|
htmlpage += '<' + tag + ' class="' + pclass + '">'
|
||||||
|
htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
|
||||||
|
htmlpage += '</' + tag + '>'
|
||||||
|
else :
|
||||||
|
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
|
||||||
|
else :
|
||||||
|
print ' is a "graphic" region'
|
||||||
|
(pos, simgsrc) = self.findinDoc('img.src',start,end)
|
||||||
|
if simgsrc:
|
||||||
|
htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
|
||||||
|
|
||||||
|
|
||||||
if last_para_continued :
|
if last_para_continued :
|
||||||
if htmlpage[-4:] == '</p>':
|
if htmlpage[-4:] == '</p>':
|
||||||
|
@ -589,10 +667,10 @@ class DocParser(object):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def convert2HTML(flatxml, classlst, fileid, bookDir):
|
def convert2HTML(flatxml, classlst, fileid, bookDir, fixedimage):
|
||||||
|
|
||||||
# create a document parser
|
# create a document parser
|
||||||
dp = DocParser(flatxml, classlst, fileid, bookDir)
|
dp = DocParser(flatxml, classlst, fileid, bookDir, fixedimage)
|
||||||
|
|
||||||
htmlpage = dp.process()
|
htmlpage = dp.process()
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
#! /usr/bin/python
|
#! /usr/bin/python
|
||||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
# For use with Topaz Scripts Version 1.8
|
# For use with Topaz Scripts Version 2.0
|
||||||
|
|
||||||
import os, sys, getopt
|
import os, sys, getopt
|
||||||
|
|
||||||
|
@ -14,13 +14,16 @@ import getpagedim
|
||||||
def usage():
|
def usage():
|
||||||
print 'Usage: '
|
print 'Usage: '
|
||||||
print ' '
|
print ' '
|
||||||
print ' genhtml.py unencryptedBookDir'
|
print ' genhtml.py [--fixed-image] unencryptedBookDir'
|
||||||
|
print ' '
|
||||||
|
print ' Options: '
|
||||||
|
print ' --fixed-image : force translation of fixed regions into svg images '
|
||||||
print ' '
|
print ' '
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def main(argv):
|
def main(argv):
|
||||||
bookDir = ''
|
bookDir = ''
|
||||||
|
fixedimage = False
|
||||||
|
|
||||||
if len(argv) == 0:
|
if len(argv) == 0:
|
||||||
argv = sys.argv
|
argv = sys.argv
|
||||||
|
@ -28,7 +31,7 @@ def main(argv):
|
||||||
argv = argv.split()
|
argv = argv.split()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
opts, args = getopt.getopt(argv[1:], "h:")
|
opts, args = getopt.getopt(argv[1:], "h:",["fixed-image"])
|
||||||
|
|
||||||
except getopt.GetoptError, err:
|
except getopt.GetoptError, err:
|
||||||
print str(err)
|
print str(err)
|
||||||
|
@ -43,6 +46,8 @@ def main(argv):
|
||||||
if o =="-h":
|
if o =="-h":
|
||||||
usage()
|
usage()
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
if o =="--fixed-image":
|
||||||
|
fixedimage = True
|
||||||
|
|
||||||
bookDir = args[0]
|
bookDir = args[0]
|
||||||
|
|
||||||
|
@ -139,7 +144,7 @@ def main(argv):
|
||||||
print ' ', filename
|
print ' ', filename
|
||||||
fname = os.path.join(pageDir,filename)
|
fname = os.path.join(pageDir,filename)
|
||||||
flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
|
flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
|
||||||
htmlstr += flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir)
|
htmlstr += flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir, fixedimage)
|
||||||
|
|
||||||
htmlstr += '</body>\n</html>\n'
|
htmlstr += '</body>\n</html>\n'
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
#! /usr/bin/python
|
#! /usr/bin/python
|
||||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
# For use with Topaz Scripts Version 1.8
|
# For use with Topaz Scripts Version 2.0
|
||||||
|
|
||||||
import os, sys, getopt
|
import os, sys, getopt
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
#! /usr/bin/python
|
#! /usr/bin/python
|
||||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
# For use with Topaz Scripts Version 1.8
|
# For use with Topaz Scripts Version 2.0
|
||||||
|
|
||||||
import os, sys, getopt
|
import os, sys, getopt
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
#! /usr/bin/python
|
#! /usr/bin/python
|
||||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
# For use with Topaz Scripts Version 1.8
|
# For use with Topaz Scripts Version 2.0
|
||||||
|
|
||||||
from __future__ import with_statement
|
from __future__ import with_statement
|
||||||
import csv
|
import csv
|
||||||
|
|
|
@ -31,9 +31,8 @@ genhtml.py - main program to generate "book.html"
|
||||||
gensvg.py - (author: clarknova) main program to create an xhmtl page with embedded svg graphics
|
gensvg.py - (author: clarknova) main program to create an xhmtl page with embedded svg graphics
|
||||||
|
|
||||||
|
|
||||||
Please note, gensvg.py, genhtml.py, and genxml.py import and use
|
Please note, these scripts all import code from each other so please
|
||||||
decode_meta.py, convert2xml.py, flatxml2html.py, getpagedim.py and stylexml2css.py
|
keep all of these python scripts together in the same place.
|
||||||
so please keep all of these python scripts together in the same place.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -78,6 +77,12 @@ The step must NOW be done BEFORE attempting conversion to html
|
||||||
When complete, use a web-browser to open the page*.xhtml files
|
When complete, use a web-browser to open the page*.xhtml files
|
||||||
in TARGETDIR/svg/ to see what the book really looks like.
|
in TARGETDIR/svg/ to see what the book really looks like.
|
||||||
|
|
||||||
|
If you would prefer pure svg pages, then use the -r option
|
||||||
|
as follows:
|
||||||
|
|
||||||
|
gensvg.py -r TARGETDIR
|
||||||
|
|
||||||
|
|
||||||
All thanks go to CLARKNOVA for this program. This program is
|
All thanks go to CLARKNOVA for this program. This program is
|
||||||
needed to actually see the true image of each page and so that
|
needed to actually see the true image of each page and so that
|
||||||
the next step can properly create images from glyphs for
|
the next step can properly create images from glyphs for
|
||||||
|
@ -97,6 +102,16 @@ properly set bold and/or italics, handle font size changes,
|
||||||
and to fix the sometimes horiffic mistakes in the ocrText
|
and to fix the sometimes horiffic mistakes in the ocrText
|
||||||
used to create the html.
|
used to create the html.
|
||||||
|
|
||||||
|
If there critical pages that need fixed layout in your book
|
||||||
|
you might want to consider forcing these fixed regions to
|
||||||
|
become svg images using the command instead
|
||||||
|
|
||||||
|
genhtml.py --fixed-image TARGETDIR
|
||||||
|
|
||||||
|
This will convert all fixed regions into svg images at the
|
||||||
|
expense of increased book size, slower loading speed, and
|
||||||
|
a loss of the ability to search for words in those regions
|
||||||
|
|
||||||
FYI: Sigil is a wonderful, free cross-
|
FYI: Sigil is a wonderful, free cross-
|
||||||
platform program that can be used to edit the html and
|
platform program that can be used to edit the html and
|
||||||
create an epub if you so desire.
|
create an epub if you so desire.
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
#! /usr/bin/python
|
#! /usr/bin/python
|
||||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
# For use with Topaz Scripts Version 1.8
|
# For use with Topaz Scripts Version 2.0
|
||||||
|
|
||||||
from __future__ import with_statement
|
from __future__ import with_statement
|
||||||
import csv
|
import csv
|
||||||
|
|
Loading…
Reference in a new issue