2012-11-20 14:28:12 +01:00
|
|
|
#! /usr/bin/python
|
|
|
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
2013-10-02 20:59:40 +02:00
|
|
|
# For use with Topaz Scripts Version 2.6
|
2012-11-20 14:28:12 +01:00
|
|
|
|
|
|
|
import sys
|
|
|
|
import csv
|
|
|
|
import os
|
2013-10-02 20:59:40 +02:00
|
|
|
import math
|
2012-11-20 14:28:12 +01:00
|
|
|
import getopt
|
2020-10-16 14:58:59 +02:00
|
|
|
import functools
|
2012-11-20 14:28:12 +01:00
|
|
|
from struct import pack
|
|
|
|
from struct import unpack
|
|
|
|
|
|
|
|
|
2013-10-02 20:59:40 +02:00
|
|
|
class DocParser(object):
|
|
|
|
def __init__(self, flatxml, classlst, fileid, bookDir, gdict, fixedimage):
|
|
|
|
self.id = os.path.basename(fileid).replace('.dat','')
|
|
|
|
self.svgcount = 0
|
2020-10-16 14:58:59 +02:00
|
|
|
self.docList = flatxml.split(b'\n')
|
2013-10-02 20:59:40 +02:00
|
|
|
self.docSize = len(self.docList)
|
|
|
|
self.classList = {}
|
|
|
|
self.bookDir = bookDir
|
|
|
|
self.gdict = gdict
|
|
|
|
tmpList = classlst.split('\n')
|
|
|
|
for pclass in tmpList:
|
2020-10-16 14:58:59 +02:00
|
|
|
if pclass != b'':
|
2013-10-02 20:59:40 +02:00
|
|
|
# remove the leading period from the css name
|
|
|
|
cname = pclass[1:]
|
|
|
|
self.classList[cname] = True
|
|
|
|
self.fixedimage = fixedimage
|
|
|
|
self.ocrtext = []
|
|
|
|
self.link_id = []
|
|
|
|
self.link_title = []
|
|
|
|
self.link_page = []
|
|
|
|
self.link_href = []
|
|
|
|
self.link_type = []
|
|
|
|
self.dehyphen_rootid = []
|
|
|
|
self.paracont_stemid = []
|
|
|
|
self.parastems_stemid = []
|
|
|
|
|
|
|
|
|
|
|
|
def getGlyph(self, gid):
|
|
|
|
result = ''
|
|
|
|
id='id="gl%d"' % gid
|
|
|
|
return self.gdict.lookup(id)
|
|
|
|
|
|
|
|
def glyphs_to_image(self, glyphList):
|
|
|
|
|
|
|
|
def extract(path, key):
|
|
|
|
b = path.find(key) + len(key)
|
|
|
|
e = path.find(' ',b)
|
|
|
|
return int(path[b:e])
|
|
|
|
|
|
|
|
svgDir = os.path.join(self.bookDir,'svg')
|
|
|
|
|
|
|
|
imgDir = os.path.join(self.bookDir,'img')
|
|
|
|
imgname = self.id + '_%04d.svg' % self.svgcount
|
|
|
|
imgfile = os.path.join(imgDir,imgname)
|
|
|
|
|
|
|
|
# get glyph information
|
2020-10-16 14:58:59 +02:00
|
|
|
gxList = self.getData(b'info.glyph.x',0,-1)
|
|
|
|
gyList = self.getData(b'info.glyph.y',0,-1)
|
|
|
|
gidList = self.getData(b'info.glyph.glyphID',0,-1)
|
2013-10-02 20:59:40 +02:00
|
|
|
|
|
|
|
gids = []
|
|
|
|
maxws = []
|
|
|
|
maxhs = []
|
|
|
|
xs = []
|
|
|
|
ys = []
|
|
|
|
gdefs = []
|
|
|
|
|
|
|
|
# get path defintions, positions, dimensions for each glyph
|
|
|
|
# that makes up the image, and find min x and min y to reposition origin
|
|
|
|
minx = -1
|
|
|
|
miny = -1
|
|
|
|
for j in glyphList:
|
|
|
|
gid = gidList[j]
|
|
|
|
gids.append(gid)
|
|
|
|
|
|
|
|
xs.append(gxList[j])
|
|
|
|
if minx == -1: minx = gxList[j]
|
|
|
|
else : minx = min(minx, gxList[j])
|
|
|
|
|
|
|
|
ys.append(gyList[j])
|
|
|
|
if miny == -1: miny = gyList[j]
|
|
|
|
else : miny = min(miny, gyList[j])
|
|
|
|
|
|
|
|
path = self.getGlyph(gid)
|
|
|
|
gdefs.append(path)
|
|
|
|
|
|
|
|
maxws.append(extract(path,'width='))
|
|
|
|
maxhs.append(extract(path,'height='))
|
|
|
|
|
|
|
|
|
|
|
|
# change the origin to minx, miny and calc max height and width
|
|
|
|
maxw = maxws[0] + xs[0] - minx
|
|
|
|
maxh = maxhs[0] + ys[0] - miny
|
2020-10-14 17:23:49 +02:00
|
|
|
for j in range(0, len(xs)):
|
2013-10-02 20:59:40 +02:00
|
|
|
xs[j] = xs[j] - minx
|
|
|
|
ys[j] = ys[j] - miny
|
|
|
|
maxw = max( maxw, (maxws[j] + xs[j]) )
|
|
|
|
maxh = max( maxh, (maxhs[j] + ys[j]) )
|
|
|
|
|
|
|
|
# open the image file for output
|
|
|
|
ifile = open(imgfile,'w')
|
|
|
|
ifile.write('<?xml version="1.0" standalone="no"?>\n')
|
|
|
|
ifile.write('<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n')
|
|
|
|
ifile.write('<svg width="%dpx" height="%dpx" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (math.floor(maxw/10), math.floor(maxh/10), maxw, maxh))
|
|
|
|
ifile.write('<defs>\n')
|
2020-10-14 17:23:49 +02:00
|
|
|
for j in range(0,len(gdefs)):
|
2013-10-02 20:59:40 +02:00
|
|
|
ifile.write(gdefs[j])
|
|
|
|
ifile.write('</defs>\n')
|
2020-10-14 17:23:49 +02:00
|
|
|
for j in range(0,len(gids)):
|
2013-10-02 20:59:40 +02:00
|
|
|
ifile.write('<use xlink:href="#gl%d" x="%d" y="%d" />\n' % (gids[j], xs[j], ys[j]))
|
|
|
|
ifile.write('</svg>')
|
|
|
|
ifile.close()
|
|
|
|
|
|
|
|
return 0
|
|
|
|
|
2012-11-20 14:28:12 +01:00
|
|
|
|
|
|
|
|
|
|
|
# return tag at line pos in document
|
|
|
|
def lineinDoc(self, pos) :
|
|
|
|
if (pos >= 0) and (pos < self.docSize) :
|
2013-10-02 20:59:40 +02:00
|
|
|
item = self.docList[pos]
|
2020-10-16 14:58:59 +02:00
|
|
|
if item.find(b'=') >= 0:
|
|
|
|
(name, argres) = item.split(b'=',1)
|
2012-11-20 14:28:12 +01:00
|
|
|
else :
|
|
|
|
name = item
|
2020-10-16 14:58:59 +02:00
|
|
|
argres = b''
|
2012-11-20 14:28:12 +01:00
|
|
|
return name, argres
|
|
|
|
|
2013-10-02 20:59:40 +02:00
|
|
|
|
2012-11-20 14:28:12 +01:00
|
|
|
# find tag in doc if within pos to end inclusive
|
|
|
|
def findinDoc(self, tagpath, pos, end) :
|
|
|
|
result = None
|
|
|
|
if end == -1 :
|
|
|
|
end = self.docSize
|
|
|
|
else:
|
|
|
|
end = min(self.docSize, end)
|
|
|
|
foundat = -1
|
2020-10-14 17:23:49 +02:00
|
|
|
for j in range(pos, end):
|
2013-10-02 20:59:40 +02:00
|
|
|
item = self.docList[j]
|
2020-10-16 14:58:59 +02:00
|
|
|
if item.find(b'=') >= 0:
|
|
|
|
(name, argres) = item.split(b'=',1)
|
2012-11-20 14:28:12 +01:00
|
|
|
else :
|
|
|
|
name = item
|
|
|
|
argres = ''
|
2020-10-16 14:58:59 +02:00
|
|
|
if (isinstance(tagpath,str)):
|
|
|
|
tagpath = tagpath.encode('utf-8')
|
2012-11-20 14:28:12 +01:00
|
|
|
if name.endswith(tagpath) :
|
|
|
|
result = argres
|
|
|
|
foundat = j
|
|
|
|
break
|
|
|
|
return foundat, result
|
|
|
|
|
2013-10-02 20:59:40 +02:00
|
|
|
|
2012-11-20 14:28:12 +01:00
|
|
|
# return list of start positions for the tagpath
|
|
|
|
def posinDoc(self, tagpath):
|
|
|
|
startpos = []
|
|
|
|
pos = 0
|
|
|
|
res = ""
|
|
|
|
while res != None :
|
|
|
|
(foundpos, res) = self.findinDoc(tagpath, pos, -1)
|
|
|
|
if res != None :
|
|
|
|
startpos.append(foundpos)
|
|
|
|
pos = foundpos + 1
|
|
|
|
return startpos
|
|
|
|
|
|
|
|
|
2013-10-02 20:59:40 +02:00
|
|
|
# returns a vector of integers for the tagpath
|
|
|
|
def getData(self, tagpath, pos, end):
|
|
|
|
argres=[]
|
|
|
|
(foundat, argt) = self.findinDoc(tagpath, pos, end)
|
|
|
|
if (argt != None) and (len(argt) > 0) :
|
2020-10-16 14:58:59 +02:00
|
|
|
argList = argt.split(b'|')
|
2013-10-02 20:59:40 +02:00
|
|
|
argres = [ int(strval) for strval in argList]
|
|
|
|
return argres
|
|
|
|
|
|
|
|
|
|
|
|
# get the class
|
|
|
|
def getClass(self, pclass):
|
|
|
|
nclass = pclass
|
|
|
|
|
|
|
|
# class names are an issue given topaz may start them with numerals (not allowed),
|
|
|
|
# use a mix of cases (which cause some browsers problems), and actually
|
|
|
|
# attach numbers after "_reclustered*" to the end to deal classeses that inherit
|
|
|
|
# from a base class (but then not actually provide all of these _reclustereed
|
|
|
|
# classes in the stylesheet!
|
|
|
|
|
|
|
|
# so we clean this up by lowercasing, prepend 'cl-', and getting any baseclass
|
|
|
|
# that exists in the stylesheet first, and then adding this specific class
|
|
|
|
# after
|
|
|
|
|
|
|
|
# also some class names have spaces in them so need to convert to dashes
|
|
|
|
if nclass != None :
|
2020-10-16 14:58:59 +02:00
|
|
|
nclass = nclass.replace(b' ',b'-')
|
|
|
|
classres = b''
|
2013-10-02 20:59:40 +02:00
|
|
|
nclass = nclass.lower()
|
2020-10-16 14:58:59 +02:00
|
|
|
nclass = b'cl-' + nclass
|
|
|
|
baseclass = b''
|
2013-10-02 20:59:40 +02:00
|
|
|
# graphic is the base class for captions
|
2020-10-16 14:58:59 +02:00
|
|
|
if nclass.find(b'cl-cap-') >=0 :
|
|
|
|
classres = b'graphic' + b' '
|
2013-10-02 20:59:40 +02:00
|
|
|
else :
|
|
|
|
# strip to find baseclass
|
2020-10-16 14:58:59 +02:00
|
|
|
p = nclass.find(b'_')
|
2013-10-02 20:59:40 +02:00
|
|
|
if p > 0 :
|
|
|
|
baseclass = nclass[0:p]
|
|
|
|
if baseclass in self.classList:
|
2020-10-16 14:58:59 +02:00
|
|
|
classres += baseclass + b' '
|
2013-10-02 20:59:40 +02:00
|
|
|
classres += nclass
|
|
|
|
nclass = classres
|
|
|
|
return nclass
|
|
|
|
|
|
|
|
|
|
|
|
# develop a sorted description of the starting positions of
|
|
|
|
# groups and regions on the page, as well as the page type
|
|
|
|
def PageDescription(self):
|
|
|
|
|
|
|
|
def compare(x, y):
|
|
|
|
(xtype, xval) = x
|
|
|
|
(ytype, yval) = y
|
|
|
|
if xval > yval:
|
|
|
|
return 1
|
|
|
|
if xval == yval:
|
|
|
|
return 0
|
|
|
|
return -1
|
2012-11-20 14:28:12 +01:00
|
|
|
|
2013-04-05 18:44:48 +02:00
|
|
|
result = []
|
2020-10-16 14:58:59 +02:00
|
|
|
(pos, pagetype) = self.findinDoc(b'page.type',0,-1)
|
2013-10-02 20:59:40 +02:00
|
|
|
|
2020-10-16 14:58:59 +02:00
|
|
|
groupList = self.posinDoc(b'page.group')
|
|
|
|
groupregionList = self.posinDoc(b'page.group.region')
|
|
|
|
pageregionList = self.posinDoc(b'page.region')
|
2013-10-02 20:59:40 +02:00
|
|
|
# integrate into one list
|
|
|
|
for j in groupList:
|
|
|
|
result.append(('grpbeg',j))
|
|
|
|
for j in groupregionList:
|
|
|
|
result.append(('gregion',j))
|
|
|
|
for j in pageregionList:
|
|
|
|
result.append(('pregion',j))
|
2020-10-16 14:58:59 +02:00
|
|
|
result.sort(key=functools.cmp_to_key(compare))
|
2013-10-02 20:59:40 +02:00
|
|
|
|
|
|
|
# insert group end and page end indicators
|
|
|
|
inGroup = False
|
|
|
|
j = 0
|
|
|
|
while True:
|
|
|
|
if j == len(result): break
|
|
|
|
rtype = result[j][0]
|
|
|
|
rval = result[j][1]
|
|
|
|
if not inGroup and (rtype == 'grpbeg') :
|
|
|
|
inGroup = True
|
|
|
|
j = j + 1
|
|
|
|
elif inGroup and (rtype in ('grpbeg', 'pregion')):
|
|
|
|
result.insert(j,('grpend',rval))
|
|
|
|
inGroup = False
|
|
|
|
else:
|
|
|
|
j = j + 1
|
|
|
|
if inGroup:
|
|
|
|
result.append(('grpend',-1))
|
|
|
|
result.append(('pageend', -1))
|
|
|
|
return pagetype, result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# build a description of the paragraph
|
|
|
|
def getParaDescription(self, start, end, regtype):
|
|
|
|
|
2013-04-05 18:44:48 +02:00
|
|
|
result = []
|
2013-10-02 20:59:40 +02:00
|
|
|
|
|
|
|
# paragraph
|
2020-10-16 14:58:59 +02:00
|
|
|
(pos, pclass) = self.findinDoc(b'paragraph.class',start,end)
|
2013-10-02 20:59:40 +02:00
|
|
|
|
|
|
|
pclass = self.getClass(pclass)
|
|
|
|
|
|
|
|
# if paragraph uses extratokens (extra glyphs) then make it fixed
|
2020-10-16 14:58:59 +02:00
|
|
|
(pos, extraglyphs) = self.findinDoc(b'paragraph.extratokens',start,end)
|
2013-10-02 20:59:40 +02:00
|
|
|
|
|
|
|
# build up a description of the paragraph in result and return it
|
|
|
|
# first check for the basic - all words paragraph
|
2020-10-16 14:58:59 +02:00
|
|
|
(pos, sfirst) = self.findinDoc(b'paragraph.firstWord',start,end)
|
|
|
|
(pos, slast) = self.findinDoc(b'paragraph.lastWord',start,end)
|
2013-10-02 20:59:40 +02:00
|
|
|
if (sfirst != None) and (slast != None) :
|
|
|
|
first = int(sfirst)
|
|
|
|
last = int(slast)
|
|
|
|
|
2020-10-16 14:58:59 +02:00
|
|
|
makeImage = (regtype == b'vertical') or (regtype == b'table')
|
2013-10-02 20:59:40 +02:00
|
|
|
makeImage = makeImage or (extraglyphs != None)
|
|
|
|
if self.fixedimage:
|
2020-10-16 14:58:59 +02:00
|
|
|
makeImage = makeImage or (regtype == b'fixed')
|
2013-10-02 20:59:40 +02:00
|
|
|
|
|
|
|
if (pclass != None):
|
2020-10-16 14:58:59 +02:00
|
|
|
makeImage = makeImage or (pclass.find(b'.inverted') >= 0)
|
2013-10-02 20:59:40 +02:00
|
|
|
if self.fixedimage :
|
2020-10-16 14:58:59 +02:00
|
|
|
makeImage = makeImage or (pclass.find(b'cl-f-') >= 0)
|
2013-10-02 20:59:40 +02:00
|
|
|
|
|
|
|
# before creating an image make sure glyph info exists
|
2020-10-16 14:58:59 +02:00
|
|
|
gidList = self.getData(b'info.glyph.glyphID',0,-1)
|
2013-10-02 20:59:40 +02:00
|
|
|
|
|
|
|
makeImage = makeImage & (len(gidList) > 0)
|
|
|
|
|
|
|
|
if not makeImage :
|
|
|
|
# standard all word paragraph
|
2020-10-14 17:23:49 +02:00
|
|
|
for wordnum in range(first, last):
|
2013-10-02 20:59:40 +02:00
|
|
|
result.append(('ocr', wordnum))
|
|
|
|
return pclass, result
|
|
|
|
|
|
|
|
# convert paragraph to svg image
|
|
|
|
# translate first and last word into first and last glyphs
|
|
|
|
# and generate inline image and include it
|
|
|
|
glyphList = []
|
2020-10-16 14:58:59 +02:00
|
|
|
firstglyphList = self.getData(b'word.firstGlyph',0,-1)
|
|
|
|
gidList = self.getData(b'info.glyph.glyphID',0,-1)
|
2013-10-02 20:59:40 +02:00
|
|
|
firstGlyph = firstglyphList[first]
|
|
|
|
if last < len(firstglyphList):
|
|
|
|
lastGlyph = firstglyphList[last]
|
|
|
|
else :
|
|
|
|
lastGlyph = len(gidList)
|
|
|
|
|
|
|
|
# handle case of white sapce paragraphs with no actual glyphs in them
|
|
|
|
# by reverting to text based paragraph
|
|
|
|
if firstGlyph >= lastGlyph:
|
|
|
|
# revert to standard text based paragraph
|
2020-10-14 17:23:49 +02:00
|
|
|
for wordnum in range(first, last):
|
2013-10-02 20:59:40 +02:00
|
|
|
result.append(('ocr', wordnum))
|
|
|
|
return pclass, result
|
|
|
|
|
2020-10-14 17:23:49 +02:00
|
|
|
for glyphnum in range(firstGlyph, lastGlyph):
|
2013-10-02 20:59:40 +02:00
|
|
|
glyphList.append(glyphnum)
|
|
|
|
# include any extratokens if they exist
|
2020-10-16 14:58:59 +02:00
|
|
|
(pos, sfg) = self.findinDoc(b'extratokens.firstGlyph',start,end)
|
|
|
|
(pos, slg) = self.findinDoc(b'extratokens.lastGlyph',start,end)
|
2013-10-02 20:59:40 +02:00
|
|
|
if (sfg != None) and (slg != None):
|
2020-10-14 17:23:49 +02:00
|
|
|
for glyphnum in range(int(sfg), int(slg)):
|
2013-10-02 20:59:40 +02:00
|
|
|
glyphList.append(glyphnum)
|
|
|
|
num = self.svgcount
|
|
|
|
self.glyphs_to_image(glyphList)
|
|
|
|
self.svgcount += 1
|
|
|
|
result.append(('svg', num))
|
|
|
|
return pclass, result
|
|
|
|
|
|
|
|
# this type of paragraph may be made up of multiple spans, inline
|
|
|
|
# word monograms (images), and words with semantic meaning,
|
|
|
|
# plus glyphs used to form starting letter of first word
|
|
|
|
|
|
|
|
# need to parse this type line by line
|
|
|
|
line = start + 1
|
|
|
|
word_class = ''
|
|
|
|
|
|
|
|
# if end is -1 then we must search to end of document
|
|
|
|
if end == -1 :
|
|
|
|
end = self.docSize
|
|
|
|
|
|
|
|
# seems some xml has last* coming before first* so we have to
|
|
|
|
# handle any order
|
|
|
|
sp_first = -1
|
|
|
|
sp_last = -1
|
|
|
|
|
|
|
|
gl_first = -1
|
|
|
|
gl_last = -1
|
|
|
|
|
|
|
|
ws_first = -1
|
|
|
|
ws_last = -1
|
|
|
|
|
|
|
|
word_class = ''
|
|
|
|
|
|
|
|
word_semantic_type = ''
|
|
|
|
|
|
|
|
while (line < end) :
|
|
|
|
|
|
|
|
(name, argres) = self.lineinDoc(line)
|
|
|
|
|
2020-10-16 14:58:59 +02:00
|
|
|
if name.endswith(b'span.firstWord') :
|
2013-10-02 20:59:40 +02:00
|
|
|
sp_first = int(argres)
|
|
|
|
|
2020-10-16 14:58:59 +02:00
|
|
|
elif name.endswith(b'span.lastWord') :
|
2013-10-02 20:59:40 +02:00
|
|
|
sp_last = int(argres)
|
|
|
|
|
2020-10-16 14:58:59 +02:00
|
|
|
elif name.endswith(b'word.firstGlyph') :
|
2013-10-02 20:59:40 +02:00
|
|
|
gl_first = int(argres)
|
|
|
|
|
2020-10-16 14:58:59 +02:00
|
|
|
elif name.endswith(b'word.lastGlyph') :
|
2013-10-02 20:59:40 +02:00
|
|
|
gl_last = int(argres)
|
|
|
|
|
2020-10-16 14:58:59 +02:00
|
|
|
elif name.endswith(b'word_semantic.firstWord'):
|
2013-10-02 20:59:40 +02:00
|
|
|
ws_first = int(argres)
|
|
|
|
|
2020-10-16 14:58:59 +02:00
|
|
|
elif name.endswith(b'word_semantic.lastWord'):
|
2013-10-02 20:59:40 +02:00
|
|
|
ws_last = int(argres)
|
|
|
|
|
2020-10-16 14:58:59 +02:00
|
|
|
elif name.endswith(b'word.class'):
|
2013-10-02 20:59:40 +02:00
|
|
|
# we only handle spaceafter word class
|
|
|
|
try:
|
2020-10-16 14:58:59 +02:00
|
|
|
(cname, space) = argres.split(b'-',1)
|
|
|
|
if space == b'' : space = b'0'
|
|
|
|
if (cname == b'spaceafter') and (int(space) > 0) :
|
2013-10-02 20:59:40 +02:00
|
|
|
word_class = 'sa'
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
|
2020-10-16 14:58:59 +02:00
|
|
|
elif name.endswith(b'word.img.src'):
|
2013-10-02 20:59:40 +02:00
|
|
|
result.append(('img' + word_class, int(argres)))
|
|
|
|
word_class = ''
|
|
|
|
|
2020-10-16 14:58:59 +02:00
|
|
|
elif name.endswith(b'region.img.src'):
|
2013-10-02 20:59:40 +02:00
|
|
|
result.append(('img' + word_class, int(argres)))
|
|
|
|
|
|
|
|
if (sp_first != -1) and (sp_last != -1):
|
2020-10-14 17:23:49 +02:00
|
|
|
for wordnum in range(sp_first, sp_last):
|
2013-10-02 20:59:40 +02:00
|
|
|
result.append(('ocr', wordnum))
|
|
|
|
sp_first = -1
|
|
|
|
sp_last = -1
|
|
|
|
|
|
|
|
if (gl_first != -1) and (gl_last != -1):
|
|
|
|
glyphList = []
|
2020-10-14 17:23:49 +02:00
|
|
|
for glyphnum in range(gl_first, gl_last):
|
2013-10-02 20:59:40 +02:00
|
|
|
glyphList.append(glyphnum)
|
|
|
|
num = self.svgcount
|
|
|
|
self.glyphs_to_image(glyphList)
|
|
|
|
self.svgcount += 1
|
|
|
|
result.append(('svg', num))
|
|
|
|
gl_first = -1
|
|
|
|
gl_last = -1
|
|
|
|
|
|
|
|
if (ws_first != -1) and (ws_last != -1):
|
2020-10-14 17:23:49 +02:00
|
|
|
for wordnum in range(ws_first, ws_last):
|
2013-10-02 20:59:40 +02:00
|
|
|
result.append(('ocr', wordnum))
|
|
|
|
ws_first = -1
|
|
|
|
ws_last = -1
|
|
|
|
|
|
|
|
line += 1
|
|
|
|
|
|
|
|
return pclass, result
|
|
|
|
|
|
|
|
|
|
|
|
def buildParagraph(self, pclass, pdesc, type, regtype) :
|
|
|
|
parares = ''
|
|
|
|
sep =''
|
|
|
|
|
|
|
|
classres = ''
|
|
|
|
if pclass :
|
2020-10-16 14:58:59 +02:00
|
|
|
classres = ' class="' + pclass.decode('utf-8') + '"'
|
2013-10-02 20:59:40 +02:00
|
|
|
|
|
|
|
br_lb = (regtype == 'fixed') or (regtype == 'chapterheading') or (regtype == 'vertical')
|
|
|
|
|
|
|
|
handle_links = len(self.link_id) > 0
|
|
|
|
|
|
|
|
if (type == 'full') or (type == 'begin') :
|
|
|
|
parares += '<p' + classres + '>'
|
|
|
|
|
|
|
|
if (type == 'end'):
|
|
|
|
parares += ' '
|
|
|
|
|
|
|
|
lstart = len(parares)
|
|
|
|
|
|
|
|
cnt = len(pdesc)
|
|
|
|
|
2020-10-14 17:23:49 +02:00
|
|
|
for j in range( 0, cnt) :
|
2013-10-02 20:59:40 +02:00
|
|
|
|
|
|
|
(wtype, num) = pdesc[j]
|
|
|
|
|
|
|
|
if wtype == 'ocr' :
|
|
|
|
try:
|
|
|
|
word = self.ocrtext[num]
|
|
|
|
except:
|
|
|
|
word = ""
|
|
|
|
|
|
|
|
sep = ' '
|
|
|
|
|
|
|
|
if handle_links:
|
|
|
|
link = self.link_id[num]
|
|
|
|
if (link > 0):
|
|
|
|
linktype = self.link_type[link-1]
|
|
|
|
title = self.link_title[link-1]
|
2020-10-16 14:58:59 +02:00
|
|
|
if (title == b"") or (parares.rfind(title.decode('utf-8')) < 0):
|
|
|
|
title=parares[lstart:].encode('utf-8')
|
2013-10-02 20:59:40 +02:00
|
|
|
if linktype == 'external' :
|
|
|
|
linkhref = self.link_href[link-1]
|
|
|
|
linkhtml = '<a href="%s">' % linkhref
|
|
|
|
else :
|
|
|
|
if len(self.link_page) >= link :
|
|
|
|
ptarget = self.link_page[link-1] - 1
|
|
|
|
linkhtml = '<a href="#page%04d">' % ptarget
|
|
|
|
else :
|
|
|
|
# just link to the current page
|
|
|
|
linkhtml = '<a href="#' + self.id + '">'
|
2020-10-16 14:58:59 +02:00
|
|
|
linkhtml += title.decode('utf-8')
|
|
|
|
linkhtml += '</a>'
|
|
|
|
pos = parares.rfind(title.decode('utf-8'))
|
2013-10-02 20:59:40 +02:00
|
|
|
if pos >= 0:
|
|
|
|
parares = parares[0:pos] + linkhtml + parares[pos+len(title):]
|
|
|
|
else :
|
|
|
|
parares += linkhtml
|
|
|
|
lstart = len(parares)
|
2020-10-16 14:58:59 +02:00
|
|
|
if word == b'_link_' : word = b''
|
2013-10-02 20:59:40 +02:00
|
|
|
elif (link < 0) :
|
2020-10-16 14:58:59 +02:00
|
|
|
if word == b'_link_' : word = b''
|
2013-10-02 20:59:40 +02:00
|
|
|
|
2020-10-16 14:58:59 +02:00
|
|
|
if word == b'_lb_':
|
2013-10-02 20:59:40 +02:00
|
|
|
if ((num-1) in self.dehyphen_rootid ) or handle_links:
|
2020-10-16 14:58:59 +02:00
|
|
|
word = b''
|
2013-10-02 20:59:40 +02:00
|
|
|
sep = ''
|
|
|
|
elif br_lb :
|
2020-10-16 14:58:59 +02:00
|
|
|
word = b'<br />\n'
|
2013-10-02 20:59:40 +02:00
|
|
|
sep = ''
|
|
|
|
else :
|
2020-10-16 14:58:59 +02:00
|
|
|
word = b'\n'
|
2013-10-02 20:59:40 +02:00
|
|
|
sep = ''
|
|
|
|
|
|
|
|
if num in self.dehyphen_rootid :
|
|
|
|
word = word[0:-1]
|
|
|
|
sep = ''
|
|
|
|
|
2020-10-16 14:58:59 +02:00
|
|
|
parares += word.decode('utf-8') + sep
|
2013-10-02 20:59:40 +02:00
|
|
|
|
|
|
|
elif wtype == 'img' :
|
|
|
|
sep = ''
|
|
|
|
parares += '<img src="img/img%04d.jpg" alt="" />' % num
|
|
|
|
parares += sep
|
|
|
|
|
|
|
|
elif wtype == 'imgsa' :
|
|
|
|
sep = ' '
|
|
|
|
parares += '<img src="img/img%04d.jpg" alt="" />' % num
|
|
|
|
parares += sep
|
|
|
|
|
|
|
|
elif wtype == 'svg' :
|
|
|
|
sep = ''
|
2020-10-16 14:58:59 +02:00
|
|
|
parares += '<img src="img/'
|
|
|
|
parares += self.id
|
|
|
|
parares += '_%04d.svg" alt="" />' % num
|
2013-10-02 20:59:40 +02:00
|
|
|
parares += sep
|
|
|
|
|
|
|
|
if len(sep) > 0 : parares = parares[0:-1]
|
|
|
|
if (type == 'full') or (type == 'end') :
|
|
|
|
parares += '</p>'
|
|
|
|
return parares
|
|
|
|
|
|
|
|
|
|
|
|
def buildTOCEntry(self, pdesc) :
|
|
|
|
parares = ''
|
|
|
|
sep =''
|
|
|
|
tocentry = ''
|
|
|
|
handle_links = len(self.link_id) > 0
|
|
|
|
|
|
|
|
lstart = 0
|
|
|
|
|
|
|
|
cnt = len(pdesc)
|
2020-10-14 17:23:49 +02:00
|
|
|
for j in range( 0, cnt) :
|
2013-10-02 20:59:40 +02:00
|
|
|
|
|
|
|
(wtype, num) = pdesc[j]
|
|
|
|
|
|
|
|
if wtype == 'ocr' :
|
2020-10-16 14:58:59 +02:00
|
|
|
word = self.ocrtext[num].decode('utf-8')
|
2013-10-02 20:59:40 +02:00
|
|
|
sep = ' '
|
|
|
|
|
|
|
|
if handle_links:
|
|
|
|
link = self.link_id[num]
|
|
|
|
if (link > 0):
|
|
|
|
linktype = self.link_type[link-1]
|
|
|
|
title = self.link_title[link-1]
|
2021-03-22 19:24:34 +01:00
|
|
|
title = title.rstrip(b'. ').decode('utf-8')
|
2013-10-02 20:59:40 +02:00
|
|
|
alt_title = parares[lstart:]
|
|
|
|
alt_title = alt_title.strip()
|
|
|
|
# now strip off the actual printed page number
|
|
|
|
alt_title = alt_title.rstrip('01234567890ivxldIVXLD-.')
|
|
|
|
alt_title = alt_title.rstrip('. ')
|
|
|
|
# skip over any external links - can't have them in a books toc
|
|
|
|
if linktype == 'external' :
|
|
|
|
title = ''
|
|
|
|
alt_title = ''
|
|
|
|
linkpage = ''
|
|
|
|
else :
|
|
|
|
if len(self.link_page) >= link :
|
|
|
|
ptarget = self.link_page[link-1] - 1
|
|
|
|
linkpage = '%04d' % ptarget
|
|
|
|
else :
|
|
|
|
# just link to the current page
|
|
|
|
linkpage = self.id[4:]
|
|
|
|
if len(alt_title) >= len(title):
|
|
|
|
title = alt_title
|
|
|
|
if title != '' and linkpage != '':
|
|
|
|
tocentry += title + '|' + linkpage + '\n'
|
|
|
|
lstart = len(parares)
|
|
|
|
if word == '_link_' : word = ''
|
|
|
|
elif (link < 0) :
|
|
|
|
if word == '_link_' : word = ''
|
|
|
|
|
|
|
|
if word == '_lb_':
|
|
|
|
word = ''
|
|
|
|
sep = ''
|
|
|
|
|
|
|
|
if num in self.dehyphen_rootid :
|
|
|
|
word = word[0:-1]
|
|
|
|
sep = ''
|
|
|
|
|
|
|
|
parares += word + sep
|
|
|
|
|
|
|
|
else :
|
|
|
|
continue
|
|
|
|
|
|
|
|
return tocentry
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# walk the document tree collecting the information needed
|
|
|
|
# to build an html page using the ocrText
|
|
|
|
|
|
|
|
def process(self):
|
|
|
|
|
|
|
|
tocinfo = ''
|
|
|
|
hlst = []
|
|
|
|
|
|
|
|
# get the ocr text
|
2020-10-16 14:58:59 +02:00
|
|
|
(pos, argres) = self.findinDoc(b'info.word.ocrText',0,-1)
|
|
|
|
if argres : self.ocrtext = argres.split(b'|')
|
2013-10-02 20:59:40 +02:00
|
|
|
|
|
|
|
# get information to dehyphenate the text
|
2020-10-16 14:58:59 +02:00
|
|
|
self.dehyphen_rootid = self.getData(b'info.dehyphen.rootID',0,-1)
|
2013-10-02 20:59:40 +02:00
|
|
|
|
|
|
|
# determine if first paragraph is continued from previous page
|
2020-10-16 14:58:59 +02:00
|
|
|
(pos, self.parastems_stemid) = self.findinDoc(b'info.paraStems.stemID',0,-1)
|
2013-10-02 20:59:40 +02:00
|
|
|
first_para_continued = (self.parastems_stemid != None)
|
|
|
|
|
|
|
|
# determine if last paragraph is continued onto the next page
|
2020-10-16 14:58:59 +02:00
|
|
|
(pos, self.paracont_stemid) = self.findinDoc(b'info.paraCont.stemID',0,-1)
|
2013-10-02 20:59:40 +02:00
|
|
|
last_para_continued = (self.paracont_stemid != None)
|
|
|
|
|
|
|
|
# collect link ids
|
2020-10-16 14:58:59 +02:00
|
|
|
self.link_id = self.getData(b'info.word.link_id',0,-1)
|
2013-10-02 20:59:40 +02:00
|
|
|
|
|
|
|
# collect link destination page numbers
|
2020-10-16 14:58:59 +02:00
|
|
|
self.link_page = self.getData(b'info.links.page',0,-1)
|
2013-10-02 20:59:40 +02:00
|
|
|
|
|
|
|
# collect link types (container versus external)
|
2020-10-16 14:58:59 +02:00
|
|
|
(pos, argres) = self.findinDoc(b'info.links.type',0,-1)
|
|
|
|
if argres : self.link_type = argres.split(b'|')
|
2013-10-02 20:59:40 +02:00
|
|
|
|
|
|
|
# collect link destinations
|
2020-10-16 14:58:59 +02:00
|
|
|
(pos, argres) = self.findinDoc(b'info.links.href',0,-1)
|
|
|
|
if argres : self.link_href = argres.split(b'|')
|
2013-10-02 20:59:40 +02:00
|
|
|
|
|
|
|
# collect link titles
|
2020-10-16 14:58:59 +02:00
|
|
|
(pos, argres) = self.findinDoc(b'info.links.title',0,-1)
|
2013-10-02 20:59:40 +02:00
|
|
|
if argres :
|
2020-10-16 14:58:59 +02:00
|
|
|
self.link_title = argres.split(b'|')
|
2013-04-05 18:44:48 +02:00
|
|
|
else:
|
2013-10-02 20:59:40 +02:00
|
|
|
self.link_title.append('')
|
2012-11-20 14:28:12 +01:00
|
|
|
|
2013-10-02 20:59:40 +02:00
|
|
|
# get a descriptions of the starting points of the regions
|
|
|
|
# and groups on the page
|
|
|
|
(pagetype, pageDesc) = self.PageDescription()
|
|
|
|
regcnt = len(pageDesc) - 1
|
|
|
|
|
|
|
|
anchorSet = False
|
|
|
|
breakSet = False
|
|
|
|
inGroup = False
|
|
|
|
|
|
|
|
# process each region on the page and convert what you can to html
|
|
|
|
|
2020-10-14 17:23:49 +02:00
|
|
|
for j in range(regcnt):
|
2013-10-02 20:59:40 +02:00
|
|
|
|
|
|
|
(etype, start) = pageDesc[j]
|
|
|
|
(ntype, end) = pageDesc[j+1]
|
|
|
|
|
|
|
|
|
|
|
|
# set anchor for link target on this page
|
|
|
|
if not anchorSet and not first_para_continued:
|
|
|
|
hlst.append('<div style="visibility: hidden; height: 0; width: 0;" id="')
|
2020-10-16 14:58:59 +02:00
|
|
|
hlst.append(self.id + '" title="pagetype_' + pagetype.decode('utf-8') + '"></div>\n')
|
2013-10-02 20:59:40 +02:00
|
|
|
anchorSet = True
|
|
|
|
|
|
|
|
# handle groups of graphics with text captions
|
2020-10-16 14:58:59 +02:00
|
|
|
if (etype == b'grpbeg'):
|
|
|
|
(pos, grptype) = self.findinDoc(b'group.type', start, end)
|
2013-10-02 20:59:40 +02:00
|
|
|
if grptype != None:
|
2020-10-16 14:58:59 +02:00
|
|
|
if grptype == b'graphic':
|
|
|
|
gcstr = ' class="' + grptype.decode('utf-8') + '"'
|
2013-10-02 20:59:40 +02:00
|
|
|
hlst.append('<div' + gcstr + '>')
|
|
|
|
inGroup = True
|
|
|
|
|
2020-10-16 14:58:59 +02:00
|
|
|
elif (etype == b'grpend'):
|
2013-10-02 20:59:40 +02:00
|
|
|
if inGroup:
|
|
|
|
hlst.append('</div>\n')
|
|
|
|
inGroup = False
|
|
|
|
|
|
|
|
else:
|
2020-10-16 14:58:59 +02:00
|
|
|
(pos, regtype) = self.findinDoc(b'region.type',start,end)
|
2013-10-02 20:59:40 +02:00
|
|
|
|
2020-10-16 14:58:59 +02:00
|
|
|
if regtype == b'graphic' :
|
|
|
|
(pos, simgsrc) = self.findinDoc(b'img.src',start,end)
|
2013-10-02 20:59:40 +02:00
|
|
|
if simgsrc:
|
|
|
|
if inGroup:
|
|
|
|
hlst.append('<img src="img/img%04d.jpg" alt="" />' % int(simgsrc))
|
|
|
|
else:
|
|
|
|
hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc))
|
|
|
|
|
2020-10-16 14:58:59 +02:00
|
|
|
elif regtype == b'chapterheading' :
|
2013-10-02 20:59:40 +02:00
|
|
|
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
|
|
|
|
if not breakSet:
|
|
|
|
hlst.append('<div style="page-break-after: always;"> </div>\n')
|
|
|
|
breakSet = True
|
|
|
|
tag = 'h1'
|
|
|
|
if pclass and (len(pclass) >= 7):
|
2020-10-16 14:58:59 +02:00
|
|
|
if pclass[3:7] == b'ch1-' : tag = 'h1'
|
|
|
|
if pclass[3:7] == b'ch2-' : tag = 'h2'
|
|
|
|
if pclass[3:7] == b'ch3-' : tag = 'h3'
|
|
|
|
hlst.append('<' + tag + ' class="' + pclass.decode('utf-8') + '">')
|
2013-10-02 20:59:40 +02:00
|
|
|
else:
|
|
|
|
hlst.append('<' + tag + '>')
|
|
|
|
hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype))
|
|
|
|
hlst.append('</' + tag + '>')
|
|
|
|
|
2020-10-16 14:58:59 +02:00
|
|
|
elif (regtype == b'text') or (regtype == b'fixed') or (regtype == b'insert') or (regtype == b'listitem'):
|
2013-10-02 20:59:40 +02:00
|
|
|
ptype = 'full'
|
|
|
|
# check to see if this is a continution from the previous page
|
|
|
|
if first_para_continued :
|
|
|
|
ptype = 'end'
|
|
|
|
first_para_continued = False
|
|
|
|
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
|
|
|
|
if pclass and (len(pclass) >= 6) and (ptype == 'full'):
|
|
|
|
tag = 'p'
|
2020-10-16 14:58:59 +02:00
|
|
|
if pclass[3:6] == b'h1-' : tag = 'h4'
|
|
|
|
if pclass[3:6] == b'h2-' : tag = 'h5'
|
|
|
|
if pclass[3:6] == b'h3-' : tag = 'h6'
|
|
|
|
hlst.append('<' + tag + ' class="' + pclass.decode('utf-8') + '">')
|
2013-10-02 20:59:40 +02:00
|
|
|
hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype))
|
|
|
|
hlst.append('</' + tag + '>')
|
|
|
|
else :
|
|
|
|
hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))
|
|
|
|
|
2020-10-16 14:58:59 +02:00
|
|
|
elif (regtype == b'tocentry') :
|
2013-10-02 20:59:40 +02:00
|
|
|
ptype = 'full'
|
|
|
|
if first_para_continued :
|
|
|
|
ptype = 'end'
|
|
|
|
first_para_continued = False
|
|
|
|
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
|
|
|
|
tocinfo += self.buildTOCEntry(pdesc)
|
|
|
|
hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))
|
|
|
|
|
2020-10-16 14:58:59 +02:00
|
|
|
elif (regtype == b'vertical') or (regtype == b'table') :
|
2013-10-02 20:59:40 +02:00
|
|
|
ptype = 'full'
|
|
|
|
if inGroup:
|
|
|
|
ptype = 'middle'
|
|
|
|
if first_para_continued :
|
|
|
|
ptype = 'end'
|
|
|
|
first_para_continued = False
|
|
|
|
(pclass, pdesc) = self.getParaDescription(start, end, regtype)
|
|
|
|
hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))
|
|
|
|
|
|
|
|
|
2020-10-16 14:58:59 +02:00
|
|
|
elif (regtype == b'synth_fcvr.center'):
|
|
|
|
(pos, simgsrc) = self.findinDoc(b'img.src',start,end)
|
2013-10-02 20:59:40 +02:00
|
|
|
if simgsrc:
|
|
|
|
hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc))
|
|
|
|
|
|
|
|
else :
|
2019-06-24 18:49:38 +02:00
|
|
|
print(' Making region type', regtype, end=' ')
|
2020-10-16 14:58:59 +02:00
|
|
|
(pos, temp) = self.findinDoc(b'paragraph',start,end)
|
|
|
|
(pos2, temp) = self.findinDoc(b'span',start,end)
|
2013-10-02 20:59:40 +02:00
|
|
|
if pos != -1 or pos2 != -1:
|
2019-06-24 18:49:38 +02:00
|
|
|
print(' a "text" region')
|
2013-10-02 20:59:40 +02:00
|
|
|
orig_regtype = regtype
|
2020-10-16 14:58:59 +02:00
|
|
|
regtype = b'fixed'
|
2013-10-02 20:59:40 +02:00
|
|
|
ptype = 'full'
|
|
|
|
# check to see if this is a continution from the previous page
|
|
|
|
if first_para_continued :
|
|
|
|
ptype = 'end'
|
|
|
|
first_para_continued = False
|
|
|
|
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
|
|
|
|
if not pclass:
|
2021-04-05 17:06:24 +02:00
|
|
|
if orig_regtype.endswith(b'.right') : pclass = b'cl-right'
|
|
|
|
elif orig_regtype.endswith(b'.center') : pclass = b'cl-center'
|
|
|
|
elif orig_regtype.endswith(b'.left') : pclass = b'cl-left'
|
|
|
|
elif orig_regtype.endswith(b'.justify') : pclass = b'cl-justify'
|
2013-10-02 20:59:40 +02:00
|
|
|
if pclass and (ptype == 'full') and (len(pclass) >= 6):
|
|
|
|
tag = 'p'
|
2020-10-16 14:58:59 +02:00
|
|
|
if pclass[3:6] == b'h1-' : tag = 'h4'
|
|
|
|
if pclass[3:6] == b'h2-' : tag = 'h5'
|
|
|
|
if pclass[3:6] == b'h3-' : tag = 'h6'
|
|
|
|
hlst.append('<' + tag + ' class="' + pclass.decode('utf-8') + '">')
|
2013-10-02 20:59:40 +02:00
|
|
|
hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype))
|
|
|
|
hlst.append('</' + tag + '>')
|
|
|
|
else :
|
|
|
|
hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))
|
|
|
|
else :
|
2019-06-24 18:49:38 +02:00
|
|
|
print(' a "graphic" region')
|
2020-10-16 14:58:59 +02:00
|
|
|
(pos, simgsrc) = self.findinDoc(b'img.src',start,end)
|
2013-10-02 20:59:40 +02:00
|
|
|
if simgsrc:
|
|
|
|
hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc))
|
|
|
|
|
|
|
|
|
|
|
|
htmlpage = "".join(hlst)
|
|
|
|
if last_para_continued :
|
|
|
|
if htmlpage[-4:] == '</p>':
|
|
|
|
htmlpage = htmlpage[0:-4]
|
|
|
|
last_para_continued = False
|
|
|
|
|
|
|
|
return htmlpage, tocinfo
|
|
|
|
|
|
|
|
|
|
|
|
def convert2HTML(flatxml, classlst, fileid, bookDir, gdict, fixedimage):
|
|
|
|
# create a document parser
|
|
|
|
dp = DocParser(flatxml, classlst, fileid, bookDir, gdict, fixedimage)
|
|
|
|
htmlpage, tocinfo = dp.process()
|
|
|
|
return htmlpage, tocinfo
|