ineptpdf 7

This commit is contained in:
Anonymous 2010-02-23 08:15:41 +00:00 committed by Apprentice Alf
parent f027848bff
commit 4f19f5ac11

View file

@ -1,6 +1,7 @@
#! /usr/bin/python
# ineptpdf.pyw, version 6.1
# ineptpdf7.pyw
# ineptpdf, version 7
# To run this program install Python 2.6 from http://www.python.org/download/
# and PyCrypto from http://www.voidspace.org.uk/python/modules.shtml#pycrypto
@ -15,6 +16,10 @@
# 5 - removing small bug with V3 ebooks (anon)
# 6 - changed to adeptkey4.der format for 1.7.2 support (anon)
# 6.1 - backward compatibility for 1.7.1 and old adeptkey.der
# 7 - Get cross reference streams and object streams working for input.
# Not yet supported on output but this only affects file size,
# not functionality. (by anon2)
"""
Decrypt Adobe ADEPT-encrypted PDF files.
"""
@ -42,6 +47,10 @@ try:
except ImportError:
ARC4 = None
RSA = None
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
class ADEPTError(Exception):
@ -569,16 +578,17 @@ class PSBaseParser(object):
pos = self.fp.tell()
buf = ''
while 0 < pos:
prevpos = pos
pos = max(0, pos-self.BUFSIZ)
self.fp.seek(pos)
s = self.fp.read(self.BUFSIZ)
s = self.fp.read(prevpos-pos)
if not s: break
while 1:
n = max(s.rfind('\r'), s.rfind('\n'))
if n == -1:
buf = s + buf
break
yield buf+s[n:]
yield s[n:]+buf
s = s[:n]
buf = ''
return
@ -867,7 +877,7 @@ class PDFStream(PDFObject):
(self.objid, len(self.rawdata), self.dic)
def decode(self):
assert self.data == None and self.rawdata != None
assert self.data is None and self.rawdata is not None
data = self.rawdata
if self.decipher:
# Handle encryption
@ -884,10 +894,6 @@ class PDFStream(PDFObject):
# will get errors if the document is encrypted.
data = zlib.decompress(data)
elif f in LITERALS_LZW_DECODE:
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
data = ''.join(LZWDecoder(StringIO(data)).run())
elif f in LITERALS_ASCII85_DECODE:
data = ascii85decode(data)
@ -926,7 +932,7 @@ class PDFStream(PDFObject):
return
def get_data(self):
if self.data == None:
if self.data is None:
self.decode()
return self.data
@ -934,6 +940,13 @@ class PDFStream(PDFObject):
return self.rawdata
def get_decdata(self):
if self.data is not None:
# Data has already been decrypted and decoded. This is the case
# for object streams. Note: this data is wrong to put in the
# output because it should be stored decrypted but
# uncompressed. This can be done by storing the intermediate
# data. For now object streams are useless in the output.
return self.data
data = self.rawdata
if self.decipher and data:
# Handle encryption
@ -989,7 +1002,7 @@ class PDFXRef(object):
if len(f) != 2:
raise PDFNoValidXRef('Trailer not found: %r: line=%r' % (parser, line))
try:
(start, nobjs) = map(long, f)
(start, nobjs) = map(int, f)
except ValueError:
raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line))
for objid in xrange(start, start+nobjs):
@ -1002,7 +1015,7 @@ class PDFXRef(object):
raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line))
(pos, genno, use) = f
if use != 'n': continue
self.offsets[objid] = (int(genno), long(pos))
self.offsets[objid] = (int(genno), int(pos))
self.load_trailer(parser)
return
@ -1040,7 +1053,7 @@ class PDFXRefStream(object):
return
def __repr__(self):
return '<PDFXRef: objid=%d-%d>' % (self.objid_first, self.objid_last)
return '<PDFXRef: objids=%s>' % self.index
def objids(self):
for first, size in self.index:
@ -1298,12 +1311,45 @@ class PDFDocument(object):
except KeyError:
pass
else:
return
#if STRICT:
# raise PDFSyntaxError('Cannot locate objid=%r' % objid)
return None
if stmid:
return PDFObjStmRef(objid, stmid, index)
# Later try to introduce PDFObjStmRef's
# return PDFObjStmRef(objid, stmid, index)
# Stuff from pdfminer
stream = stream_value(self.getobj(stmid))
if stream.dic.get('Type') is not LITERAL_OBJSTM:
if STRICT:
raise PDFSyntaxError('Not a stream object: %r' % stream)
try:
n = stream.dic['N']
except KeyError:
if STRICT:
raise PDFSyntaxError('N is not defined: %r' % stream)
n = 0
if stmid in self.parsed_objs:
objs = self.parsed_objs[stmid]
else:
parser = PDFObjStrmParser(stream.get_data(), self)
objs = []
try:
while 1:
(_,obj) = parser.nextobject()
objs.append(obj)
except PSEOF:
pass
self.parsed_objs[stmid] = objs
genno = 0
i = n*2+index
try:
obj = objs[i]
except IndexError:
raise PDFSyntaxError('Invalid object number: objid=%r' % (objid))
if isinstance(obj, PDFStream):
obj.set_objid(objid, 0)
###
else:
self.parser.seek(index)
(_,objid1) = self.parser.nexttoken() # objid
@ -1316,9 +1362,9 @@ class PDFDocument(object):
(_,obj) = self.parser.nextobject()
if isinstance(obj, PDFStream):
obj.set_objid(objid, genno)
self.objs[objid] = obj
if self.decipher:
obj = decipher_all(self.decipher, objid, genno, obj)
self.objs[objid] = obj
return obj
class PDFObjStmRef(object):
@ -1419,7 +1465,7 @@ class PDFParser(PSStackParser):
prev = line
else:
raise PDFNoValidXRef('Unexpected EOF')
return long(prev)
return int(prev)
# read xref table
def read_xref_from(self, start, xrefs):
@ -1482,6 +1528,34 @@ class PDFParser(PSStackParser):
xrefs.append(xref)
return xrefs
## PDFObjStrmParser
##
class PDFObjStrmParser(PDFParser):
def __init__(self, data, doc):
PSStackParser.__init__(self, StringIO(data))
self.doc = doc
return
def flush(self):
self.add_results(*self.popall())
return
KEYWORD_R = KWD('R')
def do_keyword(self, pos, token):
if token is self.KEYWORD_R:
# reference to indirect object
try:
((_,objid), (_,genno)) = self.pop(2)
(objid, genno) = (int(objid), int(genno))
obj = PDFObjRef(self.doc, objid, genno)
self.push((pos, obj))
except PSSyntaxError:
pass
return
# others
self.push((pos, token))
return
###
### My own code, for which there is none else to blame
@ -1521,6 +1595,7 @@ class PDFSerializer(object):
if isinstance(obj, PDFObjStmRef):
xrefstm[objid] = obj
continue
if obj is not None:
xrefs[objid] = self.tell()
self.serialize_indirect(objid, obj)
startxref = self.tell()
@ -1611,6 +1686,13 @@ class PDFSerializer(object):
self.write(' ')
self.write('%d %d R' % (obj.objid, 0))
elif isinstance(obj, PDFStream):
### For now, we have extracted all objects from an Object Stream,
### so we don't need these any more. Therefore leave them out
### of the output. Later we could try to use object streams in
### the output again to get smaller output.
if obj.dic.get('Type') == LITERAL_OBJSTM:
self.write('(deleted)')
else:
data = obj.get_decdata()
self.serialize_object(obj.dic)
self.write('stream\n')
@ -1697,7 +1779,7 @@ class DecryptionDialog(Tkinter.Frame):
def get_inpath(self):
inpath = tkFileDialog.askopenfilename(
parent=None, title='Select ADEPT-encrypted PDF file to decrypt',
defaultextension='.epub', filetypes=[('PDF files', '.pdf'),
defaultextension='.pdf', filetypes=[('PDF files', '.pdf'),
('All files', '.*')])
if inpath:
inpath = os.path.normpath(inpath)
@ -1708,7 +1790,7 @@ class DecryptionDialog(Tkinter.Frame):
def get_outpath(self):
outpath = tkFileDialog.asksaveasfilename(
parent=None, title='Select unencrypted PDF file to produce',
defaultextension='.epub', filetypes=[('PDF files', '.pdf'),
defaultextension='.pdf', filetypes=[('PDF files', '.pdf'),
('All files', '.*')])
if outpath:
outpath = os.path.normpath(outpath)