Coverage for pdfrw/pdfrw/pdfreader.py: 43%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# A part of pdfrw (https://github.com/pmaupin/pdfrw)
2# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
3# Copyright (C) 2012-2015 Nerijus Mika
4# MIT license -- See LICENSE.txt for details
6'''
7The PdfReader class reads an entire PDF file into memory and
8parses the top-level container objects. (It does not parse
9into streams.) The object subclasses PdfDict, and the
10document pages are stored in a list in the pages attribute
11of the object.
12'''
13import gc
14import binascii
15import collections
16import itertools
18from .errors import PdfParseError, log
19from .tokens import PdfTokens
20from .objects import PdfDict, PdfArray, PdfName, PdfObject, PdfIndirect
21from .uncompress import uncompress
22from . import crypt
23from .py23_diffs import convert_load, convert_store, iteritems
26class PdfReader(PdfDict):
28 def findindirect(self, objnum, gennum, PdfIndirect=PdfIndirect, int=int):
29 ''' Return a previously loaded indirect object, or create
30 a placeholder for it.
31 '''
32 key = int(objnum), int(gennum)
33 result = self.indirect_objects.get(key)
34 if result is None:
35 self.indirect_objects[key] = result = PdfIndirect(key)
36 self.deferred_objects.add(key)
37 result._loader = self.loadindirect
38 return result
40 def readarray(self, source, PdfArray=PdfArray):
41 ''' Found a [ token. Parse the tokens after that.
42 '''
43 specialget = self.special.get
44 result = []
45 pop = result.pop
46 append = result.append
48 for value in source:
49 if value in ']R':
50 if value == ']':
51 break
52 generation = pop()
53 value = self.findindirect(pop(), generation)
54 else:
55 func = specialget(value)
56 if func is not None:
57 value = func(source)
58 append(value)
59 return PdfArray(result)
61 def readdict(self, source, PdfDict=PdfDict):
62 ''' Found a << token. Parse the tokens after that.
63 '''
64 specialget = self.special.get
65 result = PdfDict()
66 next = source.next
68 tok = next()
69 while tok != '>>':
70 if not tok.startswith('/'):
71 source.error('Expected PDF /name object')
72 tok = next()
73 continue
74 key = tok
75 value = next()
76 func = specialget(value)
77 if func is not None:
78 value = func(source)
79 tok = next()
80 else:
81 tok = next()
82 if value.isdigit() and tok.isdigit():
83 tok2 = next()
84 if tok2 != 'R':
85 source.error('Expected "R" following two integers')
86 tok = tok2
87 continue
88 value = self.findindirect(value, tok)
89 tok = next()
90 result[key] = value
91 return result
93 def empty_obj(self, source, PdfObject=PdfObject):
94 ''' Some silly git put an empty object in the
95 file. Back up so the caller sees the endobj.
96 '''
97 source.floc = source.tokstart
99 def badtoken(self, source):
100 ''' Didn't see that coming.
101 '''
102 source.exception('Unexpected delimiter')
104 def findstream(self, obj, tok, source, len=len):
105 ''' Figure out if there is a content stream
106 following an object, and return the start
107 pointer to the content stream if so.
109 (We can't read it yet, because we might not
110 know how long it is, because Length might
111 be an indirect object.)
112 '''
114 fdata = source.fdata
115 startstream = source.tokstart + len(tok)
116 gotcr = fdata[startstream] == '\r'
117 startstream += gotcr
118 gotlf = fdata[startstream] == '\n'
119 startstream += gotlf
120 if not gotlf:
121 if not gotcr:
122 source.error(r'stream keyword not followed by \n')
123 else:
124 source.warning(r"stream keyword terminated "
125 r"by \r without \n")
126 return startstream
128 def readstream(self, obj, startstream, source, exact_required=False,
129 streamending='endstream endobj'.split(), int=int):
130 fdata = source.fdata
131 length = int(obj.Length)
132 source.floc = target_endstream = startstream + length
133 endit = source.multiple(2)
134 obj._stream = fdata[startstream:target_endstream]
135 if endit == streamending:
136 return
138 if exact_required:
139 source.exception('Expected endstream endobj')
141 # The length attribute does not match the distance between the
142 # stream and endstream keywords.
144 # TODO: Extract maxstream from dictionary of object offsets
145 # and use rfind instead of find.
146 maxstream = len(fdata) - 20
147 endstream = fdata.find('endstream', startstream, maxstream)
148 source.floc = startstream
149 room = endstream - startstream
150 if endstream < 0:
151 source.error('Could not find endstream')
152 return
153 if (length == room + 1 and
154 fdata[startstream - 2:startstream] == '\r\n'):
155 source.warning(r"stream keyword terminated by \r without \n")
156 obj._stream = fdata[startstream - 1:target_endstream - 1]
157 return
158 source.floc = endstream
159 if length > room:
160 source.error('stream /Length attribute (%d) appears to '
161 'be too big (size %d) -- adjusting',
162 length, room)
163 obj.stream = fdata[startstream:endstream]
164 return
165 if fdata[target_endstream:endstream].rstrip():
166 source.error('stream /Length attribute (%d) appears to '
167 'be too small (size %d) -- adjusting',
168 length, room)
169 obj.stream = fdata[startstream:endstream]
170 return
171 endobj = fdata.find('endobj', endstream, maxstream)
172 if endobj < 0:
173 source.error('Could not find endobj after endstream')
174 return
175 if fdata[endstream:endobj].rstrip() != 'endstream':
176 source.error('Unexpected data between endstream and endobj')
177 return
178 source.error('Illegal endstream/endobj combination')
180 def loadindirect(self, key, PdfDict=PdfDict,
181 isinstance=isinstance):
182 result = self.indirect_objects.get(key)
183 if not isinstance(result, PdfIndirect):
184 return result
185 source = self.source
186 offset = int(self.source.obj_offsets.get(key, '0'))
187 if not offset:
188 source.warning("Did not find PDF object %s", key)
189 return None
191 # Read the object header and validate it
192 objnum, gennum = key
193 source.floc = offset
194 objid = source.multiple(3)
195 ok = len(objid) == 3
196 ok = ok and objid[0].isdigit() and int(objid[0]) == objnum
197 ok = ok and objid[1].isdigit() and int(objid[1]) == gennum
198 ok = ok and objid[2] == 'obj'
199 if not ok:
200 source.floc = offset
201 source.next()
202 objheader = '%d %d obj' % (objnum, gennum)
203 fdata = source.fdata
204 offset2 = (fdata.find('\n' + objheader) + 1 or
205 fdata.find('\r' + objheader) + 1)
206 if (not offset2 or
207 fdata.find(fdata[offset2 - 1] + objheader, offset2) > 0):
208 source.warning("Expected indirect object '%s'", objheader)
209 return None
210 source.warning("Indirect object %s found at incorrect "
211 "offset %d (expected offset %d)",
212 objheader, offset2, offset)
213 source.floc = offset2 + len(objheader)
215 # Read the object, and call special code if it starts
216 # an array or dictionary
217 obj = source.next()
218 func = self.special.get(obj)
219 if func is not None:
220 obj = func(source)
222 self.indirect_objects[key] = obj
223 self.deferred_objects.remove(key)
225 # Mark the object as indirect, and
226 # just return it if it is a simple object.
227 obj.indirect = key
228 tok = source.next()
229 if tok == 'endobj':
230 return obj
232 # Should be a stream. Either that or it's broken.
233 isdict = isinstance(obj, PdfDict)
234 if isdict and tok == 'stream':
235 self.readstream(obj, self.findstream(obj, tok, source), source)
236 return obj
238 # Houston, we have a problem, but let's see if it
239 # is easily fixable. Leaving out a space before endobj
240 # is apparently an easy mistake to make on generation
241 # (Because it won't be noticed unless you are specifically
242 # generating an indirect object that doesn't end with any
243 # sort of delimiter.) It is so common that things like
244 # okular just handle it.
246 if isinstance(obj, PdfObject) and obj.endswith('endobj'):
247 source.error('No space or delimiter before endobj')
248 obj = PdfObject(obj[:-6])
249 else:
250 source.error("Expected 'endobj'%s token",
251 isdict and " or 'stream'" or '')
252 obj = PdfObject('')
254 obj.indirect = key
255 self.indirect_objects[key] = obj
256 return obj
258 def read_all(self):
259 deferred = self.deferred_objects
260 prev = set()
261 while 1:
262 new = deferred - prev
263 if not new:
264 break
265 prev |= deferred
266 for key in new:
267 self.loadindirect(key)
269 def decrypt_all(self):
270 self.read_all()
272 if self.crypt_filters is not None:
273 crypt.decrypt_objects(
274 self.indirect_objects.values(), self.stream_crypt_filter,
275 self.crypt_filters)
277 def uncompress(self):
278 self.read_all()
280 uncompress(self.indirect_objects.values())
282 def load_stream_objects(self, object_streams):
283 # read object streams
284 objs = []
285 for num in object_streams:
286 obj = self.findindirect(num, 0).real_value()
287 assert obj.Type == '/ObjStm'
288 objs.append(obj)
290 # read objects from stream
291 if objs:
292 # Decrypt
293 if self.crypt_filters is not None:
294 crypt.decrypt_objects(
295 objs, self.stream_crypt_filter, self.crypt_filters)
297 # Decompress
298 uncompress(objs)
300 for obj in objs:
301 objsource = PdfTokens(obj.stream, 0, False)
302 next = objsource.next
303 offsets = []
304 firstoffset = int(obj.First)
305 while objsource.floc < firstoffset:
306 offsets.append((int(next()), firstoffset + int(next())))
307 for num, offset in offsets:
308 # Read the object, and call special code if it starts
309 # an array or dictionary
310 objsource.floc = offset
311 sobj = next()
312 func = self.special.get(sobj)
313 if func is not None:
314 sobj = func(objsource)
316 key = (num, 0)
317 self.indirect_objects[key] = sobj
318 if key in self.deferred_objects:
319 self.deferred_objects.remove(key)
321 # Mark the object as indirect, and
322 # add it to the list of streams if it starts a stream
323 sobj.indirect = key
325 def findxref(self, fdata):
326 ''' Find the cross reference section at the end of a file
327 '''
328 startloc = fdata.rfind('startxref')
329 if startloc < 0:
330 raise PdfParseError('Did not find "startxref" at end of file')
331 source = PdfTokens(fdata, startloc, False, self.verbose)
332 tok = source.next()
333 assert tok == 'startxref' # (We just checked this...)
334 tableloc = source.next_default()
335 if not tableloc.isdigit():
336 source.exception('Expected table location')
337 if source.next_default().rstrip().lstrip('%') != 'EOF':
338 source.exception('Expected %%EOF')
339 return startloc, PdfTokens(fdata, int(tableloc), True, self.verbose)
341 def parse_xref_stream(self, source, int=int, range=range,
342 enumerate=enumerate, islice=itertools.islice,
343 defaultdict=collections.defaultdict,
344 hexlify=binascii.hexlify):
345 ''' Parse (one of) the cross-reference file section(s)
346 '''
348 def readint(s, lengths):
349 offset = 0
350 for length in itertools.cycle(lengths):
351 next = offset + length
352 yield int(hexlify(s[offset:next]), 16) if length else None
353 offset = next
355 setdefault = source.obj_offsets.setdefault
356 next = source.next
357 # check for xref stream object
358 objid = source.multiple(3)
359 ok = len(objid) == 3
360 ok = ok and objid[0].isdigit()
361 ok = ok and objid[1] == 'obj'
362 ok = ok and objid[2] == '<<'
363 if not ok:
364 source.exception('Expected xref stream start')
365 obj = self.readdict(source)
366 if obj.Type != PdfName.XRef:
367 source.exception('Expected dict type of /XRef')
368 tok = next()
369 self.readstream(obj, self.findstream(obj, tok, source), source, True)
370 old_strm = obj.stream
371 if not uncompress([obj], True):
372 source.exception('Could not decompress Xref stream')
373 stream = obj.stream
374 # Fix for issue #76 -- goofy compressed xref stream
375 # that is NOT ACTUALLY COMPRESSED
376 stream = stream if stream is not old_strm else convert_store(old_strm)
377 num_pairs = obj.Index or PdfArray(['0', obj.Size])
378 num_pairs = [int(x) for x in num_pairs]
379 num_pairs = zip(num_pairs[0::2], num_pairs[1::2])
380 entry_sizes = [int(x) for x in obj.W]
381 if len(entry_sizes) != 3:
382 source.exception('Invalid entry size')
383 object_streams = defaultdict(list)
384 get = readint(stream, entry_sizes)
385 for objnum, size in num_pairs:
386 for cnt in range(size):
387 xtype, p1, p2 = islice(get, 3)
388 if xtype in (1, None):
389 if p1:
390 setdefault((objnum, p2 or 0), p1)
391 elif xtype == 2:
392 object_streams[p1].append((objnum, p2))
393 objnum += 1
395 obj.private.object_streams = object_streams
396 return obj
398 def parse_xref_table(self, source, int=int, range=range):
399 ''' Parse (one of) the cross-reference file section(s)
400 '''
401 setdefault = source.obj_offsets.setdefault
402 next = source.next
403 # plain xref table
404 start = source.floc
405 try:
406 while 1:
407 tok = next()
408 if tok == 'trailer':
409 return
410 startobj = int(tok)
411 for objnum in range(startobj, startobj + int(next())):
412 offset = int(next())
413 generation = int(next())
414 inuse = next()
415 if inuse == 'n':
416 if offset != 0:
417 setdefault((objnum, generation), offset)
418 elif inuse != 'f':
419 raise ValueError
420 except:
421 pass
422 try:
423 # Table formatted incorrectly.
424 # See if we can figure it out anyway.
425 end = source.fdata.rindex('trailer', start)
426 table = source.fdata[start:end].splitlines()
427 for line in table:
428 tokens = line.split()
429 if len(tokens) == 2:
430 objnum = int(tokens[0])
431 elif len(tokens) == 3:
432 offset, generation, inuse = (int(tokens[0]),
433 int(tokens[1]), tokens[2])
434 if offset != 0 and inuse == 'n':
435 setdefault((objnum, generation), offset)
436 objnum += 1
437 elif tokens:
438 log.error('Invalid line in xref table: %s' %
439 repr(line))
440 raise ValueError
441 log.warning('Badly formatted xref table')
442 source.floc = end
443 next()
444 except:
445 source.floc = start
446 source.exception('Invalid table format')
448 def parsexref(self, source):
449 ''' Parse (one of) the cross-reference file section(s)
450 '''
451 next = source.next
452 try:
453 tok = next()
454 except StopIteration:
455 tok = ''
456 if tok.isdigit():
457 return self.parse_xref_stream(source), True
458 elif tok == 'xref':
459 self.parse_xref_table(source)
460 tok = next()
461 if tok != '<<':
462 source.exception('Expected "<<" starting catalog')
463 return self.readdict(source), False
464 else:
465 source.exception('Expected "xref" keyword or xref stream object')
467 def readpages(self, node):
468 pagename = PdfName.Page
469 pagesname = PdfName.Pages
470 catalogname = PdfName.Catalog
471 typename = PdfName.Type
472 kidname = PdfName.Kids
474 try:
475 result = []
476 stack = [node]
477 append = result.append
478 pop = stack.pop
479 while stack:
480 node = pop()
481 nodetype = node[typename]
482 if nodetype == pagename:
483 append(node)
484 elif nodetype == pagesname:
485 stack.extend(reversed(node[kidname]))
486 elif nodetype == catalogname:
487 stack.append(node[pagesname])
488 else:
489 log.error('Expected /Page or /Pages dictionary, got %s' %
490 repr(node))
491 return result
492 except (AttributeError, TypeError) as s:
493 log.error('Invalid page tree: %s' % s)
494 return []
496 def _parse_encrypt_info(self, source, password, trailer):
497 """Check password and initialize crypt filters."""
498 # Create and check password key
499 key = crypt.create_key(password, trailer)
501 if not crypt.check_user_password(key, trailer):
502 source.warning('User password does not validate')
504 # Create default crypt filters
505 private = self.private
506 crypt_filters = self.crypt_filters
507 version = int(trailer.Encrypt.V or 0)
508 if version in (1, 2):
509 crypt_filter = crypt.RC4CryptFilter(key)
510 private.stream_crypt_filter = crypt_filter
511 private.string_crypt_filter = crypt_filter
512 elif version == 4:
513 if PdfName.CF in trailer.Encrypt:
514 for name, params in iteritems(trailer.Encrypt.CF):
515 if name == PdfName.Identity:
516 continue
518 cfm = params.CFM
519 if cfm == PdfName.AESV2:
520 crypt_filters[name] = crypt.AESCryptFilter(key)
521 elif cfm == PdfName.V2:
522 crypt_filters[name] = crypt.RC4CryptFilter(key)
523 else:
524 source.warning(
525 'Unsupported crypt filter: {}, {}'.format(
526 name, cfm))
528 # Read default stream filter
529 if PdfName.StmF in trailer.Encrypt:
530 name = trailer.Encrypt.StmF
531 if name in crypt_filters:
532 private.stream_crypt_filter = crypt_filters[name]
533 else:
534 source.warning(
535 'Invalid crypt filter name in /StmF:'
536 ' {}'.format(name))
538 # Read default string filter
539 if PdfName.StrF in trailer.Encrypt:
540 name = trailer.Encrypt.StrF
541 if name in crypt_filters:
542 private.string_crypt_filter = crypt_filters[name]
543 else:
544 source.warning(
545 'Invalid crypt filter name in /StrF:'
546 ' {}'.format(name))
547 else:
548 source.warning(
549 'Unsupported Encrypt version: {}'.format(version))
551 def __init__(self, fname=None, fdata=None, decompress=False,
552 decrypt=False, password='', disable_gc=True, verbose=True):
553 self.private.verbose = verbose
555 # Runs a lot faster with GC off.
556 disable_gc = disable_gc and gc.isenabled()
557 if disable_gc:
558 gc.disable()
560 try:
561 if fname is not None:
562 assert fdata is None
563 # Allow reading preexisting streams like pyPdf
564 if hasattr(fname, 'read'):
565 fdata = fname.read()
566 else:
567 try:
568 f = open(fname, 'rb')
569 fdata = f.read()
570 f.close()
571 except IOError:
572 raise PdfParseError('Could not read PDF file %s' %
573 fname)
575 assert fdata is not None
576 fdata = convert_load(fdata)
578 if not fdata.startswith('%PDF-'):
579 startloc = fdata.find('%PDF-')
580 if startloc >= 0:
581 log.warning('PDF header not at beginning of file')
582 else:
583 lines = fdata.lstrip().splitlines()
584 if not lines:
585 raise PdfParseError('Empty PDF file!')
586 raise PdfParseError('Invalid PDF header: %s' %
587 repr(lines[0]))
589 self.private.version = fdata[5:8]
591 endloc = fdata.rfind('%EOF')
592 if endloc < 0:
593 raise PdfParseError('EOF mark not found: %s' %
594 repr(fdata[-20:]))
595 endloc += 6
596 junk = fdata[endloc:]
597 fdata = fdata[:endloc]
598 if junk.rstrip('\00').strip():
599 log.warning('Extra data at end of file')
601 private = self.private
602 private.indirect_objects = {}
603 private.deferred_objects = set()
604 private.special = {'<<': self.readdict,
605 '[': self.readarray,
606 'endobj': self.empty_obj,
607 }
608 for tok in r'\ ( ) < > { } ] >> %'.split():
609 self.special[tok] = self.badtoken
611 startloc, source = self.findxref(fdata)
612 private.source = source
614 # Find all the xref tables/streams, and
615 # then deal with them backwards.
616 xref_list = []
617 while 1:
618 source.obj_offsets = {}
619 trailer, is_stream = self.parsexref(source)
620 prev = trailer.Prev
621 if prev is None:
622 token = source.next()
623 if token != 'startxref' and not xref_list:
624 source.warning('Expected "startxref" '
625 'at end of xref table')
626 break
627 xref_list.append((source.obj_offsets, trailer, is_stream))
628 source.floc = int(prev)
630 # Handle document encryption
631 private.crypt_filters = None
632 if decrypt and PdfName.Encrypt in trailer:
633 identity_filter = crypt.IdentityCryptFilter()
634 crypt_filters = {
635 PdfName.Identity: identity_filter
636 }
637 private.crypt_filters = crypt_filters
638 private.stream_crypt_filter = identity_filter
639 private.string_crypt_filter = identity_filter
641 if not crypt.HAS_CRYPTO:
642 raise PdfParseError(
643 'Install PyCrypto to enable encryption support')
645 self._parse_encrypt_info(source, password, trailer)
647 if is_stream:
648 self.load_stream_objects(trailer.object_streams)
650 while xref_list:
651 later_offsets, later_trailer, is_stream = xref_list.pop()
652 source.obj_offsets.update(later_offsets)
653 if is_stream:
654 trailer.update(later_trailer)
655 self.load_stream_objects(later_trailer.object_streams)
656 else:
657 trailer = later_trailer
659 trailer.Prev = None
661 if (trailer.Version and
662 float(trailer.Version) > float(self.version)):
663 self.private.version = trailer.Version
665 if decrypt:
666 self.decrypt_all()
667 trailer.Encrypt = None
669 if is_stream:
670 self.Root = trailer.Root
671 self.Info = trailer.Info
672 self.ID = trailer.ID
673 self.Size = trailer.Size
674 self.Encrypt = trailer.Encrypt
675 else:
676 self.update(trailer)
678 # self.read_all_indirect(source)
679 private.pages = self.readpages(self.Root)
680 if decompress:
681 self.uncompress()
683 # For compatibility with pyPdf
684 private.numPages = len(self.pages)
685 finally:
686 if disable_gc:
687 gc.enable()
689 # For compatibility with pyPdf
690 def getPage(self, pagenum):
691 return self.pages[pagenum]