Coverage for pdfrw/pdfrw/pdfreader.py: 63%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

476 statements  

1# A part of pdfrw (https://github.com/pmaupin/pdfrw) 

2# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas 

3# Copyright (C) 2012-2015 Nerijus Mika 

4# MIT license -- See LICENSE.txt for details 

5 

6''' 

7The PdfReader class reads an entire PDF file into memory and 

8parses the top-level container objects. (It does not parse 

9into streams.) The object subclasses PdfDict, and the 

10document pages are stored in a list in the pages attribute 

11of the object. 

12''' 

13import gc 

14import binascii 

15import collections 

16import itertools 

17 

18from .errors import PdfParseError, log 

19from .tokens import PdfTokens 

20from .objects import PdfDict, PdfArray, PdfName, PdfObject, PdfIndirect 

21from .uncompress import uncompress 

22from . import crypt 

23from .py23_diffs import convert_load, convert_store, iteritems 

24 

25 

26class PdfReader(PdfDict): 

27 

28 def findindirect(self, objnum, gennum, PdfIndirect=PdfIndirect, int=int): 

29 ''' Return a previously loaded indirect object, or create 

30 a placeholder for it. 

31 ''' 

32 key = int(objnum), int(gennum) 

33 result = self.indirect_objects.get(key) 

34 if result is None: 

35 self.indirect_objects[key] = result = PdfIndirect(key) 

36 self.deferred_objects.add(key) 

37 result._loader = self.loadindirect 

38 return result 

39 

40 def readarray(self, source, PdfArray=PdfArray): 

41 ''' Found a [ token. Parse the tokens after that. 

42 ''' 

43 specialget = self.special.get 

44 result = [] 

45 pop = result.pop 

46 append = result.append 

47 

48 for value in source: 

49 if value in ']R': 

50 if value == ']': 

51 break 

52 generation = pop() 

53 value = self.findindirect(pop(), generation) 

54 else: 

55 func = specialget(value) 

56 if func is not None: 

57 value = func(source) 

58 append(value) 

59 return PdfArray(result) 

60 

61 def readdict(self, source, PdfDict=PdfDict): 

62 ''' Found a << token. Parse the tokens after that. 

63 ''' 

64 specialget = self.special.get 

65 result = PdfDict() 

66 next = source.next 

67 

68 tok = next() 

69 while tok != '>>': 

70 if not tok.startswith('/'): 

71 source.error('Expected PDF /name object') 

72 tok = next() 

73 continue 

74 key = tok 

75 value = next() 

76 func = specialget(value) 

77 if func is not None: 

78 value = func(source) 

79 tok = next() 

80 else: 

81 tok = next() 

82 if value.isdigit() and tok.isdigit(): 

83 tok2 = next() 

84 if tok2 != 'R': 

85 source.error('Expected "R" following two integers') 

86 tok = tok2 

87 continue 

88 value = self.findindirect(value, tok) 

89 tok = next() 

90 result[key] = value 

91 return result 

92 

93 def empty_obj(self, source, PdfObject=PdfObject): 

94 ''' Some silly git put an empty object in the 

95 file. Back up so the caller sees the endobj. 

96 ''' 

97 source.floc = source.tokstart 

98 

99 def badtoken(self, source): 

100 ''' Didn't see that coming. 

101 ''' 

102 source.exception('Unexpected delimiter') 

103 

104 def findstream(self, obj, tok, source, len=len): 

105 ''' Figure out if there is a content stream 

106 following an object, and return the start 

107 pointer to the content stream if so. 

108 

109 (We can't read it yet, because we might not 

110 know how long it is, because Length might 

111 be an indirect object.) 

112 ''' 

113 

114 fdata = source.fdata 

115 startstream = source.tokstart + len(tok) 

116 gotcr = fdata[startstream] == '\r' 

117 startstream += gotcr 

118 gotlf = fdata[startstream] == '\n' 

119 startstream += gotlf 

120 if not gotlf: 

121 if not gotcr: 

122 source.error(r'stream keyword not followed by \n') 

123 else: 

124 source.warning(r"stream keyword terminated " 

125 r"by \r without \n") 

126 return startstream 

127 

128 def readstream(self, obj, startstream, source, exact_required=False, 

129 streamending='endstream endobj'.split(), int=int): 

130 fdata = source.fdata 

131 length = int(obj.Length) 

132 source.floc = target_endstream = startstream + length 

133 endit = source.multiple(2) 

134 obj._stream = fdata[startstream:target_endstream] 

135 if endit == streamending: 

136 return 

137 

138 if exact_required: 

139 source.exception('Expected endstream endobj') 

140 

141 # The length attribute does not match the distance between the 

142 # stream and endstream keywords. 

143 

144 # TODO: Extract maxstream from dictionary of object offsets 

145 # and use rfind instead of find. 

146 maxstream = len(fdata) - 20 

147 endstream = fdata.find('endstream', startstream, maxstream) 

148 source.floc = startstream 

149 room = endstream - startstream 

150 if endstream < 0: 

151 source.error('Could not find endstream') 

152 return 

153 if (length == room + 1 and 

154 fdata[startstream - 2:startstream] == '\r\n'): 

155 source.warning(r"stream keyword terminated by \r without \n") 

156 obj._stream = fdata[startstream - 1:target_endstream - 1] 

157 return 

158 source.floc = endstream 

159 if length > room: 

160 source.error('stream /Length attribute (%d) appears to ' 

161 'be too big (size %d) -- adjusting', 

162 length, room) 

163 obj.stream = fdata[startstream:endstream] 

164 return 

165 if fdata[target_endstream:endstream].rstrip(): 

166 source.error('stream /Length attribute (%d) appears to ' 

167 'be too small (size %d) -- adjusting', 

168 length, room) 

169 obj.stream = fdata[startstream:endstream] 

170 return 

171 endobj = fdata.find('endobj', endstream, maxstream) 

172 if endobj < 0: 

173 source.error('Could not find endobj after endstream') 

174 return 

175 if fdata[endstream:endobj].rstrip() != 'endstream': 

176 source.error('Unexpected data between endstream and endobj') 

177 return 

178 source.error('Illegal endstream/endobj combination') 

179 

180 def loadindirect(self, key, PdfDict=PdfDict, 

181 isinstance=isinstance): 

182 result = self.indirect_objects.get(key) 

183 if not isinstance(result, PdfIndirect): 

184 return result 

185 source = self.source 

186 offset = int(self.source.obj_offsets.get(key, '0')) 

187 if not offset: 

188 source.warning("Did not find PDF object %s", key) 

189 return None 

190 

191 # Read the object header and validate it 

192 objnum, gennum = key 

193 source.floc = offset 

194 objid = source.multiple(3) 

195 ok = len(objid) == 3 

196 ok = ok and objid[0].isdigit() and int(objid[0]) == objnum 

197 ok = ok and objid[1].isdigit() and int(objid[1]) == gennum 

198 ok = ok and objid[2] == 'obj' 

199 if not ok: 

200 source.floc = offset 

201 source.next() 

202 objheader = '%d %d obj' % (objnum, gennum) 

203 fdata = source.fdata 

204 offset2 = (fdata.find('\n' + objheader) + 1 or 

205 fdata.find('\r' + objheader) + 1) 

206 if (not offset2 or 

207 fdata.find(fdata[offset2 - 1] + objheader, offset2) > 0): 

208 source.warning("Expected indirect object '%s'", objheader) 

209 return None 

210 source.warning("Indirect object %s found at incorrect " 

211 "offset %d (expected offset %d)", 

212 objheader, offset2, offset) 

213 source.floc = offset2 + len(objheader) 

214 

215 # Read the object, and call special code if it starts 

216 # an array or dictionary 

217 obj = source.next() 

218 func = self.special.get(obj) 

219 if func is not None: 

220 obj = func(source) 

221 

222 self.indirect_objects[key] = obj 

223 self.deferred_objects.remove(key) 

224 

225 # Mark the object as indirect, and 

226 # just return it if it is a simple object. 

227 obj.indirect = key 

228 tok = source.next() 

229 if tok == 'endobj': 

230 return obj 

231 

232 # Should be a stream. Either that or it's broken. 

233 isdict = isinstance(obj, PdfDict) 

234 if isdict and tok == 'stream': 

235 self.readstream(obj, self.findstream(obj, tok, source), source) 

236 return obj 

237 

238 # Houston, we have a problem, but let's see if it 

239 # is easily fixable. Leaving out a space before endobj 

240 # is apparently an easy mistake to make on generation 

241 # (Because it won't be noticed unless you are specifically 

242 # generating an indirect object that doesn't end with any 

243 # sort of delimiter.) It is so common that things like 

244 # okular just handle it. 

245 

246 if isinstance(obj, PdfObject) and obj.endswith('endobj'): 

247 source.error('No space or delimiter before endobj') 

248 obj = PdfObject(obj[:-6]) 

249 else: 

250 source.error("Expected 'endobj'%s token", 

251 isdict and " or 'stream'" or '') 

252 obj = PdfObject('') 

253 

254 obj.indirect = key 

255 self.indirect_objects[key] = obj 

256 return obj 

257 

258 def read_all(self): 

259 deferred = self.deferred_objects 

260 prev = set() 

261 while 1: 

262 new = deferred - prev 

263 if not new: 

264 break 

265 prev |= deferred 

266 for key in new: 

267 self.loadindirect(key) 

268 

269 def decrypt_all(self): 

270 self.read_all() 

271 

272 if self.crypt_filters is not None: 

273 crypt.decrypt_objects( 

274 self.indirect_objects.values(), self.stream_crypt_filter, 

275 self.crypt_filters) 

276 

277 def uncompress(self): 

278 self.read_all() 

279 

280 uncompress(self.indirect_objects.values()) 

281 

282 def load_stream_objects(self, object_streams): 

283 # read object streams 

284 objs = [] 

285 for num in object_streams: 

286 obj = self.findindirect(num, 0).real_value() 

287 assert obj.Type == '/ObjStm' 

288 objs.append(obj) 

289 

290 # read objects from stream 

291 if objs: 

292 # Decrypt 

293 if self.crypt_filters is not None: 

294 crypt.decrypt_objects( 

295 objs, self.stream_crypt_filter, self.crypt_filters) 

296 

297 # Decompress 

298 uncompress(objs) 

299 

300 for obj in objs: 

301 objsource = PdfTokens(obj.stream, 0, False) 

302 next = objsource.next 

303 offsets = [] 

304 firstoffset = int(obj.First) 

305 while objsource.floc < firstoffset: 

306 offsets.append((int(next()), firstoffset + int(next()))) 

307 for num, offset in offsets: 

308 # Read the object, and call special code if it starts 

309 # an array or dictionary 

310 objsource.floc = offset 

311 sobj = next() 

312 func = self.special.get(sobj) 

313 if func is not None: 

314 sobj = func(objsource) 

315 

316 key = (num, 0) 

317 self.indirect_objects[key] = sobj 

318 if key in self.deferred_objects: 

319 self.deferred_objects.remove(key) 

320 

321 # Mark the object as indirect, and 

322 # add it to the list of streams if it starts a stream 

323 sobj.indirect = key 

324 

325 def findxref(self, fdata): 

326 ''' Find the cross reference section at the end of a file 

327 ''' 

328 startloc = fdata.rfind('startxref') 

329 if startloc < 0: 

330 raise PdfParseError('Did not find "startxref" at end of file') 

331 source = PdfTokens(fdata, startloc, False, self.verbose) 

332 tok = source.next() 

333 assert tok == 'startxref' # (We just checked this...) 

334 tableloc = source.next_default() 

335 if not tableloc.isdigit(): 

336 source.exception('Expected table location') 

337 if source.next_default().rstrip().lstrip('%') != 'EOF': 

338 source.exception('Expected %%EOF') 

339 return startloc, PdfTokens(fdata, int(tableloc), True, self.verbose) 

340 

341 def parse_xref_stream(self, source, int=int, range=range, 

342 enumerate=enumerate, islice=itertools.islice, 

343 defaultdict=collections.defaultdict, 

344 hexlify=binascii.hexlify): 

345 ''' Parse (one of) the cross-reference file section(s) 

346 ''' 

347 

348 def readint(s, lengths): 

349 offset = 0 

350 for length in itertools.cycle(lengths): 

351 next = offset + length 

352 yield int(hexlify(s[offset:next]), 16) if length else None 

353 offset = next 

354 

355 setdefault = source.obj_offsets.setdefault 

356 next = source.next 

357 # check for xref stream object 

358 objid = source.multiple(3) 

359 ok = len(objid) == 3 

360 ok = ok and objid[0].isdigit() 

361 ok = ok and objid[1] == 'obj' 

362 ok = ok and objid[2] == '<<' 

363 if not ok: 

364 source.exception('Expected xref stream start') 

365 obj = self.readdict(source) 

366 if obj.Type != PdfName.XRef: 

367 source.exception('Expected dict type of /XRef') 

368 tok = next() 

369 self.readstream(obj, self.findstream(obj, tok, source), source, True) 

370 old_strm = obj.stream 

371 if not uncompress([obj], True): 

372 source.exception('Could not decompress Xref stream') 

373 stream = obj.stream 

374 # Fix for issue #76 -- goofy compressed xref stream 

375 # that is NOT ACTUALLY COMPRESSED 

376 stream = stream if stream is not old_strm else convert_store(old_strm) 

377 num_pairs = obj.Index or PdfArray(['0', obj.Size]) 

378 num_pairs = [int(x) for x in num_pairs] 

379 num_pairs = zip(num_pairs[0::2], num_pairs[1::2]) 

380 entry_sizes = [int(x) for x in obj.W] 

381 if len(entry_sizes) != 3: 

382 source.exception('Invalid entry size') 

383 object_streams = defaultdict(list) 

384 get = readint(stream, entry_sizes) 

385 for objnum, size in num_pairs: 

386 for cnt in range(size): 

387 xtype, p1, p2 = islice(get, 3) 

388 if xtype in (1, None): 

389 if p1: 

390 setdefault((objnum, p2 or 0), p1) 

391 elif xtype == 2: 

392 object_streams[p1].append((objnum, p2)) 

393 objnum += 1 

394 

395 obj.private.object_streams = object_streams 

396 return obj 

397 

398 def parse_xref_table(self, source, int=int, range=range): 

399 ''' Parse (one of) the cross-reference file section(s) 

400 ''' 

401 setdefault = source.obj_offsets.setdefault 

402 next = source.next 

403 # plain xref table 

404 start = source.floc 

405 try: 

406 while 1: 

407 tok = next() 

408 if tok == 'trailer': 

409 return 

410 startobj = int(tok) 

411 for objnum in range(startobj, startobj + int(next())): 

412 offset = int(next()) 

413 generation = int(next()) 

414 inuse = next() 

415 if inuse == 'n': 

416 if offset != 0: 

417 setdefault((objnum, generation), offset) 

418 elif inuse != 'f': 

419 raise ValueError 

420 except: 

421 pass 

422 try: 

423 # Table formatted incorrectly. 

424 # See if we can figure it out anyway. 

425 end = source.fdata.rindex('trailer', start) 

426 table = source.fdata[start:end].splitlines() 

427 for line in table: 

428 tokens = line.split() 

429 if len(tokens) == 2: 

430 objnum = int(tokens[0]) 

431 elif len(tokens) == 3: 

432 offset, generation, inuse = (int(tokens[0]), 

433 int(tokens[1]), tokens[2]) 

434 if offset != 0 and inuse == 'n': 

435 setdefault((objnum, generation), offset) 

436 objnum += 1 

437 elif tokens: 

438 log.error('Invalid line in xref table: %s' % 

439 repr(line)) 

440 raise ValueError 

441 log.warning('Badly formatted xref table') 

442 source.floc = end 

443 next() 

444 except: 

445 source.floc = start 

446 source.exception('Invalid table format') 

447 

448 def parsexref(self, source): 

449 ''' Parse (one of) the cross-reference file section(s) 

450 ''' 

451 next = source.next 

452 try: 

453 tok = next() 

454 except StopIteration: 

455 tok = '' 

456 if tok.isdigit(): 

457 return self.parse_xref_stream(source), True 

458 elif tok == 'xref': 

459 self.parse_xref_table(source) 

460 tok = next() 

461 if tok != '<<': 

462 source.exception('Expected "<<" starting catalog') 

463 return self.readdict(source), False 

464 else: 

465 source.exception('Expected "xref" keyword or xref stream object') 

466 

467 def readpages(self, node): 

468 pagename = PdfName.Page 

469 pagesname = PdfName.Pages 

470 catalogname = PdfName.Catalog 

471 typename = PdfName.Type 

472 kidname = PdfName.Kids 

473 

474 try: 

475 result = [] 

476 stack = [node] 

477 append = result.append 

478 pop = stack.pop 

479 while stack: 

480 node = pop() 

481 nodetype = node[typename] 

482 if nodetype == pagename: 

483 append(node) 

484 elif nodetype == pagesname: 

485 stack.extend(reversed(node[kidname])) 

486 elif nodetype == catalogname: 

487 stack.append(node[pagesname]) 

488 else: 

489 log.error('Expected /Page or /Pages dictionary, got %s' % 

490 repr(node)) 

491 return result 

492 except (AttributeError, TypeError) as s: 

493 log.error('Invalid page tree: %s' % s) 

494 return [] 

495 

496 def _parse_encrypt_info(self, source, password, trailer): 

497 """Check password and initialize crypt filters.""" 

498 # Create and check password key 

499 key = crypt.create_key(password, trailer) 

500 

501 if not crypt.check_user_password(key, trailer): 

502 source.warning('User password does not validate') 

503 

504 # Create default crypt filters 

505 private = self.private 

506 crypt_filters = self.crypt_filters 

507 version = int(trailer.Encrypt.V or 0) 

508 if version in (1, 2): 

509 crypt_filter = crypt.RC4CryptFilter(key) 

510 private.stream_crypt_filter = crypt_filter 

511 private.string_crypt_filter = crypt_filter 

512 elif version == 4: 

513 if PdfName.CF in trailer.Encrypt: 

514 for name, params in iteritems(trailer.Encrypt.CF): 

515 if name == PdfName.Identity: 

516 continue 

517 

518 cfm = params.CFM 

519 if cfm == PdfName.AESV2: 

520 crypt_filters[name] = crypt.AESCryptFilter(key) 

521 elif cfm == PdfName.V2: 

522 crypt_filters[name] = crypt.RC4CryptFilter(key) 

523 else: 

524 source.warning( 

525 'Unsupported crypt filter: {}, {}'.format( 

526 name, cfm)) 

527 

528 # Read default stream filter 

529 if PdfName.StmF in trailer.Encrypt: 

530 name = trailer.Encrypt.StmF 

531 if name in crypt_filters: 

532 private.stream_crypt_filter = crypt_filters[name] 

533 else: 

534 source.warning( 

535 'Invalid crypt filter name in /StmF:' 

536 ' {}'.format(name)) 

537 

538 # Read default string filter 

539 if PdfName.StrF in trailer.Encrypt: 

540 name = trailer.Encrypt.StrF 

541 if name in crypt_filters: 

542 private.string_crypt_filter = crypt_filters[name] 

543 else: 

544 source.warning( 

545 'Invalid crypt filter name in /StrF:' 

546 ' {}'.format(name)) 

547 else: 

548 source.warning( 

549 'Unsupported Encrypt version: {}'.format(version)) 

550 

551 def __init__(self, fname=None, fdata=None, decompress=False, 

552 decrypt=False, password='', disable_gc=True, verbose=True): 

553 self.private.verbose = verbose 

554 

555 # Runs a lot faster with GC off. 

556 disable_gc = disable_gc and gc.isenabled() 

557 if disable_gc: 

558 gc.disable() 

559 

560 try: 

561 if fname is not None: 

562 assert fdata is None 

563 # Allow reading preexisting streams like pyPdf 

564 if hasattr(fname, 'read'): 

565 fdata = fname.read() 

566 else: 

567 try: 

568 f = open(fname, 'rb') 

569 fdata = f.read() 

570 f.close() 

571 except IOError: 

572 raise PdfParseError('Could not read PDF file %s' % 

573 fname) 

574 

575 assert fdata is not None 

576 fdata = convert_load(fdata) 

577 

578 if not fdata.startswith('%PDF-'): 

579 startloc = fdata.find('%PDF-') 

580 if startloc >= 0: 

581 log.warning('PDF header not at beginning of file') 

582 else: 

583 lines = fdata.lstrip().splitlines() 

584 if not lines: 

585 raise PdfParseError('Empty PDF file!') 

586 raise PdfParseError('Invalid PDF header: %s' % 

587 repr(lines[0])) 

588 

589 self.private.version = fdata[5:8] 

590 

591 endloc = fdata.rfind('%EOF') 

592 if endloc < 0: 

593 raise PdfParseError('EOF mark not found: %s' % 

594 repr(fdata[-20:])) 

595 endloc += 6 

596 junk = fdata[endloc:] 

597 fdata = fdata[:endloc] 

598 if junk.rstrip('\00').strip(): 

599 log.warning('Extra data at end of file') 

600 

601 private = self.private 

602 private.indirect_objects = {} 

603 private.deferred_objects = set() 

604 private.special = {'<<': self.readdict, 

605 '[': self.readarray, 

606 'endobj': self.empty_obj, 

607 } 

608 for tok in r'\ ( ) < > { } ] >> %'.split(): 

609 self.special[tok] = self.badtoken 

610 

611 startloc, source = self.findxref(fdata) 

612 private.source = source 

613 

614 # Find all the xref tables/streams, and 

615 # then deal with them backwards. 

616 xref_list = [] 

617 while 1: 

618 source.obj_offsets = {} 

619 trailer, is_stream = self.parsexref(source) 

620 prev = trailer.Prev 

621 if prev is None: 

622 token = source.next() 

623 if token != 'startxref' and not xref_list: 

624 source.warning('Expected "startxref" ' 

625 'at end of xref table') 

626 break 

627 xref_list.append((source.obj_offsets, trailer, is_stream)) 

628 source.floc = int(prev) 

629 

630 # Handle document encryption 

631 private.crypt_filters = None 

632 if decrypt and PdfName.Encrypt in trailer: 

633 identity_filter = crypt.IdentityCryptFilter() 

634 crypt_filters = { 

635 PdfName.Identity: identity_filter 

636 } 

637 private.crypt_filters = crypt_filters 

638 private.stream_crypt_filter = identity_filter 

639 private.string_crypt_filter = identity_filter 

640 

641 if not crypt.HAS_CRYPTO: 

642 raise PdfParseError( 

643 'Install PyCrypto to enable encryption support') 

644 

645 self._parse_encrypt_info(source, password, trailer) 

646 

647 if is_stream: 

648 self.load_stream_objects(trailer.object_streams) 

649 

650 while xref_list: 

651 later_offsets, later_trailer, is_stream = xref_list.pop() 

652 source.obj_offsets.update(later_offsets) 

653 if is_stream: 

654 trailer.update(later_trailer) 

655 self.load_stream_objects(later_trailer.object_streams) 

656 else: 

657 trailer = later_trailer 

658 

659 trailer.Prev = None 

660 

661 if (trailer.Version and 

662 float(trailer.Version) > float(self.version)): 

663 self.private.version = trailer.Version 

664 

665 if decrypt: 

666 self.decrypt_all() 

667 trailer.Encrypt = None 

668 

669 if is_stream: 

670 self.Root = trailer.Root 

671 self.Info = trailer.Info 

672 self.ID = trailer.ID 

673 self.Size = trailer.Size 

674 self.Encrypt = trailer.Encrypt 

675 else: 

676 self.update(trailer) 

677 

678 # self.read_all_indirect(source) 

679 private.pages = self.readpages(self.Root) 

680 if decompress: 

681 self.uncompress() 

682 

683 # For compatibility with pyPdf 

684 private.numPages = len(self.pages) 

685 finally: 

686 if disable_gc: 

687 gc.enable() 

688 

689 # For compatibility with pyPdf 

690 def getPage(self, pagenum): 

691 return self.pages[pagenum]