Coverage for pdfrw/pdfrw/pdfwriter.py: 68%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

217 statements  

1# A part of pdfrw (https://github.com/pmaupin/pdfrw) 

2# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas 

3# MIT license -- See LICENSE.txt for details 

4 

5''' 

6The PdfWriter class writes an entire PDF file out to disk. 

7 

8The writing process is not at all optimized or organized. 

9 

10An instance of the PdfWriter class has two methods: 

11 addpage(page) 

12and 

13 write(fname) 

14 

15addpage() assumes that the pages are part of a valid 

16tree/forest of PDF objects. 

17''' 

18import gc 

19 

20from .objects import (PdfName, PdfArray, PdfDict, IndirectPdfDict, 

21 PdfObject, PdfString) 

22from .compress import compress as do_compress 

23from .errors import PdfOutputError, log 

24from .py23_diffs import iteritems, convert_store 

25 

26NullObject = PdfObject('null') 

27NullObject.indirect = True 

28NullObject.Type = 'Null object' 

29 

30 

31def user_fmt(obj, isinstance=isinstance, float=float, str=str, 

32 basestring=(type(u''), type(b'')), encode=PdfString.encode): 

33 ''' This function may be replaced by the user for 

34 specialized formatting requirements. 

35 ''' 

36 

37 if isinstance(obj, basestring): 

38 return encode(obj) 

39 

40 # PDFs don't handle exponent notation 

41 if isinstance(obj, float): 

42 return ('%.9f' % obj).rstrip('0').rstrip('.') 

43 

44 return str(obj) 

45 

46 

47def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(), 

48 user_fmt=user_fmt, do_compress=do_compress, 

49 convert_store=convert_store, iteritems=iteritems, 

50 id=id, isinstance=isinstance, getattr=getattr, len=len, 

51 sum=sum, set=set, str=str, hasattr=hasattr, repr=repr, 

52 enumerate=enumerate, list=list, dict=dict, tuple=tuple, 

53 PdfArray=PdfArray, PdfDict=PdfDict, PdfObject=PdfObject): 

54 ''' FormatObjects performs the actual formatting and disk write. 

55 Should be a class, was a class, turned into nested functions 

56 for performace (to reduce attribute lookups). 

57 ''' 

58 

59 def f_write(s): 

60 f.write(convert_store(s)) 

61 

62 def add(obj): 

63 ''' Add an object to our list, if it's an indirect 

64 object. Just format it if not. 

65 ''' 

66 # Can't hash dicts, so just hash the object ID 

67 objid = id(obj) 

68 

69 # Automatically set stream objects to indirect 

70 if isinstance(obj, PdfDict): 

71 indirect = obj.indirect or (obj.stream is not None) 

72 else: 

73 indirect = getattr(obj, 'indirect', False) 

74 

75 if not indirect: 

76 if objid in visited: 

77 log.warning('Replicating direct %s object, ' 

78 'should be indirect for optimal file size' % 

79 type(obj)) 

80 obj = type(obj)(obj) 

81 objid = id(obj) 

82 visiting(objid) 

83 result = format_obj(obj) 

84 leaving(objid) 

85 return result 

86 

87 objnum = indirect_dict_get(objid) 

88 

89 # If we haven't seen the object yet, we need to 

90 # add it to the indirect object list. 

91 if objnum is None: 

92 swapped = swapobj(objid) 

93 if swapped is not None: 

94 old_id = objid 

95 obj = swapped 

96 objid = id(obj) 

97 objnum = indirect_dict_get(objid) 

98 if objnum is not None: 

99 indirect_dict[old_id] = objnum 

100 return '%s 0 R' % objnum 

101 objnum = len(objlist) + 1 

102 objlist_append(None) 

103 indirect_dict[objid] = objnum 

104 deferred.append((objnum - 1, obj)) 

105 return '%s 0 R' % objnum 

106 

107 def format_array(myarray, formatter): 

108 # Format array data into semi-readable ASCII 

109 if sum([len(x) for x in myarray]) <= 70: 

110 return formatter % space_join(myarray) 

111 return format_big(myarray, formatter) 

112 

113 def format_big(myarray, formatter): 

114 bigarray = [] 

115 count = 1000000 

116 for x in myarray: 

117 lenx = len(x) + 1 

118 count += lenx 

119 if count > 71: 

120 subarray = [] 

121 bigarray.append(subarray) 

122 count = lenx 

123 subarray.append(x) 

124 return formatter % lf_join([space_join(x) for x in bigarray]) 

125 

126 def format_obj(obj): 

127 ''' format PDF object data into semi-readable ASCII. 

128 May mutually recurse with add() -- add() will 

129 return references for indirect objects, and add 

130 the indirect object to the list. 

131 ''' 

132 while 1: 

133 if isinstance(obj, (list, dict, tuple)): 

134 if isinstance(obj, PdfArray): 

135 myarray = [add(x) for x in obj] 

136 return format_array(myarray, '[%s]') 

137 elif isinstance(obj, PdfDict): 

138 if compress and obj.stream: 

139 do_compress([obj]) 

140 pairs = sorted((getattr(x, 'encoded', None) or x, y) 

141 for (x, y) in obj.iteritems()) 

142 myarray = [] 

143 for key, value in pairs: 

144 myarray.append(key) 

145 myarray.append(add(value)) 

146 result = format_array(myarray, '<<%s>>') 

147 stream = obj.stream 

148 if stream is not None: 

149 result = ('%s\nstream\n%s\nendstream' % 

150 (result, stream)) 

151 return result 

152 obj = (PdfArray, PdfDict)[isinstance(obj, dict)](obj) 

153 continue 

154 

155 # We assume that an object with an indirect 

156 # attribute knows how to represent itself to us. 

157 if hasattr(obj, 'indirect'): 

158 return str(getattr(obj, 'encoded', None) or obj) 

159 return user_fmt(obj) 

160 

161 def format_deferred(): 

162 while deferred: 

163 index, obj = deferred.pop() 

164 objlist[index] = format_obj(obj) 

165 

166 indirect_dict = {} 

167 indirect_dict_get = indirect_dict.get 

168 objlist = [] 

169 objlist_append = objlist.append 

170 visited = set() 

171 visiting = visited.add 

172 leaving = visited.remove 

173 space_join = ' '.join 

174 lf_join = '\n '.join 

175 

176 deferred = [] 

177 

178 # Don't reference old catalog or pages objects -- 

179 # swap references to new ones. 

180 type_remap = {PdfName.Catalog: trailer.Root, 

181 PdfName.Pages: trailer.Root.Pages, None: trailer}.get 

182 swapobj = [(objid, type_remap(obj.Type) if new_obj is None else new_obj) 

183 for objid, (obj, new_obj) in iteritems(killobj)] 

184 swapobj = dict((objid, obj is None and NullObject or obj) 

185 for objid, obj in swapobj).get 

186 

187 for objid in killobj: 

188 assert swapobj(objid) is not None 

189 

190 # The first format of trailer gets all the information, 

191 # but we throw away the actual trailer formatting. 

192 format_obj(trailer) 

193 # Keep formatting until we're done. 

194 # (Used to recurse inside format_obj for this, but 

195 # hit system limit.) 

196 format_deferred() 

197 # Now we know the size, so we update the trailer dict 

198 # and get the formatted data. 

199 trailer.Size = PdfObject(len(objlist) + 1) 

200 trailer = format_obj(trailer) 

201 

202 # Now we have all the pieces to write out to the file. 

203 # Keep careful track of the counts while we do it so 

204 # we can correctly build the cross-reference. 

205 

206 header = '%%PDF-%s\n%%\xe2\xe3\xcf\xd3\n' % version 

207 f_write(header) 

208 offset = len(header) 

209 offsets = [(0, 65535, 'f')] 

210 offsets_append = offsets.append 

211 

212 for i, x in enumerate(objlist): 

213 objstr = '%s 0 obj\n%s\nendobj\n' % (i + 1, x) 

214 offsets_append((offset, 0, 'n')) 

215 offset += len(objstr) 

216 f_write(objstr) 

217 

218 f_write('xref\n0 %s\n' % len(offsets)) 

219 for x in offsets: 

220 f_write('%010d %05d %s\r\n' % x) 

221 f_write('trailer\n\n%s\nstartxref\n%s\n%%%%EOF\n' % (trailer, offset)) 

222 

223 

224class PdfWriter(object): 

225 

226 _trailer = None 

227 canonicalize = False 

228 fname = None 

229 

230 def __init__(self, fname=None, version='1.3', compress=False, **kwargs): 

231 """ 

232 Parameters: 

233 fname -- Output file name, or file-like binary object 

234 with a write method 

235 version -- PDF version to target. Currently only 1.3 

236 supported. 

237 compress -- True to do compression on output. Currently 

238 compresses stream objects. 

239 """ 

240 

241 # Legacy support: fname is new, was added in front 

242 if fname is not None: 

243 try: 

244 float(fname) 

245 except (ValueError, TypeError): 

246 pass 

247 else: 

248 if version != '1.3': 

249 assert compress == False 

250 compress = version 

251 version = fname 

252 fname = None 

253 

254 self.fname = fname 

255 self.version = version 

256 self.compress = compress 

257 

258 if kwargs: 

259 for name, value in iteritems(kwargs): 

260 if name not in self.replaceable: 

261 raise ValueError("Cannot set attribute %s " 

262 "on PdfWriter instance" % name) 

263 setattr(self, name, value) 

264 

265 self.pagearray = PdfArray() 

266 self.killobj = {} 

267 

268 def addpage(self, page): 

269 self._trailer = None 

270 if page.Type != PdfName.Page: 

271 raise PdfOutputError('Bad /Type: Expected %s, found %s' 

272 % (PdfName.Page, page.Type)) 

273 inheritable = page.inheritable # searches for resources 

274 self.pagearray.append( 

275 IndirectPdfDict( 

276 page, 

277 Resources=inheritable.Resources, 

278 MediaBox=inheritable.MediaBox, 

279 CropBox=inheritable.CropBox, 

280 Rotate=inheritable.Rotate, 

281 ) 

282 ) 

283 

284 # Add parents in the hierarchy to objects we 

285 # don't want to output 

286 killobj = self.killobj 

287 obj, new_obj = page, self.pagearray[-1] 

288 while obj is not None: 

289 objid = id(obj) 

290 if objid in killobj: 

291 break 

292 killobj[objid] = obj, new_obj 

293 obj = obj.Parent 

294 new_obj = None 

295 return self 

296 

297 addPage = addpage # for compatibility with pyPdf 

298 

299 def addpages(self, pagelist): 

300 for page in pagelist: 

301 self.addpage(page) 

302 return self 

303 

304 def _get_trailer(self): 

305 trailer = self._trailer 

306 if trailer is not None: 

307 return trailer 

308 

309 if self.canonicalize: 

310 self.make_canonical() 

311 

312 # Create the basic object structure of the PDF file 

313 trailer = PdfDict( 

314 Root=IndirectPdfDict( 

315 Type=PdfName.Catalog, 

316 Pages=IndirectPdfDict( 

317 Type=PdfName.Pages, 

318 Count=PdfObject(len(self.pagearray)), 

319 Kids=self.pagearray 

320 ) 

321 ) 

322 ) 

323 # Make all the pages point back to the page dictionary and 

324 # ensure they are indirect references 

325 pagedict = trailer.Root.Pages 

326 for page in pagedict.Kids: 

327 page.Parent = pagedict 

328 page.indirect = True 

329 self._trailer = trailer 

330 return trailer 

331 

332 def _set_trailer(self, trailer): 

333 self._trailer = trailer 

334 

335 trailer = property(_get_trailer, _set_trailer) 

336 

337 def write(self, fname=None, trailer=None, user_fmt=user_fmt, 

338 disable_gc=True): 

339 

340 trailer = trailer or self.trailer 

341 

342 # Support fname for legacy applications 

343 if (fname is not None) == (self.fname is not None): 

344 raise PdfOutputError( 

345 "PdfWriter fname must be specified exactly once") 

346 

347 fname = fname or self.fname 

348 

349 # Dump the data. We either have a filename or a preexisting 

350 # file object. 

351 preexisting = hasattr(fname, 'write') 

352 f = preexisting and fname or open(fname, 'wb') 

353 if disable_gc: 

354 gc.disable() 

355 

356 try: 

357 FormatObjects(f, trailer, self.version, self.compress, 

358 self.killobj, user_fmt=user_fmt) 

359 finally: 

360 if not preexisting: 

361 f.close() 

362 if disable_gc: 

363 gc.enable() 

364 

365 def make_canonical(self): 

366 ''' Canonicalizes a PDF. Assumes everything 

367 is a Pdf object already. 

368 ''' 

369 visited = set() 

370 workitems = list(self.pagearray) 

371 while workitems: 

372 obj = workitems.pop() 

373 objid = id(obj) 

374 if objid in visited: 

375 continue 

376 visited.add(objid) 

377 obj.indirect = False 

378 if isinstance(obj, (PdfArray, PdfDict)): 

379 obj.indirect = True 

380 if isinstance(obj, PdfArray): 

381 workitems += obj 

382 else: 

383 workitems += obj.values() 

384 

385 replaceable = set(vars())