Coverage for pdfrw/pdfrw/pdfwriter.py: 12%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# A part of pdfrw (https://github.com/pmaupin/pdfrw)
2# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
3# MIT license -- See LICENSE.txt for details
5'''
6The PdfWriter class writes an entire PDF file out to disk.
8The writing process is not at all optimized or organized.
10An instance of the PdfWriter class has two methods:
11 addpage(page)
12and
13 write(fname)
15addpage() assumes that the pages are part of a valid
16tree/forest of PDF objects.
17'''
18import gc
20from .objects import (PdfName, PdfArray, PdfDict, IndirectPdfDict,
21 PdfObject, PdfString)
22from .compress import compress as do_compress
23from .errors import PdfOutputError, log
24from .py23_diffs import iteritems, convert_store
26NullObject = PdfObject('null')
27NullObject.indirect = True
28NullObject.Type = 'Null object'
31def user_fmt(obj, isinstance=isinstance, float=float, str=str,
32 basestring=(type(u''), type(b'')), encode=PdfString.encode):
33 ''' This function may be replaced by the user for
34 specialized formatting requirements.
35 '''
37 if isinstance(obj, basestring):
38 return encode(obj)
40 # PDFs don't handle exponent notation
41 if isinstance(obj, float):
42 return ('%.9f' % obj).rstrip('0').rstrip('.')
44 return str(obj)
47def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(),
48 user_fmt=user_fmt, do_compress=do_compress,
49 convert_store=convert_store, iteritems=iteritems,
50 id=id, isinstance=isinstance, getattr=getattr, len=len,
51 sum=sum, set=set, str=str, hasattr=hasattr, repr=repr,
52 enumerate=enumerate, list=list, dict=dict, tuple=tuple,
53 PdfArray=PdfArray, PdfDict=PdfDict, PdfObject=PdfObject):
54 ''' FormatObjects performs the actual formatting and disk write.
55 Should be a class, was a class, turned into nested functions
56 for performace (to reduce attribute lookups).
57 '''
59 def f_write(s):
60 f.write(convert_store(s))
62 def add(obj):
63 ''' Add an object to our list, if it's an indirect
64 object. Just format it if not.
65 '''
66 # Can't hash dicts, so just hash the object ID
67 objid = id(obj)
69 # Automatically set stream objects to indirect
70 if isinstance(obj, PdfDict):
71 indirect = obj.indirect or (obj.stream is not None)
72 else:
73 indirect = getattr(obj, 'indirect', False)
75 if not indirect:
76 if objid in visited:
77 log.warning('Replicating direct %s object, '
78 'should be indirect for optimal file size' %
79 type(obj))
80 obj = type(obj)(obj)
81 objid = id(obj)
82 visiting(objid)
83 result = format_obj(obj)
84 leaving(objid)
85 return result
87 objnum = indirect_dict_get(objid)
89 # If we haven't seen the object yet, we need to
90 # add it to the indirect object list.
91 if objnum is None:
92 swapped = swapobj(objid)
93 if swapped is not None:
94 old_id = objid
95 obj = swapped
96 objid = id(obj)
97 objnum = indirect_dict_get(objid)
98 if objnum is not None:
99 indirect_dict[old_id] = objnum
100 return '%s 0 R' % objnum
101 objnum = len(objlist) + 1
102 objlist_append(None)
103 indirect_dict[objid] = objnum
104 deferred.append((objnum - 1, obj))
105 return '%s 0 R' % objnum
107 def format_array(myarray, formatter):
108 # Format array data into semi-readable ASCII
109 if sum([len(x) for x in myarray]) <= 70:
110 return formatter % space_join(myarray)
111 return format_big(myarray, formatter)
113 def format_big(myarray, formatter):
114 bigarray = []
115 count = 1000000
116 for x in myarray:
117 lenx = len(x) + 1
118 count += lenx
119 if count > 71:
120 subarray = []
121 bigarray.append(subarray)
122 count = lenx
123 subarray.append(x)
124 return formatter % lf_join([space_join(x) for x in bigarray])
126 def format_obj(obj):
127 ''' format PDF object data into semi-readable ASCII.
128 May mutually recurse with add() -- add() will
129 return references for indirect objects, and add
130 the indirect object to the list.
131 '''
132 while 1:
133 if isinstance(obj, (list, dict, tuple)):
134 if isinstance(obj, PdfArray):
135 myarray = [add(x) for x in obj]
136 return format_array(myarray, '[%s]')
137 elif isinstance(obj, PdfDict):
138 if compress and obj.stream:
139 do_compress([obj])
140 pairs = sorted((getattr(x, 'encoded', None) or x, y)
141 for (x, y) in obj.iteritems())
142 myarray = []
143 for key, value in pairs:
144 myarray.append(key)
145 myarray.append(add(value))
146 result = format_array(myarray, '<<%s>>')
147 stream = obj.stream
148 if stream is not None:
149 result = ('%s\nstream\n%s\nendstream' %
150 (result, stream))
151 return result
152 obj = (PdfArray, PdfDict)[isinstance(obj, dict)](obj)
153 continue
155 # We assume that an object with an indirect
156 # attribute knows how to represent itself to us.
157 if hasattr(obj, 'indirect'):
158 return str(getattr(obj, 'encoded', None) or obj)
159 return user_fmt(obj)
161 def format_deferred():
162 while deferred:
163 index, obj = deferred.pop()
164 objlist[index] = format_obj(obj)
166 indirect_dict = {}
167 indirect_dict_get = indirect_dict.get
168 objlist = []
169 objlist_append = objlist.append
170 visited = set()
171 visiting = visited.add
172 leaving = visited.remove
173 space_join = ' '.join
174 lf_join = '\n '.join
176 deferred = []
178 # Don't reference old catalog or pages objects --
179 # swap references to new ones.
180 type_remap = {PdfName.Catalog: trailer.Root,
181 PdfName.Pages: trailer.Root.Pages, None: trailer}.get
182 swapobj = [(objid, type_remap(obj.Type) if new_obj is None else new_obj)
183 for objid, (obj, new_obj) in iteritems(killobj)]
184 swapobj = dict((objid, obj is None and NullObject or obj)
185 for objid, obj in swapobj).get
187 for objid in killobj:
188 assert swapobj(objid) is not None
190 # The first format of trailer gets all the information,
191 # but we throw away the actual trailer formatting.
192 format_obj(trailer)
193 # Keep formatting until we're done.
194 # (Used to recurse inside format_obj for this, but
195 # hit system limit.)
196 format_deferred()
197 # Now we know the size, so we update the trailer dict
198 # and get the formatted data.
199 trailer.Size = PdfObject(len(objlist) + 1)
200 trailer = format_obj(trailer)
202 # Now we have all the pieces to write out to the file.
203 # Keep careful track of the counts while we do it so
204 # we can correctly build the cross-reference.
206 header = '%%PDF-%s\n%%\xe2\xe3\xcf\xd3\n' % version
207 f_write(header)
208 offset = len(header)
209 offsets = [(0, 65535, 'f')]
210 offsets_append = offsets.append
212 for i, x in enumerate(objlist):
213 objstr = '%s 0 obj\n%s\nendobj\n' % (i + 1, x)
214 offsets_append((offset, 0, 'n'))
215 offset += len(objstr)
216 f_write(objstr)
218 f_write('xref\n0 %s\n' % len(offsets))
219 for x in offsets:
220 f_write('%010d %05d %s\r\n' % x)
221 f_write('trailer\n\n%s\nstartxref\n%s\n%%%%EOF\n' % (trailer, offset))
224class PdfWriter(object):
226 _trailer = None
227 canonicalize = False
228 fname = None
230 def __init__(self, fname=None, version='1.3', compress=False, **kwargs):
231 """
232 Parameters:
233 fname -- Output file name, or file-like binary object
234 with a write method
235 version -- PDF version to target. Currently only 1.3
236 supported.
237 compress -- True to do compression on output. Currently
238 compresses stream objects.
239 """
241 # Legacy support: fname is new, was added in front
242 if fname is not None:
243 try:
244 float(fname)
245 except (ValueError, TypeError):
246 pass
247 else:
248 if version != '1.3':
249 assert compress == False
250 compress = version
251 version = fname
252 fname = None
254 self.fname = fname
255 self.version = version
256 self.compress = compress
258 if kwargs:
259 for name, value in iteritems(kwargs):
260 if name not in self.replaceable:
261 raise ValueError("Cannot set attribute %s "
262 "on PdfWriter instance" % name)
263 setattr(self, name, value)
265 self.pagearray = PdfArray()
266 self.killobj = {}
268 def addpage(self, page):
269 self._trailer = None
270 if page.Type != PdfName.Page:
271 raise PdfOutputError('Bad /Type: Expected %s, found %s'
272 % (PdfName.Page, page.Type))
273 inheritable = page.inheritable # searches for resources
274 self.pagearray.append(
275 IndirectPdfDict(
276 page,
277 Resources=inheritable.Resources,
278 MediaBox=inheritable.MediaBox,
279 CropBox=inheritable.CropBox,
280 Rotate=inheritable.Rotate,
281 )
282 )
284 # Add parents in the hierarchy to objects we
285 # don't want to output
286 killobj = self.killobj
287 obj, new_obj = page, self.pagearray[-1]
288 while obj is not None:
289 objid = id(obj)
290 if objid in killobj:
291 break
292 killobj[objid] = obj, new_obj
293 obj = obj.Parent
294 new_obj = None
295 return self
297 addPage = addpage # for compatibility with pyPdf
299 def addpages(self, pagelist):
300 for page in pagelist:
301 self.addpage(page)
302 return self
304 def _get_trailer(self):
305 trailer = self._trailer
306 if trailer is not None:
307 return trailer
309 if self.canonicalize:
310 self.make_canonical()
312 # Create the basic object structure of the PDF file
313 trailer = PdfDict(
314 Root=IndirectPdfDict(
315 Type=PdfName.Catalog,
316 Pages=IndirectPdfDict(
317 Type=PdfName.Pages,
318 Count=PdfObject(len(self.pagearray)),
319 Kids=self.pagearray
320 )
321 )
322 )
323 # Make all the pages point back to the page dictionary and
324 # ensure they are indirect references
325 pagedict = trailer.Root.Pages
326 for page in pagedict.Kids:
327 page.Parent = pagedict
328 page.indirect = True
329 self._trailer = trailer
330 return trailer
332 def _set_trailer(self, trailer):
333 self._trailer = trailer
335 trailer = property(_get_trailer, _set_trailer)
337 def write(self, fname=None, trailer=None, user_fmt=user_fmt,
338 disable_gc=True):
340 trailer = trailer or self.trailer
342 # Support fname for legacy applications
343 if (fname is not None) == (self.fname is not None):
344 raise PdfOutputError(
345 "PdfWriter fname must be specified exactly once")
347 fname = fname or self.fname
349 # Dump the data. We either have a filename or a preexisting
350 # file object.
351 preexisting = hasattr(fname, 'write')
352 f = preexisting and fname or open(fname, 'wb')
353 if disable_gc:
354 gc.disable()
356 try:
357 FormatObjects(f, trailer, self.version, self.compress,
358 self.killobj, user_fmt=user_fmt)
359 finally:
360 if not preexisting:
361 f.close()
362 if disable_gc:
363 gc.enable()
365 def make_canonical(self):
366 ''' Canonicalizes a PDF. Assumes everything
367 is a Pdf object already.
368 '''
369 visited = set()
370 workitems = list(self.pagearray)
371 while workitems:
372 obj = workitems.pop()
373 objid = id(obj)
374 if objid in visited:
375 continue
376 visited.add(objid)
377 obj.indirect = False
378 if isinstance(obj, (PdfArray, PdfDict)):
379 obj.indirect = True
380 if isinstance(obj, PdfArray):
381 workitems += obj
382 else:
383 workitems += obj.values()
385 replaceable = set(vars())