Coverage for pdfrw/pdfrw/buildxobj.py: 65%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# A part of pdfrw (https://github.com/pmaupin/pdfrw)
2# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
3# MIT license -- See LICENSE.txt for details
5'''
7This module contains code to build PDF "Form XObjects".
9A Form XObject allows a fragment from one PDF file to be cleanly
10included in another PDF file.
12Reference for syntax: "Parameters for opening PDF files" from SDK 8.1
14 http://www.adobe.com/devnet/acrobat/pdfs/pdf_open_parameters.pdf
16 supported 'page=xxx', 'viewrect=<left>,<top>,<width>,<height>'
18 Also supported by this, but not by Adobe:
19 'rotate=xxx' where xxx in [0, 90, 180, 270]
21 Units are in points
24Reference for content: Adobe PDF reference, sixth edition, version 1.7
26 http://www.adobe.com/devnet/acrobat/pdfs/pdf_reference_1-7.pdf
28 Form xobjects discussed chapter 4.9, page 355
29'''
31from .objects import PdfDict, PdfArray, PdfName
32from .pdfreader import PdfReader
33from .errors import log, PdfNotImplementedError
34from .py23_diffs import iteritems
35from .uncompress import uncompress
36from .compress import compress
39class ViewInfo(object):
40 ''' Instantiate ViewInfo with a uri, and it will parse out
41 the filename, page, and viewrect into object attributes.
43 Note 1:
44 Viewrects follow the adobe definition. (See reference
45 above). They are arrays of 4 numbers:
47 - Distance from left of document in points
48 - Distance from top (NOT bottom) of document in points
49 - Width of rectangle in points
50 - Height of rectangle in points
52 Note 2:
53 For simplicity, Viewrects can also be specified
54 in fractions of the document. If every number in
55 the viewrect is between 0 and 1 inclusive, then
56 viewrect elements 0 and 2 are multiplied by the
57 mediabox width before use, and viewrect elements
58 1 and 3 are multiplied by the mediabox height before
59 use.
61 Note 3:
62 By default, an XObject based on the view will be
63 cacheable. It should not be cacheable if the XObject
64 will be subsequently modified.
65 '''
66 doc = None
67 docname = None
68 page = None
69 viewrect = None
70 rotate = None
71 cacheable = True
73 def __init__(self, pageinfo='', **kw):
74 pageinfo = pageinfo.split('#', 1)
75 if len(pageinfo) == 2:
76 pageinfo[1:] = pageinfo[1].replace('&', '#').split('#')
77 for key in 'page viewrect'.split():
78 if pageinfo[0].startswith(key + '='):
79 break
80 else:
81 self.docname = pageinfo.pop(0)
82 for item in pageinfo:
83 key, value = item.split('=')
84 key = key.strip()
85 value = value.replace(',', ' ').split()
86 if key in ('page', 'rotate'):
87 assert len(value) == 1
88 setattr(self, key, int(value[0]))
89 elif key == 'viewrect':
90 assert len(value) == 4
91 setattr(self, key, [float(x) for x in value])
92 else:
93 log.error('Unknown option: %s', key)
94 for key, value in iteritems(kw):
95 assert hasattr(self, key), key
96 setattr(self, key, value)
99def get_rotation(rotate):
100 ''' Return clockwise rotation code:
101 0 = unrotated
102 1 = 90 degrees
103 2 = 180 degrees
104 3 = 270 degrees
105 '''
106 try:
107 rotate = int(rotate)
108 except (ValueError, TypeError):
109 return 0
110 if rotate % 90 != 0:
111 return 0
112 return rotate // 90
115def rotate_point(point, rotation):
116 ''' Rotate an (x,y) coordinate clockwise by a
117 rotation code specifying a multiple of 90 degrees.
118 '''
119 if rotation & 1:
120 point = point[1], -point[0]
121 if rotation & 2:
122 point = -point[0], -point[1]
123 return point
126def rotate_rect(rect, rotation):
127 ''' Rotate both points within the rectangle, then normalize
128 the rectangle by returning the new lower left, then new
129 upper right.
130 '''
131 rect = rotate_point(rect[:2], rotation) + rotate_point(rect[2:], rotation)
132 return (min(rect[0], rect[2]), min(rect[1], rect[3]),
133 max(rect[0], rect[2]), max(rect[1], rect[3]))
136def getrects(inheritable, pageinfo, rotation):
137 ''' Given the inheritable attributes of a page and
138 the desired pageinfo rectangle, return the page's
139 media box and the calculated boundary (clip) box.
140 '''
141 mbox = tuple([float(x) for x in inheritable.MediaBox])
142 cbox = tuple([float(x) for x in (inheritable.CropBox or mbox)])
143 vrect = pageinfo.viewrect
144 if vrect is not None:
145 # Rotate the media box to match what the user sees,
146 # figure out the clipping box, then rotate back
147 mleft, mbot, mright, mtop = rotate_rect(cbox, rotation)
148 x, y, w, h = vrect
150 # Support operations in fractions of a page
151 if 0 <= min(vrect) < max(vrect) <= 1:
152 mw = mright - mleft
153 mh = mtop - mbot
154 x *= mw
155 w *= mw
156 y *= mh
157 h *= mh
159 cleft = mleft + x
160 ctop = mtop - y
161 cright = cleft + w
162 cbot = ctop - h
163 cbox = (max(mleft, cleft), max(mbot, cbot),
164 min(mright, cright), min(mtop, ctop))
165 cbox = rotate_rect(cbox, -rotation)
166 return mbox, cbox
169def _build_cache(contents, allow_compressed):
170 ''' Build a new dictionary holding the stream,
171 and save it along with private cache info.
172 Assumes validity has been pre-checked if
173 we have a non-None xobj_copy.
175 Also, the spec says nothing about nested arrays,
176 so we assume those don't exist until we see one
177 in the wild.
178 '''
179 try:
180 xobj_copy = contents.xobj_copy
181 except AttributeError:
182 # Should have a PdfArray here...
183 array = contents
184 private = contents
185 else:
186 # Should have a PdfDict here -- might or might not have cache copy
187 if xobj_copy is not None:
188 return xobj_copy
189 array = [contents]
190 private = contents.private
192 # If we don't allow compressed objects, OR if we have multiple compressed
193 # objects, we try to decompress them, and fail if we cannot do that.
195 if not allow_compressed or len(array) > 1:
196 keys = set(x[0] for cdict in array for x in iteritems(cdict))
197 was_compressed = len(keys) > 1
198 if was_compressed:
199 # Make copies of the objects before we uncompress them.
200 array = [PdfDict(x) for x in array]
201 if not uncompress(array):
202 raise PdfNotImplementedError(
203 'Xobjects with these compression parameters not supported: %s' %
204 keys)
206 xobj_copy = PdfDict(array[0])
207 xobj_copy.private.xobj_cachedict = {}
208 private.xobj_copy = xobj_copy
210 if len(array) > 1:
211 newstream = '\n'.join(x.stream for x in array)
212 newlength = sum(int(x.Length) for x in array) + len(array) - 1
213 assert newlength == len(newstream)
214 xobj_copy.stream = newstream
215 if was_compressed and allow_compressed:
216 compress(xobj_copy)
218 return xobj_copy
221def _cache_xobj(contents, resources, mbox, bbox, rotation, cacheable=True):
222 ''' Return a cached Form XObject, or create a new one and cache it.
223 Adds private members x, y, w, h
224 '''
225 cachedict = contents.xobj_cachedict
226 cachekey = mbox, bbox, rotation
227 result = cachedict.get(cachekey) if cacheable else None
228 if result is None:
229 # If we are not getting a full page, or if we are going to
230 # modify the results, first retrieve an underlying Form XObject
231 # that represents the entire page, so that we are not copying
232 # the full page data into the new file multiple times
233 func = (_get_fullpage, _get_subpage)[mbox != bbox or not cacheable]
234 result = PdfDict(
235 func(contents, resources, mbox),
236 Type=PdfName.XObject,
237 Subtype=PdfName.Form,
238 FormType=1,
239 BBox=PdfArray(bbox),
240 )
241 rect = bbox
242 if rotation:
243 matrix = (rotate_point((1, 0), rotation) +
244 rotate_point((0, 1), rotation))
245 result.Matrix = PdfArray(matrix + (0, 0))
246 rect = rotate_rect(rect, rotation)
248 private = result.private
249 private.x = rect[0]
250 private.y = rect[1]
251 private.w = rect[2] - rect[0]
252 private.h = rect[3] - rect[1]
253 if cacheable:
254 cachedict[cachekey] = result
255 return result
258def _get_fullpage(contents, resources, mbox):
259 ''' fullpage is easy. Just copy the contents,
260 set up the resources, and let _cache_xobj handle the
261 rest.
262 '''
263 return PdfDict(contents, Resources=resources)
266def _get_subpage(contents, resources, mbox):
267 ''' subpages *could* be as easy as full pages, but we
268 choose to complicate life by creating a Form XObject
269 for the page, and then one that references it for
270 the subpage, on the off-chance that we want multiple
271 items from the page.
272 '''
273 return PdfDict(
274 stream='/FullPage Do\n',
275 Resources=PdfDict(
276 XObject=PdfDict(
277 FullPage=_cache_xobj(contents, resources, mbox, mbox, 0)
278 )
279 )
280 )
283def pagexobj(page, viewinfo=ViewInfo(), allow_compressed=True):
284 ''' pagexobj creates and returns a Form XObject for
285 a given view within a page (Defaults to entire page.)
287 pagexobj is passed a page and a viewrect.
288 '''
289 inheritable = page.inheritable
290 resources = inheritable.Resources
291 rotation = get_rotation(inheritable.Rotate)
292 mbox, bbox = getrects(inheritable, viewinfo, rotation)
293 rotation += get_rotation(viewinfo.rotate)
294 contents = _build_cache(page.Contents, allow_compressed)
295 return _cache_xobj(contents, resources, mbox, bbox, rotation,
296 viewinfo.cacheable)
299def docxobj(pageinfo, doc=None, allow_compressed=True):
300 ''' docinfo reads a page out of a document and uses
301 pagexobj to create the Form XObject based on
302 the page.
304 This is a convenience function for things like
305 rst2pdf that want to be able to pass in textual
306 filename/location descriptors and don't want to
307 know about using PdfReader.
309 Can work standalone, or in conjunction with
310 the CacheXObj class (below).
312 '''
313 if not isinstance(pageinfo, ViewInfo):
314 pageinfo = ViewInfo(pageinfo)
316 # If we're explicitly passed a document,
317 # make sure we don't have one implicitly as well.
318 # If no implicit or explicit doc, then read one in
319 # from the filename.
320 if doc is not None:
321 assert pageinfo.doc is None
322 pageinfo.doc = doc
323 elif pageinfo.doc is not None:
324 doc = pageinfo.doc
325 else:
326 doc = pageinfo.doc = PdfReader(pageinfo.docname,
327 decompress=not allow_compressed)
328 assert isinstance(doc, PdfReader)
330 sourcepage = doc.pages[(pageinfo.page or 1) - 1]
331 return pagexobj(sourcepage, pageinfo, allow_compressed)
334class CacheXObj(object):
335 ''' Use to keep from reparsing files over and over,
336 and to keep from making the output too much
337 bigger than it ought to be by replicating
338 unnecessary object copies.
340 This is a convenience function for things like
341 rst2pdf that want to be able to pass in textual
342 filename/location descriptors and don't want to
343 know about using PdfReader.
344 '''
345 def __init__(self, decompress=False):
346 ''' Set decompress true if you need
347 the Form XObjects to be decompressed.
348 Will decompress what it can and scream
349 about the rest.
350 '''
351 self.cached_pdfs = {}
352 self.decompress = decompress
354 def load(self, sourcename):
355 ''' Load a Form XObject from a uri
356 '''
357 info = ViewInfo(sourcename)
358 fname = info.docname
359 pcache = self.cached_pdfs
360 doc = pcache.get(fname)
361 if doc is None:
362 doc = pcache[fname] = PdfReader(fname, decompress=self.decompress)
363 return docxobj(info, doc, allow_compressed=not self.decompress)