Coverage for pdfrw/pdfrw/findobjs.py: 0%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# A part of pdfrw (https://github.com/pmaupin/pdfrw)
2# Copyright (C) 2015 Patrick Maupin, Austin, Texas
3# MIT license -- See LICENSE.txt for details
5''' This module contains a function to find all the XObjects
6 in a document, and another function that will wrap them
7 in page objects.
8'''
10from .objects import PdfDict, PdfArray, PdfName
13def find_objects(source, valid_types=(PdfName.XObject, None),
14 valid_subtypes=(PdfName.Form, PdfName.Image),
15 no_follow=(PdfName.Parent,),
16 isinstance=isinstance, id=id, sorted=sorted,
17 reversed=reversed, PdfDict=PdfDict):
18 '''
19 Find all the objects of a particular kind in a document
20 or array. Defaults to looking for Form and Image XObjects.
22 This could be done recursively, but some PDFs
23 are quite deeply nested, so we do it without
24 recursion.
26 Note that we don't know exactly where things appear on pages,
27 but we aim for a sort order that is (a) mostly in document order,
28 and (b) reproducible. For arrays, objects are processed in
29 array order, and for dicts, they are processed in key order.
30 '''
31 container = (PdfDict, PdfArray)
33 # Allow passing a list of pages, or a dict
34 if isinstance(source, PdfDict):
35 source = [source]
36 else:
37 source = list(source)
39 visited = set()
40 source.reverse()
41 while source:
42 obj = source.pop()
43 if not isinstance(obj, container):
44 continue
45 myid = id(obj)
46 if myid in visited:
47 continue
48 visited.add(myid)
49 if isinstance(obj, PdfDict):
50 if obj.Type in valid_types and obj.Subtype in valid_subtypes:
51 yield obj
52 obj = [y for (x, y) in sorted(obj.iteritems())
53 if x not in no_follow]
54 else:
55 # TODO: This forces resolution of any indirect objects in
56 # the array. It may not be necessary. Don't know if
57 # reversed() does any voodoo underneath the hood.
58 # It's cheap enough for now, but might be removeable.
59 obj and obj[0]
60 source.extend(reversed(obj))
63def wrap_object(obj, width, margin):
64 ''' Wrap an xobj in its own page object.
65 '''
66 fmt = 'q %s 0 0 %s %s %s cm /MyImage Do Q'
67 contents = PdfDict(indirect=True)
68 subtype = obj.Subtype
69 if subtype == PdfName.Form:
70 contents._stream = obj.stream
71 contents.Length = obj.Length
72 contents.Filter = obj.Filter
73 contents.DecodeParms = obj.DecodeParms
74 resources = obj.Resources
75 mbox = obj.BBox
76 elif subtype == PdfName.Image: # Image
77 xoffset = margin[0]
78 yoffset = margin[1]
79 cw = width - margin[0] - margin[2]
80 iw, ih = float(obj.Width), float(obj.Height)
81 ch = 1.0 * cw / iw * ih
82 height = ch + margin[1] + margin[3]
83 p = tuple(('%.9f' % x).rstrip('0').rstrip('.') for x in (cw, ch, xoffset, yoffset))
84 contents.stream = fmt % p
85 resources = PdfDict(XObject=PdfDict(MyImage=obj))
86 mbox = PdfArray((0, 0, width, height))
87 else:
88 raise TypeError("Expected Form or Image XObject")
90 return PdfDict(
91 indirect=True,
92 Type=PdfName.Page,
93 MediaBox=mbox,
94 Resources=resources,
95 Contents=contents,
96 )
99def trivial_xobjs(maxignore=300):
100 ''' Ignore XObjects that trivially contain other XObjects.
101 '''
102 ignore = set('q Q cm Do'.split())
103 Image = PdfName.Image
105 def check(obj):
106 if obj.Subtype == Image:
107 return False
108 s = obj.stream
109 if len(s) < maxignore:
110 s = (x for x in s.split() if not x.startswith('/') and
111 x not in ignore)
112 s = (x.replace('.', '').replace('-', '') for x in s)
113 if not [x for x in s if not x.isdigit()]:
114 return True
115 return check
118def page_per_xobj(xobj_iter, width=8.5 * 72, margin=0.0 * 72,
119 image_only=False, ignore=trivial_xobjs(),
120 wrap_object=wrap_object):
121 ''' page_per_xobj wraps every XObj found
122 in its own page object.
123 width and margin are used to set image sizes.
124 '''
125 try:
126 iter(margin)
127 except:
128 margin = [margin]
129 while len(margin) < 4:
130 margin *= 2
132 if isinstance(xobj_iter, (list, dict)):
133 xobj_iter = find_objects(xobj_iter)
134 for obj in xobj_iter:
135 if not ignore(obj):
136 if not image_only or obj.Subtype == PdfName.IMage:
137 yield wrap_object(obj, width, margin)