Coverage for pdfrw/pdfrw/buildxobj.py: 65%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

158 statements  

1# A part of pdfrw (https://github.com/pmaupin/pdfrw) 

2# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas 

3# MIT license -- See LICENSE.txt for details 

4 

5''' 

6 

7This module contains code to build PDF "Form XObjects". 

8 

9A Form XObject allows a fragment from one PDF file to be cleanly 

10included in another PDF file. 

11 

12Reference for syntax: "Parameters for opening PDF files" from SDK 8.1 

13 

14 http://www.adobe.com/devnet/acrobat/pdfs/pdf_open_parameters.pdf 

15 

16 supported 'page=xxx', 'viewrect=<left>,<top>,<width>,<height>' 

17 

18 Also supported by this, but not by Adobe: 

19 'rotate=xxx' where xxx in [0, 90, 180, 270] 

20 

21 Units are in points 

22 

23 

24Reference for content: Adobe PDF reference, sixth edition, version 1.7 

25 

26 http://www.adobe.com/devnet/acrobat/pdfs/pdf_reference_1-7.pdf 

27 

28 Form xobjects discussed chapter 4.9, page 355 

29''' 

30 

31from .objects import PdfDict, PdfArray, PdfName 

32from .pdfreader import PdfReader 

33from .errors import log, PdfNotImplementedError 

34from .py23_diffs import iteritems 

35from .uncompress import uncompress 

36from .compress import compress 

37 

38 

39class ViewInfo(object): 

40 ''' Instantiate ViewInfo with a uri, and it will parse out 

41 the filename, page, and viewrect into object attributes. 

42 

43 Note 1: 

44 Viewrects follow the adobe definition. (See reference 

45 above). They are arrays of 4 numbers: 

46 

47 - Distance from left of document in points 

48 - Distance from top (NOT bottom) of document in points 

49 - Width of rectangle in points 

50 - Height of rectangle in points 

51 

52 Note 2: 

53 For simplicity, Viewrects can also be specified 

54 in fractions of the document. If every number in 

55 the viewrect is between 0 and 1 inclusive, then 

56 viewrect elements 0 and 2 are multiplied by the 

57 mediabox width before use, and viewrect elements 

58 1 and 3 are multiplied by the mediabox height before 

59 use. 

60 

61 Note 3: 

62 By default, an XObject based on the view will be 

63 cacheable. It should not be cacheable if the XObject 

64 will be subsequently modified. 

65 ''' 

66 doc = None 

67 docname = None 

68 page = None 

69 viewrect = None 

70 rotate = None 

71 cacheable = True 

72 

73 def __init__(self, pageinfo='', **kw): 

74 pageinfo = pageinfo.split('#', 1) 

75 if len(pageinfo) == 2: 

76 pageinfo[1:] = pageinfo[1].replace('&', '#').split('#') 

77 for key in 'page viewrect'.split(): 

78 if pageinfo[0].startswith(key + '='): 

79 break 

80 else: 

81 self.docname = pageinfo.pop(0) 

82 for item in pageinfo: 

83 key, value = item.split('=') 

84 key = key.strip() 

85 value = value.replace(',', ' ').split() 

86 if key in ('page', 'rotate'): 

87 assert len(value) == 1 

88 setattr(self, key, int(value[0])) 

89 elif key == 'viewrect': 

90 assert len(value) == 4 

91 setattr(self, key, [float(x) for x in value]) 

92 else: 

93 log.error('Unknown option: %s', key) 

94 for key, value in iteritems(kw): 

95 assert hasattr(self, key), key 

96 setattr(self, key, value) 

97 

98 

99def get_rotation(rotate): 

100 ''' Return clockwise rotation code: 

101 0 = unrotated 

102 1 = 90 degrees 

103 2 = 180 degrees 

104 3 = 270 degrees 

105 ''' 

106 try: 

107 rotate = int(rotate) 

108 except (ValueError, TypeError): 

109 return 0 

110 if rotate % 90 != 0: 

111 return 0 

112 return rotate // 90 

113 

114 

115def rotate_point(point, rotation): 

116 ''' Rotate an (x,y) coordinate clockwise by a 

117 rotation code specifying a multiple of 90 degrees. 

118 ''' 

119 if rotation & 1: 

120 point = point[1], -point[0] 

121 if rotation & 2: 

122 point = -point[0], -point[1] 

123 return point 

124 

125 

126def rotate_rect(rect, rotation): 

127 ''' Rotate both points within the rectangle, then normalize 

128 the rectangle by returning the new lower left, then new 

129 upper right. 

130 ''' 

131 rect = rotate_point(rect[:2], rotation) + rotate_point(rect[2:], rotation) 

132 return (min(rect[0], rect[2]), min(rect[1], rect[3]), 

133 max(rect[0], rect[2]), max(rect[1], rect[3])) 

134 

135 

136def getrects(inheritable, pageinfo, rotation): 

137 ''' Given the inheritable attributes of a page and 

138 the desired pageinfo rectangle, return the page's 

139 media box and the calculated boundary (clip) box. 

140 ''' 

141 mbox = tuple([float(x) for x in inheritable.MediaBox]) 

142 cbox = tuple([float(x) for x in (inheritable.CropBox or mbox)]) 

143 vrect = pageinfo.viewrect 

144 if vrect is not None: 

145 # Rotate the media box to match what the user sees, 

146 # figure out the clipping box, then rotate back 

147 mleft, mbot, mright, mtop = rotate_rect(cbox, rotation) 

148 x, y, w, h = vrect 

149 

150 # Support operations in fractions of a page 

151 if 0 <= min(vrect) < max(vrect) <= 1: 

152 mw = mright - mleft 

153 mh = mtop - mbot 

154 x *= mw 

155 w *= mw 

156 y *= mh 

157 h *= mh 

158 

159 cleft = mleft + x 

160 ctop = mtop - y 

161 cright = cleft + w 

162 cbot = ctop - h 

163 cbox = (max(mleft, cleft), max(mbot, cbot), 

164 min(mright, cright), min(mtop, ctop)) 

165 cbox = rotate_rect(cbox, -rotation) 

166 return mbox, cbox 

167 

168 

169def _build_cache(contents, allow_compressed): 

170 ''' Build a new dictionary holding the stream, 

171 and save it along with private cache info. 

172 Assumes validity has been pre-checked if 

173 we have a non-None xobj_copy. 

174 

175 Also, the spec says nothing about nested arrays, 

176 so we assume those don't exist until we see one 

177 in the wild. 

178 ''' 

179 try: 

180 xobj_copy = contents.xobj_copy 

181 except AttributeError: 

182 # Should have a PdfArray here... 

183 array = contents 

184 private = contents 

185 else: 

186 # Should have a PdfDict here -- might or might not have cache copy 

187 if xobj_copy is not None: 

188 return xobj_copy 

189 array = [contents] 

190 private = contents.private 

191 

192 # If we don't allow compressed objects, OR if we have multiple compressed 

193 # objects, we try to decompress them, and fail if we cannot do that. 

194 

195 if not allow_compressed or len(array) > 1: 

196 keys = set(x[0] for cdict in array for x in iteritems(cdict)) 

197 was_compressed = len(keys) > 1 

198 if was_compressed: 

199 # Make copies of the objects before we uncompress them. 

200 array = [PdfDict(x) for x in array] 

201 if not uncompress(array): 

202 raise PdfNotImplementedError( 

203 'Xobjects with these compression parameters not supported: %s' % 

204 keys) 

205 

206 xobj_copy = PdfDict(array[0]) 

207 xobj_copy.private.xobj_cachedict = {} 

208 private.xobj_copy = xobj_copy 

209 

210 if len(array) > 1: 

211 newstream = '\n'.join(x.stream for x in array) 

212 newlength = sum(int(x.Length) for x in array) + len(array) - 1 

213 assert newlength == len(newstream) 

214 xobj_copy.stream = newstream 

215 if was_compressed and allow_compressed: 

216 compress(xobj_copy) 

217 

218 return xobj_copy 

219 

220 

221def _cache_xobj(contents, resources, mbox, bbox, rotation, cacheable=True): 

222 ''' Return a cached Form XObject, or create a new one and cache it. 

223 Adds private members x, y, w, h 

224 ''' 

225 cachedict = contents.xobj_cachedict 

226 cachekey = mbox, bbox, rotation 

227 result = cachedict.get(cachekey) if cacheable else None 

228 if result is None: 

229 # If we are not getting a full page, or if we are going to 

230 # modify the results, first retrieve an underlying Form XObject 

231 # that represents the entire page, so that we are not copying 

232 # the full page data into the new file multiple times 

233 func = (_get_fullpage, _get_subpage)[mbox != bbox or not cacheable] 

234 result = PdfDict( 

235 func(contents, resources, mbox), 

236 Type=PdfName.XObject, 

237 Subtype=PdfName.Form, 

238 FormType=1, 

239 BBox=PdfArray(bbox), 

240 ) 

241 rect = bbox 

242 if rotation: 

243 matrix = (rotate_point((1, 0), rotation) + 

244 rotate_point((0, 1), rotation)) 

245 result.Matrix = PdfArray(matrix + (0, 0)) 

246 rect = rotate_rect(rect, rotation) 

247 

248 private = result.private 

249 private.x = rect[0] 

250 private.y = rect[1] 

251 private.w = rect[2] - rect[0] 

252 private.h = rect[3] - rect[1] 

253 if cacheable: 

254 cachedict[cachekey] = result 

255 return result 

256 

257 

258def _get_fullpage(contents, resources, mbox): 

259 ''' fullpage is easy. Just copy the contents, 

260 set up the resources, and let _cache_xobj handle the 

261 rest. 

262 ''' 

263 return PdfDict(contents, Resources=resources) 

264 

265 

266def _get_subpage(contents, resources, mbox): 

267 ''' subpages *could* be as easy as full pages, but we 

268 choose to complicate life by creating a Form XObject 

269 for the page, and then one that references it for 

270 the subpage, on the off-chance that we want multiple 

271 items from the page. 

272 ''' 

273 return PdfDict( 

274 stream='/FullPage Do\n', 

275 Resources=PdfDict( 

276 XObject=PdfDict( 

277 FullPage=_cache_xobj(contents, resources, mbox, mbox, 0) 

278 ) 

279 ) 

280 ) 

281 

282 

283def pagexobj(page, viewinfo=ViewInfo(), allow_compressed=True): 

284 ''' pagexobj creates and returns a Form XObject for 

285 a given view within a page (Defaults to entire page.) 

286 

287 pagexobj is passed a page and a viewrect. 

288 ''' 

289 inheritable = page.inheritable 

290 resources = inheritable.Resources 

291 rotation = get_rotation(inheritable.Rotate) 

292 mbox, bbox = getrects(inheritable, viewinfo, rotation) 

293 rotation += get_rotation(viewinfo.rotate) 

294 contents = _build_cache(page.Contents, allow_compressed) 

295 return _cache_xobj(contents, resources, mbox, bbox, rotation, 

296 viewinfo.cacheable) 

297 

298 

299def docxobj(pageinfo, doc=None, allow_compressed=True): 

300 ''' docinfo reads a page out of a document and uses 

301 pagexobj to create the Form XObject based on 

302 the page. 

303 

304 This is a convenience function for things like 

305 rst2pdf that want to be able to pass in textual 

306 filename/location descriptors and don't want to 

307 know about using PdfReader. 

308 

309 Can work standalone, or in conjunction with 

310 the CacheXObj class (below). 

311 

312 ''' 

313 if not isinstance(pageinfo, ViewInfo): 

314 pageinfo = ViewInfo(pageinfo) 

315 

316 # If we're explicitly passed a document, 

317 # make sure we don't have one implicitly as well. 

318 # If no implicit or explicit doc, then read one in 

319 # from the filename. 

320 if doc is not None: 

321 assert pageinfo.doc is None 

322 pageinfo.doc = doc 

323 elif pageinfo.doc is not None: 

324 doc = pageinfo.doc 

325 else: 

326 doc = pageinfo.doc = PdfReader(pageinfo.docname, 

327 decompress=not allow_compressed) 

328 assert isinstance(doc, PdfReader) 

329 

330 sourcepage = doc.pages[(pageinfo.page or 1) - 1] 

331 return pagexobj(sourcepage, pageinfo, allow_compressed) 

332 

333 

334class CacheXObj(object): 

335 ''' Use to keep from reparsing files over and over, 

336 and to keep from making the output too much 

337 bigger than it ought to be by replicating 

338 unnecessary object copies. 

339 

340 This is a convenience function for things like 

341 rst2pdf that want to be able to pass in textual 

342 filename/location descriptors and don't want to 

343 know about using PdfReader. 

344 ''' 

345 def __init__(self, decompress=False): 

346 ''' Set decompress true if you need 

347 the Form XObjects to be decompressed. 

348 Will decompress what it can and scream 

349 about the rest. 

350 ''' 

351 self.cached_pdfs = {} 

352 self.decompress = decompress 

353 

354 def load(self, sourcename): 

355 ''' Load a Form XObject from a uri 

356 ''' 

357 info = ViewInfo(sourcename) 

358 fname = info.docname 

359 pcache = self.cached_pdfs 

360 doc = pcache.get(fname) 

361 if doc is None: 

362 doc = pcache[fname] = PdfReader(fname, decompress=self.decompress) 

363 return docxobj(info, doc, allow_compressed=not self.decompress)