Coverage for pdfrw/pdfrw/uncompress.py: 78%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# A part of pdfrw (https://github.com/pmaupin/pdfrw)
2# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
3# Copyright (C) 2012-2015 Nerijus Mika
4# MIT license -- See LICENSE.txt for details
5# Copyright (c) 2006, Mathieu Fenniak
6# BSD license -- see LICENSE.txt for details
7'''
8A small subset of decompression filters. Should add more later.
10I believe, after looking at the code, that portions of the flate
11PNG predictor were originally transcribed from PyPDF2, which is
12probably an excellent source of additional filters.
13'''
14import array
15from .objects import PdfDict, PdfName, PdfArray
16from .errors import log
17from .py23_diffs import zlib, xrange, from_array, convert_load, convert_store
18import math
20def streamobjects(mylist, isinstance=isinstance, PdfDict=PdfDict):
21 for obj in mylist:
22 if isinstance(obj, PdfDict) and obj.stream is not None:
23 yield obj
25# Hack so we can import if zlib not available
26decompressobj = zlib if zlib is None else zlib.decompressobj
29def uncompress(mylist, leave_raw=False, warnings=set(),
30 flate=PdfName.FlateDecode, decompress=decompressobj,
31 isinstance=isinstance, list=list, len=len):
32 ok = True
33 for obj in streamobjects(mylist):
34 ftype = obj.Filter
35 if ftype is None:
36 continue
37 if isinstance(ftype, list) and len(ftype) == 1:
38 # todo: multiple filters
39 ftype = ftype[0]
40 parms = obj.DecodeParms or obj.DP
41 if ftype != flate:
42 msg = ('Not decompressing: cannot use filter %s'
43 ' with parameters %s') % (repr(ftype), repr(parms))
44 if msg not in warnings:
45 warnings.add(msg)
46 log.warning(msg)
47 ok = False
48 else:
49 dco = decompress()
50 try:
51 data = dco.decompress(convert_store(obj.stream))
52 except Exception as s:
53 error = str(s)
54 else:
55 error = None
56 if isinstance(parms, PdfArray):
57 oldparms = parms
58 parms = PdfDict()
59 for x in oldparms:
60 parms.update(x)
61 if parms:
62 predictor = int(parms.Predictor or 1)
63 columns = int(parms.Columns or 1)
64 colors = int(parms.Colors or 1)
65 bpc = int(parms.BitsPerComponent or 8)
66 if 10 <= predictor <= 15:
67 data, error = flate_png(data, predictor, columns, colors, bpc)
68 elif predictor != 1:
69 error = ('Unsupported flatedecode predictor %s' %
70 repr(predictor))
71 if error is None:
72 assert not dco.unconsumed_tail
73 if dco.unused_data.strip():
74 error = ('Unconsumed compression data: %s' %
75 repr(dco.unused_data[:20]))
76 if error is None:
77 obj.Filter = None
78 obj.stream = data if leave_raw else convert_load(data)
79 else:
80 log.error('%s %s' % (error, repr(obj.indirect)))
81 ok = False
82 return ok
84def flate_png_impl(data, predictor=1, columns=1, colors=1, bpc=8):
86 # http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html
87 # https://www.w3.org/TR/2003/REC-PNG-20031110/#9Filters
88 # Reconstruction functions
89 # x: the byte being filtered;
90 # a: the byte corresponding to x in the pixel immediately before the pixel containing x (or the byte immediately before x, when the bit depth is less than 8);
91 # b: the byte corresponding to x in the previous scanline;
92 # c: the byte corresponding to b in the pixel immediately before the pixel containing b (or the byte immediately before b, when the bit depth is less than 8).
94 def subfilter(data, prior_row_data, start, length, pixel_size):
95 # filter type 1: Sub
96 # Recon(x) = Filt(x) + Recon(a)
97 for i in xrange(pixel_size, length):
98 left = data[start + i - pixel_size]
99 data[start + i] = (data[start + i] + left) % 256
101 def upfilter(data, prior_row_data, start, length, pixel_size):
102 # filter type 2: Up
103 # Recon(x) = Filt(x) + Recon(b)
104 for i in xrange(length):
105 up = prior_row_data[i]
106 data[start + i] = (data[start + i] + up) % 256
108 def avgfilter(data, prior_row_data, start, length, pixel_size):
109 # filter type 3: Avg
110 # Recon(x) = Filt(x) + floor((Recon(a) + Recon(b)) / 2)
111 for i in xrange(length):
112 left = data[start + i - pixel_size] if i >= pixel_size else 0
113 up = prior_row_data[i]
114 floor = math.floor((left + up) / 2)
115 data[start + i] = (data[start + i] + int(floor)) % 256
117 def paethfilter(data, prior_row_data, start, length, pixel_size):
118 # filter type 4: Paeth
119 # Recon(x) = Filt(x) + PaethPredictor(Recon(a), Recon(b), Recon(c))
120 def paeth_predictor(a, b, c):
121 p = a + b - c
122 pa = abs(p - a)
123 pb = abs(p - b)
124 pc = abs(p - c)
125 if pa <= pb and pa <= pc:
126 return a
127 elif pb <= pc:
128 return b
129 else:
130 return c
131 for i in xrange(length):
132 left = data[start + i - pixel_size] if i >= pixel_size else 0
133 up = prior_row_data[i]
134 up_left = prior_row_data[i - pixel_size] if i >= pixel_size else 0
135 data[start + i] = (data[start + i] + paeth_predictor(left, up, up_left)) % 256
137 columnbytes = ((columns * colors * bpc) + 7) // 8
138 pixel_size = (colors * bpc + 7) // 8
139 data = array.array('B', data)
140 rowlen = columnbytes + 1
141 if predictor == 15:
142 padding = (rowlen - len(data)) % rowlen
143 data.extend([0] * padding)
144 assert len(data) % rowlen == 0
146 rows = xrange(0, len(data), rowlen)
147 prior_row_data = [ 0 for i in xrange(columnbytes) ]
148 for row_index in rows:
150 filter_type = data[row_index]
152 if filter_type == 0: # None filter
153 pass
155 elif filter_type == 1: # Sub filter
156 subfilter(data, prior_row_data, row_index + 1, columnbytes, pixel_size)
158 elif filter_type == 2: # Up filter
159 upfilter(data, prior_row_data, row_index + 1, columnbytes, pixel_size)
161 elif filter_type == 3: # Average filter
162 avgfilter(data, prior_row_data, row_index + 1, columnbytes, pixel_size)
164 elif filter_type == 4: # Paeth filter
165 paethfilter(data, prior_row_data, row_index + 1, columnbytes, pixel_size)
167 else:
168 return None, 'Unsupported PNG filter %d' % filter_type
170 prior_row_data = data[row_index + 1 : row_index + 1 + columnbytes] # without filter_type
172 for row_index in reversed(rows):
173 data.pop(row_index)
175 return data, None
177def flate_png(data, predictor=1, columns=1, colors=1, bpc=8):
178 ''' PNG prediction is used to make certain kinds of data
179 more compressible. Before the compression, each data
180 byte is either left the same, or is set to be a delta
181 from the previous byte, or is set to be a delta from
182 the previous row. This selection is done on a per-row
183 basis, and is indicated by a compression type byte
184 prepended to each row of data.
186 Within more recent PDF files, it is normal to use
187 this technique for Xref stream objects, which are
188 quite regular.
189 '''
190 d, e = flate_png_impl(data, predictor, columns, colors, bpc)
191 if d is not None:
192 d = from_array(d)
193 return d, e