Coverage for pdfrw/pdfrw/uncompress.py: 58%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

119 statements  

1# A part of pdfrw (https://github.com/pmaupin/pdfrw) 

2# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas 

3# Copyright (C) 2012-2015 Nerijus Mika 

4# MIT license -- See LICENSE.txt for details 

5# Copyright (c) 2006, Mathieu Fenniak 

6# BSD license -- see LICENSE.txt for details 

7''' 

8A small subset of decompression filters. Should add more later. 

9 

10I believe, after looking at the code, that portions of the flate 

11PNG predictor were originally transcribed from PyPDF2, which is 

12probably an excellent source of additional filters. 

13''' 

14import array 

15from .objects import PdfDict, PdfName, PdfArray 

16from .errors import log 

17from .py23_diffs import zlib, xrange, from_array, convert_load, convert_store 

18import math 

19 

20def streamobjects(mylist, isinstance=isinstance, PdfDict=PdfDict): 

21 for obj in mylist: 

22 if isinstance(obj, PdfDict) and obj.stream is not None: 

23 yield obj 

24 

25# Hack so we can import if zlib not available 

26decompressobj = zlib if zlib is None else zlib.decompressobj 

27 

28 

29def uncompress(mylist, leave_raw=False, warnings=set(), 

30 flate=PdfName.FlateDecode, decompress=decompressobj, 

31 isinstance=isinstance, list=list, len=len): 

32 ok = True 

33 for obj in streamobjects(mylist): 

34 ftype = obj.Filter 

35 if ftype is None: 

36 continue 

37 if isinstance(ftype, list) and len(ftype) == 1: 

38 # todo: multiple filters 

39 ftype = ftype[0] 

40 parms = obj.DecodeParms or obj.DP 

41 if ftype != flate: 

42 msg = ('Not decompressing: cannot use filter %s' 

43 ' with parameters %s') % (repr(ftype), repr(parms)) 

44 if msg not in warnings: 

45 warnings.add(msg) 

46 log.warning(msg) 

47 ok = False 

48 else: 

49 dco = decompress() 

50 try: 

51 data = dco.decompress(convert_store(obj.stream)) 

52 except Exception as s: 

53 error = str(s) 

54 else: 

55 error = None 

56 if isinstance(parms, PdfArray): 

57 oldparms = parms 

58 parms = PdfDict() 

59 for x in oldparms: 

60 parms.update(x) 

61 if parms: 

62 predictor = int(parms.Predictor or 1) 

63 columns = int(parms.Columns or 1) 

64 colors = int(parms.Colors or 1) 

65 bpc = int(parms.BitsPerComponent or 8) 

66 if 10 <= predictor <= 15: 

67 data, error = flate_png(data, predictor, columns, colors, bpc) 

68 elif predictor != 1: 

69 error = ('Unsupported flatedecode predictor %s' % 

70 repr(predictor)) 

71 if error is None: 

72 assert not dco.unconsumed_tail 

73 if dco.unused_data.strip(): 

74 error = ('Unconsumed compression data: %s' % 

75 repr(dco.unused_data[:20])) 

76 if error is None: 

77 obj.Filter = None 

78 obj.stream = data if leave_raw else convert_load(data) 

79 else: 

80 log.error('%s %s' % (error, repr(obj.indirect))) 

81 ok = False 

82 return ok 

83 

84def flate_png_impl(data, predictor=1, columns=1, colors=1, bpc=8): 

85 

86 # http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html 

87 # https://www.w3.org/TR/2003/REC-PNG-20031110/#9Filters 

88 # Reconstruction functions 

89 # x: the byte being filtered; 

90 # a: the byte corresponding to x in the pixel immediately before the pixel containing x (or the byte immediately before x, when the bit depth is less than 8); 

91 # b: the byte corresponding to x in the previous scanline; 

92 # c: the byte corresponding to b in the pixel immediately before the pixel containing b (or the byte immediately before b, when the bit depth is less than 8). 

93 

94 def subfilter(data, prior_row_data, start, length, pixel_size): 

95 # filter type 1: Sub 

96 # Recon(x) = Filt(x) + Recon(a) 

97 for i in xrange(pixel_size, length): 

98 left = data[start + i - pixel_size] 

99 data[start + i] = (data[start + i] + left) % 256 

100 

101 def upfilter(data, prior_row_data, start, length, pixel_size): 

102 # filter type 2: Up 

103 # Recon(x) = Filt(x) + Recon(b) 

104 for i in xrange(length): 

105 up = prior_row_data[i] 

106 data[start + i] = (data[start + i] + up) % 256 

107 

108 def avgfilter(data, prior_row_data, start, length, pixel_size): 

109 # filter type 3: Avg 

110 # Recon(x) = Filt(x) + floor((Recon(a) + Recon(b)) / 2) 

111 for i in xrange(length): 

112 left = data[start + i - pixel_size] if i >= pixel_size else 0 

113 up = prior_row_data[i] 

114 floor = math.floor((left + up) / 2) 

115 data[start + i] = (data[start + i] + int(floor)) % 256 

116 

117 def paethfilter(data, prior_row_data, start, length, pixel_size): 

118 # filter type 4: Paeth 

119 # Recon(x) = Filt(x) + PaethPredictor(Recon(a), Recon(b), Recon(c)) 

120 def paeth_predictor(a, b, c): 

121 p = a + b - c 

122 pa = abs(p - a) 

123 pb = abs(p - b) 

124 pc = abs(p - c) 

125 if pa <= pb and pa <= pc: 

126 return a 

127 elif pb <= pc: 

128 return b 

129 else: 

130 return c 

131 for i in xrange(length): 

132 left = data[start + i - pixel_size] if i >= pixel_size else 0 

133 up = prior_row_data[i] 

134 up_left = prior_row_data[i - pixel_size] if i >= pixel_size else 0 

135 data[start + i] = (data[start + i] + paeth_predictor(left, up, up_left)) % 256 

136 

137 columnbytes = ((columns * colors * bpc) + 7) // 8 

138 pixel_size = (colors * bpc + 7) // 8 

139 data = array.array('B', data) 

140 rowlen = columnbytes + 1 

141 if predictor == 15: 

142 padding = (rowlen - len(data)) % rowlen 

143 data.extend([0] * padding) 

144 assert len(data) % rowlen == 0 

145 

146 rows = xrange(0, len(data), rowlen) 

147 prior_row_data = [ 0 for i in xrange(columnbytes) ] 

148 for row_index in rows: 

149 

150 filter_type = data[row_index] 

151 

152 if filter_type == 0: # None filter 

153 pass 

154 

155 elif filter_type == 1: # Sub filter 

156 subfilter(data, prior_row_data, row_index + 1, columnbytes, pixel_size) 

157 

158 elif filter_type == 2: # Up filter 

159 upfilter(data, prior_row_data, row_index + 1, columnbytes, pixel_size) 

160 

161 elif filter_type == 3: # Average filter 

162 avgfilter(data, prior_row_data, row_index + 1, columnbytes, pixel_size) 

163 

164 elif filter_type == 4: # Paeth filter 

165 paethfilter(data, prior_row_data, row_index + 1, columnbytes, pixel_size) 

166 

167 else: 

168 return None, 'Unsupported PNG filter %d' % filter_type 

169 

170 prior_row_data = data[row_index + 1 : row_index + 1 + columnbytes] # without filter_type 

171 

172 for row_index in reversed(rows): 

173 data.pop(row_index) 

174 

175 return data, None 

176 

177def flate_png(data, predictor=1, columns=1, colors=1, bpc=8): 

178 ''' PNG prediction is used to make certain kinds of data 

179 more compressible. Before the compression, each data 

180 byte is either left the same, or is set to be a delta 

181 from the previous byte, or is set to be a delta from 

182 the previous row. This selection is done on a per-row 

183 basis, and is indicated by a compression type byte 

184 prepended to each row of data. 

185 

186 Within more recent PDF files, it is normal to use 

187 this technique for Xref stream objects, which are 

188 quite regular. 

189 ''' 

190 d, e = flate_png_impl(data, predictor, columns, colors, bpc) 

191 if d is not None: 

192 d = from_array(d) 

193 return d, e 

194