Coverage for pdfrw/pdfrw/tokens.py: 62%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

133 statements  

1# A part of pdfrw (https://github.com/pmaupin/pdfrw) 

2# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas 

3# MIT license -- See LICENSE.txt for details 

4 

5''' 

6A tokenizer for PDF streams. 

7 

8In general, documentation used was "PDF reference", 

9sixth edition, for PDF version 1.7, dated November 2006. 

10 

11''' 

12 

13import re 

14import itertools 

15from .objects import PdfString, PdfObject 

16from .objects.pdfname import BasePdfName 

17from .errors import log, PdfParseError 

18from .py23_diffs import nextattr, intern 

19 

20 

21def linepos(fdata, loc): 

22 line = fdata.count('\n', 0, loc) + 1 

23 line += fdata.count('\r', 0, loc) - fdata.count('\r\n', 0, loc) 

24 col = loc - max(fdata.rfind('\n', 0, loc), fdata.rfind('\r', 0, loc)) 

25 return line, col 

26 

27 

28class PdfTokens(object): 

29 

30 # Table 3.1, page 50 of reference, defines whitespace 

31 eol = '\n\r' 

32 whitespace = '\x00 \t\f' + eol 

33 

34 # Text on page 50 defines delimiter characters 

35 # Escape the ] 

36 delimiters = r'()<>{}[\]/%' 

37 

38 # "normal" stuff is all but delimiters or whitespace. 

39 

40 p_normal = r'(?:[^\\%s%s]+|\\[^%s])+' % (whitespace, delimiters, 

41 whitespace) 

42 

43 p_comment = r'\%%[^%s]*' % eol 

44 

45 # This will get the bulk of literal strings. 

46 p_literal_string = r'\((?:[^\\()]+|\\.)*[()]?' 

47 

48 # This will get more pieces of literal strings 

49 # (Don't ask me why, but it hangs without the trailing ?.) 

50 p_literal_string_extend = r'(?:[^\\()]+|\\.)*[()]?' 

51 

52 # A hex string. This one's easy. 

53 p_hex_string = r'\<[%s0-9A-Fa-f]*\>' % whitespace 

54 

55 p_dictdelim = r'\<\<|\>\>' 

56 p_name = r'/[^%s%s]*' % (delimiters, whitespace) 

57 

58 p_catchall = '[^%s]' % whitespace 

59 

60 pattern = '|'.join([p_normal, p_name, p_hex_string, p_dictdelim, 

61 p_literal_string, p_comment, p_catchall]) 

62 findtok = re.compile('(%s)[%s]*' % (pattern, whitespace), 

63 re.DOTALL).finditer 

64 findparen = re.compile('(%s)[%s]*' % (p_literal_string_extend, 

65 whitespace), re.DOTALL).finditer 

66 

67 def _gettoks(self, startloc, intern=intern, 

68 delimiters=delimiters, findtok=findtok, 

69 findparen=findparen, PdfString=PdfString, 

70 PdfObject=PdfObject, BasePdfName=BasePdfName): 

71 ''' Given a source data string and a location inside it, 

72 gettoks generates tokens. Each token is a tuple of the form: 

73 <starting file loc>, <ending file loc>, <token string> 

74 The ending file loc is past any trailing whitespace. 

75 

76 The main complication here is the literal strings, which 

77 can contain nested parentheses. In order to cope with these 

78 we can discard the current iterator and loop back to the 

79 top to get a fresh one. 

80 

81 We could use re.search instead of re.finditer, but that's slower. 

82 ''' 

83 fdata = self.fdata 

84 current = self.current = [(startloc, startloc)] 

85 cache = {} 

86 get_cache = cache.get 

87 while 1: 

88 for match in findtok(fdata, current[0][1]): 

89 current[0] = tokspan = match.span() 

90 token = match.group(1) 

91 firstch = token[0] 

92 toktype = intern 

93 if firstch not in delimiters: 

94 toktype = PdfObject 

95 elif firstch in '/<(%': 

96 if firstch == '/': 

97 # PDF Name 

98 toktype = BasePdfName 

99 elif firstch == '<': 

100 # << dict delim, or < hex string > 

101 if token[1:2] != '<': 

102 toktype = PdfString 

103 elif firstch == '(': 

104 # Literal string 

105 # It's probably simple, but maybe not 

106 # Nested parentheses are a bear, and if 

107 # they are present, we exit the for loop 

108 # and get back in with a new starting location. 

109 ends = None # For broken strings 

110 if fdata[match.end(1) - 1] != ')': 

111 nest = 2 

112 m_start, loc = tokspan 

113 for match in findparen(fdata, loc): 

114 loc = match.end(1) 

115 ending = fdata[loc - 1] == ')' 

116 nest += 1 - ending * 2 

117 if not nest: 

118 break 

119 if ending and ends is None: 

120 ends = loc, match.end(), nest 

121 token = fdata[m_start:loc] 

122 current[0] = m_start, match.end() 

123 if nest: 

124 # There is one possible recoverable error 

125 # seen in the wild -- some stupid generators 

126 # don't escape (. If this happens, just 

127 # terminate on first unescaped ). The string 

128 # won't be quite right, but that's a science 

129 # fair project for another time. 

130 (self.error, self.exception)[not ends]( 

131 'Unterminated literal string') 

132 loc, ends, nest = ends 

133 token = fdata[m_start:loc] + ')' * nest 

134 current[0] = m_start, ends 

135 toktype = PdfString 

136 elif firstch == '%': 

137 # Comment 

138 if self.strip_comments: 

139 continue 

140 else: 

141 self.exception(('Tokenizer logic incorrect -- ' 

142 'should never get here')) 

143 

144 newtok = get_cache(token) 

145 if newtok is None: 

146 newtok = cache[token] = toktype(token) 

147 yield newtok 

148 if current[0] is not tokspan: 

149 break 

150 else: 

151 if self.strip_comments: 

152 break 

153 raise StopIteration 

154 

155 def __init__(self, fdata, startloc=0, strip_comments=True, verbose=True): 

156 self.fdata = fdata 

157 self.strip_comments = strip_comments 

158 self.iterator = iterator = self._gettoks(startloc) 

159 self.msgs_dumped = None if verbose else set() 

160 self.next = getattr(iterator, nextattr) 

161 self.current = [(startloc, startloc)] 

162 

163 def setstart(self, startloc): 

164 ''' Change the starting location. 

165 ''' 

166 current = self.current 

167 if startloc != current[0][1]: 

168 current[0] = startloc, startloc 

169 

170 def floc(self): 

171 ''' Return the current file position 

172 (where the next token will be retrieved) 

173 ''' 

174 return self.current[0][1] 

175 floc = property(floc, setstart) 

176 

177 def tokstart(self): 

178 ''' Return the file position of the most 

179 recently retrieved token. 

180 ''' 

181 return self.current[0][0] 

182 tokstart = property(tokstart, setstart) 

183 

184 def __iter__(self): 

185 return self.iterator 

186 

187 def multiple(self, count, islice=itertools.islice, list=list): 

188 ''' Retrieve multiple tokens 

189 ''' 

190 return list(islice(self, count)) 

191 

192 def next_default(self, default='nope'): 

193 for result in self: 

194 return result 

195 return default 

196 

197 def msg(self, msg, *arg): 

198 dumped = self.msgs_dumped 

199 if dumped is not None: 

200 if msg in dumped: 

201 return 

202 dumped.add(msg) 

203 if arg: 

204 msg %= arg 

205 fdata = self.fdata 

206 begin, end = self.current[0] 

207 if begin >= len(fdata): 

208 return '%s (filepos %s past EOF %s)' % (msg, begin, len(fdata)) 

209 line, col = linepos(fdata, begin) 

210 if end > begin: 

211 tok = fdata[begin:end].rstrip() 

212 if len(tok) > 30: 

213 tok = tok[:26] + ' ...' 

214 return ('%s (line=%d, col=%d, token=%s)' % 

215 (msg, line, col, repr(tok))) 

216 return '%s (line=%d, col=%d)' % (msg, line, col) 

217 

218 def warning(self, *arg): 

219 s = self.msg(*arg) 

220 if s: 

221 log.warning(s) 

222 

223 def error(self, *arg): 

224 s = self.msg(*arg) 

225 if s: 

226 log.error(s) 

227 

228 def exception(self, *arg): 

229 raise PdfParseError(self.msg(*arg))