Coverage for pdfrw/pdfrw/tokens.py: 63%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# A part of pdfrw (https://github.com/pmaupin/pdfrw)
2# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
3# MIT license -- See LICENSE.txt for details
5'''
6A tokenizer for PDF streams.
8In general, documentation used was "PDF reference",
9sixth edition, for PDF version 1.7, dated November 2006.
11'''
13import re
14import itertools
15from .objects import PdfString, PdfObject
16from .objects.pdfname import BasePdfName
17from .errors import log, PdfParseError
18from .py23_diffs import nextattr, intern
21def linepos(fdata, loc):
22 line = fdata.count('\n', 0, loc) + 1
23 line += fdata.count('\r', 0, loc) - fdata.count('\r\n', 0, loc)
24 col = loc - max(fdata.rfind('\n', 0, loc), fdata.rfind('\r', 0, loc))
25 return line, col
28class PdfTokens(object):
30 # Table 3.1, page 50 of reference, defines whitespace
31 eol = '\n\r'
32 whitespace = '\x00 \t\f' + eol
34 # Text on page 50 defines delimiter characters
35 # Escape the ]
36 delimiters = r'()<>{}[\]/%'
38 # "normal" stuff is all but delimiters or whitespace.
40 p_normal = r'(?:[^\\%s%s]+|\\[^%s])+' % (whitespace, delimiters,
41 whitespace)
43 p_comment = r'\%%[^%s]*' % eol
45 # This will get the bulk of literal strings.
46 p_literal_string = r'\((?:[^\\()]+|\\.)*[()]?'
48 # This will get more pieces of literal strings
49 # (Don't ask me why, but it hangs without the trailing ?.)
50 p_literal_string_extend = r'(?:[^\\()]+|\\.)*[()]?'
52 # A hex string. This one's easy.
53 p_hex_string = r'\<[%s0-9A-Fa-f]*\>' % whitespace
55 p_dictdelim = r'\<\<|\>\>'
56 p_name = r'/[^%s%s]*' % (delimiters, whitespace)
58 p_catchall = '[^%s]' % whitespace
60 pattern = '|'.join([p_normal, p_name, p_hex_string, p_dictdelim,
61 p_literal_string, p_comment, p_catchall])
62 findtok = re.compile('(%s)[%s]*' % (pattern, whitespace),
63 re.DOTALL).finditer
64 findparen = re.compile('(%s)[%s]*' % (p_literal_string_extend,
65 whitespace), re.DOTALL).finditer
67 def _gettoks(self, startloc, intern=intern,
68 delimiters=delimiters, findtok=findtok,
69 findparen=findparen, PdfString=PdfString,
70 PdfObject=PdfObject, BasePdfName=BasePdfName):
71 ''' Given a source data string and a location inside it,
72 gettoks generates tokens. Each token is a tuple of the form:
73 <starting file loc>, <ending file loc>, <token string>
74 The ending file loc is past any trailing whitespace.
76 The main complication here is the literal strings, which
77 can contain nested parentheses. In order to cope with these
78 we can discard the current iterator and loop back to the
79 top to get a fresh one.
81 We could use re.search instead of re.finditer, but that's slower.
82 '''
83 fdata = self.fdata
84 current = self.current = [(startloc, startloc)]
85 cache = {}
86 get_cache = cache.get
87 while 1:
88 for match in findtok(fdata, current[0][1]):
89 current[0] = tokspan = match.span()
90 token = match.group(1)
91 firstch = token[0]
92 toktype = intern
93 if firstch not in delimiters:
94 toktype = PdfObject
95 elif firstch in '/<(%':
96 if firstch == '/':
97 # PDF Name
98 toktype = BasePdfName
99 elif firstch == '<':
100 # << dict delim, or < hex string >
101 if token[1:2] != '<':
102 toktype = PdfString
103 elif firstch == '(':
104 # Literal string
105 # It's probably simple, but maybe not
106 # Nested parentheses are a bear, and if
107 # they are present, we exit the for loop
108 # and get back in with a new starting location.
109 ends = None # For broken strings
110 if fdata[match.end(1) - 1] != ')':
111 nest = 2
112 m_start, loc = tokspan
113 for match in findparen(fdata, loc):
114 loc = match.end(1)
115 ending = fdata[loc - 1] == ')'
116 nest += 1 - ending * 2
117 if not nest:
118 break
119 if ending and ends is None:
120 ends = loc, match.end(), nest
121 token = fdata[m_start:loc]
122 current[0] = m_start, match.end()
123 if nest:
124 # There is one possible recoverable error
125 # seen in the wild -- some stupid generators
126 # don't escape (. If this happens, just
127 # terminate on first unescaped ). The string
128 # won't be quite right, but that's a science
129 # fair project for another time.
130 (self.error, self.exception)[not ends](
131 'Unterminated literal string')
132 loc, ends, nest = ends
133 token = fdata[m_start:loc] + ')' * nest
134 current[0] = m_start, ends
135 toktype = PdfString
136 elif firstch == '%':
137 # Comment
138 if self.strip_comments:
139 continue
140 else:
141 self.exception(('Tokenizer logic incorrect -- '
142 'should never get here'))
144 newtok = get_cache(token)
145 if newtok is None:
146 newtok = cache[token] = toktype(token)
147 yield newtok
148 if current[0] is not tokspan:
149 break
150 else:
151 if self.strip_comments:
152 break
153 raise StopIteration
155 def __init__(self, fdata, startloc=0, strip_comments=True, verbose=True):
156 self.fdata = fdata
157 self.strip_comments = strip_comments
158 self.iterator = iterator = self._gettoks(startloc)
159 self.msgs_dumped = None if verbose else set()
160 self.next = getattr(iterator, nextattr)
161 self.current = [(startloc, startloc)]
163 def setstart(self, startloc):
164 ''' Change the starting location.
165 '''
166 current = self.current
167 if startloc != current[0][1]:
168 current[0] = startloc, startloc
170 def floc(self):
171 ''' Return the current file position
172 (where the next token will be retrieved)
173 '''
174 return self.current[0][1]
175 floc = property(floc, setstart)
177 def tokstart(self):
178 ''' Return the file position of the most
179 recently retrieved token.
180 '''
181 return self.current[0][0]
182 tokstart = property(tokstart, setstart)
184 def __iter__(self):
185 return self.iterator
187 def multiple(self, count, islice=itertools.islice, list=list):
188 ''' Retrieve multiple tokens
189 '''
190 return list(islice(self, count))
192 def next_default(self, default='nope'):
193 for result in self:
194 return result
195 return default
197 def msg(self, msg, *arg):
198 dumped = self.msgs_dumped
199 if dumped is not None:
200 if msg in dumped:
201 return
202 dumped.add(msg)
203 if arg:
204 msg %= arg
205 fdata = self.fdata
206 begin, end = self.current[0]
207 if begin >= len(fdata):
208 return '%s (filepos %s past EOF %s)' % (msg, begin, len(fdata))
209 line, col = linepos(fdata, begin)
210 if end > begin:
211 tok = fdata[begin:end].rstrip()
212 if len(tok) > 30:
213 tok = tok[:26] + ' ...'
214 return ('%s (line=%d, col=%d, token=%s)' %
215 (msg, line, col, repr(tok)))
216 return '%s (line=%d, col=%d)' % (msg, line, col)
218 def warning(self, *arg):
219 s = self.msg(*arg)
220 if s:
221 log.warning(s)
223 def error(self, *arg):
224 s = self.msg(*arg)
225 if s:
226 log.error(s)
228 def exception(self, *arg):
229 raise PdfParseError(self.msg(*arg))