Coverage for casanova/casanova/reader.py: 91%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# =============================================================================
2# Casanova Reader
3# =============================================================================
4#
5# A fast but comfortable CSV reader based upon csv.reader to avoid dealing
6# with csv.DictReader which is nice but very slow.
7#
8import csv
9from collections import deque
10from collections.abc import Iterable
11from io import IOBase
12from operator import itemgetter
14from casanova.defaults import DEFAULTS
15from casanova.utils import ensure_open, suppress_BOM, size_of_row_in_file
16from casanova.exceptions import EmptyFileError, MissingColumnError, NoHeadersError
19def validate_multiplex_tuple(multiplex):
20 return (
21 isinstance(multiplex, tuple) and
22 len(multiplex) in [2, 3] and
23 all(isinstance(t, str) for t in multiplex)
24 )
27class DictLikeRow(object):
28 __slots__ = ('__mapping', '__row')
30 def __init__(self, mapping, row):
31 self.__mapping = mapping
32 self.__row = row
34 def __getitem__(self, key):
35 return self.__row[self.__mapping[key]]
37 def __getattr__(self, key):
38 return self.__getitem__(key)
41class Headers(object):
42 def __init__(self, fieldnames):
43 self.__mapping = {h: i for i, h in enumerate(fieldnames)}
45 def rename(self, old_name, new_name):
46 if old_name == new_name:
47 raise TypeError
49 self.__mapping[new_name] = self[old_name]
50 del self.__mapping[old_name]
52 def __len__(self):
53 return len(self.__mapping)
55 def __getitem__(self, key):
56 return self.__mapping[key]
58 def __getattr__(self, key):
59 return self.__getitem__(key)
61 def __contains__(self, key):
62 return key in self.__mapping
64 def __iter__(self):
65 yield from sorted(self.__mapping.items(), key=itemgetter(1))
67 def as_dict(self):
68 return self.__mapping.copy()
70 def get(self, key, default=None):
71 return self.__mapping.get(key, default)
73 def collect(self, keys):
74 return [self[k] for k in keys]
76 def wrap(self, row):
77 return DictLikeRow(self.__mapping, row)
79 def __repr__(self):
80 class_name = self.__class__.__name__
82 representation = '<' + class_name
84 for h, i in self:
85 if h.isidentifier():
86 representation += ' %s=%s' % (h, i)
87 else:
88 representation += ' "%s"=%s' % (h, i)
90 representation += '>'
92 return representation
95class Reader(object):
96 namespace = 'casanova.reader'
98 def __init__(self, input_file, no_headers=False, encoding='utf-8',
99 dialect=None, quotechar=None, delimiter=None, prebuffer_bytes=None,
100 total=None, multiplex=None):
102 # Resolving global defaults
103 if prebuffer_bytes is None:
104 prebuffer_bytes = DEFAULTS['prebuffer_bytes']
106 # Detecting input type
107 if isinstance(input_file, IOBase):
108 input_type = 'file'
110 elif isinstance(input_file, str):
111 input_type = 'path'
112 input_file = ensure_open(input_file, encoding=encoding)
114 elif isinstance(input_file, Iterable):
115 input_type = 'iterable'
116 input_file = iter(input_file)
118 else:
119 raise TypeError('expecting a file, a path or an iterable of rows')
121 if multiplex is not None and not validate_multiplex_tuple(multiplex):
122 raise TypeError('`multiplex` should be a 2-tuple or 3-tuple containing the column to split, the split character and optionally a new name for the column')
124 reader_kwargs = {}
126 if dialect is not None:
127 reader_kwargs['dialect'] = dialect
128 if quotechar is not None:
129 reader_kwargs['quotechar'] = quotechar
130 if delimiter is not None:
131 reader_kwargs['delimiter'] = delimiter
133 self.input_type = input_type
134 self.input_file = input_file
136 if self.input_type == 'iterable':
137 self.reader = self.input_file
138 else:
139 self.reader = csv.reader(input_file, **reader_kwargs)
141 self.buffered_rows = deque()
142 self.was_completely_buffered = False
143 self.total = total
144 self.headers = None
145 self.expected_row_length = None
147 # Reading headers
148 if no_headers:
149 try:
150 self.buffered_rows.append(next(self.reader))
151 except StopIteration:
152 raise EmptyFileError
154 self.expected_row_length = len(self.buffered_rows[0])
155 else:
156 try:
157 fieldnames = next(self.reader)
159 if fieldnames:
160 fieldnames[0] = suppress_BOM(fieldnames[0])
162 except StopIteration:
163 raise EmptyFileError
165 self.headers = Headers(fieldnames)
167 # Multiplexing
168 if multiplex is not None:
169 multiplex_column = multiplex[0]
170 split_char = multiplex[1]
172 if multiplex_column not in self.headers:
173 raise MissingColumnError(multiplex_column)
175 multiplex_pos = self.headers[multiplex_column]
177 # New col
178 if len(multiplex) == 3:
179 self.headers.rename(multiplex_column, multiplex[2])
181 original_reader = self.reader
183 def reader_wrapper():
184 for row in original_reader:
185 cell = row[multiplex_pos]
187 if not cell or split_char not in cell:
188 yield row
190 else:
191 for value in cell.split(split_char):
192 copy = list(row)
193 copy[multiplex_pos] = value
194 yield copy
196 self.reader = reader_wrapper()
198 # Prebuffering
199 if prebuffer_bytes is not None and self.total is None:
200 if not isinstance(prebuffer_bytes, int) or prebuffer_bytes < 1:
201 raise TypeError('expecting a positive integer as "prebuffer_bytes" kwarg')
203 buffered_bytes = 0
205 while buffered_bytes < prebuffer_bytes:
206 row = next(self.reader, None)
208 if row is None:
209 self.was_completely_buffered = True
210 self.total = len(self.buffered_rows)
211 break
213 buffered_bytes += size_of_row_in_file(row)
214 self.buffered_rows.append(row)
216 def __repr__(self):
217 columns_info = ' '.join('%s=%s' % t for t in self.headers)
219 return '<%s %s>' % (self.namespace, columns_info)
221 @property
222 def fieldnames(self):
223 if self.headers is None:
224 return None
226 return [k for k, v in self.headers]
228 @property
229 def row_len(self):
230 if self.expected_row_length is not None:
231 return self.expected_row_length
233 return len(self.headers)
235 def iter(self):
236 while self.buffered_rows:
237 yield self.buffered_rows.popleft()
239 yield from self.reader
241 def wrap(self, row):
242 return self.headers.wrap(row)
244 def __iter__(self):
245 return self.iter()
247 def __cells(self, column, with_rows=False):
248 if not isinstance(column, int):
249 if self.headers is None:
250 raise NoHeadersError
252 pos = self.headers.get(column)
254 if pos is None:
255 raise MissingColumnError(column)
256 else:
257 if column >= self.row_len:
258 raise MissingColumnError
260 pos = column
262 if with_rows:
263 def iterator():
264 for row in self.iter():
265 yield row, row[pos]
266 else:
267 def iterator():
268 for row in self.iter():
269 yield row[pos]
271 return iterator()
273 def cells(self, column, with_rows=False):
274 if not isinstance(column, (str, int)):
275 raise TypeError
277 return self.__cells(column, with_rows=with_rows)
279 def close(self):
280 if self.input_type == 'file':
281 self.input_file.close()
283 def __enter__(self):
284 return self
286 def __exit__(self, *args):
287 self.close()
289 @classmethod
290 def count(cls, input_file, max_rows=None, **kwargs):
291 assert max_rows is None or max_rows > 0, '%s.count: expected max_rows to be `None` or > 0.' % cls.namespace
293 n = 0
295 with cls(input_file, **kwargs) as reader:
296 for _ in reader:
297 n += 1
299 if max_rows is not None and n > max_rows:
300 return None
302 return n