Coverage for casanova/casanova/enricher.py: 24%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# =============================================================================
2# Casanova Enricher
3# =============================================================================
4#
5# A CSV reader/writer combo that can be used to read an input CSV file and
6# easily ouput a similar CSV file while editing, adding and filtering cell_count.
7#
8import csv
9from ebbe import with_is_last
11from casanova.resuming import (
12 LastCellComparisonResumer,
13 Resumer,
14 RowCountResumer,
15 ThreadSafeResumer,
16 BatchResumer
17)
18from casanova.exceptions import MissingColumnError
19from casanova.reader import (
20 Reader,
21 Headers
22)
25class Enricher(Reader):
26 __supported_resumers__ = (RowCountResumer, LastCellComparisonResumer)
28 def __init__(self, input_file, output_file, no_headers=False,
29 keep=None, add=None, **kwargs):
31 # Inheritance
32 super().__init__(input_file, no_headers=no_headers, **kwargs)
34 self.keep_indices = None
35 self.output_fieldnames = self.fieldnames
36 self.added_count = 0
37 self.padding = None
39 if keep is not None:
40 try:
41 self.keep_indices = self.headers.collect(keep)
42 except KeyError:
43 raise MissingColumnError
45 self.output_fieldnames = self.filterrow(self.output_fieldnames)
47 if add is not None:
48 self.output_fieldnames += add
49 self.added_count = len(add)
50 self.padding = [''] * self.added_count
52 self.output_headers = None
54 if self.headers is not None:
55 self.output_headers = Headers(self.output_fieldnames if not no_headers else len(self.output_fieldnames))
57 # Resuming?
58 self.resumer = None
59 can_resume = False
61 if isinstance(output_file, Resumer):
62 if not isinstance(output_file, self.__class__.__supported_resumers__):
63 raise TypeError('%s: does not support %s!' % (self.__class__.__name__, output_file.__class__.__name__))
65 self.resumer = output_file
67 can_resume = self.resumer.can_resume()
69 if can_resume:
70 self.resumer.get_insights_from_output(self)
72 if hasattr(self.resumer, 'resume'):
73 self.resumer.resume(self)
75 output_file = self.resumer.open_output_file()
77 # Instantiating writer
78 self.writer = csv.writer(output_file)
80 # Need to write headers?
81 if not no_headers and not can_resume:
82 self.writeheader()
84 # NOTE: overriding #.iter and not #.__iter__ else other reader iterators won't work
85 def iter(self):
86 if self.resumer is None:
87 yield from super().iter()
88 return
90 if not hasattr(self.resumer, 'filter'):
91 yield from self.resumer
92 yield from super().iter()
93 return
95 iterator = enumerate(super().iter())
97 for i, row in iterator:
98 if self.resumer.filter_row(i, row):
99 yield row
101 def __repr__(self):
102 columns_info = ' '.join('%s=%s' % t for t in self.headers)
104 return '<%s%s %s>' % (
105 self.__class__.__name__,
106 ' resumable' if self.resumable else '',
107 columns_info
108 )
110 def filterrow(self, row):
111 if self.keep_indices is not None:
112 row = [row[i] for i in self.keep_indices]
114 return row
116 def formatrow(self, row, add=None):
118 # Additions
119 if self.added_count > 0:
120 if add is None:
121 add = self.padding
122 else:
123 assert len(add) == self.added_count, 'casanova.enricher.writerow: expected %i additional cells but got %i.' % (self.added_count, len(add))
125 row = self.filterrow(row) + add
127 # No additions
128 else:
129 assert add is None, 'casanova.enricher.writerow: expected no additions.'
131 row = self.filterrow(row)
133 return row
135 def writeheader(self):
136 self.writer.writerow(self.output_fieldnames)
138 def writerow(self, row, add=None):
139 self.writer.writerow(self.formatrow(row, add))
142class ThreadSafeEnricher(Enricher):
143 __supported_resumers__ = (ThreadSafeResumer,)
145 def __init__(self, input_file, output_file, add=None,
146 index_column='index', **kwargs):
148 self.index_column = index_column
150 # Inheritance
151 super().__init__(
152 input_file,
153 output_file,
154 add=[index_column] + list(add),
155 **kwargs
156 )
158 def __iter__(self):
159 yield from enumerate(super().__iter__())
161 def cells(self, column, with_rows=False):
162 if with_rows:
163 for index, (row, value) in enumerate(super().cells(column, with_rows=True)):
164 yield index, row, value
165 else:
166 yield from enumerate(super().cells(column))
168 def writerow(self, index, row, add=None):
169 index = [index]
171 if add is None:
172 add = self.padding
174 self.writer.writerow(self.formatrow(row, index + add))
177class BatchEnricher(Enricher):
178 __supported_resumers__ = (BatchResumer,)
180 def __init__(self, input_file, output_file, add=None, cursor_column='cursor',
181 end_symbol='end', **kwargs):
183 self.cursor_column = cursor_column
184 self.end_symbol = end_symbol
186 # Inheritance
187 super().__init__(
188 input_file,
189 output_file,
190 add=[cursor_column] + list(add),
191 **kwargs
192 )
194 def writebatch(self, row, batch, cursor=None):
195 if cursor is None:
196 cursor = self.end_symbol
198 for is_last, addendum in with_is_last(batch):
199 self.writerow(row, [cursor if is_last else None] + addendum)