Coverage for casanova/casanova/resuming.py: 30%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# =============================================================================
2# Casanova Resuming Strategies
3# =============================================================================
4#
5# A collection of process resuming strategies acknowledged by casanova
6# enrichers.
7#
8from threading import Lock
9from os.path import isfile, getsize
10from collections import deque
12from casanova._namedtuple import future_namedtuple
13from casanova.reader import Reader
14from casanova.reverse_reader import ReverseReader
15from casanova.exceptions import (
16 ResumeError,
17 NotResumableError,
18 MissingColumnError,
19 CorruptedIndexColumn
20)
21from casanova.contiguous_range_set import ContiguousRangeSet
24class Resumer(object):
25 def __init__(self, path, listener=None):
26 self.path = path
27 self.listener = listener
28 self.output_file = None
29 self.lock = Lock()
30 self.popped = False
31 self.buffer = deque()
33 def can_resume(self):
34 return isfile(self.path) and getsize(self.path) > 0
36 def open(self, mode='a', encoding='utf-8', newline=''):
37 return open(
38 self.path,
39 mode=mode,
40 encoding=encoding,
41 newline=newline
42 )
44 def open_output_file(self, **kwargs):
45 if self.output_file is not None:
46 raise ResumeError('output file is already opened')
48 mode = 'a+' if self.can_resume() else 'w'
50 self.output_file = self.open(mode=mode, **kwargs)
51 return self.output_file
53 def emit(self, event, payload):
54 if self.listener is None:
55 return
57 with self.lock:
58 self.listener(event, payload)
60 def get_insights_from_output(self, enricher):
61 raise NotImplementedError
63 def filter_row(self, i, row):
64 result = self.filter(i, row)
66 if not result:
67 self.emit('filter.row', (i, row))
69 return result
71 def get_state(self):
72 raise NotImplementedError
74 def pop_state(self):
75 if not self.popped:
76 self.popped = True
77 return self.get_state()
79 return None
81 def __enter__(self):
82 return self
84 def __exit__(self, *args):
85 self.close()
87 def close(self):
88 if self.output_file is not None:
89 self.output_file.close()
90 self.output_file = None
92 def __repr__(self):
93 return '<{name} path={path!r} can_resume={can_resume!r}>'.format(
94 name=self.__class__.__name__,
95 path=self.path,
96 can_resume=self.can_resume()
97 )
99 def already_done_count(self):
100 raise NotImplementedError
102 def __iter__(self):
103 if hasattr(self, 'filter'):
104 raise NotImplementedError
106 while self.buffer:
107 yield self.buffer.popleft()
110class RowCountResumer(Resumer):
111 def __init__(self, *args, **kwargs):
112 super().__init__(*args, **kwargs)
113 self.row_count = 0
115 def get_insights_from_output(self, enricher):
116 self.row_count = 0
118 with self.open(mode='r') as f:
119 reader = Reader(f)
121 count = 0
123 for row in reader:
124 self.emit('output.row', row)
125 count += 1
127 self.row_count = count
129 def resume(self, enricher):
130 i = 0
131 iterator = iter(enricher)
133 while i < self.row_count:
134 row = next(iterator)
135 self.emit('input.row', row)
136 i += 1
138 def already_done_count(self):
139 return self.row_count
142class ThreadSafeResumer(Resumer):
143 def __init__(self, *args, **kwargs):
144 super().__init__(*args, **kwargs)
145 self.already_done = ContiguousRangeSet()
147 def get_insights_from_output(self, enricher):
148 self.already_done = ContiguousRangeSet()
150 with self.open(mode='r') as f:
151 reader = Reader(f)
153 pos = reader.headers.get(enricher.index_column)
155 if pos is None:
156 raise MissingColumnError(enricher.index_column)
158 for row in reader:
159 self.emit('output.row', row)
161 try:
162 current_index = int(row[pos])
163 except ValueError:
164 raise CorruptedIndexColumn
166 self.already_done.add(current_index)
168 def filter(self, i, row):
169 return not self.already_done.stateful_contains(i)
171 def already_done_count(self):
172 return len(self.already_done)
175BatchResumerContext = future_namedtuple('BatchResumerContext', ['last_cursor', 'values_to_skip'])
178class BatchResumer(Resumer):
179 def __init__(self, path, value_column, **kwargs):
180 super().__init__(path, **kwargs)
181 self.last_batch = None
182 self.value_column = value_column
183 self.value_pos = None
184 self.last_cursor = None
185 self.values_to_skip = None
187 def get_insights_from_output(self, enricher):
188 self.last_batch = ReverseReader.last_batch(
189 self.path,
190 batch_value=self.value_column,
191 batch_cursor=enricher.cursor_column,
192 end_symbol=enricher.end_symbol
193 )
194 self.value_pos = enricher.output_pos[self.value_column]
195 self.last_cursor = None
196 self.values_to_skip = None
198 def get_state(self):
199 return BatchResumerContext(
200 self.last_cursor,
201 self.values_to_skip
202 )
204 def resume(self, enricher):
205 last_batch = self.last_batch
207 if last_batch is None:
208 return
210 iterator = iter(enricher)
212 while True:
213 row = next(iterator, None)
215 if row is None:
216 raise NotResumableError
218 self.emit('input.row', row)
220 value = row[self.value_pos]
222 # We haven't reached our batch yet
223 if value != last_batch.value:
224 continue
226 # Last batch was completely finished
227 elif last_batch.finished:
228 break
230 # Here we need to record additional information
231 self.last_cursor = last_batch.cursor
232 self.values_to_skip = set(row[self.value_pos] for row in last_batch.rows)
233 self.buffer.append(row)
235 break
238class LastCellResumer(Resumer):
239 def __init__(self, path, value_column, **kwargs):
240 super().__init__(path, **kwargs)
241 self.last_cell = None
242 self.value_column = value_column
244 def get_insights_from_output(self, enricher):
245 self.last_cell = ReverseReader.last_cell(
246 self.path,
247 column=self.value_column
248 )
250 def get_state(self):
251 return self.last_cell
254class LastCellComparisonResumer(LastCellResumer):
255 '''
256 Warning : this resumer will not work as desired if the column read contains duplicate values.
257 '''
258 def resume(self, enricher):
259 for row in enricher:
260 self.emit('input.row', row)
262 if row[self.value_column] == self.last_cell:
263 break