Coverage for casanova/casanova/reverse_reader.py: 26%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# =============================================================================
2# Casanova Reverse Reader
3# =============================================================================
4#
5# A reader reading the file backwards in order to read last lines in constant
6# time. This is sometimes useful to be able to resume some computations
7# where they were left off.
8#
9import csv
10from io import DEFAULT_BUFFER_SIZE
11from file_read_backwards.file_read_backwards import FileReadBackwardsIterator
12from ebbe import with_is_last
14from casanova.reader import Reader
15from casanova.utils import ensure_open
16from casanova.exceptions import EmptyFileError, MissingColumnError
18END_OF_FILE = object()
21class Batch(object):
22 __slots__ = ('value', 'finished', 'cursor', 'rows')
24 def __init__(self, value, finished=False, cursor=None, rows=None):
25 self.value = value
26 self.finished = finished
27 self.cursor = cursor
28 self.rows = rows or []
30 def __eq__(self, other):
31 return (
32 self.value == other.value and
33 self.finished == other.finished and
34 self.cursor == other.cursor and
35 self.rows == other.rows
36 )
38 def __iter__(self):
39 return iter(self.rows)
41 def collect(self, pos):
42 return set(row[pos] for row in self)
44 def __repr__(self):
45 class_name = self.__class__.__name__
47 return (
48 '<%(class_name)s value=%(value)s finished=%(finished)s cursor=%(cursor)s rows=%(rows)i>'
49 ) % {
50 'class_name': class_name,
51 'value': self.value,
52 'finished': self.finished,
53 'cursor': self.cursor,
54 'rows': len(self.rows)
55 }
58class ReverseReader(Reader):
59 namespace = 'casanova.reverse_reader'
61 def __init__(self, input_file, quotechar=None, **kwargs):
62 super().__init__(input_file, quotechar=quotechar, **kwargs)
63 quotechar = quotechar or '"'
65 self.backwards_file = ensure_open(self.input_file.name, mode='rb')
67 backwards_iterator = FileReadBackwardsIterator(
68 self.backwards_file,
69 self.input_file.encoding,
70 DEFAULT_BUFFER_SIZE
71 )
73 def correctly_escaped_backwards_iterator():
74 acc = None
76 for line in backwards_iterator:
77 if acc is not None:
78 acc = line + '\n' + acc
79 else:
80 acc = line
82 if acc.count(quotechar) % 2 == 0:
83 yield acc
84 acc = None
86 if acc is not None:
87 yield acc
89 backwards_reader = csv.reader(correctly_escaped_backwards_iterator())
91 def generator():
92 for is_last, row in with_is_last(backwards_reader):
93 if not is_last or self.fieldnames is None:
94 yield row
96 self.close()
98 self.reader = generator()
100 if self.fieldnames is None:
101 self.buffered_rows = []
103 def close(self):
104 super().close()
105 self.backwards_file.close()
107 @staticmethod
108 def last_cell(input_file, column, **kwargs):
109 with ReverseReader(input_file, **kwargs) as reader:
110 record = next(reader.cells(column), END_OF_FILE)
112 if record is END_OF_FILE:
113 raise EmptyFileError
115 return record
117 @staticmethod
118 def last_batch(input_file, batch_value, batch_cursor, end_symbol, **kwargs):
119 with ReverseReader(input_file, **kwargs) as reader:
120 batch = END_OF_FILE
122 if batch_value not in reader.headers:
123 raise MissingColumnError(batch_value)
125 if batch_cursor not in reader.headers:
126 raise MissingColumnError(batch_cursor)
128 batch_value_pos = reader.headers[batch_value]
129 batch_cursor_pos = reader.headers[batch_cursor]
131 for row in reader:
132 value = row[batch_value_pos]
133 cursor = row[batch_cursor_pos]
135 if batch is END_OF_FILE:
136 batch = Batch(value)
138 if value != batch.value:
139 return batch
141 if cursor == end_symbol:
142 batch.finished = True
143 return batch
145 if cursor:
146 batch.cursor = cursor
147 return batch
149 batch.rows.append(row)
151 if batch is END_OF_FILE:
152 raise EmptyFileError
154 return batch