Coverage for casanova/casanova/reverse_reader.py: 26%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

88 statements  

1# ============================================================================= 

2# Casanova Reverse Reader 

3# ============================================================================= 

4# 

5# A reader reading the file backwards in order to read last lines in constant 

6# time. This is sometimes useful to be able to resume some computations 

7# where they were left off. 

8# 

9import csv 

10from io import DEFAULT_BUFFER_SIZE 

11from file_read_backwards.file_read_backwards import FileReadBackwardsIterator 

12from ebbe import with_is_last 

13 

14from casanova.reader import Reader 

15from casanova.utils import ensure_open 

16from casanova.exceptions import EmptyFileError, MissingColumnError 

17 

18END_OF_FILE = object() 

19 

20 

21class Batch(object): 

22 __slots__ = ('value', 'finished', 'cursor', 'rows') 

23 

24 def __init__(self, value, finished=False, cursor=None, rows=None): 

25 self.value = value 

26 self.finished = finished 

27 self.cursor = cursor 

28 self.rows = rows or [] 

29 

30 def __eq__(self, other): 

31 return ( 

32 self.value == other.value and 

33 self.finished == other.finished and 

34 self.cursor == other.cursor and 

35 self.rows == other.rows 

36 ) 

37 

38 def __iter__(self): 

39 return iter(self.rows) 

40 

41 def collect(self, pos): 

42 return set(row[pos] for row in self) 

43 

44 def __repr__(self): 

45 class_name = self.__class__.__name__ 

46 

47 return ( 

48 '<%(class_name)s value=%(value)s finished=%(finished)s cursor=%(cursor)s rows=%(rows)i>' 

49 ) % { 

50 'class_name': class_name, 

51 'value': self.value, 

52 'finished': self.finished, 

53 'cursor': self.cursor, 

54 'rows': len(self.rows) 

55 } 

56 

57 

58class ReverseReader(Reader): 

59 namespace = 'casanova.reverse_reader' 

60 

61 def __init__(self, input_file, quotechar=None, **kwargs): 

62 super().__init__(input_file, quotechar=quotechar, **kwargs) 

63 quotechar = quotechar or '"' 

64 

65 self.backwards_file = ensure_open(self.input_file.name, mode='rb') 

66 

67 backwards_iterator = FileReadBackwardsIterator( 

68 self.backwards_file, 

69 self.input_file.encoding, 

70 DEFAULT_BUFFER_SIZE 

71 ) 

72 

73 def correctly_escaped_backwards_iterator(): 

74 acc = None 

75 

76 for line in backwards_iterator: 

77 if acc is not None: 

78 acc = line + '\n' + acc 

79 else: 

80 acc = line 

81 

82 if acc.count(quotechar) % 2 == 0: 

83 yield acc 

84 acc = None 

85 

86 if acc is not None: 

87 yield acc 

88 

89 backwards_reader = csv.reader(correctly_escaped_backwards_iterator()) 

90 

91 def generator(): 

92 for is_last, row in with_is_last(backwards_reader): 

93 if not is_last or self.fieldnames is None: 

94 yield row 

95 

96 self.close() 

97 

98 self.reader = generator() 

99 

100 if self.fieldnames is None: 

101 self.buffered_rows = [] 

102 

103 def close(self): 

104 super().close() 

105 self.backwards_file.close() 

106 

107 @staticmethod 

108 def last_cell(input_file, column, **kwargs): 

109 with ReverseReader(input_file, **kwargs) as reader: 

110 record = next(reader.cells(column), END_OF_FILE) 

111 

112 if record is END_OF_FILE: 

113 raise EmptyFileError 

114 

115 return record 

116 

117 @staticmethod 

118 def last_batch(input_file, batch_value, batch_cursor, end_symbol, **kwargs): 

119 with ReverseReader(input_file, **kwargs) as reader: 

120 batch = END_OF_FILE 

121 

122 if batch_value not in reader.headers: 

123 raise MissingColumnError(batch_value) 

124 

125 if batch_cursor not in reader.headers: 

126 raise MissingColumnError(batch_cursor) 

127 

128 batch_value_pos = reader.headers[batch_value] 

129 batch_cursor_pos = reader.headers[batch_cursor] 

130 

131 for row in reader: 

132 value = row[batch_value_pos] 

133 cursor = row[batch_cursor_pos] 

134 

135 if batch is END_OF_FILE: 

136 batch = Batch(value) 

137 

138 if value != batch.value: 

139 return batch 

140 

141 if cursor == end_symbol: 

142 batch.finished = True 

143 return batch 

144 

145 if cursor: 

146 batch.cursor = cursor 

147 return batch 

148 

149 batch.rows.append(row) 

150 

151 if batch is END_OF_FILE: 

152 raise EmptyFileError 

153 

154 return batch