Coverage for casanova/casanova/enricher.py: 93%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

101 statements  

1# ============================================================================= 

2# Casanova Enricher 

3# ============================================================================= 

4# 

5# A CSV reader/writer combo that can be used to read an input CSV file and 

6# easily ouput a similar CSV file while editing, adding and filtering cell_count. 

7# 

8import csv 

9from ebbe import with_is_last 

10 

11from casanova.resuming import ( 

12 LastCellComparisonResumer, 

13 Resumer, 

14 RowCountResumer, 

15 ThreadSafeResumer, 

16 BatchResumer 

17) 

18from casanova.exceptions import MissingColumnError 

19from casanova.reader import ( 

20 Reader, 

21 Headers 

22) 

23 

24 

25class Enricher(Reader): 

26 __supported_resumers__ = (RowCountResumer, LastCellComparisonResumer) 

27 

28 def __init__(self, input_file, output_file, no_headers=False, 

29 keep=None, add=None, **kwargs): 

30 

31 # Inheritance 

32 super().__init__(input_file, no_headers=no_headers, **kwargs) 

33 

34 self.keep_indices = None 

35 self.output_fieldnames = self.fieldnames 

36 self.added_count = 0 

37 self.padding = None 

38 

39 if keep is not None: 

40 try: 

41 self.keep_indices = self.headers.collect(keep) 

42 except KeyError: 

43 raise MissingColumnError 

44 

45 self.output_fieldnames = self.filterrow(self.output_fieldnames) 

46 

47 if add is not None: 

48 self.output_fieldnames += add 

49 self.added_count = len(add) 

50 self.padding = [''] * self.added_count 

51 

52 self.output_headers = None 

53 

54 if self.headers is not None: 

55 self.output_headers = Headers(self.output_fieldnames if not no_headers else len(self.output_fieldnames)) 

56 

57 # Resuming? 

58 self.resumer = None 

59 can_resume = False 

60 

61 if isinstance(output_file, Resumer): 

62 if not isinstance(output_file, self.__class__.__supported_resumers__): 

63 raise TypeError('%s: does not support %s!' % (self.__class__.__name__, output_file.__class__.__name__)) 

64 

65 self.resumer = output_file 

66 

67 can_resume = self.resumer.can_resume() 

68 

69 if can_resume: 

70 self.resumer.get_insights_from_output(self) 

71 

72 if hasattr(self.resumer, 'resume'): 

73 self.resumer.resume(self) 

74 

75 output_file = self.resumer.open_output_file() 

76 

77 # Instantiating writer 

78 self.writer = csv.writer(output_file) 

79 

80 # Need to write headers? 

81 if not no_headers and not can_resume: 

82 self.writeheader() 

83 

84 # NOTE: overriding #.iter and not #.__iter__ else other reader iterators won't work 

85 def iter(self): 

86 if self.resumer is None: 

87 yield from super().iter() 

88 return 

89 

90 if not hasattr(self.resumer, 'filter'): 

91 yield from self.resumer 

92 yield from super().iter() 

93 return 

94 

95 iterator = enumerate(super().iter()) 

96 

97 for i, row in iterator: 

98 if self.resumer.filter_row(i, row): 

99 yield row 

100 

101 def __repr__(self): 

102 columns_info = ' '.join('%s=%s' % t for t in self.headers) 

103 

104 return '<%s%s %s>' % ( 

105 self.__class__.__name__, 

106 ' resumable' if self.resumable else '', 

107 columns_info 

108 ) 

109 

110 def filterrow(self, row): 

111 if self.keep_indices is not None: 

112 row = [row[i] for i in self.keep_indices] 

113 

114 return row 

115 

116 def formatrow(self, row, add=None): 

117 

118 # Additions 

119 if self.added_count > 0: 

120 if add is None: 

121 add = self.padding 

122 else: 

123 assert len(add) == self.added_count, 'casanova.enricher.writerow: expected %i additional cells but got %i.' % (self.added_count, len(add)) 

124 

125 row = self.filterrow(row) + add 

126 

127 # No additions 

128 else: 

129 assert add is None, 'casanova.enricher.writerow: expected no additions.' 

130 

131 row = self.filterrow(row) 

132 

133 return row 

134 

135 def writeheader(self): 

136 self.writer.writerow(self.output_fieldnames) 

137 

138 def writerow(self, row, add=None): 

139 self.writer.writerow(self.formatrow(row, add)) 

140 

141 

142class ThreadSafeEnricher(Enricher): 

143 __supported_resumers__ = (ThreadSafeResumer,) 

144 

145 def __init__(self, input_file, output_file, add=None, 

146 index_column='index', **kwargs): 

147 

148 self.index_column = index_column 

149 

150 # Inheritance 

151 super().__init__( 

152 input_file, 

153 output_file, 

154 add=[index_column] + list(add), 

155 **kwargs 

156 ) 

157 

158 def __iter__(self): 

159 yield from enumerate(super().__iter__()) 

160 

161 def cells(self, column, with_rows=False): 

162 if with_rows: 

163 for index, (row, value) in enumerate(super().cells(column, with_rows=True)): 

164 yield index, row, value 

165 else: 

166 yield from enumerate(super().cells(column)) 

167 

168 def writerow(self, index, row, add=None): 

169 index = [index] 

170 

171 if add is None: 

172 add = self.padding 

173 

174 self.writer.writerow(self.formatrow(row, index + add)) 

175 

176 

177class BatchEnricher(Enricher): 

178 __supported_resumers__ = (BatchResumer,) 

179 

180 def __init__(self, input_file, output_file, add=None, cursor_column='cursor', 

181 end_symbol='end', **kwargs): 

182 

183 self.cursor_column = cursor_column 

184 self.end_symbol = end_symbol 

185 

186 # Inheritance 

187 super().__init__( 

188 input_file, 

189 output_file, 

190 add=[cursor_column] + list(add), 

191 **kwargs 

192 ) 

193 

194 def writebatch(self, row, batch, cursor=None): 

195 if cursor is None: 

196 cursor = self.end_symbol 

197 

198 for is_last, addendum in with_is_last(batch): 

199 self.writerow(row, [cursor if is_last else None] + addendum)