Coverage for casanova/casanova/resuming.py: 30%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

155 statements  

1# ============================================================================= 

2# Casanova Resuming Strategies 

3# ============================================================================= 

4# 

5# A collection of process resuming strategies acknowledged by casanova 

6# enrichers. 

7# 

8from threading import Lock 

9from os.path import isfile, getsize 

10from collections import deque 

11 

12from casanova._namedtuple import future_namedtuple 

13from casanova.reader import Reader 

14from casanova.reverse_reader import ReverseReader 

15from casanova.exceptions import ( 

16 ResumeError, 

17 NotResumableError, 

18 MissingColumnError, 

19 CorruptedIndexColumn 

20) 

21from casanova.contiguous_range_set import ContiguousRangeSet 

22 

23 

24class Resumer(object): 

25 def __init__(self, path, listener=None): 

26 self.path = path 

27 self.listener = listener 

28 self.output_file = None 

29 self.lock = Lock() 

30 self.popped = False 

31 self.buffer = deque() 

32 

33 def can_resume(self): 

34 return isfile(self.path) and getsize(self.path) > 0 

35 

36 def open(self, mode='a', encoding='utf-8', newline=''): 

37 return open( 

38 self.path, 

39 mode=mode, 

40 encoding=encoding, 

41 newline=newline 

42 ) 

43 

44 def open_output_file(self, **kwargs): 

45 if self.output_file is not None: 

46 raise ResumeError('output file is already opened') 

47 

48 mode = 'a+' if self.can_resume() else 'w' 

49 

50 self.output_file = self.open(mode=mode, **kwargs) 

51 return self.output_file 

52 

53 def emit(self, event, payload): 

54 if self.listener is None: 

55 return 

56 

57 with self.lock: 

58 self.listener(event, payload) 

59 

60 def get_insights_from_output(self, enricher): 

61 raise NotImplementedError 

62 

63 def filter_row(self, i, row): 

64 result = self.filter(i, row) 

65 

66 if not result: 

67 self.emit('filter.row', (i, row)) 

68 

69 return result 

70 

71 def get_state(self): 

72 raise NotImplementedError 

73 

74 def pop_state(self): 

75 if not self.popped: 

76 self.popped = True 

77 return self.get_state() 

78 

79 return None 

80 

81 def __enter__(self): 

82 return self 

83 

84 def __exit__(self, *args): 

85 self.close() 

86 

87 def close(self): 

88 if self.output_file is not None: 

89 self.output_file.close() 

90 self.output_file = None 

91 

92 def __repr__(self): 

93 return '<{name} path={path!r} can_resume={can_resume!r}>'.format( 

94 name=self.__class__.__name__, 

95 path=self.path, 

96 can_resume=self.can_resume() 

97 ) 

98 

99 def already_done_count(self): 

100 raise NotImplementedError 

101 

102 def __iter__(self): 

103 if hasattr(self, 'filter'): 

104 raise NotImplementedError 

105 

106 while self.buffer: 

107 yield self.buffer.popleft() 

108 

109 

110class RowCountResumer(Resumer): 

111 def __init__(self, *args, **kwargs): 

112 super().__init__(*args, **kwargs) 

113 self.row_count = 0 

114 

115 def get_insights_from_output(self, enricher): 

116 self.row_count = 0 

117 

118 with self.open(mode='r') as f: 

119 reader = Reader(f) 

120 

121 count = 0 

122 

123 for row in reader: 

124 self.emit('output.row', row) 

125 count += 1 

126 

127 self.row_count = count 

128 

129 def resume(self, enricher): 

130 i = 0 

131 iterator = iter(enricher) 

132 

133 while i < self.row_count: 

134 row = next(iterator) 

135 self.emit('input.row', row) 

136 i += 1 

137 

138 def already_done_count(self): 

139 return self.row_count 

140 

141 

142class ThreadSafeResumer(Resumer): 

143 def __init__(self, *args, **kwargs): 

144 super().__init__(*args, **kwargs) 

145 self.already_done = ContiguousRangeSet() 

146 

147 def get_insights_from_output(self, enricher): 

148 self.already_done = ContiguousRangeSet() 

149 

150 with self.open(mode='r') as f: 

151 reader = Reader(f) 

152 

153 pos = reader.headers.get(enricher.index_column) 

154 

155 if pos is None: 

156 raise MissingColumnError(enricher.index_column) 

157 

158 for row in reader: 

159 self.emit('output.row', row) 

160 

161 try: 

162 current_index = int(row[pos]) 

163 except ValueError: 

164 raise CorruptedIndexColumn 

165 

166 self.already_done.add(current_index) 

167 

168 def filter(self, i, row): 

169 return not self.already_done.stateful_contains(i) 

170 

171 def already_done_count(self): 

172 return len(self.already_done) 

173 

174 

175BatchResumerContext = future_namedtuple('BatchResumerContext', ['last_cursor', 'values_to_skip']) 

176 

177 

178class BatchResumer(Resumer): 

179 def __init__(self, path, value_column, **kwargs): 

180 super().__init__(path, **kwargs) 

181 self.last_batch = None 

182 self.value_column = value_column 

183 self.value_pos = None 

184 self.last_cursor = None 

185 self.values_to_skip = None 

186 

187 def get_insights_from_output(self, enricher): 

188 self.last_batch = ReverseReader.last_batch( 

189 self.path, 

190 batch_value=self.value_column, 

191 batch_cursor=enricher.cursor_column, 

192 end_symbol=enricher.end_symbol 

193 ) 

194 self.value_pos = enricher.output_pos[self.value_column] 

195 self.last_cursor = None 

196 self.values_to_skip = None 

197 

198 def get_state(self): 

199 return BatchResumerContext( 

200 self.last_cursor, 

201 self.values_to_skip 

202 ) 

203 

204 def resume(self, enricher): 

205 last_batch = self.last_batch 

206 

207 if last_batch is None: 

208 return 

209 

210 iterator = iter(enricher) 

211 

212 while True: 

213 row = next(iterator, None) 

214 

215 if row is None: 

216 raise NotResumableError 

217 

218 self.emit('input.row', row) 

219 

220 value = row[self.value_pos] 

221 

222 # We haven't reached our batch yet 

223 if value != last_batch.value: 

224 continue 

225 

226 # Last batch was completely finished 

227 elif last_batch.finished: 

228 break 

229 

230 # Here we need to record additional information 

231 self.last_cursor = last_batch.cursor 

232 self.values_to_skip = set(row[self.value_pos] for row in last_batch.rows) 

233 self.buffer.append(row) 

234 

235 break 

236 

237 

238class LastCellResumer(Resumer): 

239 def __init__(self, path, value_column, **kwargs): 

240 super().__init__(path, **kwargs) 

241 self.last_cell = None 

242 self.value_column = value_column 

243 

244 def get_insights_from_output(self, enricher): 

245 self.last_cell = ReverseReader.last_cell( 

246 self.path, 

247 column=self.value_column 

248 ) 

249 

250 def get_state(self): 

251 return self.last_cell 

252 

253 

254class LastCellComparisonResumer(LastCellResumer): 

255 ''' 

256 Warning : this resumer will not work as desired if the column read contains duplicate values. 

257 ''' 

258 def resume(self, enricher): 

259 for row in enricher: 

260 self.emit('input.row', row) 

261 

262 if row[self.value_column] == self.last_cell: 

263 break