Coverage for casanova/casanova/reader.py: 91%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

191 statements  

1# ============================================================================= 

2# Casanova Reader 

3# ============================================================================= 

4# 

5# A fast but comfortable CSV reader based upon csv.reader to avoid dealing 

6# with csv.DictReader which is nice but very slow. 

7# 

8import csv 

9from collections import deque 

10from collections.abc import Iterable 

11from io import IOBase 

12from operator import itemgetter 

13 

14from casanova.defaults import DEFAULTS 

15from casanova.utils import ensure_open, suppress_BOM, size_of_row_in_file 

16from casanova.exceptions import EmptyFileError, MissingColumnError, NoHeadersError 

17 

18 

19def validate_multiplex_tuple(multiplex): 

20 return ( 

21 isinstance(multiplex, tuple) and 

22 len(multiplex) in [2, 3] and 

23 all(isinstance(t, str) for t in multiplex) 

24 ) 

25 

26 

27class DictLikeRow(object): 

28 __slots__ = ('__mapping', '__row') 

29 

30 def __init__(self, mapping, row): 

31 self.__mapping = mapping 

32 self.__row = row 

33 

34 def __getitem__(self, key): 

35 return self.__row[self.__mapping[key]] 

36 

37 def __getattr__(self, key): 

38 return self.__getitem__(key) 

39 

40 

41class Headers(object): 

42 def __init__(self, fieldnames): 

43 self.__mapping = {h: i for i, h in enumerate(fieldnames)} 

44 

45 def rename(self, old_name, new_name): 

46 if old_name == new_name: 

47 raise TypeError 

48 

49 self.__mapping[new_name] = self[old_name] 

50 del self.__mapping[old_name] 

51 

52 def __len__(self): 

53 return len(self.__mapping) 

54 

55 def __getitem__(self, key): 

56 return self.__mapping[key] 

57 

58 def __getattr__(self, key): 

59 return self.__getitem__(key) 

60 

61 def __contains__(self, key): 

62 return key in self.__mapping 

63 

64 def __iter__(self): 

65 yield from sorted(self.__mapping.items(), key=itemgetter(1)) 

66 

67 def as_dict(self): 

68 return self.__mapping.copy() 

69 

70 def get(self, key, default=None): 

71 return self.__mapping.get(key, default) 

72 

73 def collect(self, keys): 

74 return [self[k] for k in keys] 

75 

76 def wrap(self, row): 

77 return DictLikeRow(self.__mapping, row) 

78 

79 def __repr__(self): 

80 class_name = self.__class__.__name__ 

81 

82 representation = '<' + class_name 

83 

84 for h, i in self: 

85 if h.isidentifier(): 

86 representation += ' %s=%s' % (h, i) 

87 else: 

88 representation += ' "%s"=%s' % (h, i) 

89 

90 representation += '>' 

91 

92 return representation 

93 

94 

95class Reader(object): 

96 namespace = 'casanova.reader' 

97 

98 def __init__(self, input_file, no_headers=False, encoding='utf-8', 

99 dialect=None, quotechar=None, delimiter=None, prebuffer_bytes=None, 

100 total=None, multiplex=None): 

101 

102 # Resolving global defaults 

103 if prebuffer_bytes is None: 

104 prebuffer_bytes = DEFAULTS['prebuffer_bytes'] 

105 

106 # Detecting input type 

107 if isinstance(input_file, IOBase): 

108 input_type = 'file' 

109 

110 elif isinstance(input_file, str): 

111 input_type = 'path' 

112 input_file = ensure_open(input_file, encoding=encoding) 

113 

114 elif isinstance(input_file, Iterable): 

115 input_type = 'iterable' 

116 input_file = iter(input_file) 

117 

118 else: 

119 raise TypeError('expecting a file, a path or an iterable of rows') 

120 

121 if multiplex is not None and not validate_multiplex_tuple(multiplex): 

122 raise TypeError('`multiplex` should be a 2-tuple or 3-tuple containing the column to split, the split character and optionally a new name for the column') 

123 

124 reader_kwargs = {} 

125 

126 if dialect is not None: 

127 reader_kwargs['dialect'] = dialect 

128 if quotechar is not None: 

129 reader_kwargs['quotechar'] = quotechar 

130 if delimiter is not None: 

131 reader_kwargs['delimiter'] = delimiter 

132 

133 self.input_type = input_type 

134 self.input_file = input_file 

135 

136 if self.input_type == 'iterable': 

137 self.reader = self.input_file 

138 else: 

139 self.reader = csv.reader(input_file, **reader_kwargs) 

140 

141 self.buffered_rows = deque() 

142 self.was_completely_buffered = False 

143 self.total = total 

144 self.headers = None 

145 self.expected_row_length = None 

146 

147 # Reading headers 

148 if no_headers: 

149 try: 

150 self.buffered_rows.append(next(self.reader)) 

151 except StopIteration: 

152 raise EmptyFileError 

153 

154 self.expected_row_length = len(self.buffered_rows[0]) 

155 else: 

156 try: 

157 fieldnames = next(self.reader) 

158 

159 if fieldnames: 

160 fieldnames[0] = suppress_BOM(fieldnames[0]) 

161 

162 except StopIteration: 

163 raise EmptyFileError 

164 

165 self.headers = Headers(fieldnames) 

166 

167 # Multiplexing 

168 if multiplex is not None: 

169 multiplex_column = multiplex[0] 

170 split_char = multiplex[1] 

171 

172 if multiplex_column not in self.headers: 

173 raise MissingColumnError(multiplex_column) 

174 

175 multiplex_pos = self.headers[multiplex_column] 

176 

177 # New col 

178 if len(multiplex) == 3: 

179 self.headers.rename(multiplex_column, multiplex[2]) 

180 

181 original_reader = self.reader 

182 

183 def reader_wrapper(): 

184 for row in original_reader: 

185 cell = row[multiplex_pos] 

186 

187 if not cell or split_char not in cell: 

188 yield row 

189 

190 else: 

191 for value in cell.split(split_char): 

192 copy = list(row) 

193 copy[multiplex_pos] = value 

194 yield copy 

195 

196 self.reader = reader_wrapper() 

197 

198 # Prebuffering 

199 if prebuffer_bytes is not None and self.total is None: 

200 if not isinstance(prebuffer_bytes, int) or prebuffer_bytes < 1: 

201 raise TypeError('expecting a positive integer as "prebuffer_bytes" kwarg') 

202 

203 buffered_bytes = 0 

204 

205 while buffered_bytes < prebuffer_bytes: 

206 row = next(self.reader, None) 

207 

208 if row is None: 

209 self.was_completely_buffered = True 

210 self.total = len(self.buffered_rows) 

211 break 

212 

213 buffered_bytes += size_of_row_in_file(row) 

214 self.buffered_rows.append(row) 

215 

216 def __repr__(self): 

217 columns_info = ' '.join('%s=%s' % t for t in self.headers) 

218 

219 return '<%s %s>' % (self.namespace, columns_info) 

220 

221 @property 

222 def fieldnames(self): 

223 if self.headers is None: 

224 return None 

225 

226 return [k for k, v in self.headers] 

227 

228 @property 

229 def row_len(self): 

230 if self.expected_row_length is not None: 

231 return self.expected_row_length 

232 

233 return len(self.headers) 

234 

235 def iter(self): 

236 while self.buffered_rows: 

237 yield self.buffered_rows.popleft() 

238 

239 yield from self.reader 

240 

241 def wrap(self, row): 

242 return self.headers.wrap(row) 

243 

244 def __iter__(self): 

245 return self.iter() 

246 

247 def __cells(self, column, with_rows=False): 

248 if not isinstance(column, int): 

249 if self.headers is None: 

250 raise NoHeadersError 

251 

252 pos = self.headers.get(column) 

253 

254 if pos is None: 

255 raise MissingColumnError(column) 

256 else: 

257 if column >= self.row_len: 

258 raise MissingColumnError 

259 

260 pos = column 

261 

262 if with_rows: 

263 def iterator(): 

264 for row in self.iter(): 

265 yield row, row[pos] 

266 else: 

267 def iterator(): 

268 for row in self.iter(): 

269 yield row[pos] 

270 

271 return iterator() 

272 

273 def cells(self, column, with_rows=False): 

274 if not isinstance(column, (str, int)): 

275 raise TypeError 

276 

277 return self.__cells(column, with_rows=with_rows) 

278 

279 def close(self): 

280 if self.input_type == 'file': 

281 self.input_file.close() 

282 

283 def __enter__(self): 

284 return self 

285 

286 def __exit__(self, *args): 

287 self.close() 

288 

289 @classmethod 

290 def count(cls, input_file, max_rows=None, **kwargs): 

291 assert max_rows is None or max_rows > 0, '%s.count: expected max_rows to be `None` or > 0.' % cls.namespace 

292 

293 n = 0 

294 

295 with cls(input_file, **kwargs) as reader: 

296 for _ in reader: 

297 n += 1 

298 

299 if max_rows is not None and n > max_rows: 

300 return None 

301 

302 return n