Coverage for flair/flair/datasets/document_classification.py: 10%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

745 statements  

1import csv 

2import json 

3import os 

4 

5from pathlib import Path 

6from typing import List, Dict, Union, Callable 

7 

8import flair 

9from flair.data import ( 

10 Sentence, 

11 Corpus, 

12 Token, 

13 FlairDataset, 

14 Tokenizer, DataPair 

15) 

16from flair.tokenization import SegtokTokenizer, SpaceTokenizer 

17from flair.datasets.base import find_train_dev_test_files 

18from flair.file_utils import cached_path, unzip_file, unpack_file 

19 

20import logging 

21log = logging.getLogger("flair") 

22 

23 

24class ClassificationCorpus(Corpus): 

25 """ 

26 A classification corpus from FastText-formatted text files. 

27 """ 

28 

29 def __init__( 

30 self, 

31 data_folder: Union[str, Path], 

32 label_type: str = 'class', 

33 train_file=None, 

34 test_file=None, 

35 dev_file=None, 

36 truncate_to_max_tokens: int = -1, 

37 truncate_to_max_chars: int = -1, 

38 filter_if_longer_than: int = -1, 

39 tokenizer: Tokenizer = SegtokTokenizer(), 

40 memory_mode: str = "partial", 

41 label_name_map: Dict[str, str] = None, 

42 skip_labels: List[str] = None, 

43 allow_examples_without_labels=False, 

44 sample_missing_splits: bool = True, 

45 encoding: str = 'utf-8', 

46 ): 

47 """ 

48 Instantiates a Corpus from text classification-formatted task data 

49 

50 :param data_folder: base folder with the task data 

51 :param label_type: name of the label 

52 :param train_file: the name of the train file 

53 :param test_file: the name of the test file 

54 :param dev_file: the name of the dev file, if None, dev data is sampled from train 

55 :param truncate_to_max_tokens: If set, truncates each Sentence to a maximum number of tokens 

56 :param truncate_to_max_chars: If set, truncates each Sentence to a maximum number of chars 

57 :param filter_if_longer_than: If set, filters documents that are longer that the specified number of tokens. 

58 :param tokenizer: Tokenizer for dataset, default is SegtokTokenizer 

59 :param memory_mode: Set to what degree to keep corpus in memory ('full', 'partial' or 'disk'). Use 'full' 

60 if full corpus and all embeddings fits into memory for speedups during training. Otherwise use 'partial' and if 

61 even this is too much for your memory, use 'disk'. 

62 :param label_name_map: Optionally map label names to different schema. 

63 :param allow_examples_without_labels: set to True to allow Sentences without label in the corpus. 

64 :param encoding: Default is 'uft-8' but some datasets are in 'latin-1 

65 :return: a Corpus with annotated train, dev and test data 

66 """ 

67 

68 # find train, dev and test files if not specified 

69 dev_file, test_file, train_file = \ 

70 find_train_dev_test_files(data_folder, dev_file, test_file, train_file) 

71 

72 train: FlairDataset = ClassificationDataset( 

73 train_file, 

74 label_type=label_type, 

75 tokenizer=tokenizer, 

76 truncate_to_max_tokens=truncate_to_max_tokens, 

77 truncate_to_max_chars=truncate_to_max_chars, 

78 filter_if_longer_than=filter_if_longer_than, 

79 memory_mode=memory_mode, 

80 label_name_map=label_name_map, 

81 skip_labels=skip_labels, 

82 allow_examples_without_labels=allow_examples_without_labels, 

83 encoding=encoding, 

84 ) 

85 

86 # use test_file to create test split if available 

87 test: FlairDataset = ClassificationDataset( 

88 test_file, 

89 label_type=label_type, 

90 tokenizer=tokenizer, 

91 truncate_to_max_tokens=truncate_to_max_tokens, 

92 truncate_to_max_chars=truncate_to_max_chars, 

93 filter_if_longer_than=filter_if_longer_than, 

94 memory_mode=memory_mode, 

95 label_name_map=label_name_map, 

96 skip_labels=skip_labels, 

97 allow_examples_without_labels=allow_examples_without_labels, 

98 encoding=encoding, 

99 ) if test_file is not None else None 

100 

101 # use dev_file to create test split if available 

102 dev: FlairDataset = ClassificationDataset( 

103 dev_file, 

104 label_type=label_type, 

105 tokenizer=tokenizer, 

106 truncate_to_max_tokens=truncate_to_max_tokens, 

107 truncate_to_max_chars=truncate_to_max_chars, 

108 filter_if_longer_than=filter_if_longer_than, 

109 memory_mode=memory_mode, 

110 label_name_map=label_name_map, 

111 skip_labels=skip_labels, 

112 allow_examples_without_labels=allow_examples_without_labels, 

113 encoding=encoding, 

114 ) if dev_file is not None else None 

115 

116 super(ClassificationCorpus, self).__init__( 

117 train, dev, test, name=str(data_folder), sample_missing_splits=sample_missing_splits 

118 ) 

119 

120 log.info(f"Initialized corpus {self.name} (label type name is '{label_type}')") 

121 

122 

123class ClassificationDataset(FlairDataset): 

124 """ 

125 Dataset for classification instantiated from a single FastText-formatted file. 

126 """ 

127 

128 def __init__( 

129 self, 

130 path_to_file: Union[str, Path], 

131 label_type: str, 

132 truncate_to_max_tokens=-1, 

133 truncate_to_max_chars=-1, 

134 filter_if_longer_than: int = -1, 

135 tokenizer: Tokenizer = SegtokTokenizer(), 

136 memory_mode: str = "partial", 

137 label_name_map: Dict[str, str] = None, 

138 skip_labels: List[str] = None, 

139 allow_examples_without_labels=False, 

140 encoding: str = 'utf-8', 

141 ): 

142 """ 

143 Reads a data file for text classification. The file should contain one document/text per line. 

144 The line should have the following format: 

145 __label__<class_name> <text> 

146 If you have a multi class task, you can have as many labels as you want at the beginning of the line, e.g., 

147 __label__<class_name_1> __label__<class_name_2> <text> 

148 :param path_to_file: the path to the data file 

149 :param label_type: name of the label 

150 :param truncate_to_max_tokens: If set, truncates each Sentence to a maximum number of tokens 

151 :param truncate_to_max_chars: If set, truncates each Sentence to a maximum number of chars 

152 :param filter_if_longer_than: If set, filters documents that are longer that the specified number of tokens. 

153 :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer) 

154 :param memory_mode: Set to what degree to keep corpus in memory ('full', 'partial' or 'disk'). Use 'full' 

155 if full corpus and all embeddings fits into memory for speedups during training. Otherwise use 'partial' and if 

156 even this is too much for your memory, use 'disk'. 

157 :param label_name_map: Optionally map label names to different schema. 

158 :param allow_examples_without_labels: set to True to allow Sentences without label in the Dataset. 

159 :param encoding: Default is 'uft-8' but some datasets are in 'latin-1 

160 :return: list of sentences 

161 """ 

162 if type(path_to_file) == str: 

163 path_to_file: Path = Path(path_to_file) 

164 

165 assert path_to_file.exists() 

166 

167 self.label_prefix = "__label__" 

168 self.label_type = label_type 

169 

170 self.memory_mode = memory_mode 

171 self.tokenizer = tokenizer 

172 

173 if self.memory_mode == 'full': 

174 self.sentences = [] 

175 if self.memory_mode == 'partial': 

176 self.lines = [] 

177 if self.memory_mode == 'disk': 

178 self.indices = [] 

179 

180 self.total_sentence_count: int = 0 

181 self.truncate_to_max_chars = truncate_to_max_chars 

182 self.truncate_to_max_tokens = truncate_to_max_tokens 

183 self.filter_if_longer_than = filter_if_longer_than 

184 self.label_name_map = label_name_map 

185 self.allow_examples_without_labels = allow_examples_without_labels 

186 

187 self.path_to_file = path_to_file 

188 

189 with open(str(path_to_file), encoding=encoding) as f: 

190 line = f.readline() 

191 position = 0 

192 while line: 

193 if ("__label__" not in line and not allow_examples_without_labels) or (" " not in line and "\t" not in line): 

194 position = f.tell() 

195 line = f.readline() 

196 continue 

197 

198 if 0 < self.filter_if_longer_than < len(line.split(' ')): 

199 position = f.tell() 

200 line = f.readline() 

201 continue 

202 

203 # if data point contains black-listed label, do not use 

204 if skip_labels: 

205 skip = False 

206 for skip_label in skip_labels: 

207 if "__label__" + skip_label in line: 

208 skip = True 

209 if skip: 

210 line = f.readline() 

211 continue 

212 

213 if self.memory_mode == 'full': 

214 sentence = self._parse_line_to_sentence( 

215 line, self.label_prefix, tokenizer 

216 ) 

217 if sentence is not None and len(sentence.tokens) > 0: 

218 self.sentences.append(sentence) 

219 self.total_sentence_count += 1 

220 

221 if self.memory_mode == 'partial' or self.memory_mode == 'disk': 

222 

223 # first check if valid sentence 

224 words = line.split() 

225 l_len = 0 

226 label = False 

227 for i in range(len(words)): 

228 if words[i].startswith(self.label_prefix): 

229 l_len += len(words[i]) + 1 

230 label = True 

231 else: 

232 break 

233 text = line[l_len:].strip() 

234 

235 # if so, add to indices 

236 if text and (label or allow_examples_without_labels): 

237 

238 if self.memory_mode == 'partial': 

239 self.lines.append(line) 

240 self.total_sentence_count += 1 

241 

242 if self.memory_mode == 'disk': 

243 self.indices.append(position) 

244 self.total_sentence_count += 1 

245 

246 position = f.tell() 

247 line = f.readline() 

248 

249 def _parse_line_to_sentence( 

250 self, line: str, label_prefix: str, tokenizer: Union[Callable[[str], List[Token]], Tokenizer] 

251 ): 

252 words = line.split() 

253 

254 labels = [] 

255 l_len = 0 

256 

257 for i in range(len(words)): 

258 if words[i].startswith(label_prefix): 

259 l_len += len(words[i]) + 1 

260 label = words[i].replace(label_prefix, "") 

261 

262 if self.label_name_map and label in self.label_name_map.keys(): 

263 label = self.label_name_map[label] 

264 

265 labels.append(label) 

266 else: 

267 break 

268 

269 text = line[l_len:].strip() 

270 

271 if self.truncate_to_max_chars > 0: 

272 text = text[: self.truncate_to_max_chars] 

273 

274 if text and (labels or self.allow_examples_without_labels): 

275 sentence = Sentence(text, use_tokenizer=tokenizer) 

276 

277 for label in labels: 

278 sentence.add_label(self.label_type, label) 

279 

280 if ( 

281 sentence is not None 

282 and 0 < self.truncate_to_max_tokens < len(sentence) 

283 ): 

284 sentence.tokens = sentence.tokens[: self.truncate_to_max_tokens] 

285 

286 return sentence 

287 return None 

288 

289 def is_in_memory(self) -> bool: 

290 if self.memory_mode == 'disk': return False 

291 if self.memory_mode == 'partial': return False 

292 return True 

293 

294 def __len__(self): 

295 return self.total_sentence_count 

296 

297 def __getitem__(self, index: int = 0) -> Sentence: 

298 

299 if self.memory_mode == 'full': 

300 return self.sentences[index] 

301 

302 if self.memory_mode == 'partial': 

303 sentence = self._parse_line_to_sentence( 

304 self.lines[index], self.label_prefix, self.tokenizer 

305 ) 

306 return sentence 

307 

308 if self.memory_mode == 'disk': 

309 with open(str(self.path_to_file), encoding="utf-8") as file: 

310 file.seek(self.indices[index]) 

311 line = file.readline() 

312 sentence = self._parse_line_to_sentence( 

313 line, self.label_prefix, self.tokenizer 

314 ) 

315 return sentence 

316 

317 

318class CSVClassificationCorpus(Corpus): 

319 """ 

320 Classification corpus instantiated from CSV data files. 

321 """ 

322 

323 def __init__( 

324 self, 

325 data_folder: Union[str, Path], 

326 column_name_map: Dict[int, str], 

327 label_type: str, 

328 train_file=None, 

329 test_file=None, 

330 dev_file=None, 

331 max_tokens_per_doc=-1, 

332 max_chars_per_doc=-1, 

333 tokenizer: Tokenizer = SegtokTokenizer(), 

334 in_memory: bool = False, 

335 skip_header: bool = False, 

336 encoding: str = 'utf-8', 

337 no_class_label=None, 

338 **fmtparams, 

339 ): 

340 """ 

341 Instantiates a Corpus for text classification from CSV column formatted data 

342 

343 :param data_folder: base folder with the task data 

344 :param column_name_map: a column name map that indicates which column is text and which the label(s) 

345 :param label_type: name of the label 

346 :param train_file: the name of the train file 

347 :param test_file: the name of the test file 

348 :param dev_file: the name of the dev file, if None, dev data is sampled from train 

349 :param max_tokens_per_doc: If set, truncates each Sentence to a maximum number of Tokens 

350 :param max_chars_per_doc: If set, truncates each Sentence to a maximum number of chars 

351 :param tokenizer: Tokenizer for dataset, default is SegtokTokenizer 

352 :param in_memory: If True, keeps dataset as Sentences in memory, otherwise only keeps strings 

353 :param skip_header: If True, skips first line because it is header 

354 :param encoding: Default is 'uft-8' but some datasets are in 'latin-1 

355 :param fmtparams: additional parameters for the CSV file reader 

356 :return: a Corpus with annotated train, dev and test data 

357 """ 

358 

359 # find train, dev and test files if not specified 

360 dev_file, test_file, train_file = \ 

361 find_train_dev_test_files(data_folder, dev_file, test_file, train_file) 

362 

363 train: FlairDataset = CSVClassificationDataset( 

364 train_file, 

365 column_name_map, 

366 label_type=label_type, 

367 tokenizer=tokenizer, 

368 max_tokens_per_doc=max_tokens_per_doc, 

369 max_chars_per_doc=max_chars_per_doc, 

370 in_memory=in_memory, 

371 skip_header=skip_header, 

372 encoding=encoding, 

373 no_class_label=no_class_label, 

374 **fmtparams, 

375 ) 

376 

377 test: FlairDataset = CSVClassificationDataset( 

378 test_file, 

379 column_name_map, 

380 label_type=label_type, 

381 tokenizer=tokenizer, 

382 max_tokens_per_doc=max_tokens_per_doc, 

383 max_chars_per_doc=max_chars_per_doc, 

384 in_memory=in_memory, 

385 skip_header=skip_header, 

386 encoding=encoding, 

387 no_class_label=no_class_label, 

388 **fmtparams, 

389 ) if test_file is not None else None 

390 

391 dev: FlairDataset = CSVClassificationDataset( 

392 dev_file, 

393 column_name_map, 

394 label_type=label_type, 

395 tokenizer=tokenizer, 

396 max_tokens_per_doc=max_tokens_per_doc, 

397 max_chars_per_doc=max_chars_per_doc, 

398 in_memory=in_memory, 

399 skip_header=skip_header, 

400 encoding=encoding, 

401 no_class_label=no_class_label, 

402 **fmtparams, 

403 ) if dev_file is not None else None 

404 

405 super(CSVClassificationCorpus, self).__init__( 

406 train, dev, test, name=str(data_folder) 

407 ) 

408 

409 

410class CSVClassificationDataset(FlairDataset): 

411 """ 

412 Dataset for text classification from CSV column formatted data. 

413 """ 

414 

415 def __init__( 

416 self, 

417 path_to_file: Union[str, Path], 

418 column_name_map: Dict[int, str], 

419 label_type: str, 

420 max_tokens_per_doc: int = -1, 

421 max_chars_per_doc: int = -1, 

422 tokenizer: Tokenizer = SegtokTokenizer(), 

423 in_memory: bool = True, 

424 skip_header: bool = False, 

425 encoding: str = 'utf-8', 

426 no_class_label=None, 

427 **fmtparams, 

428 ): 

429 """ 

430 Instantiates a Dataset for text classification from CSV column formatted data 

431 

432 :param path_to_file: path to the file with the CSV data 

433 :param column_name_map: a column name map that indicates which column is text and which the label(s) 

434 :param label_type: name of the label 

435 :param max_tokens_per_doc: If set, truncates each Sentence to a maximum number of Tokens 

436 :param max_chars_per_doc: If set, truncates each Sentence to a maximum number of chars 

437 :param tokenizer: Tokenizer for dataset, default is SegTokTokenizer 

438 :param in_memory: If True, keeps dataset as Sentences in memory, otherwise only keeps strings 

439 :param skip_header: If True, skips first line because it is header 

440 :param encoding: Most datasets are 'utf-8' but some are 'latin-1' 

441 :param fmtparams: additional parameters for the CSV file reader 

442 :return: a Corpus with annotated train, dev and test data 

443 """ 

444 

445 if type(path_to_file) == str: 

446 path_to_file: Path = Path(path_to_file) 

447 

448 assert path_to_file.exists() 

449 

450 # variables 

451 self.path_to_file = path_to_file 

452 self.in_memory = in_memory 

453 self.tokenizer = tokenizer 

454 self.column_name_map = column_name_map 

455 self.max_tokens_per_doc = max_tokens_per_doc 

456 self.max_chars_per_doc = max_chars_per_doc 

457 self.no_class_label = no_class_label 

458 

459 self.label_type = label_type 

460 

461 # different handling of in_memory data than streaming data 

462 if self.in_memory: 

463 self.sentences = [] 

464 else: 

465 self.raw_data = [] 

466 

467 self.total_sentence_count: int = 0 

468 

469 # most data sets have the token text in the first column, if not, pass 'text' as column 

470 self.text_columns: List[int] = [] 

471 self.pair_columns: List[int] = [] 

472 for column in column_name_map: 

473 if column_name_map[column] == "text": 

474 self.text_columns.append(column) 

475 if column_name_map[column] == "pair": 

476 self.pair_columns.append(column) 

477 

478 with open(self.path_to_file, encoding=encoding) as csv_file: 

479 

480 csv_reader = csv.reader(csv_file, **fmtparams) 

481 

482 if skip_header: 

483 next(csv_reader, None) # skip the headers 

484 

485 for row in csv_reader: 

486 

487 # test if format is OK 

488 wrong_format = False 

489 for text_column in self.text_columns: 

490 if text_column >= len(row): 

491 wrong_format = True 

492 

493 if wrong_format: 

494 continue 

495 

496 # test if at least one label given 

497 has_label = False 

498 for column in self.column_name_map: 

499 if self.column_name_map[column].startswith("label") and row[column]: 

500 has_label = True 

501 break 

502 

503 if not has_label: 

504 continue 

505 

506 if self.in_memory: 

507 

508 sentence = self._make_labeled_data_point(row) 

509 

510 self.sentences.append(sentence) 

511 

512 else: 

513 self.raw_data.append(row) 

514 

515 self.total_sentence_count += 1 

516 

517 def _make_labeled_data_point(self, row): 

518 

519 # make sentence from text (and filter for length) 

520 text = " ".join( 

521 [row[text_column] for text_column in self.text_columns] 

522 ) 

523 

524 if self.max_chars_per_doc > 0: 

525 text = text[: self.max_chars_per_doc] 

526 

527 sentence = Sentence(text, use_tokenizer=self.tokenizer) 

528 

529 if 0 < self.max_tokens_per_doc < len(sentence): 

530 sentence.tokens = sentence.tokens[: self.max_tokens_per_doc] 

531 

532 # if a pair column is defined, make a sentence pair object 

533 if len(self.pair_columns) > 0: 

534 

535 text = " ".join( 

536 [row[pair_column] for pair_column in self.pair_columns] 

537 ) 

538 

539 if self.max_chars_per_doc > 0: 

540 text = text[: self.max_chars_per_doc] 

541 

542 pair = Sentence(text, use_tokenizer=self.tokenizer) 

543 

544 if 0 < self.max_tokens_per_doc < len(sentence): 

545 pair.tokens = pair.tokens[: self.max_tokens_per_doc] 

546 

547 data_point = DataPair(first=sentence, second=pair) 

548 

549 else: 

550 data_point = sentence 

551 

552 for column in self.column_name_map: 

553 column_value = row[column] 

554 if ( 

555 self.column_name_map[column].startswith("label") 

556 and column_value 

557 ): 

558 if column_value != self.no_class_label: 

559 data_point.add_label(self.label_type, column_value) 

560 

561 return data_point 

562 

563 def is_in_memory(self) -> bool: 

564 return self.in_memory 

565 

566 def __len__(self): 

567 return self.total_sentence_count 

568 

569 def __getitem__(self, index: int = 0) -> Sentence: 

570 if self.in_memory: 

571 return self.sentences[index] 

572 else: 

573 row = self.raw_data[index] 

574 

575 sentence = self._make_labeled_data_point(row) 

576 

577 return sentence 

578 

579 

580class AMAZON_REVIEWS(ClassificationCorpus): 

581 """ 

582 A very large corpus of Amazon reviews with positivity ratings. Corpus is downloaded from and documented at 

583 https://nijianmo.github.io/amazon/index.html. We download the 5-core subset which is still tens of millions of 

584 reviews. 

585 """ 

586 

587 # noinspection PyDefaultArgument 

588 def __init__( 

589 self, 

590 split_max: int = 30000, 

591 label_name_map: Dict[str, str] = { 

592 '1.0': 'NEGATIVE', 

593 '2.0': 'NEGATIVE', 

594 '3.0': 'NEGATIVE', 

595 '4.0': 'POSITIVE', 

596 '5.0': 'POSITIVE', 

597 }, 

598 skip_labels=['3.0', '4.0'], 

599 fraction_of_5_star_reviews: int = 10, 

600 tokenizer: Tokenizer = SegtokTokenizer(), 

601 memory_mode='partial', 

602 **corpusargs 

603 ): 

604 """ 

605 Constructs corpus object. Split_max indicates how many data points from each of the 28 splits are used, so 

606 set this higher or lower to increase/decrease corpus size. 

607 :param label_name_map: Map label names to different schema. By default, the 5-star rating is mapped onto 3 

608 classes (POSITIVE, NEGATIVE, NEUTRAL) 

609 :param split_max: Split_max indicates how many data points from each of the 28 splits are used, so 

610 set this higher or lower to increase/decrease corpus size. 

611 :param memory_mode: Set to what degree to keep corpus in memory ('full', 'partial' or 'disk'). Use 'full' 

612 if full corpus and all embeddings fits into memory for speedups during training. Otherwise use 'partial' and if 

613 even this is too much for your memory, use 'disk'. 

614 :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer) 

615 :param corpusargs: Arguments for ClassificationCorpus 

616 """ 

617 

618 # dataset name includes the split size 

619 dataset_name = self.__class__.__name__.lower() + '_' + str(split_max) + '_' + str(fraction_of_5_star_reviews) 

620 

621 # default dataset folder is the cache root 

622 data_folder = flair.cache_root / "datasets" / dataset_name 

623 

624 # download data if necessary 

625 if not (data_folder / "train.txt").is_file(): 

626 # download each of the 28 splits 

627 self.download_and_prepare_amazon_product_file(data_folder, "AMAZON_FASHION_5.json.gz", split_max, 

628 fraction_of_5_star_reviews) 

629 self.download_and_prepare_amazon_product_file(data_folder, "All_Beauty_5.json.gz", split_max, 

630 fraction_of_5_star_reviews) 

631 self.download_and_prepare_amazon_product_file(data_folder, "Appliances_5.json.gz", split_max, 

632 fraction_of_5_star_reviews) 

633 self.download_and_prepare_amazon_product_file(data_folder, "Arts_Crafts_and_Sewing_5.json.gz", split_max, 

634 fraction_of_5_star_reviews) 

635 self.download_and_prepare_amazon_product_file(data_folder, "Arts_Crafts_and_Sewing_5.json.gz", split_max, 

636 fraction_of_5_star_reviews) 

637 self.download_and_prepare_amazon_product_file(data_folder, "Automotive_5.json.gz", split_max, 

638 fraction_of_5_star_reviews) 

639 self.download_and_prepare_amazon_product_file(data_folder, "Books_5.json.gz", split_max, 

640 fraction_of_5_star_reviews) 

641 self.download_and_prepare_amazon_product_file(data_folder, "CDs_and_Vinyl_5.json.gz", split_max, 

642 fraction_of_5_star_reviews) 

643 self.download_and_prepare_amazon_product_file(data_folder, "Cell_Phones_and_Accessories_5.json.gz", 

644 split_max, fraction_of_5_star_reviews) 

645 self.download_and_prepare_amazon_product_file(data_folder, "Clothing_Shoes_and_Jewelry_5.json.gz", 

646 split_max, fraction_of_5_star_reviews) 

647 self.download_and_prepare_amazon_product_file(data_folder, "Digital_Music_5.json.gz", split_max, 

648 fraction_of_5_star_reviews) 

649 self.download_and_prepare_amazon_product_file(data_folder, "Electronics_5.json.gz", split_max, 

650 fraction_of_5_star_reviews) 

651 self.download_and_prepare_amazon_product_file(data_folder, "Gift_Cards_5.json.gz", split_max, 

652 fraction_of_5_star_reviews) 

653 self.download_and_prepare_amazon_product_file(data_folder, "Grocery_and_Gourmet_Food_5.json.gz", split_max, 

654 fraction_of_5_star_reviews) 

655 self.download_and_prepare_amazon_product_file(data_folder, "Home_and_Kitchen_5.json.gz", split_max, 

656 fraction_of_5_star_reviews) 

657 self.download_and_prepare_amazon_product_file(data_folder, "Industrial_and_Scientific_5.json.gz", split_max, 

658 fraction_of_5_star_reviews) 

659 self.download_and_prepare_amazon_product_file(data_folder, "Kindle_Store_5.json.gz", split_max, 

660 fraction_of_5_star_reviews) 

661 self.download_and_prepare_amazon_product_file(data_folder, "Luxury_Beauty_5.json.gz", split_max, 

662 fraction_of_5_star_reviews) 

663 self.download_and_prepare_amazon_product_file(data_folder, "Magazine_Subscriptions_5.json.gz", split_max, 

664 fraction_of_5_star_reviews) 

665 self.download_and_prepare_amazon_product_file(data_folder, "Movies_and_TV_5.json.gz", split_max, 

666 fraction_of_5_star_reviews) 

667 self.download_and_prepare_amazon_product_file(data_folder, "Musical_Instruments_5.json.gz", split_max, 

668 fraction_of_5_star_reviews) 

669 self.download_and_prepare_amazon_product_file(data_folder, "Office_Products_5.json.gz", split_max, 

670 fraction_of_5_star_reviews) 

671 self.download_and_prepare_amazon_product_file(data_folder, "Patio_Lawn_and_Garden_5.json.gz", split_max, 

672 fraction_of_5_star_reviews) 

673 self.download_and_prepare_amazon_product_file(data_folder, "Pet_Supplies_5.json.gz", split_max, 

674 fraction_of_5_star_reviews) 

675 self.download_and_prepare_amazon_product_file(data_folder, "Prime_Pantry_5.json.gz", split_max, 

676 fraction_of_5_star_reviews) 

677 self.download_and_prepare_amazon_product_file(data_folder, "Software_5.json.gz", split_max, 

678 fraction_of_5_star_reviews) 

679 self.download_and_prepare_amazon_product_file(data_folder, "Sports_and_Outdoors_5.json.gz", split_max, 

680 fraction_of_5_star_reviews) 

681 self.download_and_prepare_amazon_product_file(data_folder, "Tools_and_Home_Improvement_5.json.gz", 

682 split_max, fraction_of_5_star_reviews) 

683 self.download_and_prepare_amazon_product_file(data_folder, "Toys_and_Games_5.json.gz", split_max, 

684 fraction_of_5_star_reviews) 

685 self.download_and_prepare_amazon_product_file(data_folder, "Video_Games_5.json.gz", split_max, 

686 fraction_of_5_star_reviews) 

687 

688 super(AMAZON_REVIEWS, self).__init__( 

689 data_folder, 

690 label_type='sentiment', 

691 label_name_map=label_name_map, 

692 skip_labels=skip_labels, 

693 tokenizer=tokenizer, 

694 memory_mode=memory_mode, 

695 **corpusargs 

696 ) 

697 

698 def download_and_prepare_amazon_product_file(self, data_folder, part_name, max_data_points=None, 

699 fraction_of_5_star_reviews=None): 

700 amazon__path = "http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall" 

701 cached_path(f"{amazon__path}/{part_name}", Path("datasets") / 'Amazon_Product_Reviews') 

702 import gzip 

703 # create dataset directory if necessary 

704 if not os.path.exists(data_folder): 

705 os.makedirs(data_folder) 

706 with open(data_folder / "train.txt", "a") as train_file: 

707 

708 write_count = 0 

709 review_5_count = 0 

710 # download senteval datasets if necessary und unzip 

711 with gzip.open(flair.cache_root / "datasets" / 'Amazon_Product_Reviews' / part_name, "rb", ) as f_in: 

712 for line in f_in: 

713 parsed_json = json.loads(line) 

714 if 'reviewText' not in parsed_json: 

715 continue 

716 if parsed_json['reviewText'].strip() == '': 

717 continue 

718 text = parsed_json['reviewText'].replace('\n', '') 

719 

720 if fraction_of_5_star_reviews and str(parsed_json['overall']) == '5.0': 

721 review_5_count += 1 

722 if review_5_count != fraction_of_5_star_reviews: 

723 continue 

724 else: 

725 review_5_count = 0 

726 

727 train_file.write(f"__label__{parsed_json['overall']} {text}\n") 

728 

729 write_count += 1 

730 if max_data_points and write_count >= max_data_points: 

731 break 

732 

733 

734class IMDB(ClassificationCorpus): 

735 """ 

736 Corpus of IMDB movie reviews labeled by sentiment (POSITIVE, NEGATIVE). Downloaded from and documented at 

737 http://ai.stanford.edu/~amaas/data/sentiment/. 

738 """ 

739 

740 def __init__(self, 

741 base_path: Union[str, Path] = None, 

742 rebalance_corpus: bool = True, 

743 tokenizer: Tokenizer = SegtokTokenizer(), 

744 memory_mode='partial', 

745 **corpusargs): 

746 """ 

747 

748 :param base_path: Provide this only if you store the IMDB corpus in a specific folder, otherwise use default. 

749 :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer) 

750 :param rebalance_corpus: Default splits for this corpus have a strange 50/50 train/test split that are impractical. 

751 With rebalance_corpus=True (default setting), corpus is rebalanced to a 80/10/10 train/dev/test split. If you 

752 want to use original splits, set this to False. 

753 :param memory_mode: Set to 'partial' because this is a huge corpus, but you can also set to 'full' for faster 

754 processing or 'none' for less memory. 

755 :param corpusargs: Other args for ClassificationCorpus. 

756 """ 

757 

758 if type(base_path) == str: 

759 base_path: Path = Path(base_path) 

760 

761 # this dataset name 

762 dataset_name = self.__class__.__name__.lower() + '_v4' 

763 

764 # default dataset folder is the cache root 

765 if not base_path: 

766 base_path = flair.cache_root / "datasets" 

767 

768 # download data if necessary 

769 imdb_acl_path = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz" 

770 

771 if rebalance_corpus: 

772 dataset_name = dataset_name + '-rebalanced' 

773 data_folder = base_path / dataset_name 

774 data_path = flair.cache_root / "datasets" / dataset_name 

775 train_data_file = data_path / "train.txt" 

776 test_data_file = data_path / "test.txt" 

777 

778 if train_data_file.is_file()==False or (rebalance_corpus==False and test_data_file.is_file()==False): 

779 [os.remove(file_path) for file_path in [train_data_file, test_data_file] if file_path.is_file()] 

780 

781 cached_path(imdb_acl_path, Path("datasets") / dataset_name) 

782 import tarfile 

783 

784 with tarfile.open( 

785 flair.cache_root 

786 / "datasets" 

787 / dataset_name 

788 / "aclImdb_v1.tar.gz", 

789 "r:gz", 

790 ) as f_in: 

791 datasets = ["train", "test"] 

792 labels = ["pos", "neg"] 

793 

794 for label in labels: 

795 for dataset in datasets: 

796 f_in.extractall( 

797 data_path, 

798 members=[ 

799 m 

800 for m in f_in.getmembers() 

801 if f"{dataset}/{label}" in m.name 

802 ], 

803 ) 

804 data_file = train_data_file 

805 if rebalance_corpus==False and dataset=="test": 

806 data_file = test_data_file 

807 

808 with open(data_file, "at") as f_p: 

809 current_path = data_path / "aclImdb" / dataset / label 

810 for file_name in current_path.iterdir(): 

811 if file_name.is_file() and file_name.name.endswith( 

812 ".txt" 

813 ): 

814 if label == "pos": sentiment_label = 'POSITIVE' 

815 if label == "neg": sentiment_label = 'NEGATIVE' 

816 f_p.write( 

817 f"__label__{sentiment_label} " 

818 + file_name.open("rt", encoding="utf-8").read() 

819 + "\n" 

820 ) 

821 

822 super(IMDB, self).__init__( 

823 data_folder, label_type='sentiment', tokenizer=tokenizer, memory_mode=memory_mode, **corpusargs 

824 ) 

825 

826 

827class NEWSGROUPS(ClassificationCorpus): 

828 """ 

829 20 newsgroups corpus available at "http://qwone.com/~jason/20Newsgroups", classifying 

830 news items into one of 20 categories. Each data point is a full news article so documents may be very long. 

831 """ 

832 

833 def __init__(self, 

834 base_path: Union[str, Path] = None, 

835 tokenizer: Tokenizer = SegtokTokenizer(), 

836 memory_mode: str = 'partial', 

837 **corpusargs 

838 ): 

839 """ 

840 Instantiates 20 newsgroups corpus. 

841 :param base_path: Provide this only if you store the IMDB corpus in a specific folder, otherwise use default. 

842 :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer) 

843 :param memory_mode: Set to 'partial' because this is a big corpus, but you can also set to 'full' for faster 

844 processing or 'none' for less memory. 

845 :param corpusargs: Other args for ClassificationCorpus. 

846 """ 

847 

848 if type(base_path) == str: 

849 base_path: Path = Path(base_path) 

850 

851 # this dataset name 

852 dataset_name = self.__class__.__name__.lower() 

853 

854 # default dataset folder is the cache root 

855 if not base_path: 

856 base_path = flair.cache_root / "datasets" 

857 data_folder = base_path / dataset_name 

858 

859 # download data if necessary 

860 twenty_newsgroups_path = ( 

861 "http://qwone.com/~jason/20Newsgroups/20news-bydate.tar.gz" 

862 ) 

863 data_path = flair.cache_root / "datasets" / dataset_name 

864 data_file = data_path / "20news-bydate-train.txt" 

865 if not data_file.is_file(): 

866 cached_path( 

867 twenty_newsgroups_path, Path("datasets") / dataset_name / "original" 

868 ) 

869 

870 import tarfile 

871 

872 with tarfile.open( 

873 flair.cache_root 

874 / "datasets" 

875 / dataset_name 

876 / "original" 

877 / "20news-bydate.tar.gz", 

878 "r:gz", 

879 ) as f_in: 

880 datasets = ["20news-bydate-test", "20news-bydate-train"] 

881 labels = [ 

882 "alt.atheism", 

883 "comp.graphics", 

884 "comp.os.ms-windows.misc", 

885 "comp.sys.ibm.pc.hardware", 

886 "comp.sys.mac.hardware", 

887 "comp.windows.x", 

888 "misc.forsale", 

889 "rec.autos", 

890 "rec.motorcycles", 

891 "rec.sport.baseball", 

892 "rec.sport.hockey", 

893 "sci.crypt", 

894 "sci.electronics", 

895 "sci.med", 

896 "sci.space", 

897 "soc.religion.christian", 

898 "talk.politics.guns", 

899 "talk.politics.mideast", 

900 "talk.politics.misc", 

901 "talk.religion.misc", 

902 ] 

903 

904 for label in labels: 

905 for dataset in datasets: 

906 f_in.extractall( 

907 data_path / "original", 

908 members=[ 

909 m 

910 for m in f_in.getmembers() 

911 if f"{dataset}/{label}" in m.name 

912 ], 

913 ) 

914 with open( 

915 f"{data_path}/{dataset}.txt", "at", encoding="utf-8" 

916 ) as f_p: 

917 current_path = data_path / "original" / dataset / label 

918 for file_name in current_path.iterdir(): 

919 if file_name.is_file(): 

920 f_p.write( 

921 f"__label__{label} " 

922 + file_name.open("rt", encoding="latin1") 

923 .read() 

924 .replace("\n", " <n> ") 

925 + "\n" 

926 ) 

927 

928 super(NEWSGROUPS, self).__init__( 

929 data_folder, tokenizer=tokenizer, memory_mode=memory_mode, **corpusargs, 

930 ) 

931 

932 

933class SENTIMENT_140(ClassificationCorpus): 

934 """ 

935 Twitter sentiment corpus downloaded from and documented at http://help.sentiment140.com/for-students. Two sentiments 

936 in train data (POSITIVE, NEGATIVE) and three sentiments in test data (POSITIVE, NEGATIVE, NEUTRAL). 

937 """ 

938 

939 def __init__( 

940 self, 

941 label_name_map=None, 

942 tokenizer: Tokenizer = SegtokTokenizer(), 

943 memory_mode: str = 'partial', 

944 **corpusargs, 

945 ): 

946 """ 

947 Instantiates twitter sentiment corpus. 

948 :param label_name_map: By default, the numeric values are mapped to ('NEGATIVE', 'POSITIVE' and 'NEUTRAL') 

949 :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer) 

950 :param memory_mode: Set to 'partial' because this is a big corpus, but you can also set to 'full' for faster 

951 processing or 'none' for less memory. 

952 :param corpusargs: Other args for ClassificationCorpus. 

953 """ 

954 

955 # by defaut, map point score to POSITIVE / NEGATIVE values 

956 if label_name_map is None: 

957 label_name_map = {'0': 'NEGATIVE', 

958 '2': 'NEUTRAL', 

959 '4': 'POSITIVE'} 

960 

961 # this dataset name 

962 dataset_name = self.__class__.__name__.lower() 

963 

964 # default dataset folder is the cache root 

965 data_folder = flair.cache_root / "datasets" / dataset_name 

966 

967 # download data if necessary 

968 if True or not (data_folder / "train.txt").is_file(): 

969 

970 # download senteval datasets if necessary und unzip 

971 sentiment_url = "https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip" 

972 cached_path(sentiment_url, Path("datasets") / dataset_name / 'raw') 

973 senteval_folder = flair.cache_root / "datasets" / dataset_name / 'raw' 

974 unzip_file(senteval_folder / "trainingandtestdata.zip", senteval_folder) 

975 

976 # create dataset directory if necessary 

977 if not os.path.exists(data_folder): 

978 os.makedirs(data_folder) 

979 

980 # create train.txt file from CSV 

981 with open(data_folder / "train.txt", "w") as train_file: 

982 

983 with open(senteval_folder / "training.1600000.processed.noemoticon.csv", 

984 encoding='latin-1') as csv_train: 

985 csv_reader = csv.reader(csv_train) 

986 

987 for row in csv_reader: 

988 label = row[0] 

989 text = row[5] 

990 train_file.write(f"__label__{label} {text}\n") 

991 

992 # create test.txt file from CSV 

993 with open(data_folder / "test.txt", "w") as train_file: 

994 

995 with open(senteval_folder / "testdata.manual.2009.06.14.csv", encoding='latin-1') as csv_train: 

996 csv_reader = csv.reader(csv_train) 

997 

998 for row in csv_reader: 

999 label = row[0] 

1000 text = row[5] 

1001 train_file.write(f"__label__{label} {text}\n") 

1002 

1003 super(SENTIMENT_140, self).__init__( 

1004 data_folder, label_type='sentiment', tokenizer=tokenizer, 

1005 memory_mode=memory_mode, label_name_map=label_name_map, **corpusargs, 

1006 ) 

1007 

1008 

1009class SENTEVAL_CR(ClassificationCorpus): 

1010 """ 

1011 The customer reviews dataset of SentEval, see https://github.com/facebookresearch/SentEval, classified into 

1012 NEGATIVE or POSITIVE sentiment. 

1013 """ 

1014 

1015 def __init__( 

1016 self, 

1017 tokenizer: Union[bool, Callable[[str], List[Token]], Tokenizer] = SpaceTokenizer(), 

1018 memory_mode: str = 'full', 

1019 **corpusargs, 

1020 ): 

1021 """ 

1022 Instantiates SentEval customer reviews dataset. 

1023 :param corpusargs: Other args for ClassificationCorpus. 

1024 :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer()) 

1025 :param memory_mode: Set to 'full' by default since this is a small corpus. Can also be 'partial' or 'none'. 

1026 """ 

1027 

1028 # this dataset name 

1029 dataset_name = self.__class__.__name__.lower() 

1030 

1031 # default dataset folder is the cache root 

1032 data_folder = flair.cache_root / "datasets" / dataset_name 

1033 

1034 # download data if necessary 

1035 if not (data_folder / "train.txt").is_file(): 

1036 

1037 # download senteval datasets if necessary und unzip 

1038 senteval_path = "https://dl.fbaipublicfiles.com/senteval/senteval_data/datasmall_NB_ACL12.zip" 

1039 cached_path(senteval_path, Path("datasets") / "senteval") 

1040 senteval_folder = flair.cache_root / "datasets" / "senteval" 

1041 unzip_file(senteval_folder / "datasmall_NB_ACL12.zip", senteval_folder) 

1042 

1043 # create dataset directory if necessary 

1044 if not os.path.exists(data_folder): 

1045 os.makedirs(data_folder) 

1046 

1047 # create train.txt file by iterating over pos and neg file 

1048 with open(data_folder / "train.txt", "a") as train_file: 

1049 

1050 with open(senteval_folder / "data" / "customerr" / "custrev.pos", encoding="latin1") as file: 

1051 for line in file: 

1052 train_file.write(f"__label__POSITIVE {line}") 

1053 

1054 with open(senteval_folder / "data" / "customerr" / "custrev.neg", encoding="latin1") as file: 

1055 for line in file: 

1056 train_file.write(f"__label__NEGATIVE {line}") 

1057 

1058 super(SENTEVAL_CR, self).__init__( 

1059 data_folder, label_type='sentiment', tokenizer=tokenizer, memory_mode=memory_mode, **corpusargs, 

1060 ) 

1061 

1062 

1063class SENTEVAL_MR(ClassificationCorpus): 

1064 """ 

1065 The movie reviews dataset of SentEval, see https://github.com/facebookresearch/SentEval, classified into 

1066 NEGATIVE or POSITIVE sentiment. 

1067 """ 

1068 

1069 def __init__( 

1070 self, 

1071 tokenizer: Union[bool, Callable[[str], List[Token]], Tokenizer] = SpaceTokenizer(), 

1072 memory_mode: str = 'full', 

1073 **corpusargs 

1074 ): 

1075 """ 

1076 Instantiates SentEval movie reviews dataset. 

1077 :param corpusargs: Other args for ClassificationCorpus. 

1078 :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer) 

1079 :param memory_mode: Set to 'full' by default since this is a small corpus. Can also be 'partial' or 'none'. 

1080 """ 

1081 

1082 # this dataset name 

1083 dataset_name = self.__class__.__name__.lower() 

1084 

1085 # default dataset folder is the cache root 

1086 data_folder = flair.cache_root / "datasets" / dataset_name 

1087 

1088 # download data if necessary 

1089 if not (data_folder / "train.txt").is_file(): 

1090 

1091 # download senteval datasets if necessary und unzip 

1092 senteval_path = "https://dl.fbaipublicfiles.com/senteval/senteval_data/datasmall_NB_ACL12.zip" 

1093 cached_path(senteval_path, Path("datasets") / "senteval") 

1094 senteval_folder = flair.cache_root / "datasets" / "senteval" 

1095 unzip_file(senteval_folder / "datasmall_NB_ACL12.zip", senteval_folder) 

1096 

1097 # create dataset directory if necessary 

1098 if not os.path.exists(data_folder): 

1099 os.makedirs(data_folder) 

1100 

1101 # create train.txt file by iterating over pos and neg file 

1102 with open(data_folder / "train.txt", "a") as train_file: 

1103 

1104 with open(senteval_folder / "data" / "rt10662" / "rt-polarity.pos", encoding="latin1") as file: 

1105 for line in file: 

1106 train_file.write(f"__label__POSITIVE {line}") 

1107 

1108 with open(senteval_folder / "data" / "rt10662" / "rt-polarity.neg", encoding="latin1") as file: 

1109 for line in file: 

1110 train_file.write(f"__label__NEGATIVE {line}") 

1111 

1112 super(SENTEVAL_MR, self).__init__( 

1113 data_folder, label_type='sentiment', tokenizer=tokenizer, memory_mode=memory_mode, **corpusargs 

1114 ) 

1115 

1116 

1117class SENTEVAL_SUBJ(ClassificationCorpus): 

1118 """ 

1119 The subjectivity dataset of SentEval, see https://github.com/facebookresearch/SentEval, classified into 

1120 SUBJECTIVE or OBJECTIVE sentiment. 

1121 """ 

1122 

1123 def __init__( 

1124 self, 

1125 tokenizer: Union[bool, Callable[[str], List[Token]], Tokenizer] = SpaceTokenizer(), 

1126 memory_mode: str = 'full', 

1127 **corpusargs, 

1128 ): 

1129 """ 

1130 Instantiates SentEval subjectivity dataset. 

1131 :param corpusargs: Other args for ClassificationCorpus. 

1132 :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer) 

1133 :param memory_mode: Set to 'full' by default since this is a small corpus. Can also be 'partial' or 'none'. 

1134 """ 

1135 

1136 # this dataset name 

1137 dataset_name = self.__class__.__name__.lower() 

1138 

1139 # default dataset folder is the cache root 

1140 data_folder = flair.cache_root / "datasets" / dataset_name 

1141 

1142 # download data if necessary 

1143 if not (data_folder / "train.txt").is_file(): 

1144 

1145 # download senteval datasets if necessary und unzip 

1146 senteval_path = "https://dl.fbaipublicfiles.com/senteval/senteval_data/datasmall_NB_ACL12.zip" 

1147 cached_path(senteval_path, Path("datasets") / "senteval") 

1148 senteval_folder = flair.cache_root / "datasets" / "senteval" 

1149 unzip_file(senteval_folder / "datasmall_NB_ACL12.zip", senteval_folder) 

1150 

1151 # create dataset directory if necessary 

1152 if not os.path.exists(data_folder): 

1153 os.makedirs(data_folder) 

1154 

1155 # create train.txt file by iterating over pos and neg file 

1156 with open(data_folder / "train.txt", "a") as train_file: 

1157 

1158 with open(senteval_folder / "data" / "subj" / "subj.subjective", encoding="latin1") as file: 

1159 for line in file: 

1160 train_file.write(f"__label__SUBJECTIVE {line}") 

1161 

1162 with open(senteval_folder / "data" / "subj" / "subj.objective", encoding="latin1") as file: 

1163 for line in file: 

1164 train_file.write(f"__label__OBJECTIVE {line}") 

1165 

1166 super(SENTEVAL_SUBJ, self).__init__( 

1167 data_folder, label_type='objectivity', tokenizer=tokenizer, memory_mode=memory_mode, **corpusargs, 

1168 ) 

1169 

1170 

1171class SENTEVAL_MPQA(ClassificationCorpus): 

1172 """ 

1173 The opinion-polarity dataset of SentEval, see https://github.com/facebookresearch/SentEval, classified into 

1174 NEGATIVE or POSITIVE polarity. 

1175 """ 

1176 

1177 def __init__( 

1178 self, 

1179 tokenizer: Union[bool, Callable[[str], List[Token]], Tokenizer] = SpaceTokenizer(), 

1180 memory_mode: str = 'full', 

1181 **corpusargs, 

1182 ): 

1183 """ 

1184 Instantiates SentEval opinion polarity dataset. 

1185 :param corpusargs: Other args for ClassificationCorpus. 

1186 :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer) 

1187 :param memory_mode: Set to 'full' by default since this is a small corpus. Can also be 'partial' or 'none'. 

1188 """ 

1189 

1190 # this dataset name 

1191 dataset_name = self.__class__.__name__.lower() 

1192 

1193 # default dataset folder is the cache root 

1194 data_folder = flair.cache_root / "datasets" / dataset_name 

1195 

1196 # download data if necessary 

1197 if not (data_folder / "train.txt").is_file(): 

1198 

1199 # download senteval datasets if necessary und unzip 

1200 senteval_path = "https://dl.fbaipublicfiles.com/senteval/senteval_data/datasmall_NB_ACL12.zip" 

1201 cached_path(senteval_path, Path("datasets") / "senteval") 

1202 senteval_folder = flair.cache_root / "datasets" / "senteval" 

1203 unzip_file(senteval_folder / "datasmall_NB_ACL12.zip", senteval_folder) 

1204 

1205 # create dataset directory if necessary 

1206 if not os.path.exists(data_folder): 

1207 os.makedirs(data_folder) 

1208 

1209 # create train.txt file by iterating over pos and neg file 

1210 with open(data_folder / "train.txt", "a") as train_file: 

1211 

1212 with open(senteval_folder / "data" / "mpqa" / "mpqa.pos", encoding="latin1") as file: 

1213 for line in file: 

1214 train_file.write(f"__label__POSITIVE {line}") 

1215 

1216 with open(senteval_folder / "data" / "mpqa" / "mpqa.neg", encoding="latin1") as file: 

1217 for line in file: 

1218 train_file.write(f"__label__NEGATIVE {line}") 

1219 

1220 super(SENTEVAL_MPQA, self).__init__( 

1221 data_folder, label_type='sentiment', tokenizer=tokenizer, memory_mode=memory_mode, **corpusargs, 

1222 ) 

1223 

1224 

1225class SENTEVAL_SST_BINARY(ClassificationCorpus): 

1226 """ 

1227 The Stanford sentiment treebank dataset of SentEval, see https://github.com/facebookresearch/SentEval, classified 

1228 into NEGATIVE or POSITIVE sentiment. 

1229 """ 

1230 

1231 def __init__( 

1232 self, 

1233 tokenizer: Union[bool, Callable[[str], List[Token]], Tokenizer] = SpaceTokenizer(), 

1234 memory_mode: str = 'full', 

1235 **corpusargs, 

1236 ): 

1237 """ 

1238 Instantiates SentEval Stanford sentiment treebank dataset. 

1239 :param memory_mode: Set to 'full' by default since this is a small corpus. Can also be 'partial' or 'none'. 

1240 :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer) 

1241 :param corpusargs: Other args for ClassificationCorpus. 

1242 """ 

1243 

1244 # this dataset name 

1245 dataset_name = self.__class__.__name__.lower() + '_v2' 

1246 

1247 # default dataset folder is the cache root 

1248 data_folder = flair.cache_root / "datasets" / dataset_name 

1249 

1250 # download data if necessary 

1251 if not (data_folder / "train.txt").is_file(): 

1252 

1253 # download senteval datasets if necessary und unzip 

1254 cached_path('https://raw.githubusercontent.com/PrincetonML/SIF/master/data/sentiment-train', 

1255 Path("datasets") / dataset_name / 'raw') 

1256 cached_path('https://raw.githubusercontent.com/PrincetonML/SIF/master/data/sentiment-test', 

1257 Path("datasets") / dataset_name / 'raw') 

1258 cached_path('https://raw.githubusercontent.com/PrincetonML/SIF/master/data/sentiment-dev', 

1259 Path("datasets") / dataset_name / 'raw') 

1260 

1261 original_filenames = ["sentiment-train", "sentiment-dev", "sentiment-test"] 

1262 new_filenames = ["train.txt", "dev.txt", "test.txt"] 

1263 

1264 # create train dev and test files in fasttext format 

1265 for new_filename, original_filename in zip(new_filenames, original_filenames): 

1266 with open(data_folder / new_filename, "a") as out_file, open( 

1267 data_folder / 'raw' / original_filename) as in_file: 

1268 for line in in_file: 

1269 fields = line.split('\t') 

1270 label = 'POSITIVE' if fields[1].rstrip() == '1' else 'NEGATIVE' 

1271 out_file.write(f"__label__{label} {fields[0]}\n") 

1272 

1273 super(SENTEVAL_SST_BINARY, self).__init__( 

1274 data_folder, 

1275 tokenizer=tokenizer, 

1276 memory_mode=memory_mode, 

1277 **corpusargs, 

1278 ) 

1279 

1280 

1281class SENTEVAL_SST_GRANULAR(ClassificationCorpus): 

1282 """ 

1283 The Stanford sentiment treebank dataset of SentEval, see https://github.com/facebookresearch/SentEval, classified 

1284 into 5 sentiment classes. 

1285 """ 

1286 

1287 def __init__( 

1288 self, 

1289 tokenizer: Union[bool, Callable[[str], List[Token]], Tokenizer] = SpaceTokenizer(), 

1290 memory_mode: str = 'full', 

1291 **corpusargs, 

1292 ): 

1293 """ 

1294 Instantiates SentEval Stanford sentiment treebank dataset. 

1295 :param memory_mode: Set to 'full' by default since this is a small corpus. Can also be 'partial' or 'none'. 

1296 :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer) 

1297 :param corpusargs: Other args for ClassificationCorpus. 

1298 """ 

1299 

1300 # this dataset name 

1301 dataset_name = self.__class__.__name__.lower() 

1302 

1303 # default dataset folder is the cache root 

1304 data_folder = flair.cache_root / "datasets" / dataset_name 

1305 

1306 # download data if necessary 

1307 if not (data_folder / "train.txt").is_file(): 

1308 

1309 # download senteval datasets if necessary und unzip 

1310 cached_path( 

1311 'https://raw.githubusercontent.com/AcademiaSinicaNLPLab/sentiment_dataset/master/data/stsa.fine.train', 

1312 Path("datasets") / dataset_name / 'raw') 

1313 cached_path( 

1314 'https://raw.githubusercontent.com/AcademiaSinicaNLPLab/sentiment_dataset/master/data/stsa.fine.test', 

1315 Path("datasets") / dataset_name / 'raw') 

1316 cached_path( 

1317 'https://raw.githubusercontent.com/AcademiaSinicaNLPLab/sentiment_dataset/master/data/stsa.fine.dev', 

1318 Path("datasets") / dataset_name / 'raw') 

1319 

1320 # convert to FastText format 

1321 for split in ['train', 'dev', 'test']: 

1322 with open(data_folder / f"{split}.txt", "w") as train_file: 

1323 

1324 with open(data_folder / 'raw' / f'stsa.fine.{split}', encoding="latin1") as file: 

1325 for line in file: 

1326 train_file.write(f"__label__{line[0]} {line[2:]}") 

1327 

1328 super(SENTEVAL_SST_GRANULAR, self).__init__( 

1329 data_folder, 

1330 tokenizer=tokenizer, 

1331 memory_mode=memory_mode, 

1332 **corpusargs, 

1333 ) 

1334 

1335 

1336class GLUE_COLA(ClassificationCorpus): 

1337 """ 

1338 Corpus of Linguistic Acceptability from GLUE benchmark (https://gluebenchmark.com/tasks). 

1339 The task is to predict whether an English sentence is grammatically correct. 

1340 Additionaly to the Corpus we have eval_dataset containing the unlabeled test data for Glue evaluation. 

1341 """ 

1342 

1343 def __init__(self, 

1344 label_type="acceptability", 

1345 base_path: Union[str, Path] = None, 

1346 tokenizer: Tokenizer = SegtokTokenizer(), 

1347 **corpusargs): 

1348 """ 

1349 Instantiates CoLA dataset 

1350 :param base_path: Provide this only if you store the COLA corpus in a specific folder. 

1351 :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer) 

1352 :param corpusargs: Other args for ClassificationCorpus. 

1353 """ 

1354 

1355 if type(base_path) == str: 

1356 base_path: Path = Path(base_path) 

1357 

1358 dataset_name = "glue" 

1359 

1360 # if no base_path provided take cache root 

1361 if not base_path: 

1362 base_path = flair.cache_root / "datasets" 

1363 data_folder = base_path / dataset_name 

1364 

1365 # download data if necessary 

1366 cola_path = "https://dl.fbaipublicfiles.com/glue/data/CoLA.zip" 

1367 

1368 data_file = data_folder / "CoLA/train.txt" 

1369 

1370 # if data is not downloaded yet, download it 

1371 if not data_file.is_file(): 

1372 # get the zip file 

1373 zipped_data_path = cached_path(cola_path, Path("datasets") / dataset_name) 

1374 

1375 unpack_file(zipped_data_path, data_folder, mode="zip", keep=False) 

1376 

1377 # move original .tsv files to another folder 

1378 Path(data_folder / "CoLA/train.tsv").rename(data_folder / "CoLA/original/train.tsv") 

1379 Path(data_folder / "CoLA/dev.tsv").rename(data_folder / "CoLA/original/dev.tsv") 

1380 Path(data_folder / "CoLA/test.tsv").rename(data_folder / "CoLA/original/test.tsv") 

1381 

1382 label_map = {0: 'not_grammatical', 1: 'grammatical'} 

1383 

1384 # create train and dev splits in fasttext format 

1385 for split in ["train", "dev"]: 

1386 with open(data_folder / "CoLA" / (split + ".txt"), "a") as out_file, open( 

1387 data_folder / "CoLA" / "original" / (split + ".tsv")) as in_file: 

1388 for line in in_file: 

1389 fields = line.rstrip().split('\t') 

1390 label = int(fields[1]) 

1391 sentence = fields[3] 

1392 out_file.write(f"__label__{label_map[label]} {sentence}\n") 

1393 

1394 # create eval_dataset file with no labels 

1395 with open(data_folder / "CoLA" / "eval_dataset.txt", "a") as out_file, open( 

1396 data_folder / "CoLA" / "original" / "test.tsv",) as in_file: 

1397 for line in in_file: 

1398 fields = line.rstrip().split('\t') 

1399 sentence = fields[1] 

1400 out_file.write(f"{sentence}\n") 

1401 

1402 super(GLUE_COLA, self).__init__( 

1403 data_folder / "CoLA", 

1404 label_type=label_type, 

1405 tokenizer=tokenizer, 

1406 **corpusargs, 

1407 ) 

1408 

1409 self.eval_dataset = ClassificationDataset( 

1410 data_folder / "CoLA/eval_dataset.txt", 

1411 label_type=label_type, 

1412 allow_examples_without_labels=True, 

1413 tokenizer=tokenizer, 

1414 memory_mode="full", 

1415 ) 

1416 

1417 """ 

1418 This function creates a tsv file with predictions of the eval_dataset (after calling  

1419 classifier.predict(corpus.eval_dataset, label_name='acceptability')). The resulting file  

1420 is called CoLA.tsv and is in the format required for submission to the Glue Benchmark. 

1421 """ 

1422 

1423 def tsv_from_eval_dataset(self, folder_path: Union[str, Path]): 

1424 

1425 if type(folder_path) == str: 

1426 folder_path = Path(folder_path) 

1427 folder_path = folder_path / 'CoLA.tsv' 

1428 

1429 with open(folder_path, mode='w') as tsv_file: 

1430 tsv_file.write("index\tprediction\n") 

1431 for index, datapoint in enumerate(self.eval_dataset): 

1432 reverse_label_map = {'grammatical': 1, 'not_grammatical': 0} 

1433 predicted_label = reverse_label_map[datapoint.get_labels('acceptability')[0].value] 

1434 tsv_file.write(str(index) + '\t' + predicted_label + '\n') 

1435 

1436 

1437class GO_EMOTIONS(ClassificationCorpus): 

1438 """ 

1439 GoEmotions dataset containing 58k Reddit comments labeled with 27 emotion categories, see. https://github.com/google-research/google-research/tree/master/goemotions 

1440 """ 

1441 def __init__( 

1442 self, 

1443 base_path: Union[str, Path] = None, 

1444 tokenizer: Union[bool, Callable[[str], List[Token]], Tokenizer] = SegtokTokenizer(), 

1445 memory_mode: str = 'partial', 

1446 **corpusargs, 

1447 ): 

1448 """ 

1449 Parameters 

1450 ---------- 

1451 base_path : Provide this only if you want to store the corpus in a specific folder, otherwise use default. 

1452 tokenizer : Default is SegtokTokenizer(). 

1453 memory_mode : Set to what degree to keep corpus in memory ('full', 'partial' or 'disk'). Use 'full' 

1454 if full corpus and all embeddings fits into memory for speedups during training. Otherwise use 'partial' and if 

1455 even this is too much for your memory, use 'disk'. 

1456 **corpusargs : Other args for ClassificationCorpus. 

1457 

1458 """ 

1459 

1460 label_name_map = {'0': 'ADMIRATION', 

1461 '1': 'AMUSEMENT', 

1462 '2': 'ANGER', 

1463 '3': 'ANNOYANCE', 

1464 '4': 'APPROVAL', 

1465 '5': 'CARING', 

1466 '6': 'CONFUSION', 

1467 '7': 'CURIOSITY', 

1468 '8': 'DESIRE', 

1469 '9': 'DISAPPOINTMENT', 

1470 '10': 'DISAPPROVAL', 

1471 '11': 'DISGUST', 

1472 '12': 'EMBARRASSMENT', 

1473 '13': 'EXCITEMENT', 

1474 '14': 'FEAR', 

1475 '15': 'GRATITUDE', 

1476 '16': 'GRIEF', 

1477 '17': 'JOY', 

1478 '18': 'LOVE', 

1479 '19': 'NERVOUSNESS', 

1480 '20': 'OPTIMISM', 

1481 '21': 'PRIDE', 

1482 '22': 'REALIZATION', 

1483 '23': 'RELIEF', 

1484 '24': 'REMORSE', 

1485 '25': 'SADNESS', 

1486 '26': 'SURPRISE', 

1487 '27': 'NEUTRAL'} 

1488 

1489 if type(base_path) == str: 

1490 base_path: Path = Path(base_path) 

1491 

1492 # default dataset folder is the cache root 

1493 if not base_path: 

1494 base_path = flair.cache_root / "datasets" 

1495 

1496 # this dataset name 

1497 dataset_name = self.__class__.__name__.lower() 

1498 

1499 # default dataset folder is the cache root 

1500 data_folder = base_path / dataset_name 

1501 

1502 # download data if necessary 

1503 if not (data_folder / "train.txt").is_file(): 

1504 

1505 # download datasets if necessary 

1506 goemotions_url = "https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/" 

1507 for name in ["train.tsv", "test.tsv", "dev.tsv"]: 

1508 cached_path(goemotions_url + name, Path("datasets") / dataset_name / 'raw') 

1509 

1510 # create dataset directory if necessary 

1511 if not os.path.exists(data_folder): 

1512 os.makedirs(data_folder) 

1513 

1514 data_path = flair.cache_root / "datasets" / dataset_name / 'raw' 

1515 # create correctly formated txt files 

1516 for name in ["train", "test", "dev"]: 

1517 with open(data_folder / (name + '.txt'), "w", encoding='utf-8') as txt_file: 

1518 with open(data_path / (name + ".tsv"), "r", encoding='utf-8') as tsv_file: 

1519 

1520 lines = tsv_file.readlines() 

1521 for line in lines: 

1522 row = line.split('\t') 

1523 text = row[0] 

1524 # multiple labels are possible 

1525 labels = row[1].split(',') 

1526 label_string = "" 

1527 for label in labels: 

1528 label_string += '__label__' 

1529 label_string += label 

1530 label_string += ' ' 

1531 txt_file.write(f"{label_string}{text}\n") 

1532 

1533 super(GO_EMOTIONS, self).__init__( 

1534 data_folder, label_type='emotion', tokenizer=tokenizer, 

1535 memory_mode=memory_mode, label_name_map=label_name_map, **corpusargs, 

1536 ) 

1537 

1538 

1539class TREC_50(ClassificationCorpus): 

1540 """ 

1541 The TREC Question Classification Corpus, classifying questions into 50 fine-grained answer types. 

1542 """ 

1543 

1544 def __init__(self, 

1545 base_path: Union[str, Path] = None, 

1546 tokenizer: Union[bool, Callable[[str], List[Token]], Tokenizer] = SpaceTokenizer(), 

1547 memory_mode='full', 

1548 **corpusargs 

1549 ): 

1550 """ 

1551 Instantiates TREC Question Classification Corpus with 6 classes. 

1552 :param base_path: Provide this only if you store the TREC corpus in a specific folder, otherwise use default. 

1553 :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer) 

1554 :param memory_mode: Set to 'full' by default since this is a small corpus. Can also be 'partial' or 'none'. 

1555 :param corpusargs: Other args for ClassificationCorpus. 

1556 """ 

1557 

1558 if type(base_path) == str: 

1559 base_path: Path = Path(base_path) 

1560 

1561 # this dataset name 

1562 dataset_name = self.__class__.__name__.lower() 

1563 

1564 # default dataset folder is the cache root 

1565 if not base_path: 

1566 base_path = flair.cache_root / "datasets" 

1567 data_folder = base_path / dataset_name 

1568 

1569 # download data if necessary 

1570 trec_path = "https://cogcomp.seas.upenn.edu/Data/QA/QC/" 

1571 

1572 original_filenames = ["train_5500.label", "TREC_10.label"] 

1573 new_filenames = ["train.txt", "test.txt"] 

1574 for original_filename in original_filenames: 

1575 cached_path( 

1576 f"{trec_path}{original_filename}", 

1577 Path("datasets") / dataset_name / "original", 

1578 ) 

1579 

1580 data_file = data_folder / new_filenames[0] 

1581 

1582 if not data_file.is_file(): 

1583 for original_filename, new_filename in zip( 

1584 original_filenames, new_filenames 

1585 ): 

1586 with open( 

1587 data_folder / "original" / original_filename, 

1588 "rt", 

1589 encoding="latin1", 

1590 ) as open_fp: 

1591 with open( 

1592 data_folder / new_filename, "wt", encoding="utf-8" 

1593 ) as write_fp: 

1594 for line in open_fp: 

1595 line = line.rstrip() 

1596 fields = line.split() 

1597 old_label = fields[0] 

1598 question = " ".join(fields[1:]) 

1599 

1600 # Create flair compatible labels 

1601 # TREC-6 : NUM:dist -> __label__NUM 

1602 # TREC-50: NUM:dist -> __label__NUM:dist 

1603 new_label = "__label__" 

1604 new_label += old_label 

1605 

1606 write_fp.write(f"{new_label} {question}\n") 

1607 

1608 super(TREC_50, self).__init__( 

1609 data_folder, tokenizer=tokenizer, memory_mode=memory_mode, **corpusargs, 

1610 ) 

1611 

1612 

1613class TREC_6(ClassificationCorpus): 

1614 """ 

1615 The TREC Question Classification Corpus, classifying questions into 6 coarse-grained answer types 

1616 (DESC, HUM, LOC, ENTY, NUM, ABBR). 

1617 """ 

1618 

1619 def __init__(self, 

1620 base_path: Union[str, Path] = None, 

1621 tokenizer: Union[bool, Callable[[str], List[Token]], Tokenizer] = SpaceTokenizer(), 

1622 memory_mode='full', 

1623 **corpusargs 

1624 ): 

1625 """ 

1626 Instantiates TREC Question Classification Corpus with 6 classes. 

1627 :param base_path: Provide this only if you store the TREC corpus in a specific folder, otherwise use default. 

1628 :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer) 

1629 :param memory_mode: Set to 'full' by default since this is a small corpus. Can also be 'partial' or 'none'. 

1630 :param corpusargs: Other args for ClassificationCorpus. 

1631 """ 

1632 

1633 if type(base_path) == str: 

1634 base_path: Path = Path(base_path) 

1635 

1636 # this dataset name 

1637 dataset_name = self.__class__.__name__.lower() 

1638 

1639 # default dataset folder is the cache root 

1640 if not base_path: 

1641 base_path = flair.cache_root / "datasets" 

1642 data_folder = base_path / dataset_name 

1643 

1644 # download data if necessary 

1645 trec_path = "https://cogcomp.seas.upenn.edu/Data/QA/QC/" 

1646 

1647 original_filenames = ["train_5500.label", "TREC_10.label"] 

1648 new_filenames = ["train.txt", "test.txt"] 

1649 for original_filename in original_filenames: 

1650 cached_path( 

1651 f"{trec_path}{original_filename}", 

1652 Path("datasets") / dataset_name / "original", 

1653 ) 

1654 

1655 data_file = data_folder / new_filenames[0] 

1656 

1657 if not data_file.is_file(): 

1658 for original_filename, new_filename in zip( 

1659 original_filenames, new_filenames 

1660 ): 

1661 with open( 

1662 data_folder / "original" / original_filename, 

1663 "rt", 

1664 encoding="latin1", 

1665 ) as open_fp: 

1666 with open( 

1667 data_folder / new_filename, "wt", encoding="utf-8" 

1668 ) as write_fp: 

1669 for line in open_fp: 

1670 line = line.rstrip() 

1671 fields = line.split() 

1672 old_label = fields[0] 

1673 question = " ".join(fields[1:]) 

1674 

1675 # Create flair compatible labels 

1676 # TREC-6 : NUM:dist -> __label__NUM 

1677 # TREC-50: NUM:dist -> __label__NUM:dist 

1678 new_label = "__label__" 

1679 new_label += old_label.split(":")[0] 

1680 

1681 write_fp.write(f"{new_label} {question}\n") 

1682 

1683 super(TREC_6, self).__init__( 

1684 data_folder, label_type='question_class', tokenizer=tokenizer, memory_mode=memory_mode, **corpusargs, 

1685 ) 

1686 

1687 

1688class YAHOO_ANSWERS(ClassificationCorpus): 

1689 """ 

1690 The YAHOO Question Classification Corpus, classifying questions into 10 coarse-grained answer types 

1691 """ 

1692 

1693 def __init__(self, 

1694 base_path: Union[str, Path] = None, 

1695 tokenizer: Union[bool, Callable[[str], List[Token]], Tokenizer] = SpaceTokenizer(), 

1696 memory_mode='partial', 

1697 **corpusargs 

1698 ): 

1699 """ 

1700 Instantiates YAHOO Question Classification Corpus with 10 classes. 

1701 :param base_path: Provide this only if you store the YAHOO corpus in a specific folder, otherwise use default. 

1702 :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer) 

1703 :param memory_mode: Set to 'partial' by default since this is a rather big corpus. Can also be 'full' or 'none'. 

1704 :param corpusargs: Other args for ClassificationCorpus. 

1705 """ 

1706 

1707 if type(base_path) == str: 

1708 base_path: Path = Path(base_path) 

1709 

1710 # this dataset name 

1711 dataset_name = self.__class__.__name__.lower() 

1712 

1713 # default dataset folder is the cache root 

1714 if not base_path: 

1715 base_path = flair.cache_root / "datasets" 

1716 data_folder = base_path / dataset_name 

1717 

1718 # download data if necessary 

1719 url = "https://s3.amazonaws.com/fast-ai-nlp/yahoo_answers_csv.tgz" 

1720 

1721 label_map = {'1': 'Society_&_Culture', 

1722 '2': 'Science_&_Mathematics', 

1723 '3': 'Health', 

1724 '4': 'Education_&_Reference', 

1725 '5': 'Computers_&_Internet', 

1726 '6': 'Sports', 

1727 '7': 'Business_&_Finance', 

1728 '8': 'Entertainment_&_Music', 

1729 '9': 'Family_&_Relationships', 

1730 '10': 'Politics_&_Government'} 

1731 

1732 original = flair.cache_root / "datasets" / dataset_name / "original" 

1733 

1734 if not (data_folder / "train.txt").is_file(): 

1735 cached_path(url, original) 

1736 

1737 

1738 import tarfile 

1739 

1740 tar = tarfile.open(original / "yahoo_answers_csv.tgz", "r:gz") 

1741 members = [] 

1742 

1743 for member in tar.getmembers(): 

1744 if("test.csv" in member.name or "train.csv" in member.name): 

1745 members.append(member) 

1746 

1747 tar.extractall(original, members=members) 

1748 

1749 for name in ["train", "test"]: 

1750 file = open(original / "yahoo_answers_csv" / (name+".csv")) 

1751 reader = csv.reader(file) 

1752 writer = open(data_folder / (name+".txt"), "wt", encoding="utf-8") 

1753 for row in reader: 

1754 writer.write("__label__"+label_map.get(row[0])+" "+row[1]+"\n") 

1755 

1756 file.close() 

1757 writer.close() 

1758 

1759 super(YAHOO_ANSWERS, self).__init__( 

1760 data_folder, label_type='question_type', tokenizer=tokenizer, memory_mode=memory_mode, **corpusargs, 

1761 ) 

1762 

1763 

1764class GERMEVAL_2018_OFFENSIVE_LANGUAGE(ClassificationCorpus): 

1765 """ 

1766 GermEval 2018 corpus for identification of offensive language. 

1767 Classifying German tweets into 2 coarse-grained categories OFFENSIVE and OTHER 

1768 or 4 fine-grained categories ABUSE, INSULT, PROFATINTY and OTHER. 

1769 """ 

1770 

1771 def __init__(self, 

1772 base_path: Union[str, Path] = None, 

1773 tokenizer: Union[bool, Callable[[str], List[Token]], Tokenizer] = SegtokTokenizer(), 

1774 memory_mode: str = 'full', 

1775 fine_grained_classes: bool = False, 

1776 **corpusargs): 

1777 """ 

1778 Instantiates GermEval 2018 Offensive Language Classification Corpus. 

1779 :param base_path: Provide this only if you store the Offensive Language corpus in a specific folder, otherwise use default. 

1780 :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer) 

1781 :param memory_mode: Set to 'full' by default since this is a small corpus. Can also be 'partial' or 'none'. 

1782 :param fine_grained_classes: Set to True to load the dataset with 4 fine-grained classes 

1783 :param corpusargs: Other args for ClassificationCorpus. 

1784 """ 

1785 

1786 if type(base_path) == str: 

1787 base_path: Path = Path(base_path) 

1788 

1789 # this dataset name 

1790 dataset_name = self.__class__.__name__.lower() 

1791 

1792 # default dataset folder is the cache root 

1793 if not base_path: 

1794 base_path = flair.cache_root / "datasets" 

1795 data_folder = base_path / dataset_name 

1796 

1797 # download data if necessary 

1798 offlang_path = "https://raw.githubusercontent.com/uds-lsv/GermEval-2018-Data/master/" 

1799 

1800 original_filenames = ["germeval2018.training.txt", "germeval2018.test.txt"] 

1801 new_filenames = ["train.txt", "test.txt"] 

1802 for original_filename in original_filenames: 

1803 cached_path( 

1804 f"{offlang_path}{original_filename}", 

1805 Path("datasets") / dataset_name / "original", 

1806 ) 

1807 

1808 task_setting = "coarse_grained" 

1809 if fine_grained_classes: 

1810 task_setting = "fine_grained" 

1811 

1812 task_folder = data_folder / task_setting 

1813 data_file = task_folder / new_filenames[0] 

1814 

1815 # create a separate directory for different tasks 

1816 if not os.path.exists(task_folder): 

1817 os.makedirs(task_folder) 

1818 

1819 if not data_file.is_file(): 

1820 for original_filename, new_filename in zip( 

1821 original_filenames, new_filenames 

1822 ): 

1823 with open( 

1824 data_folder / "original" / original_filename, 

1825 "rt", 

1826 encoding="utf-8", 

1827 ) as open_fp: 

1828 with open( 

1829 data_folder / task_setting / new_filename, "wt", encoding="utf-8" 

1830 ) as write_fp: 

1831 for line in open_fp: 

1832 line = line.rstrip() 

1833 fields = line.split('\t') 

1834 tweet = fields[0] 

1835 if task_setting == "fine_grained": 

1836 old_label = fields[2] 

1837 else: 

1838 old_label = fields[1] 

1839 new_label = '__label__' + old_label 

1840 write_fp.write(f"{new_label} {tweet}\n") 

1841 

1842 super(GERMEVAL_2018_OFFENSIVE_LANGUAGE, self).__init__( 

1843 data_folder=task_folder, tokenizer=tokenizer, memory_mode=memory_mode, **corpusargs, 

1844 ) 

1845 

1846 

1847class COMMUNICATIVE_FUNCTIONS(ClassificationCorpus): 

1848 """ 

1849 The Communicative Functions Classification Corpus. 

1850 Classifying sentences from scientific papers into 39 communicative functions. 

1851 """ 

1852 

1853 def __init__(self, 

1854 base_path: Union[str, Path] = None, 

1855 memory_mode: str = 'full', 

1856 tokenizer: Union[bool, Callable[[str], List[Token]], Tokenizer] = SpaceTokenizer(), 

1857 **corpusargs): 

1858 """ 

1859 Instantiates Communicative Functions Classification Corpus with 39 classes. 

1860 :param base_path: Provide this only if you store the Communicative Functions date in a specific folder, otherwise use default. 

1861 :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer) 

1862 :param memory_mode: Set to 'full' by default since this is a small corpus. Can also be 'partial' or 'none'. 

1863 :param corpusargs: Other args for ClassificationCorpus. 

1864 """ 

1865 

1866 if type(base_path) == str: 

1867 base_path: Path = Path(base_path) 

1868 

1869 # this dataset name 

1870 dataset_name = self.__class__.__name__.lower() 

1871 

1872 # default dataset folder is the cache root 

1873 if not base_path: 

1874 base_path = flair.cache_root / "datasets" 

1875 data_folder = base_path / dataset_name 

1876 

1877 original_filenames = ["background.tsv", "discussion.tsv", "introduction.tsv", "method.tsv", "result.tsv"] 

1878 

1879 # download data if necessary 

1880 comm_path = "https://raw.githubusercontent.com/Alab-NII/FECFevalDataset/master/sentences/" 

1881 

1882 for original_filename in original_filenames: 

1883 cached_path(f"{comm_path}{original_filename}", Path("datasets") / dataset_name / "original") 

1884 

1885 data_file = data_folder / "train.txt" 

1886 

1887 if not data_file.is_file(): # check if new file already exists 

1888 with open(data_folder / "train.txt", 'a+', encoding="utf-8") as write_fp: 

1889 for original_filename in original_filenames[:4]: 

1890 with open(data_folder / "original" / original_filename, 'rt', encoding="utf-8") as open_fp: 

1891 for line in open_fp: 

1892 liste = line.split('\t') 

1893 write_fp.write('__label__' + liste[0].replace(' ', '_') + ' ' + liste[2] + '\n') 

1894 with open(data_folder / "original" / "result.tsv", 'rt', encoding="utf-8") as open_fp: 

1895 for line in open_fp: 

1896 liste = line.split('\t') 

1897 if liste[0].split(' ')[-1] == "(again)": 

1898 write_fp.write('__label__' + liste[0][:-8].replace(' ', '_') + ' ' + liste[2] + '\n') 

1899 else: 

1900 write_fp.write('__label__' + liste[0].replace(' ', '_') + ' ' + liste[2] + '\n') 

1901 

1902 super(COMMUNICATIVE_FUNCTIONS, self).__init__( 

1903 data_folder, label_type='communicative_function', tokenizer=tokenizer, memory_mode=memory_mode, 

1904 **corpusargs, 

1905 ) 

1906 

1907 

1908def _download_wassa_if_not_there(emotion, data_folder, dataset_name): 

1909 for split in ["train", "dev", "test"]: 

1910 

1911 data_file = data_folder / f"{emotion}-{split}.txt" 

1912 

1913 if not data_file.is_file(): 

1914 

1915 if split == "train": 

1916 url = f"http://saifmohammad.com/WebDocs/EmoInt%20Train%20Data/{emotion}-ratings-0to1.train.txt" 

1917 if split == "dev": 

1918 url = f"http://saifmohammad.com/WebDocs/EmoInt%20Dev%20Data%20With%20Gold/{emotion}-ratings-0to1.dev.gold.txt" 

1919 if split == "test": 

1920 url = f"http://saifmohammad.com/WebDocs/EmoInt%20Test%20Gold%20Data/{emotion}-ratings-0to1.test.gold.txt" 

1921 

1922 path = cached_path(url, Path("datasets") / dataset_name) 

1923 

1924 with open(path, "r", encoding="UTF-8") as f: 

1925 with open(data_file, "w", encoding="UTF-8") as out: 

1926 next(f) 

1927 for line in f: 

1928 fields = line.split("\t") 

1929 out.write(f"__label__{fields[3].rstrip()} {fields[1]}\n") 

1930 

1931 os.remove(path) 

1932 

1933 

1934class WASSA_ANGER(ClassificationCorpus): 

1935 """ 

1936 WASSA-2017 anger emotion-intensity dataset downloaded from and documented at 

1937 https://saifmohammad.com/WebPages/EmotionIntensity-SharedTask.html 

1938 """ 

1939 

1940 def __init__(self, 

1941 base_path: Union[str, Path] = None, 

1942 tokenizer: Tokenizer = SegtokTokenizer(), 

1943 **corpusargs): 

1944 """ 

1945 Instantiates WASSA-2017 anger emotion-intensity dataset 

1946 :param base_path: Provide this only if you store the WASSA corpus in a specific folder, otherwise use default. 

1947 :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer) 

1948 :param corpusargs: Other args for ClassificationCorpus. 

1949 """ 

1950 

1951 if type(base_path) == str: 

1952 base_path: Path = Path(base_path) 

1953 

1954 # this dataset name 

1955 dataset_name = self.__class__.__name__.lower() 

1956 

1957 # default dataset folder is the cache root 

1958 if not base_path: 

1959 base_path = flair.cache_root / "datasets" 

1960 data_folder = base_path / dataset_name 

1961 

1962 # download data if necessary 

1963 _download_wassa_if_not_there("anger", data_folder, dataset_name) 

1964 

1965 super(WASSA_ANGER, self).__init__( 

1966 data_folder, tokenizer=tokenizer, **corpusargs, 

1967 ) 

1968 

1969 

1970class WASSA_FEAR(ClassificationCorpus): 

1971 """ 

1972 WASSA-2017 fear emotion-intensity dataset downloaded from and documented at 

1973 https://saifmohammad.com/WebPages/EmotionIntensity-SharedTask.html 

1974 """ 

1975 

1976 def __init__(self, 

1977 base_path: Union[str, Path] = None, 

1978 tokenizer: Tokenizer = SegtokTokenizer(), 

1979 **corpusargs): 

1980 """ 

1981 Instantiates WASSA-2017 fear emotion-intensity dataset 

1982 :param base_path: Provide this only if you store the WASSA corpus in a specific folder, otherwise use default. 

1983 :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer) 

1984 :param corpusargs: Other args for ClassificationCorpus. 

1985 """ 

1986 

1987 if type(base_path) == str: 

1988 base_path: Path = Path(base_path) 

1989 

1990 # this dataset name 

1991 dataset_name = self.__class__.__name__.lower() 

1992 

1993 # default dataset folder is the cache root 

1994 if not base_path: 

1995 base_path = flair.cache_root / "datasets" 

1996 data_folder = base_path / dataset_name 

1997 

1998 # download data if necessary 

1999 _download_wassa_if_not_there("fear", data_folder, dataset_name) 

2000 

2001 super(WASSA_FEAR, self).__init__( 

2002 data_folder, tokenizer=tokenizer, **corpusargs 

2003 ) 

2004 

2005 

2006class WASSA_JOY(ClassificationCorpus): 

2007 """ 

2008 WASSA-2017 joy emotion-intensity dataset downloaded from and documented at 

2009 https://saifmohammad.com/WebPages/EmotionIntensity-SharedTask.html 

2010 """ 

2011 

2012 def __init__(self, 

2013 base_path: Union[str, Path] = None, 

2014 tokenizer: Tokenizer = SegtokTokenizer(), 

2015 **corpusargs): 

2016 """ 

2017 Instantiates WASSA-2017 joy emotion-intensity dataset 

2018 :param base_path: Provide this only if you store the WASSA corpus in a specific folder, otherwise use default. 

2019 :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer) 

2020 :param corpusargs: Other args for ClassificationCorpus. 

2021 """ 

2022 

2023 if type(base_path) == str: 

2024 base_path: Path = Path(base_path) 

2025 

2026 # this dataset name 

2027 dataset_name = self.__class__.__name__.lower() 

2028 

2029 # default dataset folder is the cache root 

2030 if not base_path: 

2031 base_path = flair.cache_root / "datasets" 

2032 data_folder = base_path / dataset_name 

2033 

2034 # download data if necessary 

2035 _download_wassa_if_not_there("joy", data_folder, dataset_name) 

2036 

2037 super(WASSA_JOY, self).__init__( 

2038 data_folder, tokenizer=tokenizer, **corpusargs, 

2039 ) 

2040 

2041 

2042class WASSA_SADNESS(ClassificationCorpus): 

2043 """ 

2044 WASSA-2017 sadness emotion-intensity dataset downloaded from and documented at 

2045 https://saifmohammad.com/WebPages/EmotionIntensity-SharedTask.html 

2046 """ 

2047 

2048 def __init__(self, 

2049 base_path: Union[str, Path] = None, 

2050 tokenizer: Tokenizer = SegtokTokenizer(), 

2051 **corpusargs): 

2052 """ 

2053 Instantiates WASSA-2017 sadness emotion-intensity dataset 

2054 :param base_path: Provide this only if you store the WASSA corpus in a specific folder, otherwise use default. 

2055 :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer) 

2056 :param corpusargs: Other args for ClassificationCorpus. 

2057 """ 

2058 

2059 if type(base_path) == str: 

2060 base_path: Path = Path(base_path) 

2061 

2062 # this dataset name 

2063 dataset_name = self.__class__.__name__.lower() 

2064 

2065 # default dataset folder is the cache root 

2066 if not base_path: 

2067 base_path = flair.cache_root / "datasets" 

2068 data_folder = base_path / dataset_name 

2069 

2070 # download data if necessary 

2071 _download_wassa_if_not_there("sadness", data_folder, dataset_name) 

2072 

2073 super(WASSA_SADNESS, self).__init__( 

2074 data_folder, tokenizer=tokenizer, **corpusargs, 

2075 )