Coverage for flair/flair/data_fetcher.py: 0%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

596 statements  

1import logging 

2import os 

3import re 

4 

5from deprecated import deprecated 

6from enum import Enum 

7from pathlib import Path 

8from typing import List, Dict, Union 

9 

10 

11import flair 

12from flair.data import ( 

13 Sentence, 

14 Corpus, 

15 Token, 

16 Tokenizer, 

17 MultiCorpus 

18) 

19from flair.tokenization import SegtokTokenizer, SpaceTokenizer 

20from flair.file_utils import cached_path 

21 

22log = logging.getLogger("flair") 

23 

24 

25class NLPTask(Enum): 

26 # conll 2000 column format 

27 CONLL_2000 = "conll_2000" 

28 

29 # conll 03 NER column format 

30 CONLL_03 = "conll_03" 

31 CONLL_03_GERMAN = "conll_03_german" 

32 CONLL_03_DUTCH = "conll_03_dutch" 

33 CONLL_03_SPANISH = "conll_03_spanish" 

34 

35 # WNUT-17 

36 WNUT_17 = "wnut_17" 

37 

38 # -- WikiNER datasets 

39 WIKINER_ENGLISH = "wikiner_english" 

40 WIKINER_GERMAN = "wikiner_german" 

41 WIKINER_FRENCH = "wikiner_french" 

42 WIKINER_SPANISH = "wikiner_spanish" 

43 WIKINER_ITALIAN = "wikiner_italian" 

44 WIKINER_DUTCH = "wikiner_dutch" 

45 WIKINER_POLISH = "wikiner_polish" 

46 WIKINER_PORTUGUESE = "wikiner_portuguese" 

47 WIKINER_RUSSIAN = "wikiner_russian" 

48 

49 # -- Universal Dependencies 

50 # Germanic 

51 UD_ENGLISH = "ud_english" 

52 UD_GERMAN = "ud_german" 

53 UD_DUTCH = "ud_dutch" 

54 # Romance 

55 UD_FRENCH = "ud_french" 

56 UD_ITALIAN = "ud_italian" 

57 UD_SPANISH = "ud_spanish" 

58 UD_PORTUGUESE = "ud_portuguese" 

59 UD_ROMANIAN = "ud_romanian" 

60 UD_CATALAN = "ud_catalan" 

61 # West-Slavic 

62 UD_POLISH = "ud_polish" 

63 UD_CZECH = "ud_czech" 

64 UD_SLOVAK = "ud_slovak" 

65 # South-Slavic 

66 UD_SLOVENIAN = "ud_slovenian" 

67 UD_CROATIAN = "ud_croatian" 

68 UD_SERBIAN = "ud_serbian" 

69 UD_BULGARIAN = "ud_bulgarian" 

70 # East-Slavic 

71 UD_RUSSIAN = "ud_russian" 

72 # Scandinavian 

73 UD_SWEDISH = "ud_swedish" 

74 UD_DANISH = "ud_danish" 

75 UD_NORWEGIAN = "ud_norwegian" 

76 UD_FINNISH = "ud_finnish" 

77 # Asian 

78 UD_ARABIC = "ud_arabic" 

79 UD_HEBREW = "ud_hebrew" 

80 UD_TURKISH = "ud_turkish" 

81 UD_PERSIAN = "ud_persian" 

82 UD_HINDI = "ud_hindi" 

83 UD_INDONESIAN = "ud_indonesian" 

84 UD_JAPANESE = "ud_japanese" 

85 UD_CHINESE = "ud_chinese" 

86 UD_KOREAN = "ud_korean" 

87 

88 # Language isolates 

89 UD_BASQUE = "ud_basque" 

90 

91 # recent Universal Dependencies 

92 UD_GERMAN_HDT = "ud_german_hdt" 

93 

94 # other datasets 

95 ONTONER = "ontoner" 

96 FASHION = "fashion" 

97 GERMEVAL = "germeval" 

98 SRL = "srl" 

99 WSD = "wsd" 

100 CONLL_12 = "conll_12" 

101 PENN = "penn" 

102 ONTONOTES = "ontonotes" 

103 NER_BASQUE = "eiec" 

104 

105 # text classification format 

106 IMDB = "imdb" 

107 AG_NEWS = "ag_news" 

108 TREC_6 = "trec-6" 

109 TREC_50 = "trec-50" 

110 

111 # text regression format 

112 REGRESSION = "regression" 

113 WASSA_ANGER = "wassa-anger" 

114 WASSA_FEAR = "wassa-fear" 

115 WASSA_JOY = "wassa-joy" 

116 WASSA_SADNESS = "wassa-sadness" 

117 

118 

119class NLPTaskDataFetcher: 

120 @staticmethod 

121 @deprecated(version="0.4.1", reason="Use 'flair.datasets' instead.") 

122 def load_corpora( 

123 tasks: List[Union[NLPTask, str]], base_path: Union[str, Path] = None 

124 ) -> MultiCorpus: 

125 return MultiCorpus( 

126 [NLPTaskDataFetcher.load_corpus(task, Path(base_path)) for task in tasks] 

127 ) 

128 

129 @staticmethod 

130 @deprecated(version="0.4.1", reason="Use 'flair.datasets' instead.") 

131 def load_corpus(task: Union[NLPTask, str], base_path: Union[str, Path] = None) -> Corpus: 

132 """ 

133 Helper function to fetch a Corpus for a specific NLPTask. For this to work you need to first download 

134 and put into the appropriate folder structure the corresponding NLP task data. The tutorials on 

135 https://github.com/zalandoresearch/flair give more info on how to do this. Alternatively, you can use this 

136 code to create your own data fetchers. 

137 :param task: specification of the NLPTask you wish to get 

138 :param base_path: path to data folder containing tasks sub folders 

139 :return: a Corpus consisting of train, dev and test data 

140 """ 

141 

142 # first, try to fetch dataset online 

143 if type(task) is NLPTask: 

144 NLPTaskDataFetcher.download_dataset(task) 

145 

146 # default dataset folder is the cache root 

147 if not base_path: 

148 base_path = flair.cache_root / "datasets" 

149 

150 if type(base_path) == str: 

151 base_path: Path = Path(base_path) 

152 

153 # get string value if enum is passed 

154 task = task.value if type(task) is NLPTask else task 

155 

156 data_folder = base_path / task.lower() 

157 

158 # the CoNLL 2000 task on chunking has three columns: text, pos and np (chunk) 

159 if task == NLPTask.CONLL_2000.value: 

160 columns = {0: "text", 1: "pos", 2: "np"} 

161 

162 return NLPTaskDataFetcher.load_column_corpus( 

163 data_folder, columns, tag_to_biloes="np" 

164 ) 

165 

166 # many NER tasks follow the CoNLL 03 format with four colulms: text, pos, np and ner tag 

167 if ( 

168 task == NLPTask.CONLL_03.value 

169 or task == NLPTask.ONTONER.value 

170 or task == NLPTask.FASHION.value 

171 ): 

172 columns = {0: "text", 1: "pos", 2: "np", 3: "ner"} 

173 

174 return NLPTaskDataFetcher.load_column_corpus( 

175 data_folder, columns, tag_to_biloes="ner" 

176 ) 

177 

178 # the CoNLL 03 task for German has an additional lemma column 

179 if task == NLPTask.CONLL_03_GERMAN.value: 

180 columns = {0: "text", 1: "lemma", 2: "pos", 3: "np", 4: "ner"} 

181 

182 return NLPTaskDataFetcher.load_column_corpus( 

183 data_folder, columns, tag_to_biloes="ner" 

184 ) 

185 

186 # the CoNLL 03 task for Dutch has no NP column 

187 if task == NLPTask.CONLL_03_DUTCH.value or task.startswith("wikiner"): 

188 columns = {0: "text", 1: "pos", 2: "ner"} 

189 

190 return NLPTaskDataFetcher.load_column_corpus( 

191 data_folder, columns, tag_to_biloes="ner" 

192 ) 

193 

194 # the CoNLL 03 task for Spanish only has two columns 

195 if task == NLPTask.CONLL_03_SPANISH.value or task == NLPTask.WNUT_17.value: 

196 columns = {0: "text", 1: "ner"} 

197 

198 return NLPTaskDataFetcher.load_column_corpus( 

199 data_folder, columns, tag_to_biloes="ner" 

200 ) 

201 

202 # the GERMEVAL task only has two columns: text and ner 

203 if task == NLPTask.GERMEVAL.value: 

204 columns = {1: "text", 2: "ner"} 

205 

206 return NLPTaskDataFetcher.load_column_corpus( 

207 data_folder, columns, tag_to_biloes="ner" 

208 ) 

209 

210 # WSD tasks may be put into this column format 

211 if task == NLPTask.WSD.value: 

212 columns = {0: "text", 1: "lemma", 2: "pos", 3: "sense"} 

213 return NLPTaskDataFetcher.load_column_corpus( 

214 data_folder, 

215 columns, 

216 train_file="semcor.tsv", 

217 test_file="semeval2015.tsv", 

218 ) 

219 

220 # the UD corpora follow the CoNLL-U format, for which we have a special reader 

221 if task.startswith("ud_") or task in [ 

222 NLPTask.ONTONOTES.value, 

223 NLPTask.CONLL_12.value, 

224 NLPTask.PENN.value, 

225 ]: 

226 return NLPTaskDataFetcher.load_ud_corpus(data_folder) 

227 

228 # for text classifiers, we use our own special format 

229 if task in [ 

230 NLPTask.IMDB.value, 

231 NLPTask.AG_NEWS.value, 

232 NLPTask.TREC_6.value, 

233 NLPTask.TREC_50.value, 

234 NLPTask.REGRESSION.value, 

235 ]: 

236 tokenizer: Tokenizer = SpaceTokenizer() if task in [ 

237 NLPTask.TREC_6.value, 

238 NLPTask.TREC_50.value, 

239 ] else SegtokTokenizer() 

240 

241 return NLPTaskDataFetcher.load_classification_corpus( 

242 data_folder, tokenizer=tokenizer 

243 ) 

244 

245 # NER corpus for Basque 

246 if task == NLPTask.NER_BASQUE.value: 

247 columns = {0: "text", 1: "ner"} 

248 return NLPTaskDataFetcher.load_column_corpus( 

249 data_folder, columns, tag_to_biloes="ner" 

250 ) 

251 

252 if task.startswith("wassa"): 

253 return NLPTaskDataFetcher.load_classification_corpus( 

254 data_folder, tokenizer=SegtokTokenizer() 

255 ) 

256 

257 @staticmethod 

258 @deprecated(version="0.4.1", reason="Use 'flair.datasets' instead.") 

259 def load_column_corpus( 

260 data_folder: Union[str, Path], 

261 column_format: Dict[int, str], 

262 train_file=None, 

263 test_file=None, 

264 dev_file=None, 

265 tag_to_biloes=None, 

266 ) -> Corpus: 

267 """ 

268 Helper function to get a Corpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000. 

269 

270 :param data_folder: base folder with the task data 

271 :param column_format: a map specifying the column format 

272 :param train_file: the name of the train file 

273 :param test_file: the name of the test file 

274 :param dev_file: the name of the dev file, if None, dev data is sampled from train 

275 :param tag_to_biloes: whether to convert to BILOES tagging scheme 

276 :return: a Corpus with annotated train, dev and test data 

277 """ 

278 

279 if type(data_folder) == str: 

280 data_folder: Path = Path(data_folder) 

281 

282 if train_file is not None: 

283 train_file = data_folder / train_file 

284 if test_file is not None: 

285 test_file = data_folder / test_file 

286 if dev_file is not None: 

287 dev_file = data_folder / dev_file 

288 

289 # automatically identify train / test / dev files 

290 if train_file is None: 

291 for file in data_folder.iterdir(): 

292 file_name = file.name 

293 if file_name.endswith(".gz"): 

294 continue 

295 if "train" in file_name and not "54019" in file_name: 

296 train_file = file 

297 if "dev" in file_name: 

298 dev_file = file 

299 if "testa" in file_name: 

300 dev_file = file 

301 if "testb" in file_name: 

302 test_file = file 

303 

304 # if no test file is found, take any file with 'test' in name 

305 if test_file is None: 

306 for file in data_folder.iterdir(): 

307 file_name = file.name 

308 if file_name.endswith(".gz"): 

309 continue 

310 if "test" in file_name: 

311 test_file = file 

312 

313 log.info("Reading data from {}".format(data_folder)) 

314 log.info("Train: {}".format(train_file)) 

315 log.info("Dev: {}".format(dev_file)) 

316 log.info("Test: {}".format(test_file)) 

317 

318 # get train and test data 

319 sentences_train: List[Sentence] = NLPTaskDataFetcher.read_column_data( 

320 train_file, column_format 

321 ) 

322 

323 # read in test file if exists, otherwise sample 10% of train data as test dataset 

324 if test_file is not None: 

325 sentences_test: List[Sentence] = NLPTaskDataFetcher.read_column_data( 

326 test_file, column_format 

327 ) 

328 else: 

329 sentences_test: List[Sentence] = [ 

330 sentences_train[i] 

331 for i in NLPTaskDataFetcher.__sample(len(sentences_train), 0.1) 

332 ] 

333 sentences_train = [x for x in sentences_train if x not in sentences_test] 

334 

335 # read in dev file if exists, otherwise sample 10% of train data as dev dataset 

336 if dev_file is not None: 

337 sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_column_data( 

338 dev_file, column_format 

339 ) 

340 else: 

341 sentences_dev: List[Sentence] = [ 

342 sentences_train[i] 

343 for i in NLPTaskDataFetcher.__sample(len(sentences_train), 0.1) 

344 ] 

345 sentences_train = [x for x in sentences_train if x not in sentences_dev] 

346 

347 if tag_to_biloes is not None: 

348 # convert tag scheme to iobes 

349 for sentence in sentences_train + sentences_test + sentences_dev: 

350 sentence.convert_tag_scheme( 

351 tag_type=tag_to_biloes, target_scheme="iobes" 

352 ) 

353 

354 return Corpus( 

355 sentences_train, sentences_dev, sentences_test, name=data_folder.name 

356 ) 

357 

358 @staticmethod 

359 @deprecated(version="0.4.1", reason="Use 'flair.datasets' instead.") 

360 def load_ud_corpus( 

361 data_folder: Union[str, Path], train_file=None, test_file=None, dev_file=None 

362 ) -> Corpus: 

363 """ 

364 Helper function to get a Corpus from CoNLL-U column-formatted task data such as the UD corpora 

365 

366 :param data_folder: base folder with the task data 

367 :param train_file: the name of the train file 

368 :param test_file: the name of the test file 

369 :param dev_file: the name of the dev file, if None, dev data is sampled from train 

370 :return: a Corpus with annotated train, dev and test data 

371 """ 

372 # automatically identify train / test / dev files 

373 if train_file is None: 

374 for file in data_folder.iterdir(): 

375 file_name = file.name 

376 if "train" in file_name: 

377 train_file = file 

378 if "test" in file_name: 

379 test_file = file 

380 if "dev" in file_name: 

381 dev_file = file 

382 if "testa" in file_name: 

383 dev_file = file 

384 if "testb" in file_name: 

385 test_file = file 

386 

387 log.info("Reading data from {}".format(data_folder)) 

388 log.info("Train: {}".format(train_file)) 

389 log.info("Dev: {}".format(dev_file)) 

390 log.info("Test: {}".format(test_file)) 

391 

392 sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(train_file) 

393 sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(test_file) 

394 sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(dev_file) 

395 

396 return Corpus( 

397 sentences_train, sentences_dev, sentences_test, name=data_folder.name 

398 ) 

399 

400 @staticmethod 

401 @deprecated(version="0.4.1", reason="Use 'flair.datasets' instead.") 

402 def load_classification_corpus( 

403 data_folder: Union[str, Path], 

404 train_file=None, 

405 test_file=None, 

406 dev_file=None, 

407 tokenizer: Tokenizer = SegtokTokenizer(), 

408 max_tokens_per_doc=-1, 

409 ) -> Corpus: 

410 """ 

411 Helper function to get a Corpus from text classification-formatted task data 

412 

413 :param data_folder: base folder with the task data 

414 :param train_file: the name of the train file 

415 :param test_file: the name of the test file 

416 :param dev_file: the name of the dev file, if None, dev data is sampled from train 

417 :param tokenizer: Custom tokenizer to use (default SegtokTokenizer) 

418 :return: a Corpus with annotated train, dev and test data 

419 """ 

420 

421 if type(data_folder) == str: 

422 data_folder: Path = Path(data_folder) 

423 

424 if train_file is not None: 

425 train_file = data_folder / train_file 

426 if test_file is not None: 

427 test_file = data_folder / test_file 

428 if dev_file is not None: 

429 dev_file = data_folder / dev_file 

430 

431 # automatically identify train / test / dev files 

432 if train_file is None: 

433 for file in data_folder.iterdir(): 

434 file_name = file.name 

435 if "train" in file_name: 

436 train_file = file 

437 if "test" in file_name: 

438 test_file = file 

439 if "dev" in file_name: 

440 dev_file = file 

441 if "testa" in file_name: 

442 dev_file = file 

443 if "testb" in file_name: 

444 test_file = file 

445 

446 log.info("Reading data from {}".format(data_folder)) 

447 log.info("Train: {}".format(train_file)) 

448 log.info("Dev: {}".format(dev_file)) 

449 log.info("Test: {}".format(test_file)) 

450 

451 sentences_train: List[ 

452 Sentence 

453 ] = NLPTaskDataFetcher.read_text_classification_file( 

454 train_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc 

455 ) 

456 sentences_test: List[ 

457 Sentence 

458 ] = NLPTaskDataFetcher.read_text_classification_file( 

459 test_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc 

460 ) 

461 

462 if dev_file is not None: 

463 sentences_dev: List[ 

464 Sentence 

465 ] = NLPTaskDataFetcher.read_text_classification_file( 

466 dev_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc 

467 ) 

468 else: 

469 sentences_dev: List[Sentence] = [ 

470 sentences_train[i] 

471 for i in NLPTaskDataFetcher.__sample(len(sentences_train), 0.1) 

472 ] 

473 sentences_train = [x for x in sentences_train if x not in sentences_dev] 

474 

475 return Corpus(sentences_train, sentences_dev, sentences_test) 

476 

477 @staticmethod 

478 @deprecated(version="0.4.1", reason="Use 'flair.datasets' instead.") 

479 def read_text_classification_file( 

480 path_to_file: Union[str, Path], 

481 max_tokens_per_doc=-1, 

482 tokenizer: Tokenizer = SegtokTokenizer(), 

483 ) -> List[Sentence]: 

484 """ 

485 Reads a data file for text classification. The file should contain one document/text per line. 

486 The line should have the following format: 

487 __label__<class_name> <text> 

488 If you have a multi class task, you can have as many labels as you want at the beginning of the line, e.g., 

489 __label__<class_name_1> __label__<class_name_2> <text> 

490 :param path_to_file: the path to the data file 

491 :param max_tokens_per_doc: Takes at most this amount of tokens per document. If set to -1 all documents are taken as is. 

492 :param tokenizer: Custom tokenizer to use to prepare the data set (default SegtokTokenizer) 

493 :return: list of sentences 

494 """ 

495 label_prefix = "__label__" 

496 sentences = [] 

497 

498 with open(str(path_to_file), encoding="utf-8") as f: 

499 for line in f: 

500 words = line.split() 

501 

502 labels = [] 

503 l_len = 0 

504 

505 for i in range(len(words)): 

506 if words[i].startswith(label_prefix): 

507 l_len += len(words[i]) + 1 

508 label = words[i].replace(label_prefix, "") 

509 labels.append(label) 

510 else: 

511 break 

512 

513 text = line[l_len:].strip() 

514 

515 if text and labels: 

516 sentence = Sentence(text, labels=labels, use_tokenizer=tokenizer) 

517 if len(sentence) > max_tokens_per_doc and max_tokens_per_doc > 0: 

518 sentence.tokens = sentence.tokens[:max_tokens_per_doc] 

519 if len(sentence.tokens) > 0: 

520 sentences.append(sentence) 

521 

522 return sentences 

523 

524 @staticmethod 

525 @deprecated(version="0.4.1", reason="Use 'flair.datasets' instead.") 

526 def read_column_data( 

527 path_to_column_file: Union[str, Path], 

528 column_name_map: Dict[int, str], 

529 infer_whitespace_after: bool = True, 

530 ): 

531 """ 

532 Reads a file in column format and produces a list of Sentence with tokenlevel annotation as specified in the 

533 column_name_map. For instance, by passing "{0: 'text', 1: 'pos', 2: 'np', 3: 'ner'}" as column_name_map you 

534 specify that the first column is the text (lexical value) of the token, the second the PoS tag, the third 

535 the chunk and the forth the NER tag. 

536 :param path_to_column_file: the path to the column file 

537 :param column_name_map: a map of column number to token annotation name 

538 :param infer_whitespace_after: if True, tries to infer whitespace_after field for Token 

539 :return: list of sentences 

540 """ 

541 sentences: List[Sentence] = [] 

542 

543 try: 

544 lines: List[str] = open( 

545 str(path_to_column_file), encoding="utf-8" 

546 ).read().strip().split("\n") 

547 except: 

548 log.info( 

549 'UTF-8 can\'t read: {} ... using "latin-1" instead.'.format( 

550 path_to_column_file 

551 ) 

552 ) 

553 lines: List[str] = open( 

554 str(path_to_column_file), encoding="latin1" 

555 ).read().strip().split("\n") 

556 

557 # most data sets have the token text in the first column, if not, pass 'text' as column 

558 text_column: int = 0 

559 for column in column_name_map: 

560 if column_name_map[column] == "text": 

561 text_column = column 

562 

563 sentence: Sentence = Sentence() 

564 for line in lines: 

565 

566 if line.startswith("#"): 

567 continue 

568 

569 if line.strip().replace("", "") == "": 

570 if len(sentence) > 0: 

571 sentence.infer_space_after() 

572 sentences.append(sentence) 

573 sentence: Sentence = Sentence() 

574 

575 else: 

576 fields: List[str] = re.split(r"\s+", line) 

577 token = Token(fields[text_column]) 

578 for column in column_name_map: 

579 if len(fields) > column: 

580 if column != text_column: 

581 token.add_tag(column_name_map[column], fields[column]) 

582 

583 sentence.add_token(token) 

584 

585 if len(sentence.tokens) > 0: 

586 sentence.infer_space_after() 

587 sentences.append(sentence) 

588 

589 return sentences 

590 

591 @staticmethod 

592 @deprecated(version="0.4.1", reason="Use 'flair.datasets' instead.") 

593 def read_conll_ud(path_to_conll_file: Union[str, Path]) -> List[Sentence]: 

594 """ 

595 Reads a file in CoNLL-U format and produces a list of Sentence with full morphosyntactic annotation 

596 :param path_to_conll_file: the path to the conll-u file 

597 :return: list of sentences 

598 """ 

599 sentences: List[Sentence] = [] 

600 

601 lines: List[str] = open( 

602 path_to_conll_file, encoding="utf-8" 

603 ).read().strip().split("\n") 

604 

605 sentence: Sentence = Sentence() 

606 for line in lines: 

607 

608 fields: List[str] = re.split("\t+", line) 

609 if line == "": 

610 if len(sentence) > 0: 

611 sentences.append(sentence) 

612 sentence: Sentence = Sentence() 

613 

614 elif line.startswith("#"): 

615 continue 

616 elif "." in fields[0]: 

617 continue 

618 elif "-" in fields[0]: 

619 continue 

620 else: 

621 token = Token(fields[1], head_id=int(fields[6])) 

622 token.add_tag("lemma", str(fields[2])) 

623 token.add_tag("upos", str(fields[3])) 

624 token.add_tag("pos", str(fields[4])) 

625 token.add_tag("dependency", str(fields[7])) 

626 

627 for morph in str(fields[5]).split("|"): 

628 if not "=" in morph: 

629 continue 

630 token.add_tag(morph.split("=")[0].lower(), morph.split("=")[1]) 

631 

632 if len(fields) > 10 and str(fields[10]) == "Y": 

633 token.add_tag("frame", str(fields[11])) 

634 

635 sentence.add_token(token) 

636 

637 if len(sentence.tokens) > 0: 

638 sentences.append(sentence) 

639 

640 return sentences 

641 

642 @staticmethod 

643 def __sample(total_number_of_sentences: int, percentage: float = 0.1) -> List[int]: 

644 import random 

645 

646 sample_size: int = round(total_number_of_sentences * percentage) 

647 sample = random.sample(range(1, total_number_of_sentences), sample_size) 

648 return sample 

649 

650 @staticmethod 

651 def download_dataset(task: NLPTask): 

652 

653 # conll 2000 chunking task 

654 if task == NLPTask.CONLL_2000: 

655 conll_2000_path = "https://www.clips.uantwerpen.be/conll2000/chunking/" 

656 data_file = flair.cache_root / "datasets" / task.value / "train.txt" 

657 if not data_file.is_file(): 

658 cached_path( 

659 f"{conll_2000_path}train.txt.gz", Path("datasets") / task.value 

660 ) 

661 cached_path( 

662 f"{conll_2000_path}test.txt.gz", Path("datasets") / task.value 

663 ) 

664 import gzip, shutil 

665 

666 with gzip.open( 

667 flair.cache_root / "datasets" / task.value / "train.txt.gz", 

668 "rb", 

669 ) as f_in: 

670 with open( 

671 flair.cache_root / "datasets" / task.value / "train.txt", 

672 "wb", 

673 ) as f_out: 

674 shutil.copyfileobj(f_in, f_out) 

675 with gzip.open( 

676 flair.cache_root / "datasets" / task.value / "test.txt.gz", 

677 "rb", 

678 ) as f_in: 

679 with open( 

680 flair.cache_root / "datasets" / task.value / "test.txt", 

681 "wb", 

682 ) as f_out: 

683 shutil.copyfileobj(f_in, f_out) 

684 

685 if task == NLPTask.NER_BASQUE: 

686 ner_basque_path = "http://ixa2.si.ehu.eus/eiec/" 

687 data_path = flair.cache_root / "datasets" / task.value 

688 data_file = data_path / "named_ent_eu.train" 

689 if not data_file.is_file(): 

690 cached_path( 

691 f"{ner_basque_path}/eiec_v1.0.tgz", Path("datasets") / task.value 

692 ) 

693 import tarfile, shutil 

694 

695 with tarfile.open( 

696 flair.cache_root / "datasets" / task.value / "eiec_v1.0.tgz", 

697 "r:gz", 

698 ) as f_in: 

699 corpus_files = ( 

700 "eiec_v1.0/named_ent_eu.train", 

701 "eiec_v1.0/named_ent_eu.test", 

702 ) 

703 for corpus_file in corpus_files: 

704 f_in.extract(corpus_file, data_path) 

705 shutil.move(f"{data_path}/{corpus_file}", data_path) 

706 

707 if task == NLPTask.IMDB: 

708 imdb_acl_path = ( 

709 "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz" 

710 ) 

711 data_path = flair.cache_root / "datasets" / task.value 

712 data_file = data_path / "train.txt" 

713 if not data_file.is_file(): 

714 cached_path(imdb_acl_path, Path("datasets") / task.value) 

715 import tarfile 

716 

717 with tarfile.open( 

718 flair.cache_root 

719 / "datasets" 

720 / task.value 

721 / "aclImdb_v1.tar.gz", 

722 "r:gz", 

723 ) as f_in: 

724 datasets = ["train", "test"] 

725 labels = ["pos", "neg"] 

726 

727 for label in labels: 

728 for dataset in datasets: 

729 f_in.extractall( 

730 data_path, 

731 members=[ 

732 m 

733 for m in f_in.getmembers() 

734 if f"{dataset}/{label}" in m.name 

735 ], 

736 ) 

737 with open(f"{data_path}/{dataset}.txt", "at") as f_p: 

738 current_path = data_path / "aclImdb" / dataset / label 

739 for file_name in current_path.iterdir(): 

740 if file_name.is_file() and file_name.name.endswith( 

741 ".txt" 

742 ): 

743 f_p.write( 

744 f"__label__{label} " 

745 + file_name.open( 

746 "rt", encoding="utf-8" 

747 ).read() 

748 + "\n" 

749 ) 

750 

751 # Support both TREC-6 and TREC-50 

752 if task.value.startswith("trec"): 

753 trec_path = "http://cogcomp.org/Data/QA/QC/" 

754 

755 original_filenames = ["train_5500.label", "TREC_10.label"] 

756 new_filenames = ["train.txt", "test.txt"] 

757 for original_filename in original_filenames: 

758 cached_path( 

759 f"{trec_path}{original_filename}", 

760 Path("datasets") / task.value / "original", 

761 ) 

762 

763 data_path = flair.cache_root / "datasets" / task.value 

764 data_file = data_path / new_filenames[0] 

765 

766 if not data_file.is_file(): 

767 for original_filename, new_filename in zip( 

768 original_filenames, new_filenames 

769 ): 

770 with open( 

771 data_path / "original" / original_filename, 

772 "rt", 

773 encoding="latin1", 

774 ) as open_fp: 

775 with open( 

776 data_path / new_filename, "wt", encoding="utf-8" 

777 ) as write_fp: 

778 for line in open_fp: 

779 line = line.rstrip() 

780 fields = line.split() 

781 old_label = fields[0] 

782 question = " ".join(fields[1:]) 

783 

784 # Create flair compatible labels 

785 # TREC-6 : NUM:dist -> __label__NUM 

786 # TREC-50: NUM:dist -> __label__NUM:dist 

787 new_label = "__label__" 

788 new_label += ( 

789 old_label.split(":")[0] 

790 if task.value == "trec-6" 

791 else old_label 

792 ) 

793 

794 write_fp.write(f"{new_label} {question}\n") 

795 

796 if task == NLPTask.WNUT_17: 

797 wnut_path = "https://noisy-text.github.io/2017/files/" 

798 cached_path(f"{wnut_path}wnut17train.conll", Path("datasets") / task.value) 

799 cached_path(f"{wnut_path}emerging.dev.conll", Path("datasets") / task.value) 

800 cached_path( 

801 f"{wnut_path}emerging.test.annotated", Path("datasets") / task.value 

802 ) 

803 

804 # Wikiner NER task 

805 wikiner_path = ( 

806 "https://raw.githubusercontent.com/dice-group/FOX/master/input/Wikiner/" 

807 ) 

808 if task.value.startswith("wikiner"): 

809 lc = "" 

810 if task == NLPTask.WIKINER_ENGLISH: 

811 lc = "en" 

812 if task == NLPTask.WIKINER_GERMAN: 

813 lc = "de" 

814 if task == NLPTask.WIKINER_DUTCH: 

815 lc = "nl" 

816 if task == NLPTask.WIKINER_FRENCH: 

817 lc = "fr" 

818 if task == NLPTask.WIKINER_ITALIAN: 

819 lc = "it" 

820 if task == NLPTask.WIKINER_SPANISH: 

821 lc = "es" 

822 if task == NLPTask.WIKINER_PORTUGUESE: 

823 lc = "pt" 

824 if task == NLPTask.WIKINER_POLISH: 

825 lc = "pl" 

826 if task == NLPTask.WIKINER_RUSSIAN: 

827 lc = "ru" 

828 

829 data_file = ( 

830 flair.cache_root 

831 / "datasets" 

832 / task.value 

833 / f"aij-wikiner-{lc}-wp3.train" 

834 ) 

835 if not data_file.is_file(): 

836 

837 cached_path( 

838 f"{wikiner_path}aij-wikiner-{lc}-wp3.bz2", 

839 Path("datasets") / task.value, 

840 ) 

841 import bz2, shutil 

842 

843 # unpack and write out in CoNLL column-like format 

844 bz_file = bz2.BZ2File( 

845 flair.cache_root 

846 / "datasets" 

847 / task.value 

848 / f"aij-wikiner-{lc}-wp3.bz2", 

849 "rb", 

850 ) 

851 with bz_file as f, open( 

852 flair.cache_root 

853 / "datasets" 

854 / task.value 

855 / f"aij-wikiner-{lc}-wp3.train", 

856 "w", 

857 ) as out: 

858 for line in f: 

859 line = line.decode("utf-8") 

860 words = line.split(" ") 

861 for word in words: 

862 out.write("\t".join(word.split("|")) + "\n") 

863 

864 # CoNLL 02/03 NER 

865 conll_02_path = "https://www.clips.uantwerpen.be/conll2002/ner/data/" 

866 if task == NLPTask.CONLL_03_DUTCH: 

867 cached_path(f"{conll_02_path}ned.testa", Path("datasets") / task.value) 

868 cached_path(f"{conll_02_path}ned.testb", Path("datasets") / task.value) 

869 cached_path(f"{conll_02_path}ned.train", Path("datasets") / task.value) 

870 if task == NLPTask.CONLL_03_SPANISH: 

871 cached_path(f"{conll_02_path}esp.testa", Path("datasets") / task.value) 

872 cached_path(f"{conll_02_path}esp.testb", Path("datasets") / task.value) 

873 cached_path(f"{conll_02_path}esp.train", Path("datasets") / task.value) 

874 

875 # universal dependencies 

876 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/" 

877 # --- UD Germanic 

878 if task == NLPTask.UD_ENGLISH: 

879 cached_path( 

880 f"{ud_path}UD_English-EWT/master/en_ewt-ud-dev.conllu", 

881 Path("datasets") / task.value, 

882 ) 

883 cached_path( 

884 f"{ud_path}UD_English-EWT/master/en_ewt-ud-test.conllu", 

885 Path("datasets") / task.value, 

886 ) 

887 cached_path( 

888 f"{ud_path}UD_English-EWT/master/en_ewt-ud-train.conllu", 

889 Path("datasets") / task.value, 

890 ) 

891 

892 if task == NLPTask.UD_GERMAN: 

893 cached_path( 

894 f"{ud_path}UD_German-GSD/master/de_gsd-ud-dev.conllu", 

895 Path("datasets") / task.value, 

896 ) 

897 cached_path( 

898 f"{ud_path}UD_German-GSD/master/de_gsd-ud-test.conllu", 

899 Path("datasets") / task.value, 

900 ) 

901 cached_path( 

902 f"{ud_path}UD_German-GSD/master/de_gsd-ud-train.conllu", 

903 Path("datasets") / task.value, 

904 ) 

905 

906 if task == NLPTask.UD_DUTCH: 

907 cached_path( 

908 f"{ud_path}UD_Dutch-Alpino/master/nl_alpino-ud-dev.conllu", 

909 Path("datasets") / task.value, 

910 ) 

911 cached_path( 

912 f"{ud_path}UD_Dutch-Alpino/master/nl_alpino-ud-test.conllu", 

913 Path("datasets") / task.value, 

914 ) 

915 cached_path( 

916 f"{ud_path}UD_Dutch-Alpino/master/nl_alpino-ud-train.conllu", 

917 Path("datasets") / task.value, 

918 ) 

919 

920 # --- UD Romance 

921 if task == NLPTask.UD_FRENCH: 

922 cached_path( 

923 f"{ud_path}UD_French-GSD/master/fr_gsd-ud-dev.conllu", 

924 Path("datasets") / task.value, 

925 ) 

926 cached_path( 

927 f"{ud_path}UD_French-GSD/master/fr_gsd-ud-test.conllu", 

928 Path("datasets") / task.value, 

929 ) 

930 cached_path( 

931 f"{ud_path}UD_French-GSD/master/fr_gsd-ud-train.conllu", 

932 Path("datasets") / task.value, 

933 ) 

934 

935 if task == NLPTask.UD_ITALIAN: 

936 cached_path( 

937 f"{ud_path}UD_Italian-ISDT/master/it_isdt-ud-dev.conllu", 

938 Path("datasets") / task.value, 

939 ) 

940 cached_path( 

941 f"{ud_path}UD_Italian-ISDT/master/it_isdt-ud-test.conllu", 

942 Path("datasets") / task.value, 

943 ) 

944 cached_path( 

945 f"{ud_path}UD_Italian-ISDT/master/it_isdt-ud-train.conllu", 

946 Path("datasets") / task.value, 

947 ) 

948 

949 if task == NLPTask.UD_SPANISH: 

950 cached_path( 

951 f"{ud_path}UD_Spanish-GSD/master/es_gsd-ud-dev.conllu", 

952 Path("datasets") / task.value, 

953 ) 

954 cached_path( 

955 f"{ud_path}UD_Spanish-GSD/master/es_gsd-ud-test.conllu", 

956 Path("datasets") / task.value, 

957 ) 

958 cached_path( 

959 f"{ud_path}UD_Spanish-GSD/master/es_gsd-ud-train.conllu", 

960 Path("datasets") / task.value, 

961 ) 

962 

963 if task == NLPTask.UD_PORTUGUESE: 

964 cached_path( 

965 f"{ud_path}UD_Portuguese-Bosque/blob/master/pt_bosque-ud-dev.conllu", 

966 Path("datasets") / task.value, 

967 ) 

968 cached_path( 

969 f"{ud_path}UD_Portuguese-Bosque/blob/master/pt_bosque-ud-test.conllu", 

970 Path("datasets") / task.value, 

971 ) 

972 cached_path( 

973 f"{ud_path}UD_Portuguese-Bosque/blob/master/pt_bosque-ud-train.conllu", 

974 Path("datasets") / task.value, 

975 ) 

976 

977 if task == NLPTask.UD_ROMANIAN: 

978 cached_path( 

979 f"{ud_path}UD_Romanian-RRT/master/ro_rrt-ud-dev.conllu", 

980 Path("datasets") / task.value, 

981 ) 

982 cached_path( 

983 f"{ud_path}UD_Romanian-RRT/master/ro_rrt-ud-test.conllu", 

984 Path("datasets") / task.value, 

985 ) 

986 cached_path( 

987 f"{ud_path}UD_Romanian-RRT/master/ro_rrt-ud-train.conllu", 

988 Path("datasets") / task.value, 

989 ) 

990 

991 if task == NLPTask.UD_CATALAN: 

992 cached_path( 

993 f"{ud_path}UD_Catalan-AnCora/master/ca_ancora-ud-dev.conllu", 

994 Path("datasets") / task.value, 

995 ) 

996 cached_path( 

997 f"{ud_path}UD_Catalan-AnCora/master/ca_ancora-ud-test.conllu", 

998 Path("datasets") / task.value, 

999 ) 

1000 cached_path( 

1001 f"{ud_path}UD_Catalan-AnCora/master/ca_ancora-ud-train.conllu", 

1002 Path("datasets") / task.value, 

1003 ) 

1004 

1005 # --- UD West-Slavic 

1006 if task == NLPTask.UD_POLISH: 

1007 cached_path( 

1008 f"{ud_path}UD_Polish-LFG/master/pl_lfg-ud-dev.conllu", 

1009 Path("datasets") / task.value, 

1010 ) 

1011 cached_path( 

1012 f"{ud_path}UD_Polish-LFG/master/pl_lfg-ud-test.conllu", 

1013 Path("datasets") / task.value, 

1014 ) 

1015 cached_path( 

1016 f"{ud_path}UD_Polish-LFG/master/pl_lfg-ud-train.conllu", 

1017 Path("datasets") / task.value, 

1018 ) 

1019 

1020 if task == NLPTask.UD_CZECH: 

1021 cached_path( 

1022 f"{ud_path}UD_Czech-PDT/master/cs_pdt-ud-dev.conllu", 

1023 Path("datasets") / task.value, 

1024 ) 

1025 cached_path( 

1026 f"{ud_path}UD_Czech-PDT/master/cs_pdt-ud-test.conllu", 

1027 Path("datasets") / task.value, 

1028 ) 

1029 cached_path( 

1030 f"{ud_path}UD_Czech-PDT/master/cs_pdt-ud-train-l.conllu", 

1031 Path("datasets") / task.value, 

1032 ) 

1033 

1034 if task == NLPTask.UD_SLOVAK: 

1035 cached_path( 

1036 f"{ud_path}UD_Slovak-SNK/master/sk_snk-ud-dev.conllu", 

1037 Path("datasets") / task.value, 

1038 ) 

1039 cached_path( 

1040 f"{ud_path}UD_Slovak-SNK/master/sk_snk-ud-test.conllu", 

1041 Path("datasets") / task.value, 

1042 ) 

1043 cached_path( 

1044 f"{ud_path}UD_Slovak-SNK/master/sk_snk-ud-train.conllu", 

1045 Path("datasets") / task.value, 

1046 ) 

1047 

1048 # --- UD Scandinavian 

1049 if task == NLPTask.UD_SWEDISH: 

1050 cached_path( 

1051 f"{ud_path}UD_Swedish-Talbanken/master/sv_talbanken-ud-dev.conllu", 

1052 Path("datasets") / task.value, 

1053 ) 

1054 cached_path( 

1055 f"{ud_path}UD_Swedish-Talbanken/master/sv_talbanken-ud-test.conllu", 

1056 Path("datasets") / task.value, 

1057 ) 

1058 cached_path( 

1059 f"{ud_path}UD_Swedish-Talbanken/master/sv_talbanken-ud-train.conllu", 

1060 Path("datasets") / task.value, 

1061 ) 

1062 

1063 if task == NLPTask.UD_DANISH: 

1064 cached_path( 

1065 f"{ud_path}UD_Danish-DDT/master/da_ddt-ud-dev.conllu", 

1066 Path("datasets") / task.value, 

1067 ) 

1068 cached_path( 

1069 f"{ud_path}UD_Danish-DDT/master/da_ddt-ud-test.conllu", 

1070 Path("datasets") / task.value, 

1071 ) 

1072 cached_path( 

1073 f"{ud_path}UD_Danish-DDT/master/da_ddt-ud-train.conllu", 

1074 Path("datasets") / task.value, 

1075 ) 

1076 

1077 if task == NLPTask.UD_NORWEGIAN: 

1078 cached_path( 

1079 f"{ud_path}UD_Norwegian-Bokmaal/master/no_bokmaal-ud-dev.conllu", 

1080 Path("datasets") / task.value, 

1081 ) 

1082 cached_path( 

1083 f"{ud_path}UD_Norwegian-Bokmaal/master/no_bokmaal-ud-test.conllu", 

1084 Path("datasets") / task.value, 

1085 ) 

1086 cached_path( 

1087 f"{ud_path}UD_Norwegian-Bokmaal/master/no_bokmaal-ud-train.conllu", 

1088 Path("datasets") / task.value, 

1089 ) 

1090 

1091 if task == NLPTask.UD_FINNISH: 

1092 cached_path( 

1093 f"{ud_path}UD_Finnish-TDT/master/fi_tdt-ud-dev.conllu", 

1094 Path("datasets") / task.value, 

1095 ) 

1096 cached_path( 

1097 f"{ud_path}UD_Finnish-TDT/master/fi_tdt-ud-test.conllu", 

1098 Path("datasets") / task.value, 

1099 ) 

1100 cached_path( 

1101 f"{ud_path}UD_Finnish-TDT/master/fi_tdt-ud-train.conllu", 

1102 Path("datasets") / task.value, 

1103 ) 

1104 

1105 # --- UD South-Slavic 

1106 if task == NLPTask.UD_SLOVENIAN: 

1107 cached_path( 

1108 f"{ud_path}UD_Slovenian-SSJ/master/sl_ssj-ud-dev.conllu", 

1109 Path("datasets") / task.value, 

1110 ) 

1111 cached_path( 

1112 f"{ud_path}UD_Slovenian-SSJ/master/sl_ssj-ud-test.conllu", 

1113 Path("datasets") / task.value, 

1114 ) 

1115 cached_path( 

1116 f"{ud_path}UD_Slovenian-SSJ/master/sl_ssj-ud-train.conllu", 

1117 Path("datasets") / task.value, 

1118 ) 

1119 

1120 if task == NLPTask.UD_CROATIAN: 

1121 cached_path( 

1122 f"{ud_path}UD_Croatian-SET/master/hr_set-ud-dev.conllu", 

1123 Path("datasets") / task.value, 

1124 ) 

1125 cached_path( 

1126 f"{ud_path}UD_Croatian-SET/master/hr_set-ud-test.conllu", 

1127 Path("datasets") / task.value, 

1128 ) 

1129 cached_path( 

1130 f"{ud_path}UD_Croatian-SET/master/hr_set-ud-train.conllu", 

1131 Path("datasets") / task.value, 

1132 ) 

1133 

1134 if task == NLPTask.UD_SERBIAN: 

1135 cached_path( 

1136 f"{ud_path}UD_Serbian-SET/master/sr_set-ud-dev.conllu", 

1137 Path("datasets") / task.value, 

1138 ) 

1139 cached_path( 

1140 f"{ud_path}UD_Serbian-SET/master/sr_set-ud-test.conllu", 

1141 Path("datasets") / task.value, 

1142 ) 

1143 cached_path( 

1144 f"{ud_path}UD_Serbian-SET/master/sr_set-ud-train.conllu", 

1145 Path("datasets") / task.value, 

1146 ) 

1147 

1148 if task == NLPTask.UD_BULGARIAN: 

1149 cached_path( 

1150 f"{ud_path}UD_Bulgarian-BTB/master/bg_btb-ud-dev.conllu", 

1151 Path("datasets") / task.value, 

1152 ) 

1153 cached_path( 

1154 f"{ud_path}UD_Bulgarian-BTB/master/bg_btb-ud-test.conllu", 

1155 Path("datasets") / task.value, 

1156 ) 

1157 cached_path( 

1158 f"{ud_path}UD_Bulgarian-BTB/master/bg_btb-ud-train.conllu", 

1159 Path("datasets") / task.value, 

1160 ) 

1161 

1162 # --- UD Asian 

1163 if task == NLPTask.UD_ARABIC: 

1164 cached_path( 

1165 f"{ud_path}UD_Arabic-PADT/master/ar_padt-ud-dev.conllu", 

1166 Path("datasets") / task.value, 

1167 ) 

1168 cached_path( 

1169 f"{ud_path}UD_Arabic-PADT/master/ar_padt-ud-test.conllu", 

1170 Path("datasets") / task.value, 

1171 ) 

1172 cached_path( 

1173 f"{ud_path}UD_Arabic-PADT/master/ar_padt-ud-train.conllu", 

1174 Path("datasets") / task.value, 

1175 ) 

1176 

1177 if task == NLPTask.UD_HEBREW: 

1178 cached_path( 

1179 f"{ud_path}UD_Hebrew-HTB/master/he_htb-ud-dev.conllu", 

1180 Path("datasets") / task.value, 

1181 ) 

1182 cached_path( 

1183 f"{ud_path}UD_Hebrew-HTB/master/he_htb-ud-test.conllu", 

1184 Path("datasets") / task.value, 

1185 ) 

1186 cached_path( 

1187 f"{ud_path}UD_Hebrew-HTB/master/he_htb-ud-train.conllu", 

1188 Path("datasets") / task.value, 

1189 ) 

1190 

1191 if task == NLPTask.UD_TURKISH: 

1192 cached_path( 

1193 f"{ud_path}UD_Turkish-IMST/master/tr_imst-ud-dev.conllu", 

1194 Path("datasets") / task.value, 

1195 ) 

1196 cached_path( 

1197 f"{ud_path}UD_Turkish-IMST/master/tr_imst-ud-test.conllu", 

1198 Path("datasets") / task.value, 

1199 ) 

1200 cached_path( 

1201 f"{ud_path}UD_Turkish-IMST/master/tr_imst-ud-train.conllu", 

1202 Path("datasets") / task.value, 

1203 ) 

1204 

1205 if task == NLPTask.UD_PERSIAN: 

1206 cached_path( 

1207 f"{ud_path}UD_Persian-Seraji/master/fa_seraji-ud-dev.conllu", 

1208 Path("datasets") / task.value, 

1209 ) 

1210 cached_path( 

1211 f"{ud_path}UD_Persian-Seraji/master/fa_seraji-ud-test.conllu", 

1212 Path("datasets") / task.value, 

1213 ) 

1214 cached_path( 

1215 f"{ud_path}UD_Persian-Seraji/master/fa_seraji-ud-train.conllu", 

1216 Path("datasets") / task.value, 

1217 ) 

1218 

1219 if task == NLPTask.UD_RUSSIAN: 

1220 cached_path( 

1221 f"{ud_path}UD_Russian-SynTagRus/master/ru_syntagrus-ud-dev.conllu", 

1222 Path("datasets") / task.value, 

1223 ) 

1224 cached_path( 

1225 f"{ud_path}UD_Russian-SynTagRus/master/ru_syntagrus-ud-test.conllu", 

1226 Path("datasets") / task.value, 

1227 ) 

1228 cached_path( 

1229 f"{ud_path}UD_Russian-SynTagRus/master/ru_syntagrus-ud-train.conllu", 

1230 Path("datasets") / task.value, 

1231 ) 

1232 

1233 if task == NLPTask.UD_HINDI: 

1234 cached_path( 

1235 f"{ud_path}UD_Hindi-HDTB/master/hi_hdtb-ud-dev.conllu", 

1236 Path("datasets") / task.value, 

1237 ) 

1238 cached_path( 

1239 f"{ud_path}UD_Hindi-HDTB/master/hi_hdtb-ud-test.conllu", 

1240 Path("datasets") / task.value, 

1241 ) 

1242 cached_path( 

1243 f"{ud_path}UD_Hindi-HDTB/master/hi_hdtb-ud-train.conllu", 

1244 Path("datasets") / task.value, 

1245 ) 

1246 

1247 if task == NLPTask.UD_INDONESIAN: 

1248 cached_path( 

1249 f"{ud_path}UD_Indonesian-GSD/master/id_gsd-ud-dev.conllu", 

1250 Path("datasets") / task.value, 

1251 ) 

1252 cached_path( 

1253 f"{ud_path}UD_Indonesian-GSD/master/id_gsd-ud-test.conllu", 

1254 Path("datasets") / task.value, 

1255 ) 

1256 cached_path( 

1257 f"{ud_path}UD_Indonesian-GSD/master/id_gsd-ud-train.conllu", 

1258 Path("datasets") / task.value, 

1259 ) 

1260 

1261 if task == NLPTask.UD_JAPANESE: 

1262 cached_path( 

1263 f"{ud_path}UD_Japanese-GSD/master/ja_gsd-ud-dev.conllu", 

1264 Path("datasets") / task.value, 

1265 ) 

1266 cached_path( 

1267 f"{ud_path}UD_Japanese-GSD/master/ja_gsd-ud-test.conllu", 

1268 Path("datasets") / task.value, 

1269 ) 

1270 cached_path( 

1271 f"{ud_path}UD_Japanese-GSD/master/ja_gsd-ud-train.conllu", 

1272 Path("datasets") / task.value, 

1273 ) 

1274 

1275 if task == NLPTask.UD_CHINESE: 

1276 cached_path( 

1277 f"{ud_path}UD_Chinese-GSD/master/zh_gsd-ud-dev.conllu", 

1278 Path("datasets") / task.value, 

1279 ) 

1280 cached_path( 

1281 f"{ud_path}UD_Chinese-GSD/master/zh_gsd-ud-test.conllu", 

1282 Path("datasets") / task.value, 

1283 ) 

1284 cached_path( 

1285 f"{ud_path}UD_Chinese-GSD/master/zh_gsd-ud-train.conllu", 

1286 Path("datasets") / task.value, 

1287 ) 

1288 

1289 if task == NLPTask.UD_KOREAN: 

1290 cached_path( 

1291 f"{ud_path}UD_Korean-Kaist/master/ko_kaist-ud-dev.conllu", 

1292 Path("datasets") / task.value, 

1293 ) 

1294 cached_path( 

1295 f"{ud_path}UD_Korean-Kaist/master/ko_kaist-ud-test.conllu", 

1296 Path("datasets") / task.value, 

1297 ) 

1298 cached_path( 

1299 f"{ud_path}UD_Korean-Kaist/master/ko_kaist-ud-train.conllu", 

1300 Path("datasets") / task.value, 

1301 ) 

1302 

1303 if task == NLPTask.UD_BASQUE: 

1304 cached_path( 

1305 f"{ud_path}UD_Basque-BDT/master/eu_bdt-ud-dev.conllu", 

1306 Path("datasets") / task.value, 

1307 ) 

1308 cached_path( 

1309 f"{ud_path}UD_Basque-BDT/master/eu_bdt-ud-test.conllu", 

1310 Path("datasets") / task.value, 

1311 ) 

1312 cached_path( 

1313 f"{ud_path}UD_Basque-BDT/master/eu_bdt-ud-train.conllu", 

1314 Path("datasets") / task.value, 

1315 ) 

1316 

1317 if task.value.startswith("wassa"): 

1318 

1319 emotion = task.value[6:] 

1320 

1321 for split in ["train", "dev", "test"]: 

1322 

1323 data_file = ( 

1324 flair.cache_root 

1325 / "datasets" 

1326 / task.value 

1327 / f"{emotion}-{split}.txt" 

1328 ) 

1329 

1330 if not data_file.is_file(): 

1331 

1332 if split == "train": 

1333 url = f"http://saifmohammad.com/WebDocs/EmoInt%20Train%20Data/{emotion}-ratings-0to1.train.txt" 

1334 if split == "dev": 

1335 url = f"http://saifmohammad.com/WebDocs/EmoInt%20Dev%20Data%20With%20Gold/{emotion}-ratings-0to1.dev.gold.txt" 

1336 if split == "test": 

1337 url = f"http://saifmohammad.com/WebDocs/EmoInt%20Test%20Gold%20Data/{emotion}-ratings-0to1.test.gold.txt" 

1338 

1339 path = cached_path(url, Path("datasets") / task.value) 

1340 

1341 with open(path, "r") as f: 

1342 with open(data_file, "w") as out: 

1343 next(f) 

1344 for line in f: 

1345 fields = line.split("\t") 

1346 out.write( 

1347 f"__label__{fields[3].rstrip()} {fields[1]}\n" 

1348 ) 

1349 

1350 os.remove(path) 

1351 

1352 if task == NLPTask.UD_GERMAN_HDT: 

1353 cached_path( 

1354 f"{ud_path}UD_German-HDT/dev/de_hdt-ud-dev.conllu", 

1355 Path("datasets") / task.value, 

1356 ) 

1357 cached_path( 

1358 f"{ud_path}UD_German-HDT/dev/de_hdt-ud-test.conllu", 

1359 Path("datasets") / task.value, 

1360 ) 

1361 cached_path( 

1362 f"{ud_path}UD_German-HDT/dev/de_hdt-ud-train-a.conllu", 

1363 Path("datasets") / task.value / "original", 

1364 ) 

1365 cached_path( 

1366 f"{ud_path}UD_German-HDT/dev/de_hdt-ud-train-b.conllu", 

1367 Path("datasets") / task.value / "original", 

1368 ) 

1369 data_path = flair.cache_root / "datasets" / task.value 

1370 

1371 train_filenames = ["de_hdt-ud-train-a.conllu", "de_hdt-ud-train-b.conllu"] 

1372 

1373 new_train_file: Path = data_path / "de_hdt-ud-train-all.conllu" 

1374 

1375 if not new_train_file.is_file(): 

1376 with open(new_train_file, "wt") as f_out: 

1377 for train_filename in train_filenames: 

1378 with open( 

1379 data_path / "original" / train_filename, "rt" 

1380 ) as f_in: 

1381 f_out.write(f_in.read())