Coverage for flair/flair/data_fetcher.py: 0%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import logging
2import os
3import re
5from deprecated import deprecated
6from enum import Enum
7from pathlib import Path
8from typing import List, Dict, Union
11import flair
12from flair.data import (
13 Sentence,
14 Corpus,
15 Token,
16 Tokenizer,
17 MultiCorpus
18)
19from flair.tokenization import SegtokTokenizer, SpaceTokenizer
20from flair.file_utils import cached_path
22log = logging.getLogger("flair")
25class NLPTask(Enum):
26 # conll 2000 column format
27 CONLL_2000 = "conll_2000"
29 # conll 03 NER column format
30 CONLL_03 = "conll_03"
31 CONLL_03_GERMAN = "conll_03_german"
32 CONLL_03_DUTCH = "conll_03_dutch"
33 CONLL_03_SPANISH = "conll_03_spanish"
35 # WNUT-17
36 WNUT_17 = "wnut_17"
38 # -- WikiNER datasets
39 WIKINER_ENGLISH = "wikiner_english"
40 WIKINER_GERMAN = "wikiner_german"
41 WIKINER_FRENCH = "wikiner_french"
42 WIKINER_SPANISH = "wikiner_spanish"
43 WIKINER_ITALIAN = "wikiner_italian"
44 WIKINER_DUTCH = "wikiner_dutch"
45 WIKINER_POLISH = "wikiner_polish"
46 WIKINER_PORTUGUESE = "wikiner_portuguese"
47 WIKINER_RUSSIAN = "wikiner_russian"
49 # -- Universal Dependencies
50 # Germanic
51 UD_ENGLISH = "ud_english"
52 UD_GERMAN = "ud_german"
53 UD_DUTCH = "ud_dutch"
54 # Romance
55 UD_FRENCH = "ud_french"
56 UD_ITALIAN = "ud_italian"
57 UD_SPANISH = "ud_spanish"
58 UD_PORTUGUESE = "ud_portuguese"
59 UD_ROMANIAN = "ud_romanian"
60 UD_CATALAN = "ud_catalan"
61 # West-Slavic
62 UD_POLISH = "ud_polish"
63 UD_CZECH = "ud_czech"
64 UD_SLOVAK = "ud_slovak"
65 # South-Slavic
66 UD_SLOVENIAN = "ud_slovenian"
67 UD_CROATIAN = "ud_croatian"
68 UD_SERBIAN = "ud_serbian"
69 UD_BULGARIAN = "ud_bulgarian"
70 # East-Slavic
71 UD_RUSSIAN = "ud_russian"
72 # Scandinavian
73 UD_SWEDISH = "ud_swedish"
74 UD_DANISH = "ud_danish"
75 UD_NORWEGIAN = "ud_norwegian"
76 UD_FINNISH = "ud_finnish"
77 # Asian
78 UD_ARABIC = "ud_arabic"
79 UD_HEBREW = "ud_hebrew"
80 UD_TURKISH = "ud_turkish"
81 UD_PERSIAN = "ud_persian"
82 UD_HINDI = "ud_hindi"
83 UD_INDONESIAN = "ud_indonesian"
84 UD_JAPANESE = "ud_japanese"
85 UD_CHINESE = "ud_chinese"
86 UD_KOREAN = "ud_korean"
88 # Language isolates
89 UD_BASQUE = "ud_basque"
91 # recent Universal Dependencies
92 UD_GERMAN_HDT = "ud_german_hdt"
94 # other datasets
95 ONTONER = "ontoner"
96 FASHION = "fashion"
97 GERMEVAL = "germeval"
98 SRL = "srl"
99 WSD = "wsd"
100 CONLL_12 = "conll_12"
101 PENN = "penn"
102 ONTONOTES = "ontonotes"
103 NER_BASQUE = "eiec"
105 # text classification format
106 IMDB = "imdb"
107 AG_NEWS = "ag_news"
108 TREC_6 = "trec-6"
109 TREC_50 = "trec-50"
111 # text regression format
112 REGRESSION = "regression"
113 WASSA_ANGER = "wassa-anger"
114 WASSA_FEAR = "wassa-fear"
115 WASSA_JOY = "wassa-joy"
116 WASSA_SADNESS = "wassa-sadness"
119class NLPTaskDataFetcher:
120 @staticmethod
121 @deprecated(version="0.4.1", reason="Use 'flair.datasets' instead.")
122 def load_corpora(
123 tasks: List[Union[NLPTask, str]], base_path: Union[str, Path] = None
124 ) -> MultiCorpus:
125 return MultiCorpus(
126 [NLPTaskDataFetcher.load_corpus(task, Path(base_path)) for task in tasks]
127 )
129 @staticmethod
130 @deprecated(version="0.4.1", reason="Use 'flair.datasets' instead.")
131 def load_corpus(task: Union[NLPTask, str], base_path: Union[str, Path] = None) -> Corpus:
132 """
133 Helper function to fetch a Corpus for a specific NLPTask. For this to work you need to first download
134 and put into the appropriate folder structure the corresponding NLP task data. The tutorials on
135 https://github.com/zalandoresearch/flair give more info on how to do this. Alternatively, you can use this
136 code to create your own data fetchers.
137 :param task: specification of the NLPTask you wish to get
138 :param base_path: path to data folder containing tasks sub folders
139 :return: a Corpus consisting of train, dev and test data
140 """
142 # first, try to fetch dataset online
143 if type(task) is NLPTask:
144 NLPTaskDataFetcher.download_dataset(task)
146 # default dataset folder is the cache root
147 if not base_path:
148 base_path = flair.cache_root / "datasets"
150 if type(base_path) == str:
151 base_path: Path = Path(base_path)
153 # get string value if enum is passed
154 task = task.value if type(task) is NLPTask else task
156 data_folder = base_path / task.lower()
158 # the CoNLL 2000 task on chunking has three columns: text, pos and np (chunk)
159 if task == NLPTask.CONLL_2000.value:
160 columns = {0: "text", 1: "pos", 2: "np"}
162 return NLPTaskDataFetcher.load_column_corpus(
163 data_folder, columns, tag_to_biloes="np"
164 )
166 # many NER tasks follow the CoNLL 03 format with four colulms: text, pos, np and ner tag
167 if (
168 task == NLPTask.CONLL_03.value
169 or task == NLPTask.ONTONER.value
170 or task == NLPTask.FASHION.value
171 ):
172 columns = {0: "text", 1: "pos", 2: "np", 3: "ner"}
174 return NLPTaskDataFetcher.load_column_corpus(
175 data_folder, columns, tag_to_biloes="ner"
176 )
178 # the CoNLL 03 task for German has an additional lemma column
179 if task == NLPTask.CONLL_03_GERMAN.value:
180 columns = {0: "text", 1: "lemma", 2: "pos", 3: "np", 4: "ner"}
182 return NLPTaskDataFetcher.load_column_corpus(
183 data_folder, columns, tag_to_biloes="ner"
184 )
186 # the CoNLL 03 task for Dutch has no NP column
187 if task == NLPTask.CONLL_03_DUTCH.value or task.startswith("wikiner"):
188 columns = {0: "text", 1: "pos", 2: "ner"}
190 return NLPTaskDataFetcher.load_column_corpus(
191 data_folder, columns, tag_to_biloes="ner"
192 )
194 # the CoNLL 03 task for Spanish only has two columns
195 if task == NLPTask.CONLL_03_SPANISH.value or task == NLPTask.WNUT_17.value:
196 columns = {0: "text", 1: "ner"}
198 return NLPTaskDataFetcher.load_column_corpus(
199 data_folder, columns, tag_to_biloes="ner"
200 )
202 # the GERMEVAL task only has two columns: text and ner
203 if task == NLPTask.GERMEVAL.value:
204 columns = {1: "text", 2: "ner"}
206 return NLPTaskDataFetcher.load_column_corpus(
207 data_folder, columns, tag_to_biloes="ner"
208 )
210 # WSD tasks may be put into this column format
211 if task == NLPTask.WSD.value:
212 columns = {0: "text", 1: "lemma", 2: "pos", 3: "sense"}
213 return NLPTaskDataFetcher.load_column_corpus(
214 data_folder,
215 columns,
216 train_file="semcor.tsv",
217 test_file="semeval2015.tsv",
218 )
220 # the UD corpora follow the CoNLL-U format, for which we have a special reader
221 if task.startswith("ud_") or task in [
222 NLPTask.ONTONOTES.value,
223 NLPTask.CONLL_12.value,
224 NLPTask.PENN.value,
225 ]:
226 return NLPTaskDataFetcher.load_ud_corpus(data_folder)
228 # for text classifiers, we use our own special format
229 if task in [
230 NLPTask.IMDB.value,
231 NLPTask.AG_NEWS.value,
232 NLPTask.TREC_6.value,
233 NLPTask.TREC_50.value,
234 NLPTask.REGRESSION.value,
235 ]:
236 tokenizer: Tokenizer = SpaceTokenizer() if task in [
237 NLPTask.TREC_6.value,
238 NLPTask.TREC_50.value,
239 ] else SegtokTokenizer()
241 return NLPTaskDataFetcher.load_classification_corpus(
242 data_folder, tokenizer=tokenizer
243 )
245 # NER corpus for Basque
246 if task == NLPTask.NER_BASQUE.value:
247 columns = {0: "text", 1: "ner"}
248 return NLPTaskDataFetcher.load_column_corpus(
249 data_folder, columns, tag_to_biloes="ner"
250 )
252 if task.startswith("wassa"):
253 return NLPTaskDataFetcher.load_classification_corpus(
254 data_folder, tokenizer=SegtokTokenizer()
255 )
257 @staticmethod
258 @deprecated(version="0.4.1", reason="Use 'flair.datasets' instead.")
259 def load_column_corpus(
260 data_folder: Union[str, Path],
261 column_format: Dict[int, str],
262 train_file=None,
263 test_file=None,
264 dev_file=None,
265 tag_to_biloes=None,
266 ) -> Corpus:
267 """
268 Helper function to get a Corpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000.
270 :param data_folder: base folder with the task data
271 :param column_format: a map specifying the column format
272 :param train_file: the name of the train file
273 :param test_file: the name of the test file
274 :param dev_file: the name of the dev file, if None, dev data is sampled from train
275 :param tag_to_biloes: whether to convert to BILOES tagging scheme
276 :return: a Corpus with annotated train, dev and test data
277 """
279 if type(data_folder) == str:
280 data_folder: Path = Path(data_folder)
282 if train_file is not None:
283 train_file = data_folder / train_file
284 if test_file is not None:
285 test_file = data_folder / test_file
286 if dev_file is not None:
287 dev_file = data_folder / dev_file
289 # automatically identify train / test / dev files
290 if train_file is None:
291 for file in data_folder.iterdir():
292 file_name = file.name
293 if file_name.endswith(".gz"):
294 continue
295 if "train" in file_name and not "54019" in file_name:
296 train_file = file
297 if "dev" in file_name:
298 dev_file = file
299 if "testa" in file_name:
300 dev_file = file
301 if "testb" in file_name:
302 test_file = file
304 # if no test file is found, take any file with 'test' in name
305 if test_file is None:
306 for file in data_folder.iterdir():
307 file_name = file.name
308 if file_name.endswith(".gz"):
309 continue
310 if "test" in file_name:
311 test_file = file
313 log.info("Reading data from {}".format(data_folder))
314 log.info("Train: {}".format(train_file))
315 log.info("Dev: {}".format(dev_file))
316 log.info("Test: {}".format(test_file))
318 # get train and test data
319 sentences_train: List[Sentence] = NLPTaskDataFetcher.read_column_data(
320 train_file, column_format
321 )
323 # read in test file if exists, otherwise sample 10% of train data as test dataset
324 if test_file is not None:
325 sentences_test: List[Sentence] = NLPTaskDataFetcher.read_column_data(
326 test_file, column_format
327 )
328 else:
329 sentences_test: List[Sentence] = [
330 sentences_train[i]
331 for i in NLPTaskDataFetcher.__sample(len(sentences_train), 0.1)
332 ]
333 sentences_train = [x for x in sentences_train if x not in sentences_test]
335 # read in dev file if exists, otherwise sample 10% of train data as dev dataset
336 if dev_file is not None:
337 sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_column_data(
338 dev_file, column_format
339 )
340 else:
341 sentences_dev: List[Sentence] = [
342 sentences_train[i]
343 for i in NLPTaskDataFetcher.__sample(len(sentences_train), 0.1)
344 ]
345 sentences_train = [x for x in sentences_train if x not in sentences_dev]
347 if tag_to_biloes is not None:
348 # convert tag scheme to iobes
349 for sentence in sentences_train + sentences_test + sentences_dev:
350 sentence.convert_tag_scheme(
351 tag_type=tag_to_biloes, target_scheme="iobes"
352 )
354 return Corpus(
355 sentences_train, sentences_dev, sentences_test, name=data_folder.name
356 )
358 @staticmethod
359 @deprecated(version="0.4.1", reason="Use 'flair.datasets' instead.")
360 def load_ud_corpus(
361 data_folder: Union[str, Path], train_file=None, test_file=None, dev_file=None
362 ) -> Corpus:
363 """
364 Helper function to get a Corpus from CoNLL-U column-formatted task data such as the UD corpora
366 :param data_folder: base folder with the task data
367 :param train_file: the name of the train file
368 :param test_file: the name of the test file
369 :param dev_file: the name of the dev file, if None, dev data is sampled from train
370 :return: a Corpus with annotated train, dev and test data
371 """
372 # automatically identify train / test / dev files
373 if train_file is None:
374 for file in data_folder.iterdir():
375 file_name = file.name
376 if "train" in file_name:
377 train_file = file
378 if "test" in file_name:
379 test_file = file
380 if "dev" in file_name:
381 dev_file = file
382 if "testa" in file_name:
383 dev_file = file
384 if "testb" in file_name:
385 test_file = file
387 log.info("Reading data from {}".format(data_folder))
388 log.info("Train: {}".format(train_file))
389 log.info("Dev: {}".format(dev_file))
390 log.info("Test: {}".format(test_file))
392 sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(train_file)
393 sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(test_file)
394 sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(dev_file)
396 return Corpus(
397 sentences_train, sentences_dev, sentences_test, name=data_folder.name
398 )
400 @staticmethod
401 @deprecated(version="0.4.1", reason="Use 'flair.datasets' instead.")
402 def load_classification_corpus(
403 data_folder: Union[str, Path],
404 train_file=None,
405 test_file=None,
406 dev_file=None,
407 tokenizer: Tokenizer = SegtokTokenizer(),
408 max_tokens_per_doc=-1,
409 ) -> Corpus:
410 """
411 Helper function to get a Corpus from text classification-formatted task data
413 :param data_folder: base folder with the task data
414 :param train_file: the name of the train file
415 :param test_file: the name of the test file
416 :param dev_file: the name of the dev file, if None, dev data is sampled from train
417 :param tokenizer: Custom tokenizer to use (default SegtokTokenizer)
418 :return: a Corpus with annotated train, dev and test data
419 """
421 if type(data_folder) == str:
422 data_folder: Path = Path(data_folder)
424 if train_file is not None:
425 train_file = data_folder / train_file
426 if test_file is not None:
427 test_file = data_folder / test_file
428 if dev_file is not None:
429 dev_file = data_folder / dev_file
431 # automatically identify train / test / dev files
432 if train_file is None:
433 for file in data_folder.iterdir():
434 file_name = file.name
435 if "train" in file_name:
436 train_file = file
437 if "test" in file_name:
438 test_file = file
439 if "dev" in file_name:
440 dev_file = file
441 if "testa" in file_name:
442 dev_file = file
443 if "testb" in file_name:
444 test_file = file
446 log.info("Reading data from {}".format(data_folder))
447 log.info("Train: {}".format(train_file))
448 log.info("Dev: {}".format(dev_file))
449 log.info("Test: {}".format(test_file))
451 sentences_train: List[
452 Sentence
453 ] = NLPTaskDataFetcher.read_text_classification_file(
454 train_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
455 )
456 sentences_test: List[
457 Sentence
458 ] = NLPTaskDataFetcher.read_text_classification_file(
459 test_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
460 )
462 if dev_file is not None:
463 sentences_dev: List[
464 Sentence
465 ] = NLPTaskDataFetcher.read_text_classification_file(
466 dev_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
467 )
468 else:
469 sentences_dev: List[Sentence] = [
470 sentences_train[i]
471 for i in NLPTaskDataFetcher.__sample(len(sentences_train), 0.1)
472 ]
473 sentences_train = [x for x in sentences_train if x not in sentences_dev]
475 return Corpus(sentences_train, sentences_dev, sentences_test)
477 @staticmethod
478 @deprecated(version="0.4.1", reason="Use 'flair.datasets' instead.")
479 def read_text_classification_file(
480 path_to_file: Union[str, Path],
481 max_tokens_per_doc=-1,
482 tokenizer: Tokenizer = SegtokTokenizer(),
483 ) -> List[Sentence]:
484 """
485 Reads a data file for text classification. The file should contain one document/text per line.
486 The line should have the following format:
487 __label__<class_name> <text>
488 If you have a multi class task, you can have as many labels as you want at the beginning of the line, e.g.,
489 __label__<class_name_1> __label__<class_name_2> <text>
490 :param path_to_file: the path to the data file
491 :param max_tokens_per_doc: Takes at most this amount of tokens per document. If set to -1 all documents are taken as is.
492 :param tokenizer: Custom tokenizer to use to prepare the data set (default SegtokTokenizer)
493 :return: list of sentences
494 """
495 label_prefix = "__label__"
496 sentences = []
498 with open(str(path_to_file), encoding="utf-8") as f:
499 for line in f:
500 words = line.split()
502 labels = []
503 l_len = 0
505 for i in range(len(words)):
506 if words[i].startswith(label_prefix):
507 l_len += len(words[i]) + 1
508 label = words[i].replace(label_prefix, "")
509 labels.append(label)
510 else:
511 break
513 text = line[l_len:].strip()
515 if text and labels:
516 sentence = Sentence(text, labels=labels, use_tokenizer=tokenizer)
517 if len(sentence) > max_tokens_per_doc and max_tokens_per_doc > 0:
518 sentence.tokens = sentence.tokens[:max_tokens_per_doc]
519 if len(sentence.tokens) > 0:
520 sentences.append(sentence)
522 return sentences
524 @staticmethod
525 @deprecated(version="0.4.1", reason="Use 'flair.datasets' instead.")
526 def read_column_data(
527 path_to_column_file: Union[str, Path],
528 column_name_map: Dict[int, str],
529 infer_whitespace_after: bool = True,
530 ):
531 """
532 Reads a file in column format and produces a list of Sentence with tokenlevel annotation as specified in the
533 column_name_map. For instance, by passing "{0: 'text', 1: 'pos', 2: 'np', 3: 'ner'}" as column_name_map you
534 specify that the first column is the text (lexical value) of the token, the second the PoS tag, the third
535 the chunk and the forth the NER tag.
536 :param path_to_column_file: the path to the column file
537 :param column_name_map: a map of column number to token annotation name
538 :param infer_whitespace_after: if True, tries to infer whitespace_after field for Token
539 :return: list of sentences
540 """
541 sentences: List[Sentence] = []
543 try:
544 lines: List[str] = open(
545 str(path_to_column_file), encoding="utf-8"
546 ).read().strip().split("\n")
547 except:
548 log.info(
549 'UTF-8 can\'t read: {} ... using "latin-1" instead.'.format(
550 path_to_column_file
551 )
552 )
553 lines: List[str] = open(
554 str(path_to_column_file), encoding="latin1"
555 ).read().strip().split("\n")
557 # most data sets have the token text in the first column, if not, pass 'text' as column
558 text_column: int = 0
559 for column in column_name_map:
560 if column_name_map[column] == "text":
561 text_column = column
563 sentence: Sentence = Sentence()
564 for line in lines:
566 if line.startswith("#"):
567 continue
569 if line.strip().replace("", "") == "":
570 if len(sentence) > 0:
571 sentence.infer_space_after()
572 sentences.append(sentence)
573 sentence: Sentence = Sentence()
575 else:
576 fields: List[str] = re.split(r"\s+", line)
577 token = Token(fields[text_column])
578 for column in column_name_map:
579 if len(fields) > column:
580 if column != text_column:
581 token.add_tag(column_name_map[column], fields[column])
583 sentence.add_token(token)
585 if len(sentence.tokens) > 0:
586 sentence.infer_space_after()
587 sentences.append(sentence)
589 return sentences
591 @staticmethod
592 @deprecated(version="0.4.1", reason="Use 'flair.datasets' instead.")
593 def read_conll_ud(path_to_conll_file: Union[str, Path]) -> List[Sentence]:
594 """
595 Reads a file in CoNLL-U format and produces a list of Sentence with full morphosyntactic annotation
596 :param path_to_conll_file: the path to the conll-u file
597 :return: list of sentences
598 """
599 sentences: List[Sentence] = []
601 lines: List[str] = open(
602 path_to_conll_file, encoding="utf-8"
603 ).read().strip().split("\n")
605 sentence: Sentence = Sentence()
606 for line in lines:
608 fields: List[str] = re.split("\t+", line)
609 if line == "":
610 if len(sentence) > 0:
611 sentences.append(sentence)
612 sentence: Sentence = Sentence()
614 elif line.startswith("#"):
615 continue
616 elif "." in fields[0]:
617 continue
618 elif "-" in fields[0]:
619 continue
620 else:
621 token = Token(fields[1], head_id=int(fields[6]))
622 token.add_tag("lemma", str(fields[2]))
623 token.add_tag("upos", str(fields[3]))
624 token.add_tag("pos", str(fields[4]))
625 token.add_tag("dependency", str(fields[7]))
627 for morph in str(fields[5]).split("|"):
628 if not "=" in morph:
629 continue
630 token.add_tag(morph.split("=")[0].lower(), morph.split("=")[1])
632 if len(fields) > 10 and str(fields[10]) == "Y":
633 token.add_tag("frame", str(fields[11]))
635 sentence.add_token(token)
637 if len(sentence.tokens) > 0:
638 sentences.append(sentence)
640 return sentences
642 @staticmethod
643 def __sample(total_number_of_sentences: int, percentage: float = 0.1) -> List[int]:
644 import random
646 sample_size: int = round(total_number_of_sentences * percentage)
647 sample = random.sample(range(1, total_number_of_sentences), sample_size)
648 return sample
650 @staticmethod
651 def download_dataset(task: NLPTask):
653 # conll 2000 chunking task
654 if task == NLPTask.CONLL_2000:
655 conll_2000_path = "https://www.clips.uantwerpen.be/conll2000/chunking/"
656 data_file = flair.cache_root / "datasets" / task.value / "train.txt"
657 if not data_file.is_file():
658 cached_path(
659 f"{conll_2000_path}train.txt.gz", Path("datasets") / task.value
660 )
661 cached_path(
662 f"{conll_2000_path}test.txt.gz", Path("datasets") / task.value
663 )
664 import gzip, shutil
666 with gzip.open(
667 flair.cache_root / "datasets" / task.value / "train.txt.gz",
668 "rb",
669 ) as f_in:
670 with open(
671 flair.cache_root / "datasets" / task.value / "train.txt",
672 "wb",
673 ) as f_out:
674 shutil.copyfileobj(f_in, f_out)
675 with gzip.open(
676 flair.cache_root / "datasets" / task.value / "test.txt.gz",
677 "rb",
678 ) as f_in:
679 with open(
680 flair.cache_root / "datasets" / task.value / "test.txt",
681 "wb",
682 ) as f_out:
683 shutil.copyfileobj(f_in, f_out)
685 if task == NLPTask.NER_BASQUE:
686 ner_basque_path = "http://ixa2.si.ehu.eus/eiec/"
687 data_path = flair.cache_root / "datasets" / task.value
688 data_file = data_path / "named_ent_eu.train"
689 if not data_file.is_file():
690 cached_path(
691 f"{ner_basque_path}/eiec_v1.0.tgz", Path("datasets") / task.value
692 )
693 import tarfile, shutil
695 with tarfile.open(
696 flair.cache_root / "datasets" / task.value / "eiec_v1.0.tgz",
697 "r:gz",
698 ) as f_in:
699 corpus_files = (
700 "eiec_v1.0/named_ent_eu.train",
701 "eiec_v1.0/named_ent_eu.test",
702 )
703 for corpus_file in corpus_files:
704 f_in.extract(corpus_file, data_path)
705 shutil.move(f"{data_path}/{corpus_file}", data_path)
707 if task == NLPTask.IMDB:
708 imdb_acl_path = (
709 "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
710 )
711 data_path = flair.cache_root / "datasets" / task.value
712 data_file = data_path / "train.txt"
713 if not data_file.is_file():
714 cached_path(imdb_acl_path, Path("datasets") / task.value)
715 import tarfile
717 with tarfile.open(
718 flair.cache_root
719 / "datasets"
720 / task.value
721 / "aclImdb_v1.tar.gz",
722 "r:gz",
723 ) as f_in:
724 datasets = ["train", "test"]
725 labels = ["pos", "neg"]
727 for label in labels:
728 for dataset in datasets:
729 f_in.extractall(
730 data_path,
731 members=[
732 m
733 for m in f_in.getmembers()
734 if f"{dataset}/{label}" in m.name
735 ],
736 )
737 with open(f"{data_path}/{dataset}.txt", "at") as f_p:
738 current_path = data_path / "aclImdb" / dataset / label
739 for file_name in current_path.iterdir():
740 if file_name.is_file() and file_name.name.endswith(
741 ".txt"
742 ):
743 f_p.write(
744 f"__label__{label} "
745 + file_name.open(
746 "rt", encoding="utf-8"
747 ).read()
748 + "\n"
749 )
751 # Support both TREC-6 and TREC-50
752 if task.value.startswith("trec"):
753 trec_path = "http://cogcomp.org/Data/QA/QC/"
755 original_filenames = ["train_5500.label", "TREC_10.label"]
756 new_filenames = ["train.txt", "test.txt"]
757 for original_filename in original_filenames:
758 cached_path(
759 f"{trec_path}{original_filename}",
760 Path("datasets") / task.value / "original",
761 )
763 data_path = flair.cache_root / "datasets" / task.value
764 data_file = data_path / new_filenames[0]
766 if not data_file.is_file():
767 for original_filename, new_filename in zip(
768 original_filenames, new_filenames
769 ):
770 with open(
771 data_path / "original" / original_filename,
772 "rt",
773 encoding="latin1",
774 ) as open_fp:
775 with open(
776 data_path / new_filename, "wt", encoding="utf-8"
777 ) as write_fp:
778 for line in open_fp:
779 line = line.rstrip()
780 fields = line.split()
781 old_label = fields[0]
782 question = " ".join(fields[1:])
784 # Create flair compatible labels
785 # TREC-6 : NUM:dist -> __label__NUM
786 # TREC-50: NUM:dist -> __label__NUM:dist
787 new_label = "__label__"
788 new_label += (
789 old_label.split(":")[0]
790 if task.value == "trec-6"
791 else old_label
792 )
794 write_fp.write(f"{new_label} {question}\n")
796 if task == NLPTask.WNUT_17:
797 wnut_path = "https://noisy-text.github.io/2017/files/"
798 cached_path(f"{wnut_path}wnut17train.conll", Path("datasets") / task.value)
799 cached_path(f"{wnut_path}emerging.dev.conll", Path("datasets") / task.value)
800 cached_path(
801 f"{wnut_path}emerging.test.annotated", Path("datasets") / task.value
802 )
804 # Wikiner NER task
805 wikiner_path = (
806 "https://raw.githubusercontent.com/dice-group/FOX/master/input/Wikiner/"
807 )
808 if task.value.startswith("wikiner"):
809 lc = ""
810 if task == NLPTask.WIKINER_ENGLISH:
811 lc = "en"
812 if task == NLPTask.WIKINER_GERMAN:
813 lc = "de"
814 if task == NLPTask.WIKINER_DUTCH:
815 lc = "nl"
816 if task == NLPTask.WIKINER_FRENCH:
817 lc = "fr"
818 if task == NLPTask.WIKINER_ITALIAN:
819 lc = "it"
820 if task == NLPTask.WIKINER_SPANISH:
821 lc = "es"
822 if task == NLPTask.WIKINER_PORTUGUESE:
823 lc = "pt"
824 if task == NLPTask.WIKINER_POLISH:
825 lc = "pl"
826 if task == NLPTask.WIKINER_RUSSIAN:
827 lc = "ru"
829 data_file = (
830 flair.cache_root
831 / "datasets"
832 / task.value
833 / f"aij-wikiner-{lc}-wp3.train"
834 )
835 if not data_file.is_file():
837 cached_path(
838 f"{wikiner_path}aij-wikiner-{lc}-wp3.bz2",
839 Path("datasets") / task.value,
840 )
841 import bz2, shutil
843 # unpack and write out in CoNLL column-like format
844 bz_file = bz2.BZ2File(
845 flair.cache_root
846 / "datasets"
847 / task.value
848 / f"aij-wikiner-{lc}-wp3.bz2",
849 "rb",
850 )
851 with bz_file as f, open(
852 flair.cache_root
853 / "datasets"
854 / task.value
855 / f"aij-wikiner-{lc}-wp3.train",
856 "w",
857 ) as out:
858 for line in f:
859 line = line.decode("utf-8")
860 words = line.split(" ")
861 for word in words:
862 out.write("\t".join(word.split("|")) + "\n")
864 # CoNLL 02/03 NER
865 conll_02_path = "https://www.clips.uantwerpen.be/conll2002/ner/data/"
866 if task == NLPTask.CONLL_03_DUTCH:
867 cached_path(f"{conll_02_path}ned.testa", Path("datasets") / task.value)
868 cached_path(f"{conll_02_path}ned.testb", Path("datasets") / task.value)
869 cached_path(f"{conll_02_path}ned.train", Path("datasets") / task.value)
870 if task == NLPTask.CONLL_03_SPANISH:
871 cached_path(f"{conll_02_path}esp.testa", Path("datasets") / task.value)
872 cached_path(f"{conll_02_path}esp.testb", Path("datasets") / task.value)
873 cached_path(f"{conll_02_path}esp.train", Path("datasets") / task.value)
875 # universal dependencies
876 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/"
877 # --- UD Germanic
878 if task == NLPTask.UD_ENGLISH:
879 cached_path(
880 f"{ud_path}UD_English-EWT/master/en_ewt-ud-dev.conllu",
881 Path("datasets") / task.value,
882 )
883 cached_path(
884 f"{ud_path}UD_English-EWT/master/en_ewt-ud-test.conllu",
885 Path("datasets") / task.value,
886 )
887 cached_path(
888 f"{ud_path}UD_English-EWT/master/en_ewt-ud-train.conllu",
889 Path("datasets") / task.value,
890 )
892 if task == NLPTask.UD_GERMAN:
893 cached_path(
894 f"{ud_path}UD_German-GSD/master/de_gsd-ud-dev.conllu",
895 Path("datasets") / task.value,
896 )
897 cached_path(
898 f"{ud_path}UD_German-GSD/master/de_gsd-ud-test.conllu",
899 Path("datasets") / task.value,
900 )
901 cached_path(
902 f"{ud_path}UD_German-GSD/master/de_gsd-ud-train.conllu",
903 Path("datasets") / task.value,
904 )
906 if task == NLPTask.UD_DUTCH:
907 cached_path(
908 f"{ud_path}UD_Dutch-Alpino/master/nl_alpino-ud-dev.conllu",
909 Path("datasets") / task.value,
910 )
911 cached_path(
912 f"{ud_path}UD_Dutch-Alpino/master/nl_alpino-ud-test.conllu",
913 Path("datasets") / task.value,
914 )
915 cached_path(
916 f"{ud_path}UD_Dutch-Alpino/master/nl_alpino-ud-train.conllu",
917 Path("datasets") / task.value,
918 )
920 # --- UD Romance
921 if task == NLPTask.UD_FRENCH:
922 cached_path(
923 f"{ud_path}UD_French-GSD/master/fr_gsd-ud-dev.conllu",
924 Path("datasets") / task.value,
925 )
926 cached_path(
927 f"{ud_path}UD_French-GSD/master/fr_gsd-ud-test.conllu",
928 Path("datasets") / task.value,
929 )
930 cached_path(
931 f"{ud_path}UD_French-GSD/master/fr_gsd-ud-train.conllu",
932 Path("datasets") / task.value,
933 )
935 if task == NLPTask.UD_ITALIAN:
936 cached_path(
937 f"{ud_path}UD_Italian-ISDT/master/it_isdt-ud-dev.conllu",
938 Path("datasets") / task.value,
939 )
940 cached_path(
941 f"{ud_path}UD_Italian-ISDT/master/it_isdt-ud-test.conllu",
942 Path("datasets") / task.value,
943 )
944 cached_path(
945 f"{ud_path}UD_Italian-ISDT/master/it_isdt-ud-train.conllu",
946 Path("datasets") / task.value,
947 )
949 if task == NLPTask.UD_SPANISH:
950 cached_path(
951 f"{ud_path}UD_Spanish-GSD/master/es_gsd-ud-dev.conllu",
952 Path("datasets") / task.value,
953 )
954 cached_path(
955 f"{ud_path}UD_Spanish-GSD/master/es_gsd-ud-test.conllu",
956 Path("datasets") / task.value,
957 )
958 cached_path(
959 f"{ud_path}UD_Spanish-GSD/master/es_gsd-ud-train.conllu",
960 Path("datasets") / task.value,
961 )
963 if task == NLPTask.UD_PORTUGUESE:
964 cached_path(
965 f"{ud_path}UD_Portuguese-Bosque/blob/master/pt_bosque-ud-dev.conllu",
966 Path("datasets") / task.value,
967 )
968 cached_path(
969 f"{ud_path}UD_Portuguese-Bosque/blob/master/pt_bosque-ud-test.conllu",
970 Path("datasets") / task.value,
971 )
972 cached_path(
973 f"{ud_path}UD_Portuguese-Bosque/blob/master/pt_bosque-ud-train.conllu",
974 Path("datasets") / task.value,
975 )
977 if task == NLPTask.UD_ROMANIAN:
978 cached_path(
979 f"{ud_path}UD_Romanian-RRT/master/ro_rrt-ud-dev.conllu",
980 Path("datasets") / task.value,
981 )
982 cached_path(
983 f"{ud_path}UD_Romanian-RRT/master/ro_rrt-ud-test.conllu",
984 Path("datasets") / task.value,
985 )
986 cached_path(
987 f"{ud_path}UD_Romanian-RRT/master/ro_rrt-ud-train.conllu",
988 Path("datasets") / task.value,
989 )
991 if task == NLPTask.UD_CATALAN:
992 cached_path(
993 f"{ud_path}UD_Catalan-AnCora/master/ca_ancora-ud-dev.conllu",
994 Path("datasets") / task.value,
995 )
996 cached_path(
997 f"{ud_path}UD_Catalan-AnCora/master/ca_ancora-ud-test.conllu",
998 Path("datasets") / task.value,
999 )
1000 cached_path(
1001 f"{ud_path}UD_Catalan-AnCora/master/ca_ancora-ud-train.conllu",
1002 Path("datasets") / task.value,
1003 )
1005 # --- UD West-Slavic
1006 if task == NLPTask.UD_POLISH:
1007 cached_path(
1008 f"{ud_path}UD_Polish-LFG/master/pl_lfg-ud-dev.conllu",
1009 Path("datasets") / task.value,
1010 )
1011 cached_path(
1012 f"{ud_path}UD_Polish-LFG/master/pl_lfg-ud-test.conllu",
1013 Path("datasets") / task.value,
1014 )
1015 cached_path(
1016 f"{ud_path}UD_Polish-LFG/master/pl_lfg-ud-train.conllu",
1017 Path("datasets") / task.value,
1018 )
1020 if task == NLPTask.UD_CZECH:
1021 cached_path(
1022 f"{ud_path}UD_Czech-PDT/master/cs_pdt-ud-dev.conllu",
1023 Path("datasets") / task.value,
1024 )
1025 cached_path(
1026 f"{ud_path}UD_Czech-PDT/master/cs_pdt-ud-test.conllu",
1027 Path("datasets") / task.value,
1028 )
1029 cached_path(
1030 f"{ud_path}UD_Czech-PDT/master/cs_pdt-ud-train-l.conllu",
1031 Path("datasets") / task.value,
1032 )
1034 if task == NLPTask.UD_SLOVAK:
1035 cached_path(
1036 f"{ud_path}UD_Slovak-SNK/master/sk_snk-ud-dev.conllu",
1037 Path("datasets") / task.value,
1038 )
1039 cached_path(
1040 f"{ud_path}UD_Slovak-SNK/master/sk_snk-ud-test.conllu",
1041 Path("datasets") / task.value,
1042 )
1043 cached_path(
1044 f"{ud_path}UD_Slovak-SNK/master/sk_snk-ud-train.conllu",
1045 Path("datasets") / task.value,
1046 )
1048 # --- UD Scandinavian
1049 if task == NLPTask.UD_SWEDISH:
1050 cached_path(
1051 f"{ud_path}UD_Swedish-Talbanken/master/sv_talbanken-ud-dev.conllu",
1052 Path("datasets") / task.value,
1053 )
1054 cached_path(
1055 f"{ud_path}UD_Swedish-Talbanken/master/sv_talbanken-ud-test.conllu",
1056 Path("datasets") / task.value,
1057 )
1058 cached_path(
1059 f"{ud_path}UD_Swedish-Talbanken/master/sv_talbanken-ud-train.conllu",
1060 Path("datasets") / task.value,
1061 )
1063 if task == NLPTask.UD_DANISH:
1064 cached_path(
1065 f"{ud_path}UD_Danish-DDT/master/da_ddt-ud-dev.conllu",
1066 Path("datasets") / task.value,
1067 )
1068 cached_path(
1069 f"{ud_path}UD_Danish-DDT/master/da_ddt-ud-test.conllu",
1070 Path("datasets") / task.value,
1071 )
1072 cached_path(
1073 f"{ud_path}UD_Danish-DDT/master/da_ddt-ud-train.conllu",
1074 Path("datasets") / task.value,
1075 )
1077 if task == NLPTask.UD_NORWEGIAN:
1078 cached_path(
1079 f"{ud_path}UD_Norwegian-Bokmaal/master/no_bokmaal-ud-dev.conllu",
1080 Path("datasets") / task.value,
1081 )
1082 cached_path(
1083 f"{ud_path}UD_Norwegian-Bokmaal/master/no_bokmaal-ud-test.conllu",
1084 Path("datasets") / task.value,
1085 )
1086 cached_path(
1087 f"{ud_path}UD_Norwegian-Bokmaal/master/no_bokmaal-ud-train.conllu",
1088 Path("datasets") / task.value,
1089 )
1091 if task == NLPTask.UD_FINNISH:
1092 cached_path(
1093 f"{ud_path}UD_Finnish-TDT/master/fi_tdt-ud-dev.conllu",
1094 Path("datasets") / task.value,
1095 )
1096 cached_path(
1097 f"{ud_path}UD_Finnish-TDT/master/fi_tdt-ud-test.conllu",
1098 Path("datasets") / task.value,
1099 )
1100 cached_path(
1101 f"{ud_path}UD_Finnish-TDT/master/fi_tdt-ud-train.conllu",
1102 Path("datasets") / task.value,
1103 )
1105 # --- UD South-Slavic
1106 if task == NLPTask.UD_SLOVENIAN:
1107 cached_path(
1108 f"{ud_path}UD_Slovenian-SSJ/master/sl_ssj-ud-dev.conllu",
1109 Path("datasets") / task.value,
1110 )
1111 cached_path(
1112 f"{ud_path}UD_Slovenian-SSJ/master/sl_ssj-ud-test.conllu",
1113 Path("datasets") / task.value,
1114 )
1115 cached_path(
1116 f"{ud_path}UD_Slovenian-SSJ/master/sl_ssj-ud-train.conllu",
1117 Path("datasets") / task.value,
1118 )
1120 if task == NLPTask.UD_CROATIAN:
1121 cached_path(
1122 f"{ud_path}UD_Croatian-SET/master/hr_set-ud-dev.conllu",
1123 Path("datasets") / task.value,
1124 )
1125 cached_path(
1126 f"{ud_path}UD_Croatian-SET/master/hr_set-ud-test.conllu",
1127 Path("datasets") / task.value,
1128 )
1129 cached_path(
1130 f"{ud_path}UD_Croatian-SET/master/hr_set-ud-train.conllu",
1131 Path("datasets") / task.value,
1132 )
1134 if task == NLPTask.UD_SERBIAN:
1135 cached_path(
1136 f"{ud_path}UD_Serbian-SET/master/sr_set-ud-dev.conllu",
1137 Path("datasets") / task.value,
1138 )
1139 cached_path(
1140 f"{ud_path}UD_Serbian-SET/master/sr_set-ud-test.conllu",
1141 Path("datasets") / task.value,
1142 )
1143 cached_path(
1144 f"{ud_path}UD_Serbian-SET/master/sr_set-ud-train.conllu",
1145 Path("datasets") / task.value,
1146 )
1148 if task == NLPTask.UD_BULGARIAN:
1149 cached_path(
1150 f"{ud_path}UD_Bulgarian-BTB/master/bg_btb-ud-dev.conllu",
1151 Path("datasets") / task.value,
1152 )
1153 cached_path(
1154 f"{ud_path}UD_Bulgarian-BTB/master/bg_btb-ud-test.conllu",
1155 Path("datasets") / task.value,
1156 )
1157 cached_path(
1158 f"{ud_path}UD_Bulgarian-BTB/master/bg_btb-ud-train.conllu",
1159 Path("datasets") / task.value,
1160 )
1162 # --- UD Asian
1163 if task == NLPTask.UD_ARABIC:
1164 cached_path(
1165 f"{ud_path}UD_Arabic-PADT/master/ar_padt-ud-dev.conllu",
1166 Path("datasets") / task.value,
1167 )
1168 cached_path(
1169 f"{ud_path}UD_Arabic-PADT/master/ar_padt-ud-test.conllu",
1170 Path("datasets") / task.value,
1171 )
1172 cached_path(
1173 f"{ud_path}UD_Arabic-PADT/master/ar_padt-ud-train.conllu",
1174 Path("datasets") / task.value,
1175 )
1177 if task == NLPTask.UD_HEBREW:
1178 cached_path(
1179 f"{ud_path}UD_Hebrew-HTB/master/he_htb-ud-dev.conllu",
1180 Path("datasets") / task.value,
1181 )
1182 cached_path(
1183 f"{ud_path}UD_Hebrew-HTB/master/he_htb-ud-test.conllu",
1184 Path("datasets") / task.value,
1185 )
1186 cached_path(
1187 f"{ud_path}UD_Hebrew-HTB/master/he_htb-ud-train.conllu",
1188 Path("datasets") / task.value,
1189 )
1191 if task == NLPTask.UD_TURKISH:
1192 cached_path(
1193 f"{ud_path}UD_Turkish-IMST/master/tr_imst-ud-dev.conllu",
1194 Path("datasets") / task.value,
1195 )
1196 cached_path(
1197 f"{ud_path}UD_Turkish-IMST/master/tr_imst-ud-test.conllu",
1198 Path("datasets") / task.value,
1199 )
1200 cached_path(
1201 f"{ud_path}UD_Turkish-IMST/master/tr_imst-ud-train.conllu",
1202 Path("datasets") / task.value,
1203 )
1205 if task == NLPTask.UD_PERSIAN:
1206 cached_path(
1207 f"{ud_path}UD_Persian-Seraji/master/fa_seraji-ud-dev.conllu",
1208 Path("datasets") / task.value,
1209 )
1210 cached_path(
1211 f"{ud_path}UD_Persian-Seraji/master/fa_seraji-ud-test.conllu",
1212 Path("datasets") / task.value,
1213 )
1214 cached_path(
1215 f"{ud_path}UD_Persian-Seraji/master/fa_seraji-ud-train.conllu",
1216 Path("datasets") / task.value,
1217 )
1219 if task == NLPTask.UD_RUSSIAN:
1220 cached_path(
1221 f"{ud_path}UD_Russian-SynTagRus/master/ru_syntagrus-ud-dev.conllu",
1222 Path("datasets") / task.value,
1223 )
1224 cached_path(
1225 f"{ud_path}UD_Russian-SynTagRus/master/ru_syntagrus-ud-test.conllu",
1226 Path("datasets") / task.value,
1227 )
1228 cached_path(
1229 f"{ud_path}UD_Russian-SynTagRus/master/ru_syntagrus-ud-train.conllu",
1230 Path("datasets") / task.value,
1231 )
1233 if task == NLPTask.UD_HINDI:
1234 cached_path(
1235 f"{ud_path}UD_Hindi-HDTB/master/hi_hdtb-ud-dev.conllu",
1236 Path("datasets") / task.value,
1237 )
1238 cached_path(
1239 f"{ud_path}UD_Hindi-HDTB/master/hi_hdtb-ud-test.conllu",
1240 Path("datasets") / task.value,
1241 )
1242 cached_path(
1243 f"{ud_path}UD_Hindi-HDTB/master/hi_hdtb-ud-train.conllu",
1244 Path("datasets") / task.value,
1245 )
1247 if task == NLPTask.UD_INDONESIAN:
1248 cached_path(
1249 f"{ud_path}UD_Indonesian-GSD/master/id_gsd-ud-dev.conllu",
1250 Path("datasets") / task.value,
1251 )
1252 cached_path(
1253 f"{ud_path}UD_Indonesian-GSD/master/id_gsd-ud-test.conllu",
1254 Path("datasets") / task.value,
1255 )
1256 cached_path(
1257 f"{ud_path}UD_Indonesian-GSD/master/id_gsd-ud-train.conllu",
1258 Path("datasets") / task.value,
1259 )
1261 if task == NLPTask.UD_JAPANESE:
1262 cached_path(
1263 f"{ud_path}UD_Japanese-GSD/master/ja_gsd-ud-dev.conllu",
1264 Path("datasets") / task.value,
1265 )
1266 cached_path(
1267 f"{ud_path}UD_Japanese-GSD/master/ja_gsd-ud-test.conllu",
1268 Path("datasets") / task.value,
1269 )
1270 cached_path(
1271 f"{ud_path}UD_Japanese-GSD/master/ja_gsd-ud-train.conllu",
1272 Path("datasets") / task.value,
1273 )
1275 if task == NLPTask.UD_CHINESE:
1276 cached_path(
1277 f"{ud_path}UD_Chinese-GSD/master/zh_gsd-ud-dev.conllu",
1278 Path("datasets") / task.value,
1279 )
1280 cached_path(
1281 f"{ud_path}UD_Chinese-GSD/master/zh_gsd-ud-test.conllu",
1282 Path("datasets") / task.value,
1283 )
1284 cached_path(
1285 f"{ud_path}UD_Chinese-GSD/master/zh_gsd-ud-train.conllu",
1286 Path("datasets") / task.value,
1287 )
1289 if task == NLPTask.UD_KOREAN:
1290 cached_path(
1291 f"{ud_path}UD_Korean-Kaist/master/ko_kaist-ud-dev.conllu",
1292 Path("datasets") / task.value,
1293 )
1294 cached_path(
1295 f"{ud_path}UD_Korean-Kaist/master/ko_kaist-ud-test.conllu",
1296 Path("datasets") / task.value,
1297 )
1298 cached_path(
1299 f"{ud_path}UD_Korean-Kaist/master/ko_kaist-ud-train.conllu",
1300 Path("datasets") / task.value,
1301 )
1303 if task == NLPTask.UD_BASQUE:
1304 cached_path(
1305 f"{ud_path}UD_Basque-BDT/master/eu_bdt-ud-dev.conllu",
1306 Path("datasets") / task.value,
1307 )
1308 cached_path(
1309 f"{ud_path}UD_Basque-BDT/master/eu_bdt-ud-test.conllu",
1310 Path("datasets") / task.value,
1311 )
1312 cached_path(
1313 f"{ud_path}UD_Basque-BDT/master/eu_bdt-ud-train.conllu",
1314 Path("datasets") / task.value,
1315 )
1317 if task.value.startswith("wassa"):
1319 emotion = task.value[6:]
1321 for split in ["train", "dev", "test"]:
1323 data_file = (
1324 flair.cache_root
1325 / "datasets"
1326 / task.value
1327 / f"{emotion}-{split}.txt"
1328 )
1330 if not data_file.is_file():
1332 if split == "train":
1333 url = f"http://saifmohammad.com/WebDocs/EmoInt%20Train%20Data/{emotion}-ratings-0to1.train.txt"
1334 if split == "dev":
1335 url = f"http://saifmohammad.com/WebDocs/EmoInt%20Dev%20Data%20With%20Gold/{emotion}-ratings-0to1.dev.gold.txt"
1336 if split == "test":
1337 url = f"http://saifmohammad.com/WebDocs/EmoInt%20Test%20Gold%20Data/{emotion}-ratings-0to1.test.gold.txt"
1339 path = cached_path(url, Path("datasets") / task.value)
1341 with open(path, "r") as f:
1342 with open(data_file, "w") as out:
1343 next(f)
1344 for line in f:
1345 fields = line.split("\t")
1346 out.write(
1347 f"__label__{fields[3].rstrip()} {fields[1]}\n"
1348 )
1350 os.remove(path)
1352 if task == NLPTask.UD_GERMAN_HDT:
1353 cached_path(
1354 f"{ud_path}UD_German-HDT/dev/de_hdt-ud-dev.conllu",
1355 Path("datasets") / task.value,
1356 )
1357 cached_path(
1358 f"{ud_path}UD_German-HDT/dev/de_hdt-ud-test.conllu",
1359 Path("datasets") / task.value,
1360 )
1361 cached_path(
1362 f"{ud_path}UD_German-HDT/dev/de_hdt-ud-train-a.conllu",
1363 Path("datasets") / task.value / "original",
1364 )
1365 cached_path(
1366 f"{ud_path}UD_German-HDT/dev/de_hdt-ud-train-b.conllu",
1367 Path("datasets") / task.value / "original",
1368 )
1369 data_path = flair.cache_root / "datasets" / task.value
1371 train_filenames = ["de_hdt-ud-train-a.conllu", "de_hdt-ud-train-b.conllu"]
1373 new_train_file: Path = data_path / "de_hdt-ud-train-all.conllu"
1375 if not new_train_file.is_file():
1376 with open(new_train_file, "wt") as f_out:
1377 for train_filename in train_filenames:
1378 with open(
1379 data_path / "original" / train_filename, "rt"
1380 ) as f_in:
1381 f_out.write(f_in.read())