Coverage for flair/flair/datasets/treebanks.py: 15%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import logging
2import re
3from pathlib import Path
4from typing import List, Union
6import flair
7from flair.data import (
8 Sentence,
9 Corpus,
10 Token,
11 FlairDataset,
12)
13from flair.datasets.base import find_train_dev_test_files
14from flair.file_utils import cached_path, unzip_file
16log = logging.getLogger("flair")
19class UniversalDependenciesCorpus(Corpus):
20 def __init__(
21 self,
22 data_folder: Union[str, Path],
23 train_file=None,
24 test_file=None,
25 dev_file=None,
26 in_memory: bool = True,
27 split_multiwords: bool = True,
28 ):
29 """
30 Instantiates a Corpus from CoNLL-U column-formatted task data such as the UD corpora
32 :param data_folder: base folder with the task data
33 :param train_file: the name of the train file
34 :param test_file: the name of the test file
35 :param dev_file: the name of the dev file, if None, dev data is sampled from train
36 :param in_memory: If set to True, keeps full dataset in memory, otherwise does disk reads
37 :param split_multiwords: If set to True, multiwords are split (default), otherwise kept as single tokens
38 :return: a Corpus with annotated train, dev and test data
39 """
41 # find train, dev and test files if not specified
42 dev_file, test_file, train_file = \
43 find_train_dev_test_files(data_folder, dev_file, test_file, train_file)
45 # get train data
46 train = UniversalDependenciesDataset(train_file, in_memory=in_memory, split_multiwords=split_multiwords)
48 # get test data
49 test = UniversalDependenciesDataset(test_file, in_memory=in_memory, split_multiwords=split_multiwords) \
50 if test_file is not None else None
52 # get dev data
53 dev = UniversalDependenciesDataset(dev_file, in_memory=in_memory, split_multiwords=split_multiwords) \
54 if dev_file is not None else None
56 super(UniversalDependenciesCorpus, self).__init__(
57 train, dev, test, name=str(data_folder)
58 )
61class UniversalDependenciesDataset(FlairDataset):
62 def __init__(self, path_to_conll_file: Union[str, Path], in_memory: bool = True, split_multiwords: bool = True):
63 """
64 Instantiates a column dataset in CoNLL-U format.
66 :param path_to_conll_file: Path to the CoNLL-U formatted file
67 :param in_memory: If set to True, keeps full dataset in memory, otherwise does disk reads
68 """
69 if type(path_to_conll_file) is str:
70 path_to_conll_file = Path(path_to_conll_file)
71 assert path_to_conll_file.exists()
73 self.in_memory: bool = in_memory
74 self.split_multiwords: bool = split_multiwords
76 self.path_to_conll_file = path_to_conll_file
77 self.total_sentence_count: int = 0
79 with open(str(self.path_to_conll_file), encoding="utf-8") as file:
81 # option 1: read only sentence boundaries as offset positions
82 if not self.in_memory:
83 self.indices: List[int] = []
85 line = file.readline()
86 position = 0
87 while line:
88 line = line.strip()
89 if line == "":
90 self.indices.append(position)
91 position = file.tell()
92 line = file.readline()
94 self.total_sentence_count = len(self.indices)
96 # option 2: keep everything in memory
97 if self.in_memory:
98 self.sentences: List[Sentence] = []
100 while True:
101 sentence = self._read_next_sentence(file)
102 if not sentence: break
103 self.sentences.append(sentence)
105 self.total_sentence_count = len(self.sentences)
107 def is_in_memory(self) -> bool:
108 return self.in_memory
110 def __len__(self):
111 return self.total_sentence_count
113 def __getitem__(self, index: int = 0) -> Sentence:
115 # if in memory, retrieve parsed sentence
116 if self.in_memory:
117 sentence = self.sentences[index]
119 # else skip to position in file where sentence begins
120 else:
121 with open(str(self.path_to_conll_file), encoding="utf-8") as file:
122 file.seek(self.indices[index])
123 sentence = self._read_next_sentence(file)
125 return sentence
127 def _read_next_sentence(self, file):
128 line = file.readline()
129 sentence: Sentence = Sentence()
131 # current token ID
132 token_idx = 0
134 # handling for the awful UD multiword format
135 current_multiword_text = ''
136 current_multiword_sequence = ''
137 current_multiword_first_token = 0
138 current_multiword_last_token = 0
140 while line:
141 line = line.strip()
142 fields: List[str] = re.split("\t+", line)
144 # end of sentence
145 if line == "":
146 if len(sentence) > 0:
147 break
149 # comments
150 elif line.startswith("#"):
151 line = file.readline()
152 continue
154 # ellipsis
155 elif "." in fields[0]:
156 line = file.readline()
157 continue
159 # if token is a multi-word
160 elif "-" in fields[0]:
161 line = file.readline()
163 current_multiword_first_token = int(fields[0].split('-')[0])
164 current_multiword_last_token = int(fields[0].split('-')[1])
165 current_multiword_text = fields[1]
166 current_multiword_sequence = ''
168 if self.split_multiwords:
169 continue
170 else:
171 token = Token(fields[1])
172 token.add_label("lemma", str(fields[2]))
173 if len(fields) > 9 and 'SpaceAfter=No' in fields[9]:
174 token.whitespace_after = False
175 sentence.add_token(token)
176 token_idx += 1
178 # normal single-word tokens
179 else:
181 # if we don't split multiwords, skip over component words
182 if not self.split_multiwords and token_idx < current_multiword_last_token:
183 token_idx += 1
184 line = file.readline()
185 continue
187 # add token
188 token = Token(fields[1], head_id=int(fields[6]))
189 token.add_label("lemma", str(fields[2]))
190 token.add_label("upos", str(fields[3]))
191 token.add_label("pos", str(fields[4]))
192 token.add_label("dependency", str(fields[7]))
194 if len(fields) > 9 and 'SpaceAfter=No' in fields[9]:
195 token.whitespace_after = False
197 # add morphological tags
198 for morph in str(fields[5]).split("|"):
199 if "=" not in morph:
200 continue
201 token.add_label(morph.split("=")[0].lower(), morph.split("=")[1])
203 if len(fields) > 10 and str(fields[10]) == "Y":
204 token.add_label("frame", str(fields[11]))
206 token_idx += 1
208 # derive whitespace logic for multiwords
209 if token_idx <= current_multiword_last_token:
210 current_multiword_sequence += token.text
212 # print(token)
213 # print(current_multiword_last_token)
214 # print(current_multiword_first_token)
215 # if multi-word equals component tokens, there should be no whitespace
216 if token_idx == current_multiword_last_token and current_multiword_sequence == current_multiword_text:
217 # go through all tokens in subword and set whitespace_after information
218 for i in range(current_multiword_last_token - current_multiword_first_token):
219 # print(i)
220 sentence[-(i+1)].whitespace_after = False
222 sentence.add_token(token)
224 line = file.readline()
225 return sentence
228class UD_ENGLISH(UniversalDependenciesCorpus):
229 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
231 if type(base_path) == str:
232 base_path: Path = Path(base_path)
234 # this dataset name
235 dataset_name = self.__class__.__name__.lower()
237 # default dataset folder is the cache root
238 if not base_path:
239 base_path = flair.cache_root / "datasets"
240 data_folder = base_path / dataset_name
242 # download data if necessary
243 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master"
244 cached_path(f"{web_path}/en_ewt-ud-dev.conllu", Path("datasets") / dataset_name)
245 cached_path(
246 f"{web_path}/en_ewt-ud-test.conllu", Path("datasets") / dataset_name
247 )
248 cached_path(
249 f"{web_path}/en_ewt-ud-train.conllu", Path("datasets") / dataset_name
250 )
252 super(UD_ENGLISH, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
255class UD_GALICIAN(UniversalDependenciesCorpus):
256 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
258 if type(base_path) == str:
259 base_path: Path = Path(base_path)
261 # this dataset name
262 dataset_name = self.__class__.__name__.lower()
264 # default dataset folder is the cache root
265 if not base_path:
266 base_path = Path(flair.cache_root) / "datasets"
267 data_folder = base_path / dataset_name
269 # download data if necessary
270 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Galician-TreeGal/master"
271 cached_path(
272 f"{web_path}/gl_treegal-ud-test.conllu", Path("datasets") / dataset_name
273 )
274 cached_path(
275 f"{web_path}/gl_treegal-ud-train.conllu", Path("datasets") / dataset_name
276 )
278 super(UD_GALICIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
281class UD_ANCIENT_GREEK(UniversalDependenciesCorpus):
282 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
284 if type(base_path) == str:
285 base_path: Path = Path(base_path)
287 # this dataset name
288 dataset_name = self.__class__.__name__.lower()
290 # default dataset folder is the cache root
291 if not base_path:
292 base_path = flair.cache_root / "datasets"
293 data_folder = base_path / dataset_name
295 # download data if necessary
296 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-PROIEL/master"
297 cached_path(f"{web_path}/grc_proiel-ud-dev.conllu", Path("datasets") / dataset_name)
298 cached_path(
299 f"{web_path}/grc_proiel-ud-test.conllu", Path("datasets") / dataset_name
300 )
301 cached_path(
302 f"{web_path}/grc_proiel-ud-train.conllu", Path("datasets") / dataset_name
303 )
305 super(UD_ANCIENT_GREEK, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
308class UD_KAZAKH(UniversalDependenciesCorpus):
309 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
311 if type(base_path) == str:
312 base_path: Path = Path(base_path)
314 # this dataset name
315 dataset_name = self.__class__.__name__.lower()
317 # default dataset folder is the cache root
318 if not base_path:
319 base_path = flair.cache_root / "datasets"
320 data_folder = base_path / dataset_name
322 # download data if necessary
323 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Kazakh-KTB/master"
324 cached_path(
325 f"{web_path}/kk_ktb-ud-test.conllu", Path("datasets") / dataset_name
326 )
327 cached_path(
328 f"{web_path}/kk_ktb-ud-train.conllu", Path("datasets") / dataset_name
329 )
331 super(UD_KAZAKH, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
334class UD_OLD_CHURCH_SLAVONIC(UniversalDependenciesCorpus):
335 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
337 if type(base_path) == str:
338 base_path: Path = Path(base_path)
340 # this dataset name
341 dataset_name = self.__class__.__name__.lower()
343 # default dataset folder is the cache root
344 if not base_path:
345 base_path = flair.cache_root / "datasets"
346 data_folder = base_path / dataset_name
348 # download data if necessary
349 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Old_Church_Slavonic-PROIEL/master"
350 cached_path(f"{web_path}/cu_proiel-ud-dev.conllu", Path("datasets") / dataset_name)
351 cached_path(
352 f"{web_path}/cu_proiel-ud-test.conllu", Path("datasets") / dataset_name
353 )
354 cached_path(
355 f"{web_path}/cu_proiel-ud-train.conllu", Path("datasets") / dataset_name
356 )
358 super(UD_OLD_CHURCH_SLAVONIC, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
361class UD_ARMENIAN(UniversalDependenciesCorpus):
362 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
364 if type(base_path) == str:
365 base_path: Path = Path(base_path)
367 # this dataset name
368 dataset_name = self.__class__.__name__.lower()
370 # default dataset folder is the cache root
371 if not base_path:
372 base_path = flair.cache_root / "datasets"
373 data_folder = base_path / dataset_name
375 # download data if necessary
376 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Armenian-ArmTDP/master/"
377 cached_path(f"{web_path}/hy_armtdp-ud-dev.conllu", Path("datasets") / dataset_name)
378 cached_path(
379 f"{web_path}/hy_armtdp-ud-test.conllu", Path("datasets") / dataset_name
380 )
381 cached_path(
382 f"{web_path}/hy_armtdp-ud-train.conllu", Path("datasets") / dataset_name
383 )
385 super(UD_ARMENIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
387class UD_ESTONIAN(UniversalDependenciesCorpus):
388 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
390 if type(base_path) == str:
391 base_path: Path = Path(base_path)
393 # this dataset name
394 dataset_name = self.__class__.__name__.lower()
396 # default dataset folder is the cache root
397 if not base_path:
398 base_path = flair.cache_root / "datasets"
399 data_folder = base_path / dataset_name
401 # download data if necessary
402 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Estonian-EDT/master"
403 cached_path(f"{web_path}/et_edt-ud-dev.conllu", Path("datasets") / dataset_name)
404 cached_path(
405 f"{web_path}/et_edt-ud-test.conllu", Path("datasets") / dataset_name
406 )
407 cached_path(
408 f"{web_path}/et_edt-ud-train.conllu", Path("datasets") / dataset_name
409 )
411 super(UD_ESTONIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
414class UD_GERMAN(UniversalDependenciesCorpus):
415 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
417 if type(base_path) == str:
418 base_path: Path = Path(base_path)
420 # this dataset name
421 dataset_name = self.__class__.__name__.lower()
423 # default dataset folder is the cache root
424 if not base_path:
425 base_path = flair.cache_root / "datasets"
426 data_folder = base_path / dataset_name
428 # download data if necessary
429 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_German-GSD/master"
430 cached_path(f"{ud_path}/de_gsd-ud-dev.conllu", Path("datasets") / dataset_name)
431 cached_path(f"{ud_path}/de_gsd-ud-test.conllu", Path("datasets") / dataset_name)
432 cached_path(
433 f"{ud_path}/de_gsd-ud-train.conllu", Path("datasets") / dataset_name
434 )
436 super(UD_GERMAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
439class UD_GERMAN_HDT(UniversalDependenciesCorpus):
440 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = False, split_multiwords: bool = True):
442 if type(base_path) == str:
443 base_path: Path = Path(base_path)
445 # this dataset name
446 dataset_name = self.__class__.__name__.lower()
448 # default dataset folder is the cache root
449 if not base_path:
450 base_path = flair.cache_root / "datasets"
451 data_folder = base_path / dataset_name
453 # download data if necessary
454 ud_path = (
455 "https://raw.githubusercontent.com/UniversalDependencies/UD_German-HDT/dev"
456 )
457 cached_path(f"{ud_path}/de_hdt-ud-dev.conllu", Path("datasets") / dataset_name)
458 cached_path(f"{ud_path}/de_hdt-ud-test.conllu", Path("datasets") / dataset_name)
460 train_filenames = [
461 "de_hdt-ud-train-a-1.conllu",
462 "de_hdt-ud-train-a-2.conllu",
463 "de_hdt-ud-train-b-1.conllu",
464 "de_hdt-ud-train-b-2.conllu",
465 ]
467 for train_file in train_filenames:
468 cached_path(
469 f"{ud_path}/{train_file}", Path("datasets") / dataset_name / "original"
470 )
472 data_path = flair.cache_root / "datasets" / dataset_name
474 new_train_file: Path = data_path / "de_hdt-ud-train-all.conllu"
476 if not new_train_file.is_file():
477 with open(new_train_file, "wt") as f_out:
478 for train_filename in train_filenames:
479 with open(data_path / "original" / train_filename, "rt") as f_in:
480 f_out.write(f_in.read())
482 super(UD_GERMAN_HDT, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
485class UD_DUTCH(UniversalDependenciesCorpus):
486 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
488 if type(base_path) == str:
489 base_path: Path = Path(base_path)
491 # this dataset name
492 dataset_name = self.__class__.__name__.lower()
494 # default dataset folder is the cache root
495 if not base_path:
496 base_path = flair.cache_root / "datasets"
497 data_folder = base_path / dataset_name
499 # download data if necessary
500 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Dutch-Alpino/master"
501 cached_path(
502 f"{ud_path}/nl_alpino-ud-dev.conllu", Path("datasets") / dataset_name
503 )
504 cached_path(
505 f"{ud_path}/nl_alpino-ud-test.conllu", Path("datasets") / dataset_name
506 )
507 cached_path(
508 f"{ud_path}/nl_alpino-ud-train.conllu", Path("datasets") / dataset_name
509 )
511 super(UD_DUTCH, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
513class UD_FAROESE(UniversalDependenciesCorpus):
514 """ This treebank includes the Faroese treebank dataset from the following link:
515 https://github.com/UniversalDependencies/UD_Faroese-FarPaHC/tree/master
517 Faronese is a small Western Scandinavian language with 60.000-100.000, related to Icelandic and Old Norse"""
518 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
520 if type(base_path) == str:
521 base_path: Path = Path(base_path)
523 # this dataset name
524 dataset_name = self.__class__.__name__.lower()
526 # default dataset folder is the cache root
527 if not base_path:
528 base_path = flair.cache_root / "datasets"
529 data_folder = base_path / dataset_name
531 # download data if necessary
532 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Faroese-FarPaHC/master"
533 cached_path(
534 f"{web_path}/fo_farpahc-ud-dev.conllu", Path("datasets") / dataset_name
535 )
536 cached_path(
537 f"{web_path}/fo_farpahc-ud-test.conllu", Path("datasets") / dataset_name
538 )
539 cached_path(
540 f"{web_path}/fo_farpahc-ud-train.conllu", Path("datasets") / dataset_name
541 )
543 super(UD_FAROESE, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
546class UD_FRENCH(UniversalDependenciesCorpus):
547 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
549 if type(base_path) == str:
550 base_path: Path = Path(base_path)
552 # this dataset name
553 dataset_name = self.__class__.__name__.lower()
555 # default dataset folder is the cache root
556 if not base_path:
557 base_path = flair.cache_root / "datasets"
558 data_folder = base_path / dataset_name
560 # download data if necessary
561 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_French-GSD/master"
562 cached_path(f"{ud_path}/fr_gsd-ud-dev.conllu", Path("datasets") / dataset_name)
563 cached_path(f"{ud_path}/fr_gsd-ud-test.conllu", Path("datasets") / dataset_name)
564 cached_path(
565 f"{ud_path}/fr_gsd-ud-train.conllu", Path("datasets") / dataset_name
566 )
567 super(UD_FRENCH, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
570class UD_ITALIAN(UniversalDependenciesCorpus):
571 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
573 if type(base_path) == str:
574 base_path: Path = Path(base_path)
576 # this dataset name
577 dataset_name = self.__class__.__name__.lower()
579 # default dataset folder is the cache root
580 if not base_path:
581 base_path = flair.cache_root / "datasets"
582 data_folder = base_path / dataset_name
584 # download data if necessary
585 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Italian-ISDT/master"
586 cached_path(f"{ud_path}/it_isdt-ud-dev.conllu", Path("datasets") / dataset_name)
587 cached_path(
588 f"{ud_path}/it_isdt-ud-test.conllu", Path("datasets") / dataset_name
589 )
590 cached_path(
591 f"{ud_path}/it_isdt-ud-train.conllu", Path("datasets") / dataset_name
592 )
593 super(UD_ITALIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
596class UD_LATIN(UniversalDependenciesCorpus):
597 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
599 if type(base_path) == str:
600 base_path: Path = Path(base_path)
602 # this dataset name
603 dataset_name = self.__class__.__name__.lower()
605 # default dataset folder is the cache root
606 if not base_path:
607 base_path = Path(flair.cache_root) / "datasets"
608 data_folder = base_path / dataset_name
610 # download data if necessary
611 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Latin-LLCT/master/"
612 cached_path(f"{web_path}/la_llct-ud-dev.conllu", Path("datasets") / dataset_name)
613 cached_path(
614 f"{web_path}/la_llct-ud-test.conllu", Path("datasets") / dataset_name
615 )
616 cached_path(
617 f"{web_path}/la_llct-ud-train.conllu", Path("datasets") / dataset_name
618 )
620 super(UD_LATIN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
624class UD_SPANISH(UniversalDependenciesCorpus):
625 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
627 if type(base_path) == str:
628 base_path: Path = Path(base_path)
630 # this dataset name
631 dataset_name = self.__class__.__name__.lower()
633 # default dataset folder is the cache root
634 if not base_path:
635 base_path = flair.cache_root / "datasets"
636 data_folder = base_path / dataset_name
638 # download data if necessary
639 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Spanish-GSD/master"
640 cached_path(f"{ud_path}/es_gsd-ud-dev.conllu", Path("datasets") / dataset_name)
641 cached_path(f"{ud_path}/es_gsd-ud-test.conllu", Path("datasets") / dataset_name)
642 cached_path(
643 f"{ud_path}/es_gsd-ud-train.conllu", Path("datasets") / dataset_name
644 )
645 super(UD_SPANISH, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
648class UD_PORTUGUESE(UniversalDependenciesCorpus):
649 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
651 if type(base_path) == str:
652 base_path: Path = Path(base_path)
654 # this dataset name
655 dataset_name = self.__class__.__name__.lower()
657 # default dataset folder is the cache root
658 if not base_path:
659 base_path = flair.cache_root / "datasets"
660 data_folder = base_path / dataset_name
662 # download data if necessary
663 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Portuguese-Bosque/master"
664 cached_path(
665 f"{ud_path}/pt_bosque-ud-dev.conllu", Path("datasets") / dataset_name
666 )
667 cached_path(
668 f"{ud_path}/pt_bosque-ud-test.conllu", Path("datasets") / dataset_name
669 )
670 cached_path(
671 f"{ud_path}/pt_bosque-ud-train.conllu", Path("datasets") / dataset_name
672 )
673 super(UD_PORTUGUESE, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
676class UD_ROMANIAN(UniversalDependenciesCorpus):
677 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
679 if type(base_path) == str:
680 base_path: Path = Path(base_path)
682 # this dataset name
683 dataset_name = self.__class__.__name__.lower()
685 # default dataset folder is the cache root
686 if not base_path:
687 base_path = flair.cache_root / "datasets"
688 data_folder = base_path / dataset_name
690 # download data if necessary
691 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Romanian-RRT/master"
692 cached_path(f"{ud_path}/ro_rrt-ud-dev.conllu", Path("datasets") / dataset_name)
693 cached_path(f"{ud_path}/ro_rrt-ud-test.conllu", Path("datasets") / dataset_name)
694 cached_path(
695 f"{ud_path}/ro_rrt-ud-train.conllu", Path("datasets") / dataset_name
696 )
697 super(UD_ROMANIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
700class UD_CATALAN(UniversalDependenciesCorpus):
701 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
703 if type(base_path) == str:
704 base_path: Path = Path(base_path)
706 # this dataset name
707 dataset_name = self.__class__.__name__.lower()
709 # default dataset folder is the cache root
710 if not base_path:
711 base_path = flair.cache_root / "datasets"
712 data_folder = base_path / dataset_name
714 # download data if necessary
715 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Catalan-AnCora/master"
716 cached_path(
717 f"{ud_path}/ca_ancora-ud-dev.conllu", Path("datasets") / dataset_name
718 )
719 cached_path(
720 f"{ud_path}/ca_ancora-ud-test.conllu", Path("datasets") / dataset_name
721 )
722 cached_path(
723 f"{ud_path}/ca_ancora-ud-train.conllu", Path("datasets") / dataset_name
724 )
725 super(UD_CATALAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
728class UD_POLISH(UniversalDependenciesCorpus):
729 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
731 if type(base_path) == str:
732 base_path: Path = Path(base_path)
734 # this dataset name
735 dataset_name = self.__class__.__name__.lower()
737 # default dataset folder is the cache root
738 if not base_path:
739 base_path = flair.cache_root / "datasets"
740 data_folder = base_path / dataset_name
742 # download data if necessary
743 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Polish-LFG/master"
744 cached_path(f"{ud_path}/pl_lfg-ud-dev.conllu", Path("datasets") / dataset_name)
745 cached_path(f"{ud_path}/pl_lfg-ud-test.conllu", Path("datasets") / dataset_name)
746 cached_path(
747 f"{ud_path}/pl_lfg-ud-train.conllu", Path("datasets") / dataset_name
748 )
750 super(UD_POLISH, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
753class UD_CZECH(UniversalDependenciesCorpus):
754 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = False, split_multiwords: bool = True):
756 if type(base_path) == str:
757 base_path: Path = Path(base_path)
759 # this dataset name
760 dataset_name = self.__class__.__name__.lower()
762 # default dataset folder is the cache root
763 if not base_path:
764 base_path = flair.cache_root / "datasets"
765 data_folder = base_path / dataset_name
767 # download data if necessary
768 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Czech-PDT/master"
769 cached_path(f"{ud_path}/cs_pdt-ud-dev.conllu", Path("datasets") / dataset_name)
770 cached_path(f"{ud_path}/cs_pdt-ud-test.conllu", Path("datasets") / dataset_name)
771 cached_path(
772 f"{ud_path}/cs_pdt-ud-train-c.conllu",
773 Path("datasets") / dataset_name / "original",
774 )
775 cached_path(
776 f"{ud_path}/cs_pdt-ud-train-l.conllu",
777 Path("datasets") / dataset_name / "original",
778 )
779 cached_path(
780 f"{ud_path}/cs_pdt-ud-train-m.conllu",
781 Path("datasets") / dataset_name / "original",
782 )
783 cached_path(
784 f"{ud_path}/cs_pdt-ud-train-v.conllu",
785 Path("datasets") / dataset_name / "original",
786 )
787 data_path = flair.cache_root / "datasets" / dataset_name
789 train_filenames = [
790 "cs_pdt-ud-train-c.conllu",
791 "cs_pdt-ud-train-l.conllu",
792 "cs_pdt-ud-train-m.conllu",
793 "cs_pdt-ud-train-v.conllu",
794 ]
796 new_train_file: Path = data_path / "cs_pdt-ud-train-all.conllu"
798 if not new_train_file.is_file():
799 with open(new_train_file, "wt") as f_out:
800 for train_filename in train_filenames:
801 with open(data_path / "original" / train_filename, "rt") as f_in:
802 f_out.write(f_in.read())
803 super(UD_CZECH, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
806class UD_SLOVAK(UniversalDependenciesCorpus):
807 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
809 if type(base_path) == str:
810 base_path: Path = Path(base_path)
812 # this dataset name
813 dataset_name = self.__class__.__name__.lower()
815 # default dataset folder is the cache root
816 if not base_path:
817 base_path = flair.cache_root / "datasets"
818 data_folder = base_path / dataset_name
820 # download data if necessary
821 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Slovak-SNK/master"
822 cached_path(f"{ud_path}/sk_snk-ud-dev.conllu", Path("datasets") / dataset_name)
823 cached_path(f"{ud_path}/sk_snk-ud-test.conllu", Path("datasets") / dataset_name)
824 cached_path(
825 f"{ud_path}/sk_snk-ud-train.conllu", Path("datasets") / dataset_name
826 )
828 super(UD_SLOVAK, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
831class UD_SWEDISH(UniversalDependenciesCorpus):
832 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
834 if type(base_path) == str:
835 base_path: Path = Path(base_path)
837 # this dataset name
838 dataset_name = self.__class__.__name__.lower()
840 # default dataset folder is the cache root
841 if not base_path:
842 base_path = flair.cache_root / "datasets"
843 data_folder = base_path / dataset_name
845 # download data if necessary
846 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Swedish-Talbanken/master"
847 cached_path(
848 f"{ud_path}/sv_talbanken-ud-dev.conllu", Path("datasets") / dataset_name
849 )
850 cached_path(
851 f"{ud_path}/sv_talbanken-ud-test.conllu", Path("datasets") / dataset_name
852 )
853 cached_path(
854 f"{ud_path}/sv_talbanken-ud-train.conllu", Path("datasets") / dataset_name
855 )
857 super(UD_SWEDISH, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
860class UD_DANISH(UniversalDependenciesCorpus):
861 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
863 if type(base_path) == str:
864 base_path: Path = Path(base_path)
866 # this dataset name
867 dataset_name = self.__class__.__name__.lower()
869 # default dataset folder is the cache root
870 if not base_path:
871 base_path = flair.cache_root / "datasets"
872 data_folder = base_path / dataset_name
874 # download data if necessary
875 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Danish-DDT/master"
876 cached_path(f"{ud_path}/da_ddt-ud-dev.conllu", Path("datasets") / dataset_name)
877 cached_path(f"{ud_path}/da_ddt-ud-test.conllu", Path("datasets") / dataset_name)
878 cached_path(
879 f"{ud_path}/da_ddt-ud-train.conllu", Path("datasets") / dataset_name
880 )
882 super(UD_DANISH, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
885class UD_NORWEGIAN(UniversalDependenciesCorpus):
886 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
888 if type(base_path) == str:
889 base_path: Path = Path(base_path)
891 # this dataset name
892 dataset_name = self.__class__.__name__.lower()
894 # default dataset folder is the cache root
895 if not base_path:
896 base_path = flair.cache_root / "datasets"
897 data_folder = base_path / dataset_name
899 # download data if necessary
900 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Norwegian-Bokmaal/master"
901 cached_path(
902 f"{ud_path}/no_bokmaal-ud-dev.conllu", Path("datasets") / dataset_name
903 )
904 cached_path(
905 f"{ud_path}/no_bokmaal-ud-test.conllu", Path("datasets") / dataset_name
906 )
907 cached_path(
908 f"{ud_path}/no_bokmaal-ud-train.conllu", Path("datasets") / dataset_name
909 )
911 super(UD_NORWEGIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
914class UD_FINNISH(UniversalDependenciesCorpus):
915 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
917 if type(base_path) == str:
918 base_path: Path = Path(base_path)
920 # this dataset name
921 dataset_name = self.__class__.__name__.lower()
923 # default dataset folder is the cache root
924 if not base_path:
925 base_path = flair.cache_root / "datasets"
926 data_folder = base_path / dataset_name
928 # download data if necessary
929 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Finnish-TDT/master"
930 cached_path(f"{ud_path}/fi_tdt-ud-dev.conllu", Path("datasets") / dataset_name)
931 cached_path(f"{ud_path}/fi_tdt-ud-test.conllu", Path("datasets") / dataset_name)
932 cached_path(
933 f"{ud_path}/fi_tdt-ud-train.conllu", Path("datasets") / dataset_name
934 )
936 super(UD_FINNISH, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
939class UD_SLOVENIAN(UniversalDependenciesCorpus):
940 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
942 if type(base_path) == str:
943 base_path: Path = Path(base_path)
945 # this dataset name
946 dataset_name = self.__class__.__name__.lower()
948 # default dataset folder is the cache root
949 if not base_path:
950 base_path = flair.cache_root / "datasets"
951 data_folder = base_path / dataset_name
953 # download data if necessary
954 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Slovenian-SSJ/master"
955 cached_path(f"{ud_path}/sl_ssj-ud-dev.conllu", Path("datasets") / dataset_name)
956 cached_path(f"{ud_path}/sl_ssj-ud-test.conllu", Path("datasets") / dataset_name)
957 cached_path(
958 f"{ud_path}/sl_ssj-ud-train.conllu", Path("datasets") / dataset_name
959 )
961 super(UD_SLOVENIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
964class UD_CROATIAN(UniversalDependenciesCorpus):
965 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
967 if type(base_path) == str:
968 base_path: Path = Path(base_path)
970 # this dataset name
971 dataset_name = self.__class__.__name__.lower()
973 # default dataset folder is the cache root
974 if not base_path:
975 base_path = flair.cache_root / "datasets"
976 data_folder = base_path / dataset_name
978 # download data if necessary
979 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Croatian-SET/master"
980 cached_path(f"{ud_path}/hr_set-ud-dev.conllu", Path("datasets") / dataset_name)
981 cached_path(f"{ud_path}/hr_set-ud-test.conllu", Path("datasets") / dataset_name)
982 cached_path(
983 f"{ud_path}/hr_set-ud-train.conllu", Path("datasets") / dataset_name
984 )
986 super(UD_CROATIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
989class UD_SERBIAN(UniversalDependenciesCorpus):
990 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
992 if type(base_path) == str:
993 base_path: Path = Path(base_path)
995 # this dataset name
996 dataset_name = self.__class__.__name__.lower()
998 # default dataset folder is the cache root
999 if not base_path:
1000 base_path = flair.cache_root / "datasets"
1001 data_folder = base_path / dataset_name
1003 # download data if necessary
1004 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Serbian-SET/master"
1005 cached_path(f"{ud_path}/sr_set-ud-dev.conllu", Path("datasets") / dataset_name)
1006 cached_path(f"{ud_path}/sr_set-ud-test.conllu", Path("datasets") / dataset_name)
1007 cached_path(
1008 f"{ud_path}/sr_set-ud-train.conllu", Path("datasets") / dataset_name
1009 )
1011 super(UD_SERBIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
1014class UD_BULGARIAN(UniversalDependenciesCorpus):
1015 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
1017 if type(base_path) == str:
1018 base_path: Path = Path(base_path)
1020 # this dataset name
1021 dataset_name = self.__class__.__name__.lower()
1023 # default dataset folder is the cache root
1024 if not base_path:
1025 base_path = flair.cache_root / "datasets"
1026 data_folder = base_path / dataset_name
1028 # download data if necessary
1029 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Bulgarian-BTB/master"
1030 cached_path(f"{ud_path}/bg_btb-ud-dev.conllu", Path("datasets") / dataset_name)
1031 cached_path(f"{ud_path}/bg_btb-ud-test.conllu", Path("datasets") / dataset_name)
1032 cached_path(
1033 f"{ud_path}/bg_btb-ud-train.conllu", Path("datasets") / dataset_name
1034 )
1036 super(UD_BULGARIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
1039class UD_ARABIC(UniversalDependenciesCorpus):
1040 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
1042 if type(base_path) == str:
1043 base_path: Path = Path(base_path)
1045 # this dataset name
1046 dataset_name = self.__class__.__name__.lower()
1048 # default dataset folder is the cache root
1049 if not base_path:
1050 base_path = flair.cache_root / "datasets"
1051 data_folder = base_path / dataset_name
1053 # download data if necessary
1054 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Arabic-PADT/master"
1055 cached_path(f"{ud_path}/ar_padt-ud-dev.conllu", Path("datasets") / dataset_name)
1056 cached_path(
1057 f"{ud_path}/ar_padt-ud-test.conllu", Path("datasets") / dataset_name
1058 )
1059 cached_path(
1060 f"{ud_path}/ar_padt-ud-train.conllu", Path("datasets") / dataset_name
1061 )
1062 super(UD_ARABIC, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
1065class UD_HEBREW(UniversalDependenciesCorpus):
1066 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
1068 if type(base_path) == str:
1069 base_path: Path = Path(base_path)
1071 # this dataset name
1072 dataset_name = self.__class__.__name__.lower()
1074 # default dataset folder is the cache root
1075 if not base_path:
1076 base_path = flair.cache_root / "datasets"
1077 data_folder = base_path / dataset_name
1079 # download data if necessary
1080 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Hebrew-HTB/master"
1081 cached_path(f"{ud_path}/he_htb-ud-dev.conllu", Path("datasets") / dataset_name)
1082 cached_path(f"{ud_path}/he_htb-ud-test.conllu", Path("datasets") / dataset_name)
1083 cached_path(
1084 f"{ud_path}/he_htb-ud-train.conllu", Path("datasets") / dataset_name
1085 )
1086 super(UD_HEBREW, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
1089class UD_TURKISH(UniversalDependenciesCorpus):
1090 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
1092 if type(base_path) == str:
1093 base_path: Path = Path(base_path)
1095 # this dataset name
1096 dataset_name = self.__class__.__name__.lower()
1098 # default dataset folder is the cache root
1099 if not base_path:
1100 base_path = flair.cache_root / "datasets"
1101 data_folder = base_path / dataset_name
1103 # download data if necessary
1104 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Turkish-IMST/master"
1105 cached_path(f"{ud_path}/tr_imst-ud-dev.conllu", Path("datasets") / dataset_name)
1106 cached_path(
1107 f"{ud_path}/tr_imst-ud-test.conllu", Path("datasets") / dataset_name
1108 )
1109 cached_path(
1110 f"{ud_path}/tr_imst-ud-train.conllu", Path("datasets") / dataset_name
1111 )
1113 super(UD_TURKISH, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
1116class UD_PERSIAN(UniversalDependenciesCorpus):
1117 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
1119 if type(base_path) == str:
1120 base_path: Path = Path(base_path)
1122 # this dataset name
1123 dataset_name = self.__class__.__name__.lower()
1125 # default dataset folder is the cache root
1126 if not base_path:
1127 base_path = flair.cache_root / "datasets"
1128 data_folder = base_path / dataset_name
1130 # download data if necessary
1131 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Persian-Seraji/master"
1132 cached_path(
1133 f"{ud_path}/fa_seraji-ud-dev.conllu", Path("datasets") / dataset_name
1134 )
1135 cached_path(
1136 f"{ud_path}/fa_seraji-ud-test.conllu", Path("datasets") / dataset_name
1137 )
1138 cached_path(
1139 f"{ud_path}/fa_seraji-ud-train.conllu", Path("datasets") / dataset_name
1140 )
1142 super(UD_PERSIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
1145class UD_RUSSIAN(UniversalDependenciesCorpus):
1146 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
1148 if type(base_path) == str:
1149 base_path: Path = Path(base_path)
1151 # this dataset name
1152 dataset_name = self.__class__.__name__.lower()
1154 # default dataset folder is the cache root
1155 if not base_path:
1156 base_path = flair.cache_root / "datasets"
1157 data_folder = base_path / dataset_name
1159 # download data if necessary
1160 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/master"
1161 cached_path(
1162 f"{ud_path}/ru_syntagrus-ud-dev.conllu", Path("datasets") / dataset_name
1163 )
1164 cached_path(
1165 f"{ud_path}/ru_syntagrus-ud-test.conllu", Path("datasets") / dataset_name
1166 )
1167 cached_path(
1168 f"{ud_path}/ru_syntagrus-ud-train.conllu", Path("datasets") / dataset_name
1169 )
1171 super(UD_RUSSIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
1174class UD_HINDI(UniversalDependenciesCorpus):
1175 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
1177 if type(base_path) == str:
1178 base_path: Path = Path(base_path)
1180 # this dataset name
1181 dataset_name = self.__class__.__name__.lower()
1183 # default dataset folder is the cache root
1184 if not base_path:
1185 base_path = flair.cache_root / "datasets"
1186 data_folder = base_path / dataset_name
1188 # download data if necessary
1189 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Hindi-HDTB/master"
1190 cached_path(f"{ud_path}/hi_hdtb-ud-dev.conllu", Path("datasets") / dataset_name)
1191 cached_path(
1192 f"{ud_path}/hi_hdtb-ud-test.conllu", Path("datasets") / dataset_name
1193 )
1194 cached_path(
1195 f"{ud_path}/hi_hdtb-ud-train.conllu", Path("datasets") / dataset_name
1196 )
1198 super(UD_HINDI, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
1201class UD_INDONESIAN(UniversalDependenciesCorpus):
1202 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
1204 if type(base_path) == str:
1205 base_path: Path = Path(base_path)
1207 # this dataset name
1208 dataset_name = self.__class__.__name__.lower()
1210 # default dataset folder is the cache root
1211 if not base_path:
1212 base_path = flair.cache_root / "datasets"
1213 data_folder = base_path / dataset_name
1215 # download data if necessary
1216 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-GSD/master"
1217 cached_path(f"{ud_path}/id_gsd-ud-dev.conllu", Path("datasets") / dataset_name)
1218 cached_path(f"{ud_path}/id_gsd-ud-test.conllu", Path("datasets") / dataset_name)
1219 cached_path(
1220 f"{ud_path}/id_gsd-ud-train.conllu", Path("datasets") / dataset_name
1221 )
1223 super(UD_INDONESIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
1226class UD_JAPANESE(UniversalDependenciesCorpus):
1227 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
1229 if type(base_path) == str:
1230 base_path: Path = Path(base_path)
1232 # this dataset name
1233 dataset_name = self.__class__.__name__.lower()
1235 # default dataset folder is the cache root
1236 if not base_path:
1237 base_path = flair.cache_root / "datasets"
1238 data_folder = base_path / dataset_name
1240 # download data if necessary
1241 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Japanese-GSD/master"
1242 cached_path(f"{ud_path}/ja_gsd-ud-dev.conllu", Path("datasets") / dataset_name)
1243 cached_path(f"{ud_path}/ja_gsd-ud-test.conllu", Path("datasets") / dataset_name)
1244 cached_path(
1245 f"{ud_path}/ja_gsd-ud-train.conllu", Path("datasets") / dataset_name
1246 )
1248 super(UD_JAPANESE, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
1251class UD_CHINESE(UniversalDependenciesCorpus):
1252 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
1254 if type(base_path) == str:
1255 base_path: Path = Path(base_path)
1257 # this dataset name
1258 dataset_name = self.__class__.__name__.lower()
1260 # default dataset folder is the cache root
1261 if not base_path:
1262 base_path = flair.cache_root / "datasets"
1263 data_folder = base_path / dataset_name
1265 # download data if necessary
1266 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Chinese-GSD/master"
1267 cached_path(f"{ud_path}/zh_gsd-ud-dev.conllu", Path("datasets") / dataset_name)
1268 cached_path(f"{ud_path}/zh_gsd-ud-test.conllu", Path("datasets") / dataset_name)
1269 cached_path(
1270 f"{ud_path}/zh_gsd-ud-train.conllu", Path("datasets") / dataset_name
1271 )
1273 super(UD_CHINESE, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
1276class UD_KOREAN(UniversalDependenciesCorpus):
1277 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
1279 if type(base_path) == str:
1280 base_path: Path = Path(base_path)
1282 # this dataset name
1283 dataset_name = self.__class__.__name__.lower()
1285 # default dataset folder is the cache root
1286 if not base_path:
1287 base_path = flair.cache_root / "datasets"
1288 data_folder = base_path / dataset_name
1290 # download data if necessary
1291 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Korean-Kaist/master"
1292 cached_path(
1293 f"{ud_path}/ko_kaist-ud-dev.conllu", Path("datasets") / dataset_name
1294 )
1295 cached_path(
1296 f"{ud_path}/ko_kaist-ud-test.conllu", Path("datasets") / dataset_name
1297 )
1298 cached_path(
1299 f"{ud_path}/ko_kaist-ud-train.conllu", Path("datasets") / dataset_name
1300 )
1302 super(UD_KOREAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
1305class UD_BASQUE(UniversalDependenciesCorpus):
1306 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
1308 if type(base_path) == str:
1309 base_path: Path = Path(base_path)
1311 # this dataset name
1312 dataset_name = self.__class__.__name__.lower()
1314 # default dataset folder is the cache root
1315 if not base_path:
1316 base_path = flair.cache_root / "datasets"
1317 data_folder = base_path / dataset_name
1319 # download data if necessary
1320 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Basque-BDT/master"
1321 cached_path(f"{ud_path}/eu_bdt-ud-dev.conllu", Path("datasets") / dataset_name)
1322 cached_path(f"{ud_path}/eu_bdt-ud-test.conllu", Path("datasets") / dataset_name)
1323 cached_path(
1324 f"{ud_path}/eu_bdt-ud-train.conllu", Path("datasets") / dataset_name
1325 )
1327 super(UD_BASQUE, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
1330class UD_CHINESE_KYOTO(UniversalDependenciesCorpus):
1331 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
1333 if type(base_path) == str:
1334 base_path: Path = Path(base_path)
1336 # this dataset name
1337 dataset_name = self.__class__.__name__.lower()
1339 # default dataset folder is the cache root
1340 if not base_path:
1341 base_path = flair.cache_root / "datasets"
1342 data_folder = base_path / dataset_name
1344 # download data if necessary
1345 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Classical_Chinese-Kyoto/master"
1346 cached_path(f"{web_path}/lzh_kyoto-ud-dev.conllu", Path("datasets") / dataset_name)
1347 cached_path(
1348 f"{web_path}/lzh_kyoto-ud-test.conllu", Path("datasets") / dataset_name
1349 )
1350 cached_path(
1351 f"{web_path}/lzh_kyoto-ud-train.conllu", Path("datasets") / dataset_name
1352 )
1354 super(UD_CHINESE_KYOTO, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
1357class UD_GREEK(UniversalDependenciesCorpus):
1358 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
1360 if type(base_path) == str:
1361 base_path: Path = Path(base_path)
1363 # this dataset name
1364 dataset_name = self.__class__.__name__.lower()
1366 # default dataset folder is the cache root
1367 if not base_path:
1368 base_path = flair.cache_root / "datasets"
1369 data_folder = base_path / dataset_name
1371 # download data if necessary
1372 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Greek-GDT/master"
1373 cached_path(f"{web_path}/el_gdt-ud-dev.conllu", Path("datasets") / dataset_name)
1374 cached_path(
1375 f"{web_path}/el_gdt-ud-test.conllu", Path("datasets") / dataset_name
1376 )
1377 cached_path(
1378 f"{web_path}/el_gdt-ud-train.conllu", Path("datasets") / dataset_name
1379 )
1381 super(UD_GREEK, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
1384class UD_NAIJA(UniversalDependenciesCorpus):
1385 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
1387 if type(base_path) == str:
1388 base_path: Path = Path(base_path)
1390 # this dataset name
1391 dataset_name = self.__class__.__name__.lower()
1393 # default dataset folder is the cache root
1394 if not base_path:
1395 base_path = flair.cache_root / "datasets"
1396 data_folder = base_path / dataset_name
1398 # download data if necessary
1399 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Naija-NSC/master"
1400 cached_path(f"{web_path}//pcm_nsc-ud-dev.conllu", Path("datasets") / dataset_name)
1401 cached_path(
1402 f"{web_path}//pcm_nsc-ud-test.conllu", Path("datasets") / dataset_name
1403 )
1404 cached_path(
1405 f"{web_path}//pcm_nsc-ud-train.conllu", Path("datasets") / dataset_name
1406 )
1408 super(UD_NAIJA, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
1411class UD_LIVVI(UniversalDependenciesCorpus):
1412 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
1414 if type(base_path) == str:
1415 base_path: Path = Path(base_path)
1417 # this dataset name
1418 dataset_name = self.__class__.__name__.lower()
1420 # default dataset folder is the cache root
1421 if not base_path:
1422 base_path = flair.cache_root / "datasets"
1423 data_folder = base_path / dataset_name
1425 # download data if necessary
1426 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Livvi-KKPP/master"
1427 cached_path(f"{web_path}/olo_kkpp-ud-test.conllu", Path("datasets") / dataset_name)
1428 cached_path(f"{web_path}/olo_kkpp-ud-train.conllu", Path("datasets") / dataset_name)
1430 super(UD_LIVVI, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
1433class UD_BURYAT(UniversalDependenciesCorpus):
1434 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
1436 if type(base_path) == str:
1437 base_path: Path = Path(base_path)
1439 # this dataset name
1440 dataset_name = self.__class__.__name__.lower()
1442 # default dataset folder is the cache root
1443 if not base_path:
1444 base_path = flair.cache_root / "datasets"
1445 data_folder = base_path / dataset_name
1447 # download data if necessary
1448 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Buryat-BDT/master"
1449 cached_path(
1450 f"{web_path}/bxr_bdt-ud-test.conllu", Path("datasets") / dataset_name
1451 )
1452 cached_path(
1453 f"{web_path}/bxr_bdt-ud-train.conllu", Path("datasets") / dataset_name
1454 )
1456 super(UD_BURYAT, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
1459class UD_NORTH_SAMI(UniversalDependenciesCorpus):
1460 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
1462 if type(base_path) == str:
1463 base_path: Path = Path(base_path)
1465 # this dataset name
1466 dataset_name = self.__class__.__name__.lower()
1468 # default dataset folder is the cache root
1469 if not base_path:
1470 base_path = flair.cache_root / "datasets"
1471 data_folder = base_path / dataset_name
1473 # download data if necessary
1474 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_North_Sami-Giella/master"
1475 cached_path(
1476 f"{web_path}/sme_giella-ud-test.conllu", Path("datasets") / dataset_name
1477 )
1478 cached_path(
1479 f"{web_path}/sme_giella-ud-train.conllu", Path("datasets") / dataset_name
1480 )
1482 super(UD_NORTH_SAMI, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
1485class UD_MARATHI(UniversalDependenciesCorpus):
1486 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
1488 if type(base_path) == str:
1489 base_path: Path = Path(base_path)
1491 # this dataset name
1492 dataset_name = self.__class__.__name__.lower()
1494 # default dataset folder is the cache root
1495 if not base_path:
1496 base_path = flair.cache_root / "datasets"
1497 data_folder = base_path / dataset_name
1499 # download data if necessary
1500 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Marathi-UFAL/master"
1501 cached_path(f"{web_path}/mr_ufal-ud-dev.conllu", Path("datasets") / dataset_name)
1502 cached_path(
1503 f"{web_path}/mr_ufal-ud-test.conllu", Path("datasets") / dataset_name
1504 )
1505 cached_path(
1506 f"{web_path}/mr_ufal-ud-train.conllu", Path("datasets") / dataset_name
1507 )
1509 super(UD_MARATHI, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
1512class UD_MALTESE(UniversalDependenciesCorpus):
1513 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
1515 if type(base_path) == str:
1516 base_path: Path = Path(base_path)
1518 # this dataset name
1519 dataset_name = self.__class__.__name__.lower()
1521 # default dataset folder is the cache root
1522 if not base_path:
1523 base_path = flair.cache_root / "datasets"
1524 data_folder = base_path / dataset_name
1525 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Maltese-MUDT/master"
1526 cached_path(
1527 f"{web_path}/mt_mudt-ud-dev.conllu", Path("datasets") / dataset_name
1528 )
1529 cached_path(
1530 f"{web_path}/mt_mudt-ud-test.conllu", Path("datasets") / dataset_name
1531 )
1532 cached_path(
1533 f"{web_path}/mt_mudt-ud-train.conllu", Path("datasets") / dataset_name
1534 )
1536 super(UD_MALTESE, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
1539class UD_AFRIKAANS(UniversalDependenciesCorpus):
1540 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
1542 if type(base_path) == str:
1543 base_path: Path = Path(base_path)
1545 # this dataset name
1546 dataset_name = self.__class__.__name__.lower()
1548 # default dataset folder is the cache root
1549 if not base_path:
1550 base_path = flair.cache_root / "datasets"
1551 data_folder = base_path / dataset_name
1552 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Afrikaans-AfriBooms/master"
1553 cached_path(
1554 f"{web_path}/af_afribooms-ud-dev.conllu", Path("datasets") / dataset_name
1555 )
1556 cached_path(
1557 f"{web_path}/af_afribooms-ud-test.conllu", Path("datasets") / dataset_name
1558 )
1559 cached_path(
1560 f"{web_path}/af_afribooms-ud-train.conllu", Path("datasets") / dataset_name
1561 )
1563 super(UD_AFRIKAANS, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
1566class UD_GOTHIC(UniversalDependenciesCorpus):
1567 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
1569 if type(base_path) == str:
1570 base_path: Path = Path(base_path)
1572 # this dataset name
1573 dataset_name = self.__class__.__name__.lower()
1575 # default dataset folder is the cache root
1576 if not base_path:
1577 base_path = flair.cache_root / "datasets"
1578 data_folder = base_path / dataset_name
1580 # download data if necessary
1581 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Gothic-PROIEL/master"
1582 cached_path(f"{web_path}/got_proiel-ud-dev.conllu", Path("datasets") / dataset_name)
1583 cached_path(
1584 f"{web_path}/got_proiel-ud-test.conllu", Path("datasets") / dataset_name
1585 )
1586 cached_path(
1587 f"{web_path}/got_proiel-ud-train.conllu", Path("datasets") / dataset_name
1588 )
1590 super(UD_GOTHIC, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
1593class UD_OLD_FRENCH(UniversalDependenciesCorpus):
1594 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
1596 if type(base_path) == str:
1597 base_path: Path = Path(base_path)
1599 # this dataset name
1600 dataset_name = self.__class__.__name__.lower()
1602 # default dataset folder is the cache root
1603 if not base_path:
1604 base_path = flair.cache_root / "datasets"
1605 data_folder = base_path / dataset_name
1607 # download data if necessary
1608 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Old_French-SRCMF/master"
1609 cached_path(f"{web_path}/fro_srcmf-ud-dev.conllu", Path("datasets") / dataset_name)
1610 cached_path(
1611 f"{web_path}/fro_srcmf-ud-test.conllu", Path("datasets") / dataset_name
1612 )
1613 cached_path(
1614 f"{web_path}/fro_srcmf-ud-train.conllu", Path("datasets") / dataset_name
1615 )
1617 super(UD_OLD_FRENCH, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
1620class UD_WOLOF(UniversalDependenciesCorpus):
1621 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
1623 if type(base_path) == str:
1624 base_path: Path = Path(base_path)
1626 # this dataset name
1627 dataset_name = self.__class__.__name__.lower()
1629 # default dataset folder is the cache root
1630 if not base_path:
1631 base_path = flair.cache_root / "datasets"
1632 data_folder = base_path / dataset_name
1633 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Wolof-WTB/master"
1634 cached_path(
1635 f"{web_path}/wo_wtb-ud-dev.conllu", Path("datasets") / dataset_name
1636 )
1637 cached_path(
1638 f"{web_path}/wo_wtb-ud-test.conllu", Path("datasets") / dataset_name
1639 )
1640 cached_path(
1641 f"{web_path}/wo_wtb-ud-train.conllu", Path("datasets") / dataset_name
1642 )
1644 super(UD_WOLOF, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
1647class UD_BELARUSIAN(UniversalDependenciesCorpus):
1648 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
1650 if type(base_path) == str:
1651 base_path: Path = Path(base_path)
1653 # this dataset name
1654 dataset_name = self.__class__.__name__.lower()
1656 # default dataset folder is the cache root
1657 if not base_path:
1658 base_path = flair.cache_root / "datasets"
1659 data_folder = base_path / dataset_name
1661 # download data if necessary
1662 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Belarusian-HSE/master"
1663 cached_path(f"{web_path}/be_hse-ud-dev.conllu", Path("datasets") / dataset_name)
1664 cached_path(
1665 f"{web_path}/be_hse-ud-test.conllu", Path("datasets") / dataset_name
1666 )
1667 cached_path(
1668 f"{web_path}/be_hse-ud-train.conllu", Path("datasets") / dataset_name
1669 )
1671 super(UD_BELARUSIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
1674class UD_COPTIC(UniversalDependenciesCorpus):
1675 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
1677 if type(base_path) == str:
1678 base_path: Path = Path(base_path)
1680 # this dataset name
1681 dataset_name = self.__class__.__name__.lower()
1683 # default dataset folder is the cache root
1684 if not base_path:
1685 base_path = flair.cache_root / "datasets"
1686 data_folder = base_path / dataset_name
1688 # download data if necessary
1689 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Coptic-Scriptorium/master"
1690 cached_path(f"{web_path}/cop_scriptorium-ud-dev.conllu", Path("datasets") / dataset_name)
1691 cached_path(
1692 f"{web_path}/cop_scriptorium-ud-test.conllu", Path("datasets") / dataset_name
1693 )
1694 cached_path(
1695 f"{web_path}/cop_scriptorium-ud-train.conllu", Path("datasets") / dataset_name
1696 )
1698 super(UD_COPTIC, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
1700class UD_IRISH(UniversalDependenciesCorpus):
1701 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
1703 if type(base_path) == str:
1704 base_path: Path = Path(base_path)
1706 # this dataset name
1707 dataset_name = self.__class__.__name__.lower()
1709 # default dataset folder is the cache root
1710 if not base_path:
1711 base_path = flair.cache_root / "datasets"
1712 data_folder = base_path / dataset_name
1714 # download data if necessary
1715 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Irish-IDT/master"
1716 cached_path(f"{web_path}/ga_idt-ud-dev.conllu", Path("datasets") / dataset_name)
1717 cached_path(
1718 f"{web_path}/ga_idt-ud-test.conllu", Path("datasets") / dataset_name
1719 )
1720 cached_path(
1721 f"{web_path}/ga_idt-ud-train.conllu", Path("datasets") / dataset_name
1722 )
1724 super(UD_IRISH, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
1726class UD_LATVIAN(UniversalDependenciesCorpus):
1727 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
1729 if type(base_path) == str:
1730 base_path: Path = Path(base_path)
1732 # this dataset name
1733 dataset_name = self.__class__.__name__.lower()
1735 # default dataset folder is the cache root
1736 if not base_path:
1737 base_path = Path(flair.cache_root) / "datasets"
1738 data_folder = base_path / dataset_name
1740 # download data if necessary
1741 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Latvian-LVTB/master"
1742 cached_path(f"{web_path}/lv_lvtb-ud-dev.conllu", Path("datasets") / dataset_name)
1743 cached_path(
1744 f"{web_path}/lv_lvtb-ud-test.conllu", Path("datasets") / dataset_name
1745 )
1746 cached_path(
1747 f"{web_path}/lv_lvtb-ud-train.conllu", Path("datasets") / dataset_name
1748 )
1750 super(UD_LATVIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
1752class UD_LITHUANIAN(UniversalDependenciesCorpus):
1753 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
1755 if type(base_path) == str:
1756 base_path: Path = Path(base_path)
1758 # this dataset name
1759 dataset_name = self.__class__.__name__.lower()
1761 # default dataset folder is the cache root
1762 if not base_path:
1763 base_path = Path(flair.cache_root) / "datasets"
1764 data_folder = base_path / dataset_name
1766 # download data if necessary
1767 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Lithuanian-ALKSNIS/master"
1768 cached_path(f"{web_path}/lt_alksnis-ud-dev.conllu", Path("datasets") / dataset_name)
1769 cached_path(
1770 f"{web_path}/lt_alksnis-ud-test.conllu", Path("datasets") / dataset_name
1771 )
1772 cached_path(
1773 f"{web_path}/lt_alksnis-ud-train.conllu", Path("datasets") / dataset_name
1774 )
1776 super(UD_LITHUANIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)