Coverage for /home/ubuntu/Documents/Research/mut_p1/flair/flair/datasets/sequence_labeling.py: 19%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import logging
2import os
3import re
4import shutil
5from pathlib import Path
6from typing import Union, Dict, List, Optional
8from torch.utils.data import ConcatDataset
10import flair
11from flair.data import Corpus, MultiCorpus, FlairDataset, Sentence, Token
12from flair.datasets.base import find_train_dev_test_files
13from flair.file_utils import cached_path, unpack_file
15log = logging.getLogger("flair")
18class MultiFileColumnCorpus(Corpus):
19 def __init__(
20 self,
21 column_format: Dict[int, str],
22 train_files=None,
23 test_files=None,
24 dev_files=None,
25 tag_to_bioes=None,
26 column_delimiter: str = r"\s+",
27 comment_symbol: str = None,
28 encoding: str = "utf-8",
29 document_separator_token: str = None,
30 skip_first_line: bool = False,
31 in_memory: bool = True,
32 label_name_map: Dict[str, str] = None,
33 banned_sentences: List[str] = None,
34 **corpusargs,
35 ):
36 """
37 Instantiates a Corpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000.
38 :param data_folder: base folder with the task data
39 :param column_format: a map specifying the column format
40 :param train_files: the name of the train files
41 :param test_files: the name of the test files
42 :param dev_files: the name of the dev files, if empty, dev data is sampled from train
43 :param tag_to_bioes: whether to convert to BIOES tagging scheme
44 :param column_delimiter: default is to split on any separatator, but you can overwrite for instance with "\t"
45 to split only on tabs
46 :param comment_symbol: if set, lines that begin with this symbol are treated as comments
47 :param document_separator_token: If provided, sentences that function as document boundaries are so marked
48 :param skip_first_line: set to True if your dataset has a header line
49 :param in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads
50 :param label_name_map: Optionally map tag names to different schema.
51 :param banned_sentences: Optionally remove sentences from the corpus. Works only if `in_memory` is true
52 :return: a Corpus with annotated train, dev and test data
53 """
54 # get train data
55 train = ConcatDataset([
56 ColumnDataset(
57 train_file,
58 column_format,
59 tag_to_bioes,
60 encoding=encoding,
61 comment_symbol=comment_symbol,
62 column_delimiter=column_delimiter,
63 banned_sentences=banned_sentences,
64 in_memory=in_memory,
65 document_separator_token=document_separator_token,
66 skip_first_line=skip_first_line,
67 label_name_map=label_name_map,
68 ) for train_file in train_files
69 ]) if train_files and train_files[0] else None
71 # read in test file if exists
72 test = ConcatDataset([
73 ColumnDataset(
74 test_file,
75 column_format,
76 tag_to_bioes,
77 encoding=encoding,
78 comment_symbol=comment_symbol,
79 column_delimiter=column_delimiter,
80 banned_sentences=banned_sentences,
81 in_memory=in_memory,
82 document_separator_token=document_separator_token,
83 skip_first_line=skip_first_line,
84 label_name_map=label_name_map,
85 ) for test_file in test_files
86 ]) if test_files and test_files[0] else None
88 # read in dev file if exists
89 dev = ConcatDataset([
90 ColumnDataset(
91 dev_file,
92 column_format,
93 tag_to_bioes,
94 encoding=encoding,
95 comment_symbol=comment_symbol,
96 column_delimiter=column_delimiter,
97 banned_sentences=banned_sentences,
98 in_memory=in_memory,
99 document_separator_token=document_separator_token,
100 skip_first_line=skip_first_line,
101 label_name_map=label_name_map,
102 ) for dev_file in dev_files
103 ]) if dev_files and dev_files[0] else None
105 super(MultiFileColumnCorpus, self).__init__(train, dev, test, **corpusargs)
108class ColumnCorpus(MultiFileColumnCorpus):
109 def __init__(
110 self,
111 data_folder: Union[str, Path],
112 column_format: Dict[int, str],
113 train_file=None,
114 test_file=None,
115 dev_file=None,
116 autofind_splits: bool = True,
117 name: Optional[str] = None,
118 **corpusargs,
119 ):
120 """
121 Instantiates a Corpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000.
122 :param data_folder: base folder with the task data
123 :param column_format: a map specifying the column format
124 :param train_file: the name of the train file
125 :param test_file: the name of the test file
126 :param dev_file: the name of the dev file, if None, dev data is sampled from train
127 :param tag_to_bioes: whether to convert to BIOES tagging scheme
128 :param column_delimiter: default is to split on any separatator, but you can overwrite for instance with "\t"
129 to split only on tabs
130 :param comment_symbol: if set, lines that begin with this symbol are treated as comments
131 :param document_separator_token: If provided, sentences that function as document boundaries are so marked
132 :param skip_first_line: set to True if your dataset has a header line
133 :param in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads
134 :param label_name_map: Optionally map tag names to different schema.
135 :param banned_sentences: Optionally remove sentences from the corpus. Works only if `in_memory` is true
136 :return: a Corpus with annotated train, dev and test data
137 """
139 # find train, dev and test files if not specified
140 dev_file, test_file, train_file = \
141 find_train_dev_test_files(data_folder, dev_file, test_file, train_file, autofind_splits)
142 super(ColumnCorpus, self).__init__(
143 column_format,
144 dev_files=[dev_file] if dev_file else [],
145 train_files=[train_file] if train_file else [],
146 test_files=[test_file] if test_file else [],
147 name=name if data_folder is None else str(data_folder),
148 **corpusargs
149 )
152class ColumnDataset(FlairDataset):
153 # special key for space after
154 SPACE_AFTER_KEY = "space-after"
156 def __init__(
157 self,
158 path_to_column_file: Union[str, Path],
159 column_name_map: Dict[int, str],
160 tag_to_bioes: str = None,
161 column_delimiter: str = r"\s+",
162 comment_symbol: str = None,
163 banned_sentences: List[str] = None,
164 in_memory: bool = True,
165 document_separator_token: str = None,
166 encoding: str = "utf-8",
167 skip_first_line: bool = False,
168 label_name_map: Dict[str, str] = None,
169 ):
170 """
171 Instantiates a column dataset (typically used for sequence labeling or word-level prediction).
172 :param path_to_column_file: path to the file with the column-formatted data
173 :param column_name_map: a map specifying the column format
174 :param tag_to_bioes: whether to convert to BIOES tagging scheme
175 :param column_delimiter: default is to split on any separatator, but you can overwrite for instance with "\t"
176 to split only on tabs
177 :param comment_symbol: if set, lines that begin with this symbol are treated as comments
178 :param in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads
179 :param document_separator_token: If provided, sentences that function as document boundaries are so marked
180 :param skip_first_line: set to True if your dataset has a header line
181 :param label_name_map: Optionally map tag names to different schema.
182 :param banned_sentences: Optionally remove sentences from the corpus. Works only if `in_memory` is true
183 :return: a dataset with annotated data
184 """
185 if type(path_to_column_file) is str:
186 path_to_column_file = Path(path_to_column_file)
187 assert path_to_column_file.exists()
188 self.path_to_column_file = path_to_column_file
189 self.tag_to_bioes = tag_to_bioes
190 self.column_name_map = column_name_map
191 self.column_delimiter = column_delimiter
192 self.comment_symbol = comment_symbol
193 self.document_separator_token = document_separator_token
194 self.label_name_map = label_name_map
195 self.banned_sentences = banned_sentences
197 # store either Sentence objects in memory, or only file offsets
198 self.in_memory = in_memory
200 self.total_sentence_count: int = 0
202 # most data sets have the token text in the first column, if not, pass 'text' as column
203 self.text_column: int = 0
204 for column in self.column_name_map:
205 if column_name_map[column] == "text":
206 self.text_column = column
208 # determine encoding of text file
209 self.encoding = encoding
211 with open(str(self.path_to_column_file), encoding=self.encoding) as file:
213 # skip first line if to selected
214 if skip_first_line:
215 file.readline()
217 # option 1: read only sentence boundaries as offset positions
218 if not self.in_memory:
219 self.indices: List[int] = []
221 line = file.readline()
222 position = 0
223 sentence_started = False
224 while line:
225 if sentence_started and self.__line_completes_sentence(line):
226 self.indices.append(position)
227 position = file.tell()
228 sentence_started = False
230 elif not line.isspace():
231 sentence_started = True
232 line = file.readline()
234 if sentence_started:
235 self.indices.append(position)
237 self.total_sentence_count = len(self.indices)
239 # option 2: keep everything in memory
240 if self.in_memory:
241 self.sentences: List[Sentence] = []
243 # pointer to previous
244 previous_sentence = None
245 while True:
246 sentence = self._convert_lines_to_sentence(self._read_next_sentence(file))
247 if not sentence: break
248 if self.banned_sentences is not None and any(
249 [d in sentence.to_plain_string() for d in self.banned_sentences]):
250 continue
251 sentence._previous_sentence = previous_sentence
252 sentence._next_sentence = None
254 if previous_sentence: previous_sentence._next_sentence = sentence
256 self.sentences.append(sentence)
257 previous_sentence = sentence
259 self.total_sentence_count = len(self.sentences)
261 def _read_next_sentence(self, file):
262 lines = []
263 line = file.readline()
264 while line:
265 if not line.isspace():
266 lines.append(line)
268 # if sentence ends, break
269 if len(lines) > 0 and self.__line_completes_sentence(line):
270 break
272 line = file.readline()
273 return lines
275 def _convert_lines_to_sentence(self, lines):
277 sentence: Sentence = Sentence()
278 for line in lines:
279 # skip comments
280 if self.comment_symbol is not None and line.startswith(self.comment_symbol):
281 continue
283 # if sentence ends, convert and return
284 if self.__line_completes_sentence(line):
285 if len(sentence) > 0:
286 if self.tag_to_bioes is not None:
287 sentence.convert_tag_scheme(
288 tag_type=self.tag_to_bioes, target_scheme="iobes"
289 )
290 # check if this sentence is a document boundary
291 if sentence.to_original_text() == self.document_separator_token:
292 sentence.is_document_boundary = True
293 return sentence
295 # otherwise, this line is a token. parse and add to sentence
296 else:
297 token = self._parse_token(line)
298 sentence.add_token(token)
300 # check if this sentence is a document boundary
301 if sentence.to_original_text() == self.document_separator_token: sentence.is_document_boundary = True
303 if self.tag_to_bioes is not None:
304 sentence.convert_tag_scheme(
305 tag_type=self.tag_to_bioes, target_scheme="iobes"
306 )
308 if len(sentence) > 0: return sentence
310 def _parse_token(self, line: str) -> Token:
311 fields: List[str] = re.split(self.column_delimiter, line.rstrip())
312 token = Token(fields[self.text_column])
313 for column in self.column_name_map:
314 if len(fields) > column:
315 if column != self.text_column and self.column_name_map[column] != self.SPACE_AFTER_KEY:
316 task = self.column_name_map[column] # for example 'pos'
317 tag = fields[column]
318 if tag.count("-") >= 1: # tag with prefix, for example tag='B-OBJ'
319 split_at_first_hyphen = tag.split("-", 1)
320 tagging_format_prefix = split_at_first_hyphen[0]
321 tag_without_tagging_format = split_at_first_hyphen[1]
322 if self.label_name_map and tag_without_tagging_format in self.label_name_map.keys():
323 tag = tagging_format_prefix + "-" + self.label_name_map[tag_without_tagging_format]
324 # for example, transforming 'B-OBJ' to 'B-part-of-speech-object'
325 if self.label_name_map[tag_without_tagging_format] == 'O': tag = 'O'
326 else: # tag without prefix, for example tag='PPER'
327 if self.label_name_map and tag in self.label_name_map.keys():
328 tag = self.label_name_map[tag] # for example, transforming 'PPER' to 'person'
330 token.add_label(task, tag)
331 if self.column_name_map[column] == self.SPACE_AFTER_KEY and fields[column] == '-':
332 token.whitespace_after = False
333 return token
335 def __line_completes_sentence(self, line: str) -> bool:
336 sentence_completed = line.isspace() or line == ''
337 return sentence_completed
339 def is_in_memory(self) -> bool:
340 return self.in_memory
342 def __len__(self):
343 return self.total_sentence_count
345 def __getitem__(self, index: int = 0) -> Sentence:
347 # if in memory, retrieve parsed sentence
348 if self.in_memory:
349 sentence = self.sentences[index]
351 # else skip to position in file where sentence begins
352 else:
353 with open(str(self.path_to_column_file), encoding=self.encoding) as file:
354 file.seek(self.indices[index])
355 sentence = self._convert_lines_to_sentence(self._read_next_sentence(file))
357 # set sentence context using partials
358 sentence._position_in_dataset = (self, index)
360 return sentence
363class MultiCoNer(MultiFileColumnCorpus):
364 def __init__(
365 self,
366 task: str = "multi",
367 base_path: Union[str, Path] = None,
368 tag_to_bioes: str = "ner",
369 in_memory: bool = True,
370 use_dev_as_test: bool = True,
371 **corpusargs,
372 ):
373 """
374 Initialize the MultiCoNer corpus. This is only possible if you've applied and downloaded it to your machine.
375 Apply for the corpus from here https://multiconer.github.io/dataset and unpack the .zip file's content into
376 a folder called 'multiconer'. Then set the base_path parameter in the constructor to the path to the
377 parent directory where the multiconer folder resides. You can also create the multiconer in
378 the {FLAIR_CACHE_ROOT}/datasets folder to leave the path empty.
379 :param base_path: Path to the CoNLL-03 corpus (i.e. 'conll_03' folder) on your machine
380 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' or 'np' to predict
381 POS tags or chunks respectively
382 :param in_memory: If True, keeps dataset in memory giving speedups in training.
383 :param use_dev_as_test: If True, it uses the dev set as test set and samples random training data for a dev split.
384 :param task: either 'multi', 'code-switch', or the language code for one of the mono tasks.
385 """
386 if type(base_path) == str:
387 base_path: Path = Path(base_path)
389 folders = {
390 "bn": "BN-Bangla",
391 "de": "DE-German",
392 "en": "EN-English",
393 "es": "ES-Espanish",
394 "fa": "FA-Farsi",
395 "hi": "HI-Hindi",
396 "ko": "KO-Korean",
397 "nl": "NL-Dutch",
398 "ru": "RU-Russian",
399 "tr": "TR-Turkish",
400 "zh": "ZH-Chinese",
401 }
403 possible_tasks = ["multi", "code-switch"] + list(folders.keys())
404 task = task.lower()
406 if task not in possible_tasks:
407 raise ValueError(f"task has to be one of {possible_tasks}, but is '{task}'")
409 # column format
410 columns = {0: "text", 3: "ner"}
412 # this dataset name
413 dataset_name = self.__class__.__name__.lower()
415 # default dataset folder is the cache root
416 if not base_path:
417 base_path = flair.cache_root / "datasets"
418 data_folder = base_path / dataset_name
420 # check if data there
421 if not data_folder.exists():
422 log.warning("-" * 100)
423 log.warning(f'WARNING: MultiCoNer dataset not found at "{data_folder}".')
424 log.warning(
425 'Instructions for obtaining the data can be found here: https://multiconer.github.io/dataset"'
426 )
427 log.warning("-" * 100)
429 if task in ["multi", "code-switch"]:
430 # code-switch uses the same training data than multi but provides a different test set.
431 # as the test set is not published, those two tasks are the same.
432 train_files = list(data_folder.rglob("*_train.conll"))
433 dev_files = list(data_folder.rglob("*_dev.conll"))
434 else:
435 train_files = [data_folder / folders[task] / f"{task}_train.conll"]
436 dev_files = [data_folder / folders[task] / f"{task}_dev.conll"]
438 if use_dev_as_test:
439 test_files = dev_files
440 dev_files = []
441 else:
442 test_files = []
444 super(MultiCoNer, self).__init__(
445 train_files=train_files,
446 dev_files=dev_files,
447 test_files=test_files,
448 column_format=columns,
449 tag_to_bioes=tag_to_bioes,
450 comment_symbol="# id ",
451 in_memory=in_memory,
452 **corpusargs,
453 )
456class CONLL_03(ColumnCorpus):
457 def __init__(
458 self,
459 base_path: Union[str, Path] = None,
460 tag_to_bioes: str = "ner",
461 in_memory: bool = True,
462 **corpusargs,
463 ):
464 """
465 Initialize the CoNLL-03 corpus. This is only possible if you've manually downloaded it to your machine.
466 Obtain the corpus from https://www.clips.uantwerpen.be/conll2003/ner/ and put the eng.testa, .testb, .train
467 files in a folder called 'conll_03'. Then set the base_path parameter in the constructor to the path to the
468 parent directory where the conll_03 folder resides.
469 If using entity linking, the conll03 dateset is reduced by about 20 Documents, which are not part of the yago dataset.
470 :param base_path: Path to the CoNLL-03 corpus (i.e. 'conll_03' folder) on your machine
471 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' or 'np' to predict
472 POS tags or chunks respectively
473 :param in_memory: If True, keeps dataset in memory giving speedups in training.
474 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
475 """
476 if type(base_path) == str:
477 base_path: Path = Path(base_path)
479 # column format
480 columns = {0: "text", 1: "pos", 2: "np", 3: "ner"}
482 # this dataset name
483 dataset_name = self.__class__.__name__.lower()
485 # default dataset folder is the cache root
486 if not base_path:
487 base_path = flair.cache_root / "datasets"
488 data_folder = base_path / dataset_name
490 # check if data there
491 if not data_folder.exists():
492 log.warning("-" * 100)
493 log.warning(f'WARNING: CoNLL-03 dataset not found at "{data_folder}".')
494 log.warning(
495 'Instructions for obtaining the data can be found here: https://www.clips.uantwerpen.be/conll2003/ner/"'
496 )
497 log.warning("-" * 100)
499 super(CONLL_03, self).__init__(
500 data_folder,
501 columns,
502 tag_to_bioes=tag_to_bioes,
503 in_memory=in_memory,
504 document_separator_token="-DOCSTART-",
505 **corpusargs,
506 )
509class CONLL_03_GERMAN(ColumnCorpus):
510 def __init__(
511 self,
512 base_path: Union[str, Path] = None,
513 tag_to_bioes: str = "ner",
514 in_memory: bool = True,
515 **corpusargs,
516 ):
517 """
518 Initialize the CoNLL-03 corpus for German. This is only possible if you've manually downloaded it to your machine.
519 Obtain the corpus from https://www.clips.uantwerpen.be/conll2003/ner/ and put the respective files in a folder called
520 'conll_03_german'. Then set the base_path parameter in the constructor to the path to the parent directory where
521 the conll_03_german folder resides.
522 :param base_path: Path to the CoNLL-03 corpus (i.e. 'conll_03_german' folder) on your machine
523 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'lemma', 'pos' or 'np' to predict
524 word lemmas, POS tags or chunks respectively
525 :param in_memory: If True, keeps dataset in memory giving speedups in training.
526 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
527 """
528 if type(base_path) == str:
529 base_path: Path = Path(base_path)
531 # column format
532 columns = {0: "text", 1: "lemma", 2: "pos", 3: "np", 4: "ner"}
534 # this dataset name
535 dataset_name = self.__class__.__name__.lower()
537 # default dataset folder is the cache root
538 if not base_path:
539 base_path = flair.cache_root / "datasets"
540 data_folder = base_path / dataset_name
542 # check if data there
543 if not data_folder.exists():
544 log.warning("-" * 100)
545 log.warning(f'WARNING: CoNLL-03 dataset not found at "{data_folder}".')
546 log.warning(
547 'Instructions for obtaining the data can be found here: https://www.clips.uantwerpen.be/conll2003/ner/"'
548 )
549 log.warning("-" * 100)
551 super(CONLL_03_GERMAN, self).__init__(
552 data_folder,
553 columns,
554 tag_to_bioes=tag_to_bioes,
555 in_memory=in_memory,
556 document_separator_token="-DOCSTART-",
557 **corpusargs,
558 )
561class CONLL_03_DUTCH(ColumnCorpus):
562 def __init__(
563 self,
564 base_path: Union[str, Path] = None,
565 tag_to_bioes: str = "ner",
566 in_memory: bool = True,
567 **corpusargs,
568 ):
569 """
570 Initialize the CoNLL-03 corpus for Dutch. The first time you call this constructor it will automatically
571 download the dataset.
572 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
573 to point to a different folder but typically this should not be necessary.
574 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
575 POS tags instead
576 :param in_memory: If True, keeps dataset in memory giving speedups in training.
577 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
578 """
579 if type(base_path) == str:
580 base_path: Path = Path(base_path)
582 # column format
583 columns = {0: "text", 1: "pos", 2: "ner"}
585 # this dataset name
586 dataset_name = self.__class__.__name__.lower()
588 # default dataset folder is the cache root
589 if not base_path:
590 base_path = flair.cache_root / "datasets"
591 data_folder = base_path / dataset_name
593 # download data if necessary
594 conll_02_path = "https://www.clips.uantwerpen.be/conll2002/ner/data/"
596 # download files if not present locally
597 cached_path(f"{conll_02_path}ned.testa", data_folder / 'raw')
598 cached_path(f"{conll_02_path}ned.testb", data_folder / 'raw')
599 cached_path(f"{conll_02_path}ned.train", data_folder / 'raw')
601 # we need to slightly modify the original files by adding some new lines after document separators
602 train_data_file = data_folder / 'train.txt'
603 if not train_data_file.is_file():
604 self.__offset_docstarts(data_folder / 'raw' / "ned.train", data_folder / 'train.txt')
605 self.__offset_docstarts(data_folder / 'raw' / "ned.testa", data_folder / 'dev.txt')
606 self.__offset_docstarts(data_folder / 'raw' / "ned.testb", data_folder / 'test.txt')
608 super(CONLL_03_DUTCH, self).__init__(
609 data_folder,
610 columns,
611 train_file='train.txt',
612 dev_file='dev.txt',
613 test_file='test.txt',
614 tag_to_bioes=tag_to_bioes,
615 encoding="latin-1",
616 in_memory=in_memory,
617 document_separator_token="-DOCSTART-",
618 **corpusargs,
619 )
621 @staticmethod
622 def __offset_docstarts(file_in: Union[str, Path], file_out: Union[str, Path]):
623 with open(file_in, 'r', encoding="latin-1") as f:
624 lines = f.readlines()
625 with open(file_out, 'w', encoding="latin-1") as f:
626 for line in lines:
627 f.write(line)
628 if line.startswith('-DOCSTART-'):
629 f.write("\n")
632class CONLL_03_SPANISH(ColumnCorpus):
633 def __init__(
634 self,
635 base_path: Union[str, Path] = None,
636 tag_to_bioes: str = "ner",
637 in_memory: bool = True,
638 **corpusargs,
639 ):
640 """
641 Initialize the CoNLL-03 corpus for Spanish. The first time you call this constructor it will automatically
642 download the dataset.
643 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
644 to point to a different folder but typically this should not be necessary.
645 :param tag_to_bioes: NER by default, should not be changed
646 :param in_memory: If True, keeps dataset in memory giving speedups in training.
647 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
648 """
649 if type(base_path) == str:
650 base_path: Path = Path(base_path)
652 # column format
653 columns = {0: "text", 1: "ner"}
655 # this dataset name
656 dataset_name = self.__class__.__name__.lower()
658 # default dataset folder is the cache root
659 if not base_path:
660 base_path = flair.cache_root / "datasets"
661 data_folder = base_path / dataset_name
663 # download data if necessary
664 conll_02_path = "https://www.clips.uantwerpen.be/conll2002/ner/data/"
665 cached_path(f"{conll_02_path}esp.testa", Path("datasets") / dataset_name)
666 cached_path(f"{conll_02_path}esp.testb", Path("datasets") / dataset_name)
667 cached_path(f"{conll_02_path}esp.train", Path("datasets") / dataset_name)
669 super(CONLL_03_SPANISH, self).__init__(
670 data_folder,
671 columns,
672 tag_to_bioes=tag_to_bioes,
673 encoding="latin-1",
674 in_memory=in_memory,
675 **corpusargs,
676 )
679class CONLL_2000(ColumnCorpus):
680 def __init__(
681 self,
682 base_path: Union[str, Path] = None,
683 tag_to_bioes: str = "np",
684 in_memory: bool = True,
685 **corpusargs,
686 ):
687 """
688 Initialize the CoNLL-2000 corpus for English chunking.
689 The first time you call this constructor it will automatically download the dataset.
690 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
691 to point to a different folder but typically this should not be necessary.
692 :param tag_to_bioes: 'np' by default, should not be changed, but you can set 'pos' instead to predict POS tags
693 :param in_memory: If True, keeps dataset in memory giving speedups in training.
694 """
695 if type(base_path) == str:
696 base_path: Path = Path(base_path)
698 # column format
699 columns = {0: "text", 1: "pos", 2: "np"}
701 # this dataset name
702 dataset_name = self.__class__.__name__.lower()
704 # default dataset folder is the cache root
705 if not base_path:
706 base_path = flair.cache_root / "datasets"
707 data_folder = base_path / dataset_name
709 # download data if necessary
710 conll_2000_path = "https://www.clips.uantwerpen.be/conll2000/chunking/"
711 data_file = flair.cache_root / "datasets" / dataset_name / "train.txt"
712 if not data_file.is_file():
713 cached_path(
714 f"{conll_2000_path}train.txt.gz", Path("datasets") / dataset_name
715 )
716 cached_path(
717 f"{conll_2000_path}test.txt.gz", Path("datasets") / dataset_name
718 )
719 import gzip, shutil
721 with gzip.open(
722 flair.cache_root / "datasets" / dataset_name / "train.txt.gz",
723 "rb",
724 ) as f_in:
725 with open(
726 flair.cache_root / "datasets" / dataset_name / "train.txt",
727 "wb",
728 ) as f_out:
729 shutil.copyfileobj(f_in, f_out)
730 with gzip.open(
731 flair.cache_root / "datasets" / dataset_name / "test.txt.gz", "rb"
732 ) as f_in:
733 with open(
734 flair.cache_root / "datasets" / dataset_name / "test.txt",
735 "wb",
736 ) as f_out:
737 shutil.copyfileobj(f_in, f_out)
739 super(CONLL_2000, self).__init__(
740 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
741 )
744class WNUT_17(ColumnCorpus):
745 def __init__(
746 self,
747 base_path: Union[str, Path] = None,
748 tag_to_bioes: str = "ner",
749 in_memory: bool = True,
750 **corpusargs,
751 ):
752 if type(base_path) == str:
753 base_path: Path = Path(base_path)
755 # column format
756 columns = {0: "text", 1: "ner"}
758 # this dataset name
759 dataset_name = self.__class__.__name__.lower()
761 # default dataset folder is the cache root
762 if not base_path:
763 base_path = flair.cache_root / "datasets"
764 data_folder = base_path / dataset_name
766 # download data if necessary
767 wnut_path = "https://noisy-text.github.io/2017/files/"
768 cached_path(f"{wnut_path}wnut17train.conll", Path("datasets") / dataset_name)
769 cached_path(f"{wnut_path}emerging.dev.conll", Path("datasets") / dataset_name)
770 cached_path(
771 f"{wnut_path}emerging.test.annotated", Path("datasets") / dataset_name
772 )
774 super(WNUT_17, self).__init__(
775 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
776 )
779class BIOSCOPE(ColumnCorpus):
780 def __init__(
781 self,
782 base_path: Union[str, Path] = None,
783 in_memory: bool = True,
784 **corpusargs,
785 ):
786 if type(base_path) == str:
787 base_path: Path = Path(base_path)
789 # column format
790 columns = {0: "text", 1: "tag"}
792 # this dataset name
793 dataset_name = self.__class__.__name__.lower()
795 # default dataset folder is the cache root
796 if not base_path:
797 base_path = flair.cache_root / "datasets"
798 data_folder = base_path / dataset_name
800 # download data if necessary
801 bioscope_path = "https://raw.githubusercontent.com/whoisjones/BioScopeSequenceLabelingData/master/sequence_labeled/"
802 cached_path(f"{bioscope_path}output.txt", Path("datasets") / dataset_name)
804 super(BIOSCOPE, self).__init__(
805 data_folder, columns, in_memory=in_memory, train_file="output.txt", **corpusargs,
806 )
809class NER_ARABIC_ANER(ColumnCorpus):
810 def __init__(
811 self,
812 base_path: Union[str, Path] = None,
813 tag_to_bioes: str = "ner",
814 in_memory: bool = True,
815 document_as_sequence: bool = False,
816 **corpusargs,
817 ):
818 """
819 Initialize a preprocessed version of the Arabic Named Entity Recognition Corpus (ANERCorp) dataset available
820 from https://github.com/EmnamoR/Arabic-named-entity-recognition/blob/master/ANERCorp.rar.
821 http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp
822 Column order is swapped
823 The first time you call this constructor it will automatically download the dataset.
824 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
825 to point to a different folder but typically this should not be necessary.
826 :param tag_to_bioes: NER by default, need not be changed.
827 :param in_memory: If True, keeps dataset in memory giving speedups in training.
828 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
829 """
830 if type(base_path) == str:
831 base_path: Path = Path(base_path)
833 # column format
834 columns = {0: "text", 1: "ner"}
836 # this dataset name
837 dataset_name = self.__class__.__name__.lower()
839 # default dataset folder is the cache root
840 if not base_path:
841 base_path = flair.cache_root / "datasets"
842 data_folder = base_path / dataset_name
844 # download data if necessary
845 anercorp_path = "https://megantosh.s3.eu-central-1.amazonaws.com/ANERcorp/"
846 # cached_path(f"{anercorp_path}test.txt", Path("datasets") / dataset_name)
847 cached_path(f"{anercorp_path}train.txt", Path("datasets") / dataset_name)
849 super(NER_ARABIC_ANER, self).__init__(
850 data_folder,
851 columns,
852 tag_to_bioes=tag_to_bioes,
853 encoding="utf-8",
854 in_memory=in_memory,
855 document_separator_token=None if not document_as_sequence else "-DOCSTART-",
856 **corpusargs,
857 )
860class NER_ARABIC_AQMAR(ColumnCorpus):
861 def __init__(
862 self,
863 base_path: Union[str, Path] = None,
864 tag_to_bioes: str = "ner",
865 in_memory: bool = True,
866 document_as_sequence: bool = False,
867 **corpusargs,
868 ):
869 """
870 Initialize a preprocessed and modified version of the American and Qatari Modeling of Arabic (AQMAR) dataset available
871 from http://www.cs.cmu.edu/~ark/ArabicNER/AQMAR_Arabic_NER_corpus-1.0.zip.
872 via http://www.cs.cmu.edu/~ark/AQMAR/
874 - Modifications from original dataset: Miscellaneous tags (MIS0, MIS1, MIS2, MIS3) are merged to one tag "MISC" as these categories deviate across the original dataset
875 - The 28 original Wikipedia articles are merged into a single file containing the articles in alphabetical order
877 The first time you call this constructor it will automatically download the dataset.
879 This dataset is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.
880 please cite: "Behrang Mohit, Nathan Schneider, Rishav Bhowmick, Kemal Oflazer, and Noah A. Smith (2012),
881 Recall-Oriented Learning of Named Entities in Arabic Wikipedia. Proceedings of EACL."
883 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
884 :param tag_to_bioes: NER by default
885 :param in_memory: If True, keeps dataset in memory giving speedups in training.
886 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
887 """
888 if type(base_path) == str:
889 base_path: Path = Path(base_path)
891 # column format
892 columns = {0: "text", 1: "ner"}
894 # this dataset name
895 dataset_name = self.__class__.__name__.lower()
897 # default dataset folder is the cache root
898 if not base_path:
899 base_path = flair.cache_root / "datasets"
900 data_folder = base_path / dataset_name
902 # download data if necessary
903 aqmar_path = "https://megantosh.s3.eu-central-1.amazonaws.com/AQMAR/"
904 # cached_path(f"{anercorp_path}test.txt", Path("datasets") / dataset_name)
905 cached_path(f"{aqmar_path}train.txt", Path("datasets") / dataset_name)
907 super(NER_ARABIC_AQMAR, self).__init__(
908 data_folder,
909 columns,
910 tag_to_bioes=tag_to_bioes,
911 encoding="utf-8",
912 in_memory=in_memory,
913 document_separator_token=None if not document_as_sequence else "-DOCSTART-",
914 **corpusargs,
915 )
918class NER_BASQUE(ColumnCorpus):
919 def __init__(
920 self,
921 base_path: Union[str, Path] = None,
922 tag_to_bioes: str = "ner",
923 in_memory: bool = True,
924 **corpusargs,
925 ):
926 if type(base_path) == str:
927 base_path: Path = Path(base_path)
929 # column format
930 columns = {0: "text", 1: "ner"}
932 # this dataset name
933 dataset_name = self.__class__.__name__.lower()
935 # default dataset folder is the cache root
936 if not base_path:
937 base_path = flair.cache_root / "datasets"
938 data_folder = base_path / dataset_name
940 # download data if necessary
941 ner_basque_path = "http://ixa2.si.ehu.eus/eiec/"
942 data_path = flair.cache_root / "datasets" / dataset_name
943 data_file = data_path / "named_ent_eu.train"
944 if not data_file.is_file():
945 cached_path(
946 f"{ner_basque_path}/eiec_v1.0.tgz", Path("datasets") / dataset_name
947 )
948 import tarfile, shutil
950 with tarfile.open(
951 flair.cache_root / "datasets" / dataset_name / "eiec_v1.0.tgz",
952 "r:gz",
953 ) as f_in:
954 corpus_files = (
955 "eiec_v1.0/named_ent_eu.train",
956 "eiec_v1.0/named_ent_eu.test",
957 )
958 for corpus_file in corpus_files:
959 f_in.extract(corpus_file, data_path)
960 shutil.move(f"{data_path}/{corpus_file}", data_path)
962 super(NER_BASQUE, self).__init__(
963 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
964 )
967class NER_CHINESE_WEIBO(ColumnCorpus):
968 def __init__(
969 self,
970 base_path: Union[str, Path] = None,
971 tag_to_bioes: str = "ner",
972 in_memory: bool = True,
973 document_as_sequence: bool = False,
974 **corpusargs,
975 ):
976 """
977 Initialize the WEIBO_NER corpus . The first time you call this constructor it will automatically
978 download the dataset.
979 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
980 to point to a different folder but typically this should not be necessary.
981 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
982 POS tags instead
983 :param in_memory: If True, keeps dataset in memory giving speedups in training.
984 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
985 """
986 if type(base_path) == str:
987 base_path: Path = Path(base_path)
989 # column format
990 columns = {0: 'text', 1: 'ner'}
992 # this dataset name
993 dataset_name = self.__class__.__name__.lower()
995 # default dataset folder is the cache root
996 if not base_path:
997 base_path = flair.cache_root / "datasets"
998 data_folder = base_path / dataset_name
1000 # download data if necessary
1001 weiboNER_conll_path = "https://raw.githubusercontent.com/87302380/WEIBO_NER/main/data/"
1002 cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.train", Path("datasets") / dataset_name)
1003 cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.test", Path("datasets") / dataset_name)
1004 cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.dev", Path("datasets") / dataset_name)
1006 super(NER_CHINESE_WEIBO, self).__init__(
1007 data_folder,
1008 columns,
1009 tag_to_bioes=tag_to_bioes,
1010 encoding="utf-8",
1011 in_memory=in_memory,
1012 train_file="weiboNER_2nd_conll_format.train",
1013 test_file="weiboNER_2nd_conll_format.test",
1014 dev_file="weiboNER_2nd_conll_format.dev",
1015 document_separator_token=None if not document_as_sequence else "-DOCSTART-",
1016 **corpusargs,
1017 )
1020class NER_DANISH_DANE(ColumnCorpus):
1021 def __init__(
1022 self,
1023 base_path: Union[str, Path] = None,
1024 tag_to_bioes: str = "ner",
1025 in_memory: bool = True,
1026 **corpusargs,
1027 ):
1028 if type(base_path) == str:
1029 base_path: Path = Path(base_path)
1031 # column format
1032 columns = {1: 'text', 3: 'pos', 9: 'ner'}
1034 # this dataset name
1035 dataset_name = self.__class__.__name__.lower()
1037 # default dataset folder is the cache root
1038 if not base_path:
1039 base_path = flair.cache_root / "datasets"
1040 data_folder = base_path / dataset_name
1042 # download data if necessary
1043 data_path = flair.cache_root / "datasets" / dataset_name
1044 train_data_file = data_path / "ddt.train.conllu"
1045 if not train_data_file.is_file():
1046 temp_file = cached_path(
1047 'https://danlp.alexandra.dk/304bd159d5de/datasets/ddt.zip',
1048 Path("datasets") / dataset_name
1049 )
1050 from zipfile import ZipFile
1052 with ZipFile(temp_file, 'r') as zip_file:
1053 zip_file.extractall(path=data_path)
1055 # Remove CoNLL-U meta information in the last column
1056 for part in ['train', 'dev', 'test']:
1057 lines = []
1058 data_file = "ddt.{}.conllu".format(part)
1059 with open(data_path / data_file, 'r') as file:
1060 for line in file:
1061 if line.startswith("#") or line == "\n":
1062 lines.append(line)
1063 lines.append(line.replace("name=", "").replace("|SpaceAfter=No", ""))
1065 with open(data_path / data_file, 'w') as file:
1066 file.writelines(lines)
1068 print(data_path / data_file)
1070 super(NER_DANISH_DANE, self).__init__(
1071 data_folder, columns, tag_to_bioes=tag_to_bioes,
1072 in_memory=in_memory, comment_symbol="#",
1073 **corpusargs,
1074 )
1077class NER_ENGLISH_MOVIE_SIMPLE(ColumnCorpus):
1078 def __init__(
1079 self,
1080 base_path: Union[str, Path] = None,
1081 tag_to_bioes: str = "ner",
1082 in_memory: bool = True,
1083 **corpusargs,
1084 ):
1085 """
1086 Initialize the eng corpus of the MIT Movie Corpus (it has simpler queries compared to the trivia10k13 corpus)
1087 in BIO format. The first time you call this constructor it will automatically download the dataset.
1088 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
1089 to point to a different folder but typically this should not be necessary.
1090 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
1091 POS tags instead
1092 :param in_memory: If True, keeps dataset in memory giving speedups in training.
1093 """
1094 # column format
1095 columns = {0: "ner", 1: "text"}
1097 # dataset name
1098 dataset_name = self.__class__.__name__.lower()
1100 # data folder: default dataset folder is the cache root
1101 if type(base_path) == str:
1102 base_path: Path = Path(base_path)
1103 if not base_path:
1104 base_path: Path = flair.cache_root / "datasets"
1105 data_folder = base_path / dataset_name
1107 # download data if necessary
1108 mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/"
1109 train_file = "engtrain.bio"
1110 test_file = "engtest.bio"
1111 cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name)
1112 cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name)
1114 super(NER_ENGLISH_MOVIE_SIMPLE, self).__init__(
1115 data_folder,
1116 columns,
1117 train_file=train_file,
1118 test_file=test_file,
1119 tag_to_bioes=tag_to_bioes,
1120 in_memory=in_memory,
1121 **corpusargs,
1122 )
1125class NER_ENGLISH_MOVIE_COMPLEX(ColumnCorpus):
1126 def __init__(
1127 self,
1128 base_path: Union[str, Path] = None,
1129 tag_to_bioes: str = "ner",
1130 in_memory: bool = True,
1131 **corpusargs,
1132 ):
1133 """
1134 Initialize the trivia10k13 corpus of the MIT Movie Corpus (it has more complex queries compared to the eng corpus)
1135 in BIO format. The first time you call this constructor it will automatically download the dataset.
1136 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
1137 to point to a different folder but typically this should not be necessary.
1138 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
1139 POS tags instead
1140 :param in_memory: If True, keeps dataset in memory giving speedups in training.
1141 """
1142 # column format
1143 columns = {0: "ner", 1: "text"}
1145 # dataset name
1146 dataset_name = self.__class__.__name__.lower()
1148 # data folder: default dataset folder is the cache root
1149 if type(base_path) == str:
1150 base_path: Path = Path(base_path)
1151 if not base_path:
1152 base_path: Path = flair.cache_root / "datasets"
1153 data_folder = base_path / dataset_name
1155 # download data if necessary
1156 mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/"
1157 train_file = "trivia10k13train.bio"
1158 test_file = "trivia10k13test.bio"
1159 cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name)
1160 cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name)
1162 super(NER_ENGLISH_MOVIE_COMPLEX, self).__init__(
1163 data_folder,
1164 columns,
1165 train_file=train_file,
1166 test_file=test_file,
1167 tag_to_bioes=tag_to_bioes,
1168 in_memory=in_memory,
1169 **corpusargs,
1170 )
1173class NER_ENGLISH_SEC_FILLINGS(ColumnCorpus):
1174 """
1175 Initialize corpus of SEC-fillings annotated with English NER tags. See paper "Domain Adaption of Named Entity
1176 Recognition to Support Credit Risk Assessment" by Alvarado et al, 2015: https://aclanthology.org/U15-1010/
1177 :param base_path: Path to the CoNLL-03 corpus (i.e. 'conll_03' folder) on your machine
1178 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' or 'np' to predict
1179 POS tags or chunks respectively
1180 :param in_memory: If True, keeps dataset in memory giving speedups in training.
1181 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
1182 """
1184 def __init__(
1185 self,
1186 base_path: Union[str, Path] = None,
1187 tag_to_bioes: str = "ner",
1188 in_memory: bool = True,
1189 **corpusargs,
1190 ):
1192 if type(base_path) == str:
1193 base_path: Path = Path(base_path)
1195 # column format
1196 columns = {0: "text", 1: "pos", 3: "ner"}
1198 # this dataset name
1199 dataset_name = self.__class__.__name__.lower()
1201 # default dataset folder is the cache root
1202 if not base_path:
1203 base_path = flair.cache_root / "datasets"
1204 data_folder = base_path / dataset_name
1206 # download data if necessary
1207 SEC_FILLINGS_Path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/SEC-filings/CONLL-format/data/"
1208 cached_path(f"{SEC_FILLINGS_Path}test/FIN3.txt", Path("datasets") / dataset_name)
1209 cached_path(f"{SEC_FILLINGS_Path}train/FIN5.txt", Path("datasets") / dataset_name)
1211 super(NER_ENGLISH_SEC_FILLINGS, self).__init__(
1212 data_folder,
1213 columns,
1214 tag_to_bioes=tag_to_bioes,
1215 encoding="utf-8",
1216 in_memory=in_memory,
1217 train_file='FIN5.txt',
1218 test_file="FIN3.txt",
1219 skip_first_line=True,
1220 **corpusargs,
1221 )
1224class NER_ENGLISH_RESTAURANT(ColumnCorpus):
1225 def __init__(
1226 self,
1227 base_path: Union[str, Path] = None,
1228 tag_to_bioes: str = "ner",
1229 in_memory: bool = True,
1230 **corpusargs,
1231 ):
1232 """
1233 Initialize the experimental MIT Restaurant corpus available on https://groups.csail.mit.edu/sls/downloads/restaurant/.
1234 The first time you call this constructor it will automatically download the dataset.
1235 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
1236 to point to a different folder but typically this should not be necessary.
1237 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
1238 POS tags instead
1239 :param in_memory: If True, keeps dataset in memory giving speedups in training.
1240 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
1241 """
1242 if type(base_path) == str:
1243 base_path: Path = Path(base_path)
1245 # column format
1246 columns = {0: "text", 1: "ner"}
1248 # this dataset name
1249 dataset_name = self.__class__.__name__.lower()
1251 # default dataset folder is the cache root
1252 if not base_path:
1253 base_path = flair.cache_root / "datasets"
1254 data_folder = base_path / dataset_name
1256 # download data if necessary
1257 mit_restaurants_path = "https://megantosh.s3.eu-central-1.amazonaws.com/MITRestoCorpus/"
1258 cached_path(f"{mit_restaurants_path}test.txt", Path("datasets") / dataset_name)
1259 cached_path(f"{mit_restaurants_path}train.txt", Path("datasets") / dataset_name)
1261 super(NER_ENGLISH_RESTAURANT, self).__init__(
1262 data_folder,
1263 columns,
1264 tag_to_bioes=tag_to_bioes,
1265 encoding="latin-1",
1266 in_memory=in_memory,
1267 **corpusargs,
1268 )
1271class NER_ENGLISH_STACKOVERFLOW(ColumnCorpus):
1272 def __init__(
1273 self,
1274 base_path: Union[str, Path] = None,
1275 tag_to_bioes: str = "ner",
1276 in_memory: bool = True,
1277 **corpusargs,
1278 ):
1279 """
1280 Initialize the STACKOVERFLOW_NER corpus. The first time you call this constructor it will automatically
1281 download the dataset.
1282 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
1283 to point to a different folder but typically this should not be necessary.
1284 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
1285 POS tags instead
1286 :param in_memory: If True, keeps dataset in memory giving speedups in training.
1287 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
1288 """
1289 if type(base_path) == str:
1290 base_path: Path = Path(base_path)
1292 """
1293 The Datasets are represented in the Conll format.
1294 In this format each line of the Dataset is in the following format:
1295 <word>+"\t"+<NE>"\t"+<word>+"\t"<markdown>
1296 The end of sentence is marked with an empty line.
1297 In each line NE represented the human annotated named entity
1298 and <markdown> represented the code tags provided by the users who wrote the posts.
1299 """
1300 # column format
1301 columns = {0: "word", 1: "ner", 3: "markdown"}
1303 # entity_mapping
1304 entity_mapping = {"Library_Function": "Function",
1305 "Function_Name": "Function",
1306 "Class_Name": "Class",
1307 "Library_Class": "Class",
1308 "Organization": "Website",
1309 "Library_Variable": "Variable",
1310 "Variable_Name": "Variable",
1311 "Error_Name": "O",
1312 "Keyboard_IP": "O",
1313 "Value": "O",
1314 "Output_Block": "O"
1315 }
1317 # this dataset name
1318 dataset_name = self.__class__.__name__.lower()
1320 # default dataset folder is the cache root
1321 if not base_path:
1322 base_path = flair.cache_root / "datasets"
1323 data_folder = base_path / dataset_name
1325 # download data if necessary
1326 STACKOVERFLOW_NER_path = "https://raw.githubusercontent.com/jeniyat/StackOverflowNER/master/resources/annotated_ner_data/StackOverflow/"
1328 # data validation
1329 banned_sentences = ["code omitted for annotation",
1330 "omitted for annotation",
1331 "CODE_BLOCK :",
1332 "OP_BLOCK :",
1333 "Question_URL :",
1334 "Question_ID :"
1335 ]
1337 files = ["train", "test", "dev"]
1339 for file in files:
1340 questions = 0
1341 answers = 0
1343 cached_path(f"{STACKOVERFLOW_NER_path}{file}.txt", Path("datasets") / dataset_name)
1344 for line in open(data_folder / (file + ".txt"), mode="r", encoding="utf-8"):
1345 if line.startswith("Question_ID"):
1346 questions += 1
1348 if line.startswith("Answer_to_Question_ID"):
1349 answers += 1
1350 log.info(f"File {file} has {questions} questions and {answers} answers.")
1352 super(NER_ENGLISH_STACKOVERFLOW, self).__init__(
1353 data_folder,
1354 columns,
1355 train_file="train.txt",
1356 test_file="test.txt",
1357 dev_file="dev.txt",
1358 tag_to_bioes=tag_to_bioes,
1359 encoding="utf-8",
1360 banned_sentences=banned_sentences,
1361 in_memory=in_memory,
1362 label_name_map=entity_mapping,
1363 **corpusargs
1364 )
1367class NER_ENGLISH_TWITTER(ColumnCorpus):
1368 def __init__(
1369 self,
1370 base_path: Union[str, Path] = None,
1371 tag_to_bioes: str = "ner",
1372 in_memory: bool = True,
1373 **corpusargs,
1374 ):
1375 """
1376 Initialize a dataset called twitter_ner which can be found on the following page:
1377 https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/ner.txt.
1379 The first time you call this constructor it will automatically
1380 download the dataset.
1381 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
1382 to point to a different folder but typically this should not be necessary.
1383 :param tag_to_bioes: NER by default, need not be changed
1384 :param in_memory: If True, keeps dataset in memory giving speedups in training.
1385 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
1386 """
1387 if type(base_path) == str:
1388 base_path: Path = Path(base_path)
1390 # column format
1391 columns = {0: 'text', 1: 'ner'}
1393 # this dataset name
1394 dataset_name = self.__class__.__name__.lower()
1396 # default dataset folder is the cache root
1397 if not base_path:
1398 base_path = flair.cache_root / "datasets"
1399 data_folder = base_path / dataset_name
1401 # download data if necessary
1402 twitter_ner_path = "https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/"
1403 cached_path(f"{twitter_ner_path}ner.txt", Path("datasets") / dataset_name)
1405 super(NER_ENGLISH_TWITTER, self).__init__(
1406 data_folder,
1407 columns,
1408 tag_to_bioes=tag_to_bioes,
1409 encoding="latin-1",
1410 train_file="ner.txt",
1411 in_memory=in_memory,
1412 **corpusargs,
1413 )
1416class NER_ENGLISH_PERSON(ColumnCorpus):
1417 def __init__(
1418 self,
1419 base_path: Union[str, Path] = None,
1420 in_memory: bool = True,
1421 ):
1422 """
1423 Initialize the PERSON_NER corpus for person names. The first time you call this constructor it will automatically
1424 download the dataset.
1425 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
1426 to point to a different folder but typically this should not be necessary.
1427 :param in_memory: If True, keeps dataset in memory giving speedups in training.
1428 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
1429 """
1431 if type(base_path) == str:
1432 base_path: Path = Path(base_path)
1434 # column format
1435 columns = {0: "text", 1: "ner"}
1437 # this dataset name
1438 dataset_name = self.__class__.__name__.lower()
1440 # default dataset folder is the cache root
1441 if not base_path:
1442 base_path = flair.cache_root / "datasets"
1443 data_folder = base_path / dataset_name
1445 # download data if necessary
1446 conll_path = "https://raw.githubusercontent.com/das-sudeshna/genid/master/"
1448 # download files if not present locallys
1449 cached_path(f"{conll_path}conll-g.conll", data_folder / 'raw')
1450 cached_path(f"{conll_path}ieer-g.conll", data_folder / 'raw')
1451 cached_path(f"{conll_path}textbook-g.conll", data_folder / 'raw')
1452 cached_path(f"{conll_path}wiki-g.conll", data_folder / 'raw')
1454 self.__concatAllFiles(data_folder)
1456 super(NER_ENGLISH_PERSON, self).__init__(
1457 data_folder,
1458 columns,
1459 in_memory=in_memory,
1460 train_file='bigFile.conll'
1461 )
1463 @staticmethod
1464 def __concatAllFiles(data_folder):
1465 arr = os.listdir(data_folder / 'raw')
1467 with open(data_folder / 'bigFile.conll', 'w') as outfile:
1468 for fname in arr:
1469 with open(data_folder / 'raw' / fname) as infile:
1470 outfile.write(infile.read())
1473class NER_ENGLISH_WEBPAGES(ColumnCorpus):
1474 def __init__(
1475 self,
1476 base_path: Union[str, Path] = None,
1477 tag_to_bioes: str = "ner",
1478 in_memory: bool = True,
1479 **corpusargs,
1480 ):
1481 """
1482 Initialize the WEBPAGES_NER corpus introduced in the paper "Design Challenges and Misconceptions in Named Entity
1483 Recognition" by Ratinov and Roth (2009): https://aclanthology.org/W09-1119/.
1484 The first time you call this constructor it will automatically download the dataset.
1485 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
1486 to point to a different folder but typically this should not be necessary.
1487 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
1488 POS tags instead
1489 :param in_memory: If True, keeps dataset in memory giving speedups in training.
1490 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
1491 """
1492 if type(base_path) == str:
1493 base_path: Path = Path(base_path)
1495 # column format
1496 columns = {0: "ner", 5: "text"}
1498 # this dataset name
1499 dataset_name = self.__class__.__name__.lower()
1501 # default dataset folder is the cache root
1502 if not base_path:
1503 base_path = Path(flair.cache_root) / "datasets"
1504 data_folder = base_path / dataset_name
1505 import tarfile
1506 if not os.path.isfile(data_folder / 'webpages_ner.txt'):
1507 # # download zip
1508 tar_file = "https://cogcomp.seas.upenn.edu/Data/NERWebpagesColumns.tgz"
1509 webpages_ner_path = cached_path(tar_file, Path("datasets") / dataset_name)
1510 tf = tarfile.open(webpages_ner_path)
1511 tf.extractall(data_folder)
1512 tf.close()
1513 outputfile = os.path.abspath(data_folder)
1515 # merge the files in one as the zip is containing multiples files
1517 with open(outputfile / data_folder / "webpages_ner.txt", "w+") as outfile:
1518 for files in os.walk(outputfile):
1519 f = files[1]
1520 ff = os.listdir(outputfile / data_folder / f[-1])
1521 for i, file in enumerate(ff):
1522 if file.endswith('.gold'):
1523 with open(outputfile / data_folder / f[-1] / file, 'r+', errors='replace') as infile:
1524 content = infile.read()
1525 outfile.write(content)
1526 break
1528 super(NER_ENGLISH_WEBPAGES, self).__init__(
1529 data_folder,
1530 columns,
1531 train_file='webpages_ner.txt',
1532 tag_to_bioes=tag_to_bioes,
1533 in_memory=in_memory,
1534 **corpusargs,
1535 )
1538class NER_ENGLISH_WNUT_2020(ColumnCorpus):
1539 def __init__(
1540 self,
1541 base_path: Union[str, Path] = None,
1542 tag_to_bioes: str = "ner",
1543 in_memory: bool = True,
1544 document_as_sequence: bool = False,
1545 **corpusargs,
1546 ):
1547 """
1548 Initialize the WNUT_2020_NER corpus. The first time you call this constructor it will automatically
1549 download the dataset.
1550 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
1551 to point to a different folder but typically this should not be necessary.
1552 :param tag_to_bioes: NER by default, since it is the only option of the WNUT corpus.
1553 :param in_memory: If True, keeps dataset in memory giving speedups in training.
1554 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
1555 """
1556 if type(base_path) == str:
1557 base_path: Path = Path(base_path)
1559 # column format
1560 columns = {0: "text", 1: "ner"}
1562 # this dataset name
1563 dataset_name = self.__class__.__name__.lower()
1565 # default dataset folder is the cache root
1566 if not base_path:
1567 base_path = flair.cache_root / "datasets"
1568 data_folder = base_path / dataset_name
1570 # download data if necessary
1571 github_url = "https://github.com/jeniyat/WNUT_2020_NER/archive/master.zip"
1573 for sample in ["train", "test", "dev"]:
1575 sample_file = data_folder / (sample + ".txt")
1576 if not sample_file.is_file():
1578 zip_path = cached_path(
1579 f"{github_url}", Path("datasets") / dataset_name
1580 )
1582 # unzip the downloaded repo and merge the train, dev and test datasets
1583 unpack_file(zip_path, data_folder, "zip", False) # unzipped folder name: WNUT_2020_NER-master
1585 if sample == "test":
1586 file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data_2020/Conll_Format/")
1587 else:
1588 file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data/Conll_Format/")
1589 filenames = os.listdir(file_path)
1590 with open(data_folder / (sample + '.txt'), 'w') as outfile:
1591 for fname in filenames:
1592 with open(file_path / fname) as infile:
1593 lines = infile.read()
1594 outfile.write(lines)
1596 shutil.rmtree(str(data_folder / "WNUT_2020_NER-master")) # clean up when done
1598 super(NER_ENGLISH_WNUT_2020, self).__init__(
1599 data_folder,
1600 columns,
1601 tag_to_bioes=tag_to_bioes,
1602 encoding="utf-8",
1603 in_memory=in_memory,
1604 document_separator_token=None if not document_as_sequence else "-DOCSTART-",
1605 **corpusargs,
1606 )
1609class NER_ENGLISH_WIKIGOLD(ColumnCorpus):
1610 def __init__(
1611 self,
1612 base_path: Union[str, Path] = None,
1613 tag_to_bioes: str = "ner",
1614 in_memory: bool = True,
1615 document_as_sequence: bool = False,
1616 **corpusargs,
1617 ):
1618 """
1619 Initialize the wikigold corpus. The first time you call this constructor it will automatically
1620 download the dataset.
1621 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
1622 to point to a different folder but typically this should not be necessary.
1623 :param tag_to_bioes: NER by default, should not be changed
1624 :param in_memory: If True, keeps dataset in memory giving speedups in training.
1625 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
1626 """
1627 if type(base_path) == str:
1628 base_path: Path = Path(base_path)
1630 # column format
1631 columns = {0: "text", 1: "ner"}
1633 # this dataset name
1634 dataset_name = self.__class__.__name__.lower()
1636 # default dataset folder is the cache root
1637 if not base_path:
1638 base_path = flair.cache_root / "datasets"
1639 data_folder = base_path / dataset_name
1641 # download data if necessary
1642 wikigold_ner_path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/wikigold/CONLL-format/data/"
1643 cached_path(f"{wikigold_ner_path}wikigold.conll.txt", Path("datasets") / dataset_name)
1645 super(NER_ENGLISH_WIKIGOLD, self).__init__(
1646 data_folder,
1647 columns,
1648 tag_to_bioes=tag_to_bioes,
1649 encoding="utf-8",
1650 in_memory=in_memory,
1651 train_file='wikigold.conll.txt',
1652 document_separator_token=None if not document_as_sequence else "-DOCSTART-",
1653 **corpusargs,
1654 )
1657class NER_FINNISH(ColumnCorpus):
1658 def __init__(
1659 self,
1660 base_path: Union[str, Path] = None,
1661 tag_to_bioes: str = "ner",
1662 in_memory: bool = True,
1663 **corpusargs,
1664 ):
1665 if type(base_path) == str:
1666 base_path: Path = Path(base_path)
1668 # column format
1669 columns = {0: "text", 1: "ner"}
1671 # this dataset name
1672 dataset_name = self.__class__.__name__.lower()
1674 # default dataset folder is the cache root
1675 if not base_path:
1676 base_path = flair.cache_root / "datasets"
1677 data_folder = base_path / dataset_name
1679 # download data if necessary
1680 ner_finnish_path = "https://raw.githubusercontent.com/mpsilfve/finer-data/master/data/digitoday."
1681 cached_path(f"{ner_finnish_path}2014.train.csv", Path("datasets") / dataset_name)
1682 cached_path(f"{ner_finnish_path}2014.dev.csv", Path("datasets") / dataset_name)
1683 cached_path(f"{ner_finnish_path}2015.test.csv", Path("datasets") / dataset_name)
1685 self._remove_lines_without_annotations(data_file=Path(data_folder / "digitoday.2015.test.csv"))
1687 super(NER_FINNISH, self).__init__(
1688 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, skip_first_line=True, **corpusargs,
1689 )
1691 def _remove_lines_without_annotations(self, data_file: Union[str, Path] = None):
1692 with open(data_file, 'r') as f:
1693 lines = f.readlines()
1694 with open(data_file, 'w') as f:
1695 for line in lines:
1696 if len(line.split()) != 1:
1697 f.write(line)
1700class NER_GERMAN_BIOFID(ColumnCorpus):
1701 def __init__(
1702 self,
1703 base_path: Union[str, Path] = None,
1704 tag_to_bioes: str = "ner",
1705 in_memory: bool = True,
1706 **corpusargs,
1707 ):
1708 if type(base_path) == str:
1709 base_path: Path = Path(base_path)
1711 # column format
1712 columns = {0: "text", 1: "lemma", 2: "pos", 3: "ner"}
1714 # this dataset name
1715 dataset_name = self.__class__.__name__.lower()
1717 # default dataset folder is the cache root
1718 if not base_path:
1719 base_path = flair.cache_root / "datasets"
1720 data_folder = base_path / dataset_name
1722 # download data if necessary
1723 biofid_path = "https://raw.githubusercontent.com/texttechnologylab/BIOfid/master/BIOfid-Dataset-NER/"
1724 cached_path(f"{biofid_path}train.conll", Path("datasets") / dataset_name)
1725 cached_path(f"{biofid_path}dev.conll", Path("datasets") / dataset_name)
1726 cached_path(f"{biofid_path}test.conll", Path("datasets") / dataset_name)
1728 super(NER_GERMAN_BIOFID, self).__init__(
1729 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
1730 )
1733class NER_GERMAN_EUROPARL(ColumnCorpus):
1734 def __init__(
1735 self,
1736 base_path: Union[str, Path] = None,
1737 tag_to_bioes: str = "ner",
1738 in_memory: bool = True,
1739 **corpusargs,
1740 ):
1741 """
1742 Initialize the EUROPARL_NER_GERMAN corpus. The first time you call this constructor it will automatically
1743 download the dataset.
1744 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
1745 to point to a different folder but typically this should not be necessary.
1746 :param tag_to_bioes: 'ner' by default, should not be changed.
1747 :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
1748 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
1749 """
1751 if type(base_path) == str:
1752 base_path: Path = Path(base_path)
1754 # column format
1755 columns = {0: 'text', 1: 'lemma', 2: 'pos', 3: 'np', 4: 'ner'}
1757 # this dataset name
1758 dataset_name = self.__class__.__name__.lower()
1760 # default dataset folder is the cache root
1761 if not base_path:
1762 base_path = flair.cache_root / "datasets"
1763 data_folder = base_path / dataset_name
1765 # download data if necessary
1766 europarl_ner_german_path = "https://nlpado.de/~sebastian/software/ner/"
1767 cached_path(f"{europarl_ner_german_path}ep-96-04-15.conll", Path("datasets") / dataset_name)
1768 cached_path(f"{europarl_ner_german_path}ep-96-04-16.conll", Path("datasets") / dataset_name)
1770 self._add_IOB_tags(data_file=Path(data_folder / "ep-96-04-15.conll"), encoding="latin-1", ner_column=4)
1771 self._add_IOB_tags(data_file=Path(data_folder / "ep-96-04-16.conll"), encoding="latin-1", ner_column=4)
1773 super(NER_GERMAN_EUROPARL, self).__init__(
1774 data_folder,
1775 columns,
1776 tag_to_bioes=tag_to_bioes,
1777 encoding="latin-1",
1778 in_memory=in_memory,
1779 train_file='ep-96-04-16.conll',
1780 test_file='ep-96-04-15.conll',
1781 **corpusargs,
1782 )
1784 def _add_IOB_tags(self, data_file: Union[str, Path], encoding: str = "utf8", ner_column: int = 1):
1785 """
1786 Function that adds IOB tags if only chunk names are provided (e.g. words are tagged PER instead
1787 of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects
1788 the letter 'O'. Additionally it removes lines with no tags in the data file and can also
1789 be used if the data is only partially IOB tagged.
1790 Parameters
1791 ----------
1792 data_file : Union[str, Path]
1793 Path to the data file.
1794 encoding : str, optional
1795 Encoding used in open function. The default is "utf8".
1796 ner_column : int, optional
1797 Specifies the ner-tagged column. The default is 1 (the second column).
1799 """
1801 def add_I_prefix(current_line: List[str], ner: int, tag: str):
1802 for i in range(0, len(current_line)):
1803 if i == 0:
1804 f.write(line_list[i])
1805 elif i == ner:
1806 f.write(' I-' + tag)
1807 else:
1808 f.write(' ' + current_line[i])
1809 f.write('\n')
1811 with open(file=data_file, mode='r', encoding=encoding) as f:
1812 lines = f.readlines()
1813 with open(file=data_file, mode='w', encoding=encoding) as f:
1814 pred = 'O' # remembers ner tag of predecessing line
1815 for line in lines:
1816 line_list = line.split()
1817 if len(line_list) > 2: # word with tags
1818 ner_tag = line_list[ner_column]
1819 if ner_tag in ['0', 'O']: # no chunk
1820 for i in range(0, len(line_list)):
1821 if i == 0:
1822 f.write(line_list[i])
1823 elif i == ner_column:
1824 f.write(' O')
1825 else:
1826 f.write(' ' + line_list[i])
1827 f.write('\n')
1828 pred = 'O'
1829 elif '-' not in ner_tag: # no IOB tags
1830 if pred == 'O': # found a new chunk
1831 add_I_prefix(line_list, ner_column, ner_tag)
1832 pred = ner_tag
1833 else: # found further part of chunk or new chunk directly after old chunk
1834 add_I_prefix(line_list, ner_column, ner_tag)
1835 pred = ner_tag
1836 else: # line already has IOB tag (tag contains '-')
1837 f.write(line)
1838 pred = ner_tag.split('-')[1]
1839 elif len(line_list) == 0: # empty line
1840 f.write('\n')
1841 pred = 'O'
1844class NER_GERMAN_LEGAL(ColumnCorpus):
1845 def __init__(
1846 self,
1847 base_path: Union[str, Path] = None,
1848 tag_to_bioes: str = "ner",
1849 in_memory: bool = True,
1850 **corpusargs,
1851 ):
1852 """
1853 Initialize the LER_GERMAN (Legal Entity Recognition) corpus. The first time you call this constructor it will automatically
1854 download the dataset.
1855 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
1856 to point to a different folder but typically this should not be necessary.
1857 :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
1858 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
1859 """
1861 if type(base_path) == str:
1862 base_path: Path = Path(base_path)
1864 # column format
1865 columns = {0: "text", 1: "ner"}
1867 # this dataset name
1868 dataset_name = self.__class__.__name__.lower()
1870 # default dataset folder is the cache root
1871 if not base_path:
1872 base_path = flair.cache_root / "datasets"
1873 data_folder = base_path / dataset_name
1875 # download data if necessary
1876 ler_path = "https://raw.githubusercontent.com/elenanereiss/Legal-Entity-Recognition/master/data/"
1877 cached_path(f"{ler_path}ler.conll", Path("datasets") / dataset_name)
1879 super(NER_GERMAN_LEGAL, self).__init__(
1880 data_folder,
1881 columns,
1882 tag_to_bioes=tag_to_bioes,
1883 in_memory=in_memory,
1884 train_file='ler.conll',
1885 **corpusargs,
1886 )
1889class NER_GERMAN_GERMEVAL(ColumnCorpus):
1890 def __init__(
1891 self,
1892 base_path: Union[str, Path] = None,
1893 tag_to_bioes: str = "ner",
1894 in_memory: bool = True,
1895 **corpusargs,
1896 ):
1897 """
1898 Initialize the GermEval NER corpus for German. This is only possible if you've manually downloaded it to your
1899 machine. Obtain the corpus from https://sites.google.com/site/germeval2014ner/data and put it into some folder.
1900 Then point the base_path parameter in the constructor to this folder
1901 :param base_path: Path to the GermEval corpus on your machine
1902 :param tag_to_bioes: 'ner' by default, should not be changed.
1903 :param in_memory:If True, keeps dataset in memory giving speedups in training.
1904 """
1905 if type(base_path) == str:
1906 base_path: Path = Path(base_path)
1908 # column format
1909 columns = {1: "text", 2: "ner"}
1911 # this dataset name
1912 dataset_name = self.__class__.__name__.lower()
1914 # default dataset folder is the cache root
1915 if not base_path:
1916 base_path = flair.cache_root / "datasets"
1917 data_folder = base_path / dataset_name
1919 # check if data there
1920 if not data_folder.exists():
1921 # create folder
1922 os.makedirs(data_folder)
1924 # download dataset
1925 import gdown
1926 gdown.download(url="https://drive.google.com/uc?id={}".format("1Jjhbal535VVz2ap4v4r_rN1UEHTdLK5P"),
1927 output=str(data_folder / 'train.tsv'))
1928 gdown.download(url="https://drive.google.com/uc?id={}".format("1u9mb7kNJHWQCWyweMDRMuTFoOHOfeBTH"),
1929 output=str(data_folder / 'test.tsv'))
1930 gdown.download(url="https://drive.google.com/uc?id={}".format("1ZfRcQThdtAR5PPRjIDtrVP7BtXSCUBbm"),
1931 output=str(data_folder / 'dev.tsv'))
1933 super(NER_GERMAN_GERMEVAL, self).__init__(
1934 data_folder,
1935 columns,
1936 tag_to_bioes=tag_to_bioes,
1937 comment_symbol="#",
1938 in_memory=in_memory,
1939 **corpusargs,
1940 )
1943class NER_GERMAN_POLITICS(ColumnCorpus):
1944 def __init__(
1945 self,
1946 base_path: Union[str, Path] = None,
1947 tag_to_bioes: str = "ner",
1948 column_delimiter: str = r"\s+",
1949 in_memory: bool = True,
1950 **corpusargs,
1951 ):
1952 """
1953 Initialize corpus with Named Entity Model for German, Politics (NEMGP) data from
1954 https://www.thomas-zastrow.de/nlp/. The first time you call this constructor it will automatically download the
1955 dataset.
1956 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
1957 to point to a different folder but typically this should not be necessary.
1958 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
1959 POS tags instead
1960 :param in_memory: If True, keeps dataset in memory giving speedups in training.
1961 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
1962 """
1963 if type(base_path) == str:
1964 base_path: Path = Path(base_path)
1966 # column format
1967 columns = {0: "text", 1: "ner"}
1969 # this dataset name
1970 dataset_name = self.__class__.__name__.lower()
1972 # default dataset folder is the cache root
1973 if not base_path:
1974 base_path = flair.cache_root / "datasets"
1975 data_folder = base_path / dataset_name
1977 # download and parse data if necessary
1978 german_politics_path = "https://www.thomas-zastrow.de/nlp/nemgp_trainingdata_01.txt.zip"
1979 corpus_file_name = "nemgp_trainingdata_01.txt"
1980 parsed_dataset = data_folder / "raw" / corpus_file_name
1982 if not parsed_dataset.exists():
1983 german_politics_zip = cached_path(f"{german_politics_path}", Path("datasets") / dataset_name / "raw")
1984 unpack_file(german_politics_zip, data_folder / "raw", "zip", False)
1985 self._convert_to_column_corpus(parsed_dataset)
1987 # create train test dev if not exist
1988 train_dataset = data_folder / "train.txt"
1989 if not train_dataset.exists():
1990 self._create_datasets(parsed_dataset, data_folder)
1992 super(NER_GERMAN_POLITICS, self).__init__(
1993 data_folder,
1994 columns,
1995 column_delimiter=column_delimiter,
1996 train_file='train.txt',
1997 dev_file='dev.txt',
1998 test_file='test.txt',
1999 tag_to_bioes=tag_to_bioes,
2000 encoding="utf-8",
2001 in_memory=in_memory,
2002 **corpusargs,
2003 )
2005 def _convert_to_column_corpus(self, data_file: Union[str, Path]):
2006 with open(data_file, 'r', encoding='utf-8') as f:
2007 lines = f.readlines()
2008 with open(data_file, 'w', encoding='utf-8') as f:
2009 tag_bool = False
2010 new_sentence = True
2011 for line in lines:
2012 line = re.sub('\s{2,}', ' ', line).strip().split(' ')
2013 for substr in line:
2014 if substr == '.':
2015 f.write("\n")
2016 new_sentence = True
2017 elif "<START:" in substr:
2018 tag_bool = True
2019 tag = substr.strip('<START:').strip('>')
2020 if 'loc' in tag:
2021 tag_IOB = '-LOC'
2022 elif 'per' in tag:
2023 tag_IOB = '-PER'
2024 elif 'org' in tag:
2025 tag_IOB = '-ORG'
2026 elif 'misc' in tag:
2027 tag_IOB = '-MISC'
2028 elif "<END>" in substr:
2029 tag_bool = False
2030 new_sentence = True
2031 else:
2032 if tag_bool:
2033 if new_sentence is True:
2034 start = 'B'
2035 new_sentence = False
2036 else:
2037 start = 'I'
2038 f.write(substr.strip(' ') + " " + start + tag_IOB + "\n")
2039 else:
2040 f.write(substr.strip(' ') + " " + 'O' + "\n")
2042 def _create_datasets(self, data_file: Union[str, Path], data_folder: Union[str, Path]):
2043 with open(data_file, 'r') as file:
2044 num_lines = len(file.readlines())
2045 file.seek(0)
2047 train_len = round(num_lines * 0.8)
2048 test_len = round(num_lines * 0.1)
2049 dev_len = num_lines - train_len - test_len
2051 train = open(data_folder / "train.txt", "w")
2052 test = open(data_folder / "test.txt", "w")
2053 dev = open(data_folder / "dev.txt", "w")
2055 k = 0
2056 for line in file.readlines():
2057 k += 1
2058 if k <= train_len:
2059 train.write(line)
2060 elif k > train_len and k <= (train_len + test_len):
2061 test.write(line)
2062 elif k > (train_len + test_len) and k <= num_lines:
2063 dev.write(line)
2066class NER_HUNGARIAN(ColumnCorpus):
2067 def __init__(
2068 self,
2069 base_path: Union[str, Path] = None,
2070 tag_to_bioes: str = "ner",
2071 in_memory: bool = True,
2072 document_as_sequence: bool = False,
2073 **corpusargs,
2074 ):
2075 """
2076 Initialize the NER Business corpus for Hungarian. The first time you call this constructor it will automatically
2077 download the dataset.
2078 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
2079 to point to a different folder but typically this should not be necessary.
2080 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
2081 POS tags instead
2082 :param in_memory: If True, keeps dataset in memory giving speedups in training.
2083 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
2084 """
2085 if type(base_path) == str:
2086 base_path: Path = Path(base_path)
2088 # column format
2089 columns = {0: "text", 1: "ner"}
2091 # this dataset name
2092 dataset_name = self.__class__.__name__.lower()
2094 # default dataset folder is the cache root
2095 if not base_path:
2096 base_path = flair.cache_root / "datasets"
2097 data_folder = base_path / dataset_name
2099 # If the extracted corpus file is not yet present in dir
2100 if not os.path.isfile(data_folder / 'hun_ner_corpus.txt'):
2101 # download zip if necessary
2102 hun_ner_path = "https://rgai.sed.hu/sites/rgai.sed.hu/files/business_NER.zip"
2103 path_to_zipped_corpus = cached_path(hun_ner_path, Path("datasets") / dataset_name)
2104 # extracted corpus is not present , so unpacking it.
2105 unpack_file(
2106 path_to_zipped_corpus,
2107 data_folder,
2108 mode="zip",
2109 keep=True
2110 )
2112 super(NER_HUNGARIAN, self).__init__(
2113 data_folder,
2114 columns,
2115 train_file='hun_ner_corpus.txt',
2116 column_delimiter='\t',
2117 tag_to_bioes=tag_to_bioes,
2118 encoding="latin-1",
2119 in_memory=in_memory,
2120 label_name_map={'0': 'O'},
2121 document_separator_token=None if not document_as_sequence else "-DOCSTART-",
2122 **corpusargs,
2123 )
2126class NER_ICELANDIC(ColumnCorpus):
2127 def __init__(
2128 self,
2129 base_path: Union[str, Path] = None,
2130 tag_to_bioes: str = "ner",
2131 in_memory: bool = True,
2132 **corpusargs,
2133 ):
2134 """
2135 Initialize the ICELANDIC_NER corpus. The first time you call this constructor it will automatically
2136 download the dataset.
2137 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
2138 to point to a different folder but typically this should not be necessary.
2139 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
2140 POS tags instead
2141 :param in_memory: If True, keeps dataset in memory giving speedups in training.
2142 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
2143 """
2144 if type(base_path) == str:
2145 base_path: Path = Path(base_path)
2147 # column format
2148 columns = {0: "text", 1: "ner"}
2150 # this dataset name
2151 dataset_name = self.__class__.__name__.lower()
2153 # default dataset folder is the cache root
2154 if not base_path:
2155 base_path = flair.cache_root / "datasets"
2156 data_folder = base_path / dataset_name
2158 if not os.path.isfile(data_folder / 'icelandic_ner.txt'):
2159 # download zip
2160 icelandic_ner = "https://repository.clarin.is/repository/xmlui/handle/20.500.12537/42/allzip"
2161 icelandic_ner_path = cached_path(icelandic_ner, Path("datasets") / dataset_name)
2163 # unpacking the zip
2164 unpack_file(
2165 icelandic_ner_path,
2166 data_folder,
2167 mode="zip",
2168 keep=True
2169 )
2170 outputfile = os.path.abspath(data_folder)
2172 # merge the files in one as the zip is containing multiples files
2174 with open(outputfile / data_folder / "icelandic_ner.txt", "wb") as outfile:
2175 for files in os.walk(outputfile / data_folder):
2176 f = files[2]
2178 for i in range(len(f)):
2179 if f[i].endswith('.txt'):
2180 with open(outputfile / data_folder / f[i], 'rb') as infile:
2181 contents = infile.read()
2182 outfile.write(contents)
2184 super(NER_ICELANDIC, self).__init__(
2185 data_folder,
2186 columns,
2187 train_file='icelandic_ner.txt',
2188 tag_to_bioes=tag_to_bioes,
2189 in_memory=in_memory,
2190 **corpusargs,
2191 )
2194class NER_JAPANESE(ColumnCorpus):
2195 def __init__(
2196 self,
2197 base_path: Union[str, Path] = None,
2198 tag_to_bioes: str = "ner",
2199 in_memory: bool = True,
2200 **corpusargs,
2201 ):
2202 """
2203 Initialize the Hironsan/IOB2 corpus for Japanese. The first time you call this constructor it will automatically
2204 download the dataset.
2205 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
2206 to point to a different folder but typically this should not be necessary.
2207 :param tag_to_bioes: NER by default.
2208 :param in_memory: If True, keeps dataset in memory giving speedups in training.
2209 """
2210 if type(base_path) == str:
2211 base_path: Path = Path(base_path)
2213 # column format
2214 columns = {0: 'text', 1: 'ner'}
2216 # this dataset name
2217 dataset_name = self.__class__.__name__.lower()
2219 # default dataset folder is the cache root
2220 if not base_path:
2221 base_path = flair.cache_root / "datasets"
2222 data_folder = base_path / dataset_name
2224 # download data from github if necessary (hironsan.txt, ja.wikipedia.conll)
2225 IOB2_path = "https://raw.githubusercontent.com/Hironsan/IOB2Corpus/master/"
2227 # download files if not present locally
2228 cached_path(f"{IOB2_path}hironsan.txt", data_folder / 'raw')
2229 cached_path(f"{IOB2_path}ja.wikipedia.conll", data_folder / 'raw')
2231 # we need to modify the original files by adding new lines after after the end of each sentence
2232 train_data_file = data_folder / 'train.txt'
2233 if not train_data_file.is_file():
2234 self.__prepare_jap_wikinews_corpus(data_folder / 'raw' / "hironsan.txt", data_folder / 'train.txt')
2235 self.__prepare_jap_wikipedia_corpus(data_folder / 'raw' / "ja.wikipedia.conll", data_folder / 'train.txt')
2237 super(NER_JAPANESE, self).__init__(
2238 data_folder,
2239 columns,
2240 train_file='train.txt',
2241 tag_to_bioes=tag_to_bioes,
2242 in_memory=in_memory,
2243 **corpusargs,
2244 )
2246 @staticmethod
2247 def __prepare_jap_wikipedia_corpus(file_in: Union[str, Path], file_out: Union[str, Path]):
2248 with open(file_in, 'r') as f:
2249 lines = f.readlines()
2250 with open(file_out, 'a') as f:
2251 for line in lines:
2252 if (line[0] == "。"):
2253 f.write(line)
2254 f.write("\n")
2255 elif (line[0] == "\n"):
2256 continue
2257 else:
2258 f.write(line)
2260 @staticmethod
2261 def __prepare_jap_wikinews_corpus(file_in: Union[str, Path], file_out: Union[str, Path]):
2262 with open(file_in, 'r') as f:
2263 lines = f.readlines()
2264 with open(file_out, 'a') as f:
2265 for line in lines:
2266 sp_line = line.split("\t")
2267 if (sp_line[0] == "\n"):
2268 f.write("\n")
2269 else:
2270 f.write(sp_line[0] + "\t" + sp_line[len(sp_line) - 1])
2273class NER_MASAKHANE(MultiCorpus):
2274 def __init__(
2275 self,
2276 languages: Union[str, List[str]] = "luo",
2277 base_path: Union[str, Path] = None,
2278 tag_to_bioes: str = "ner",
2279 in_memory: bool = True,
2280 **corpusargs,
2281 ):
2282 """
2283 Initialize the Masakhane corpus available on https://github.com/masakhane-io/masakhane-ner/tree/main/data.
2284 It consists of ten African languages. Pass a language code or a list of language codes to initialize the corpus
2285 with the languages you require. If you pass "all", all languages will be initialized.
2286 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
2287 to point to a different folder but typically this should not be necessary.
2288 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
2289 POS tags instead
2290 :param in_memory: If True, keeps dataset in memory giving speedups in training.
2291 """
2292 if type(base_path) == str:
2293 base_path: Path = Path(base_path)
2295 # if only one language is given
2296 if type(languages) == str:
2297 languages = [languages]
2299 # column format
2300 columns = {0: "text", 1: "ner"}
2302 # this dataset name
2303 dataset_name = self.__class__.__name__.lower()
2305 # default dataset folder is the cache root
2306 if not base_path:
2307 base_path = flair.cache_root / "datasets"
2308 data_folder = base_path / dataset_name
2310 language_to_code = {"amharic": "amh",
2311 "hausa": "hau",
2312 "igbo": "ibo",
2313 "kinyarwanda": "kin",
2314 "luganda": "lug",
2315 "luo": "luo",
2316 "naija": "pcm",
2317 "swahili": "swa",
2318 "yoruba": "yor",
2319 "wolof": "wol",
2320 }
2322 # use all languages if explicitly set to "all"
2323 if languages == ["all"]: languages = language_to_code.values()
2325 corpora = []
2326 for language in languages:
2328 if language in language_to_code.keys():
2329 language = language_to_code[language]
2331 if language not in language_to_code.values():
2332 log.error(f"Language '{language}' is not in list of supported languages!")
2333 log.error(f"Supported are '{language_to_code.values()}'!")
2334 log.error(f"Instantiate this Corpus for instance like so 'corpus = NER_MASAKHANE(languages='luo')'")
2335 raise Exception()
2337 language_folder = data_folder / language
2339 # download data if necessary
2340 data_path = f"https://raw.githubusercontent.com/masakhane-io/masakhane-ner/main/data/{language}/"
2341 cached_path(f"{data_path}dev.txt", language_folder)
2342 cached_path(f"{data_path}test.txt", language_folder)
2343 cached_path(f"{data_path}train.txt", language_folder)
2345 # initialize comlumncorpus and add it to list
2346 log.info(f"Reading data for language {language}")
2347 corp = ColumnCorpus(data_folder=language_folder,
2348 column_format=columns,
2349 tag_to_bioes=tag_to_bioes,
2350 encoding="utf-8",
2351 in_memory=in_memory,
2352 name=language,
2353 **corpusargs,
2354 )
2355 corpora.append(corp)
2357 super(NER_MASAKHANE, self).__init__(
2358 corpora,
2359 name='masakhane-' + '-'.join(languages),
2360 )
2363class NER_MULTI_WIKIANN(MultiCorpus):
2364 def __init__(
2365 self,
2366 languages: Union[str, List[str]] = "en",
2367 base_path: Union[str, Path] = None,
2368 tag_to_bioes: str = "ner",
2369 in_memory: bool = False,
2370 **corpusargs,
2371 ):
2372 """
2373 WkiAnn corpus for cross-lingual NER consisting of datasets from 282 languages that exist
2374 in Wikipedia. See https://elisa-ie.github.io/wikiann/ for details and for the languages and their
2375 respective abbreveations, i.e. "en" for english. (license: https://opendatacommons.org/licenses/by/)
2376 Parameters
2377 ----------
2378 languages : Union[str, List[str]]
2379 Should be an abbreviation of a language ("en", "de",..) or a list of abbreviations.
2380 The datasets of all passed languages will be saved in one MultiCorpus.
2381 (Note that, even though listed on https://elisa-ie.github.io/wikiann/ some datasets are empty.
2382 This includes "aa", "cho", "ho", "hz", "ii", "jam", "kj", "kr", "mus", "olo" and "tcy".)
2383 base_path : Union[str, Path], optional
2384 Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
2385 to point to a different folder but typically this should not be necessary.
2386 tag_to_bioes : str, optional
2387 The data is in bio-format. It will by default (with the string "ner" as value) be transformed
2388 into the bioes format. If you dont want that set it to None.
2390 """
2391 if type(languages) == str:
2392 languages = [languages]
2394 if type(base_path) == str:
2395 base_path: Path = Path(base_path)
2397 # column format
2398 columns = {0: "text", 1: "ner"}
2400 # this dataset name
2401 dataset_name = self.__class__.__name__.lower()
2403 # default dataset folder is the cache root
2404 if not base_path:
2405 base_path = flair.cache_root / "datasets"
2406 data_folder = base_path / dataset_name
2408 # For each language in languages, the file is downloaded if not existent
2409 # Then a comlumncorpus of that data is created and saved in a list
2410 # this list is handed to the multicorpus
2412 # list that contains the columncopora
2413 corpora = []
2415 google_drive_path = 'https://drive.google.com/uc?id='
2416 # download data if necessary
2417 first = True
2418 for language in languages:
2420 language_folder = data_folder / language
2421 file_name = 'wikiann-' + language + '.bio'
2423 # if language not downloaded yet, download it
2424 if not language_folder.exists():
2425 if first:
2426 import gdown
2427 import tarfile
2428 first = False
2429 # create folder
2430 os.makedirs(language_folder)
2431 # get google drive id from list
2432 google_id = self._google_drive_id_from_language_name(language)
2433 url = google_drive_path + google_id
2435 # download from google drive
2436 gdown.download(url, str(language_folder / language) + '.tar.gz')
2438 # unzip
2439 log.info("Extracting data...")
2440 tar = tarfile.open(str(language_folder / language) + '.tar.gz', "r:gz")
2441 # tar.extractall(language_folder,members=[tar.getmember(file_name)])
2442 tar.extract(file_name, str(language_folder))
2443 tar.close()
2444 log.info('...done.')
2446 # transform data into required format
2447 # the processed dataset has the additional ending "_new"
2448 log.info("Processing dataset...")
2449 self._silver_standard_to_simple_ner_annotation(str(language_folder / file_name))
2450 # remove the unprocessed dataset
2451 os.remove(str(language_folder / file_name))
2452 log.info('...done.')
2454 # initialize comlumncorpus and add it to list
2455 log.info(f"Reading data for language {language}")
2456 corp = ColumnCorpus(data_folder=language_folder,
2457 column_format=columns,
2458 train_file=file_name + '_new',
2459 tag_to_bioes=tag_to_bioes,
2460 in_memory=in_memory,
2461 **corpusargs,
2462 )
2463 corpora.append(corp)
2464 log.info("...done.")
2466 super(NER_MULTI_WIKIANN, self).__init__(
2467 corpora, name='wikiann',
2468 )
2470 def _silver_standard_to_simple_ner_annotation(self, data_file: Union[str, Path]):
2471 f_read = open(data_file, 'r', encoding='utf-8')
2472 f_write = open(data_file + '_new', 'w+', encoding='utf-8')
2473 while True:
2474 line = f_read.readline()
2475 if line:
2476 if line == '\n':
2477 f_write.write(line)
2478 else:
2479 liste = line.split()
2480 f_write.write(liste[0] + ' ' + liste[-1] + '\n')
2481 else:
2482 break
2483 f_read.close()
2484 f_write.close()
2486 def _google_drive_id_from_language_name(self, language):
2487 languages_ids = {
2488 'aa': '1tDDlydKq7KQQ3_23Ysbtke4HJOe4snIk', # leer
2489 'ab': '1hB8REj2XA_0DjI9hdQvNvSDpuBIb8qRf',
2490 'ace': '1WENJS2ppHcZqaBEXRZyk2zY-PqXkTkgG',
2491 'ady': '1n6On8WWDHxEoybj7F9K15d_fkGPy6KgO',
2492 'af': '1CPB-0BD2tg3zIT60D3hmJT0i5O_SKja0',
2493 'ak': '1l2vlGHnQwvm9XhW5S-403fetwUXhBlZm',
2494 'als': '196xyYjhbie7sYLHLZHWkkurOwQLi8wK-',
2495 'am': '1ug1IEoExKD3xWpvfZprAPSQi82YF9Cet',
2496 'an': '1DNLgPOAOsGZBYd6rC5ddhzvc9_DtWnk2',
2497 'ang': '1W_0ti7Tl8AkqM91lRCMPWEuUnPOAZroV',
2498 'ar': '1tyvd32udEQG_cNeVpaD5I2fxvCc6XKIS',
2499 'arc': '1hSOByStqPmP3b9HfQ39EclUZGo8IKCMb',
2500 'arz': '1CKW5ZhxTpIHmc8Jt5JLz_5O6Cr8Icsan',
2501 'as': '12opBoIweBLM8XciMHT4B6-MAaKdYdvpE',
2502 'ast': '1rp64PxGZBDfcw-tpFBjLg_ddLDElG1II',
2503 'av': '1hncGUrkG1vwAAQgLtwOf41BWkHkEvdss',
2504 'ay': '1VmIsWpMTz442b4Mx798ZOgtB9vquKQtf',
2505 'az': '1FXDXsvBSdqc7GGIDZv0hqBOaaw12Ip2-',
2506 'azb': '1amVqOuHLEkhjn8rkGUl-mXdZlaACWyNT',
2507 'ba': '1aLx1d8GagI11VZVYOGQy0BEePeqoT0x3',
2508 'bar': '1JZ8-k8ZmnpWYI_Yl_cBBgjVdxoM9Daci',
2509 'bat-smg': '1trxKXDFSeKsygTMKi-ZqXSJs7F90k5a8',
2510 'bcl': '1Hs0k7KVZ2DPsqroZ4cUKcwZG4HdPV794',
2511 'be-x-old': '1gaK-spj1m6eGYQ-SsngLxxLUvP1VRk08',
2512 'be': '1_ttfOSy9BzCRkIT_p3mImT82XRPpEiuH',
2513 'bg': '1Iug6gYKemb0OrLTUrKDc_c66YGypTfCF',
2514 'bh': '12OcSFLu940A8tVQLxI8pnxKBpTeZHmrh',
2515 'bi': '1rftVziS_pqARx4mvLJC0sKLY-OL5ZIjE',
2516 'bjn': '1n17mkRjPUAOWQk5LQs2C3Tz3ShxK0enZ',
2517 'bm': '1284dwO_sfdsWE7FR06HhfBRUb8ePesKR',
2518 'bn': '1K2DM1mT4hkr6NlAIBTj95BeVXcgvpgDm',
2519 'bo': '1SzGHDVK-OguKdjZ4DXWiOJVrie1iHeWm',
2520 'bpy': '1m-e5EoruJufvwBEgJLmJtx6jzx64pYN2',
2521 'br': '1xdaBoJ1DnwI0iEq7gQN1dWcABAs_bM9H',
2522 'bs': '167dsB01trMYFQl8FshtIdfhjw7IfVKbk',
2523 'bug': '1yCnevM9_KJzFk27Vxsva_20OacLo4Uam',
2524 'bxr': '1DlByAX3zB-9UyEAVD4wtX-R7mXC-8xum',
2525 'ca': '1LuUgbd9sGa-5Ahcsy31EK89a3WOowftY',
2526 'cbk-zam': '1kgF8xoD-kIOWZET_9kp_4yNX6AAXn6PI',
2527 'cdo': '14x1y6611G-UAEGq92QEHRpreVkYnoUCw',
2528 'ce': '1QUUCVKA-fkiCHd3KT3zUWefaWnxzlZLu',
2529 'ceb': '1DJZE9RfaMoPNXHI73KBXAm4YSe-_YCUk',
2530 'ch': '1YzAfhmatkmTpkZbAcD6X83epCgzD5S2_',
2531 'cho': '1ciY0vF3c5a2mTOo_k32A2wMs0klK98Kb', # leer
2532 'chr': '1EHaxz1UZHn7v2bbRzCLAhPsNtRzrG3Ae',
2533 'chy': '1nNWwMAJr1KNdz3bHf6uIn-thZCknlTeB',
2534 'ckb': '1llpaftcUSiXCZQZMdAqaJSrhwMdcf9IV',
2535 'co': '1ZP-8oWgMYfW7a6w6ygEFkKDGbN39QnDn',
2536 'cr': '1ST0xRicLAG4JdCZwGdaY-0pEXooQh7e6',
2537 'crh': '1Jmpq2XVYUR_XaXU5XNhtOMnz-qkpsgpE',
2538 'cs': '1Vydyze-jBkK_S1uV5ewV_Y6dbwhXr7lk',
2539 'csb': '1naUyF74lZPnnopXdOqf5Xor2kT4WoHfS',
2540 'cu': '1EN5dVTU6jc7YOYPCHq8EYUF31HlMUKs7',
2541 'cv': '1gEUAlqYSSDI4TrWCqP1LUq2n0X1XEjN3',
2542 'cy': '1q5g6NJE5GXf65Vc_P4BnUMHQ49Prz-J1',
2543 'da': '11onAGOLkkqrIwM784siWlg-cewa5WKm8',
2544 'de': '1f9nWvNkCCy6XWhd9uf4Dq-2--GzSaYAb',
2545 'diq': '1IkpJaVbEOuOs9qay_KG9rkxRghWZhWPm',
2546 'dsb': '1hlExWaMth-2eVIQ3i3siJSG-MN_7Z6MY',
2547 'dv': '1WpCrslO4I7TMb2uaKVQw4U2U8qMs5szi',
2548 'dz': '10WX52ePq2KfyGliwPvY_54hIjpzW6klV',
2549 'ee': '1tYEt3oN2KPzBSWrk9jpCqnW3J1KXdhjz',
2550 'el': '1cxq4NUYmHwWsEn5waYXfFSanlINXWLfM',
2551 'eml': '17FgGhPZqZNtzbxpTJOf-6nxEuI5oU4Vd',
2552 'en': '1mqxeCPjxqmO7e8utj1MQv1CICLFVvKa-',
2553 'eo': '1YeknLymGcqj44ug2yd4P7xQVpSK27HkK',
2554 'es': '1Dnx3MVR9r5cuoOgeew2gT8bDvWpOKxkU',
2555 'et': '1Qhb3kYlQnLefWmNimdN_Vykm4mWzbcWy',
2556 'eu': '1f613wH88UeITYyBSEMZByK-nRNMwLHTs',
2557 'ext': '1D0nLOZ3aolCM8TShIRyCgF3-_MhWXccN',
2558 'fa': '1QOG15HU8VfZvJUNKos024xI-OGm0zhEX',
2559 'ff': '1h5pVjxDYcq70bSus30oqi9KzDmezVNry',
2560 'fi': '1y3Kf6qYsSvL8_nSEwE1Y6Bf6ninaPvqa',
2561 'fiu-vro': '1oKUiqG19WgPd3CCl4FGudk5ATmtNfToR',
2562 'fj': '10xDMuqtoTJlJFp5ghbhKfNWRpLDK3W4d',
2563 'fo': '1RhjYqgtri1276Be1N9RrNitdBNkpzh0J',
2564 'fr': '1sK_T_-wzVPJYrnziNqWTriU52rEsXGjn',
2565 'frp': '1NUm8B2zClBcEa8dHLBb-ZgzEr8phcQyZ',
2566 'frr': '1FjNqbIUlOW1deJdB8WCuWjaZfUzKqujV',
2567 'fur': '1oqHZMK7WAV8oHoZLjGR0PfmO38wmR6XY',
2568 'fy': '1DvnU6iaTJc9bWedmDklHyx8nzKD1s3Ge',
2569 'ga': '1Ql6rh7absdYQ8l-3hj_MVKcEC3tHKeFB',
2570 'gag': '1zli-hOl2abuQ2wsDJU45qbb0xuvYwA3a',
2571 'gan': '1u2dOwy58y-GaS-tCPJS_i9VRDQIPXwCr',
2572 'gd': '1umsUpngJiwkLdGQbRqYpkgxZju9dWlRz',
2573 'gl': '141K2IbLjJfXwFTIf-kthmmG0YWdi8liE',
2574 'glk': '1ZDaxQ6ilXaoivo4_KllagabbvfOuiZ0c',
2575 'gn': '1hM4MuCaVnZqnL-w-0N-WcWag22ikVLtZ',
2576 'gom': '1BNOSw75tzPC0wEgLOCKbwu9wg9gcLOzs',
2577 'got': '1YSHYBtXc1WvUvMIHPz6HHgJvaXKulJUj',
2578 'gu': '1VdK-B2drqFwKg8KD23c3dKXY-cZgCMgd',
2579 'gv': '1XZFohYNbKszEFR-V-yDXxx40V41PV9Zm',
2580 'ha': '18ZG4tUU0owRtQA8Ey3Dl72ALjryEJWMC',
2581 'hak': '1QQe3WgrCWbvnVH42QXD7KX4kihHURB0Z',
2582 'haw': '1FLqlK-wpz4jy768XbQAtxd9PhC-9ciP7',
2583 'he': '18K-Erc2VOgtIdskaQq4D5A3XkVstDmfX',
2584 'hi': '1lBRapb5tjBqT176gD36K5yb_qsaFeu-k',
2585 'hif': '153MQ9Ga4NQ-CkK8UiJM3DjKOk09fhCOV',
2586 'ho': '1c1AoS7yq15iVkTEE-0f3x25NT4F202B8', # leer
2587 'hr': '1wS-UtB3sGHuXJQQGR0F5lDegogsgoyif',
2588 'hsb': '1_3mMLzAE5OmXn2z64rW3OwWbo85Mirbd',
2589 'ht': '1BwCaF0nfdgkM7Yt7A7d7KyVk0BcuwPGk',
2590 'hu': '10AkDmTxUWNbOXuYLYZ-ZPbLAdGAGZZ8J',
2591 'hy': '1Mi2k2alJJquT1ybd3GC3QYDstSagaWdo',
2592 'hz': '1c1m_-Q92v0Di7Nez6VuaccrN19i8icKV', # leer
2593 'ia': '1jPyqTmDuVhEhj89N606Cja5heJEbcMoM',
2594 'id': '1JWIvIh8fQoMQqk1rPvUThaskxnTs8tsf',
2595 'ie': '1TaKRlTtB8-Wqu4sfvx6JQKIugAlg0pV-',
2596 'ig': '15NFAf2Qx6BXSjv_Oun9_3QRBWNn49g86',
2597 'ii': '1qldGJkMOMKwY13DpcgbxQCbff0K982f9', # leer
2598 'ik': '1VoSTou2ZlwVhply26ujowDz6gjwtxmny',
2599 'ilo': '1-xMuIT6GaM_YeHqgm1OamGkxYfBREiv3',
2600 'io': '19Zla0wsAcrZm2c0Pw5ghpp4rHjYs26Pp',
2601 'is': '11i-NCyqS6HbldIbYulsCgQGZFXR8hwoB',
2602 'it': '1HmjlOaQunHqL2Te7pIkuBWrnjlmdfYo_',
2603 'iu': '18jKm1S7Ls3l0_pHqQH8MycG3LhoC2pdX',
2604 'ja': '10dz8UxyK4RIacXE2HcGdrharmp5rwc3r',
2605 'jam': '1v99CXf9RnbF6aJo669YeTR6mQRTOLZ74', # leer
2606 'jbo': '1_LmH9hc6FDGE3F7pyGB1fUEbSwuTYQdD',
2607 'jv': '1qiSu1uECCLl4IBZS27FBdJIBivkJ7GwE',
2608 'ka': '172UFuFRBX2V1aWeXlPSpu9TjS-3cxNaD',
2609 'kaa': '1kh6hMPUdqO-FIxRY6qaIBZothBURXxbY',
2610 'kab': '1oKjbZI6ZrrALCqnPCYgIjKNrKDA7ehcs',
2611 'kbd': '1jNbfrboPOwJmlXQBIv053d7n5WXpMRv7',
2612 'kg': '1iiu5z-sdJ2JLC4Ja9IgDxpRZklIb6nDx',
2613 'ki': '1GUtt0QI84c5McyLGGxoi5uwjHOq1d6G8',
2614 'kj': '1nSxXUSGDlXVCIPGlVpcakRc537MwuKZR', # leer
2615 'kk': '1ryC3UN0myckc1awrWhhb6RIi17C0LCuS',
2616 'kl': '1gXtGtX9gcTXms1IExICnqZUHefrlcIFf',
2617 'km': '1DS5ATxvxyfn1iWvq2G6qmjZv9pv0T6hD',
2618 'kn': '1ZGLYMxbb5-29MNmuUfg2xFhYUbkJFMJJ',
2619 'ko': '12r8tIkTnwKhLJxy71qpIcoLrT6NNhQYm',
2620 'koi': '1EdG_wZ_Qk124EPAZw-w6rdEhYLsgcvIj',
2621 'kr': '19VNQtnBA-YL_avWuVeHQHxJZ9MZ04WPF', # leer
2622 'krc': '1nReV4Mb7Wdj96czpO5regFbdBPu0zZ_y',
2623 'ks': '1kzh0Pgrv27WRMstR9MpU8mu7p60TcT-X',
2624 'ksh': '1iHJvrl2HeRaCumlrx3N7CPrHQ2KuLUkt',
2625 'ku': '1YqJog7Bkk0fHBCSTxJ9heeE-bfbkbkye',
2626 'kv': '1s91HI4eq8lQYlZwfrJAgaGlCyAtIhvIJ',
2627 'kw': '16TaIX2nRfqDp8n7zudd4bqf5abN49dvW',
2628 'ky': '17HPUKFdKWhUjuR1NOp5f3PQYfMlMCxCT',
2629 'la': '1NiQuBaUIFEERvVXo6CQLwosPraGyiRYw',
2630 'lad': '1PEmXCWLCqnjLBomMAYHeObM1AmVHtD08',
2631 'lb': '1nE4g10xoTU23idmDtOQ0w2QCuizZ6QH_',
2632 'lbe': '1KOm-AdRcCHfSc1-uYBxBA4GjxXjnIlE-',
2633 'lez': '1cJAXshrLlF1TZlPHJTpDwEvurIOsz4yR',
2634 'lg': '1Ur0y7iiEpWBgHECrIrT1OyIC8um_y4th',
2635 'li': '1TikIqfqcZlSDWhOae1JnjJiDko4nj4Dj',
2636 'lij': '1ro5ItUcF49iP3JdV82lhCQ07MtZn_VjW',
2637 'lmo': '1W4rhBy2Pi5SuYWyWbNotOVkVY3kYWS_O',
2638 'ln': '1bLSV6bWx0CgFm7ByKppZLpYCFL8EIAoD',
2639 'lo': '1C6SSLeKF3QirjZbAZAcpVX_AXYg_TJG3',
2640 'lrc': '1GUcS28MlJe_OjeQfS2AJ8uczpD8ut60e',
2641 'lt': '1gAG6TcMTmC128wWK0rCXRlCTsJY9wFQY',
2642 'ltg': '12ziP8t_fAAS9JqOCEC0kuJObEyuoiOjD',
2643 'lv': '1MPuAM04u-AtfybXdpHwCqUpFWbe-zD0_',
2644 'mai': '1d_nUewBkka2QGEmxCc9v3dTfvo7lPATH',
2645 'map-bms': '1wrNIE-mqp2xb3lrNdwADe6pb7f35NP6V',
2646 'mdf': '1BmMGUJy7afuKfhfTBMiKxM3D7FY-JrQ2',
2647 'mg': '105WaMhcWa-46tCztoj8npUyg0aH18nFL',
2648 'mh': '1Ej7n6yA1cF1cpD5XneftHtL33iHJwntT',
2649 'mhr': '1CCPIUaFkEYXiHO0HF8_w07UzVyWchrjS',
2650 'mi': '1F6au9xQjnF-aNBupGJ1PwaMMM6T_PgdQ',
2651 'min': '1tVK5SHiCy_DaZSDm3nZBgT5bgWThbJt_',
2652 'mk': '18NpudytGhSWq_LbmycTDw10cSftlSBGS',
2653 'ml': '1V73UE-EvcE-vV3V1RTvU4sak6QFcP91y',
2654 'mn': '14jRXicA87oXZOZllWqUjKBMetNpQEUUp',
2655 'mo': '1YsLGNMsJ7VsekhdcITQeolzOSK4NzE6U',
2656 'mr': '1vOr1AIHbgkhTO9Ol9Jx5Wh98Qdyh1QKI',
2657 'mrj': '1dW-YmEW8a9D5KyXz8ojSdIXWGekNzGzN',
2658 'ms': '1bs-_5WNRiZBjO-DtcNtkcIle-98homf_',
2659 'mt': '1L7aU3iGjm6SmPIU74k990qRgHFV9hrL0',
2660 'mus': '1_b7DcRqiKJFEFwp87cUecqf8A5BDbTIJ', # leer
2661 'mwl': '1MfP0jba2jQfGVeJOLq26MjI6fYY7xTPu',
2662 'my': '16wsIGBhNVd2lC2p6n1X8rdMbiaemeiUM',
2663 'myv': '1KEqHmfx2pfU-a1tdI_7ZxMQAk5NJzJjB',
2664 'mzn': '1CflvmYEXZnWwpsBmIs2OvG-zDDvLEMDJ',
2665 'na': '1r0AVjee5wNnrcgJxQmVGPVKg5YWz1irz',
2666 'nah': '1fx6eu91NegyueZ1i0XaB07CKjUwjHN7H',
2667 'nap': '1bhT4sXCJvaTchCIV9mwLBtf3a7OprbVB',
2668 'nds-nl': '1UIFi8eOCuFYJXSAXZ9pCWwkQMlHaY4ye',
2669 'nds': '1FLgZIXUWa_vekDt4ndY0B5XL7FNLiulr',
2670 'ne': '1gEoCjSJmzjIH4kdHsbDZzD6ID4_78ekS',
2671 'new': '1_-p45Ny4w9UvGuhD8uRNSPPeaARYvESH',
2672 'ng': '11yxPdkmpmnijQUcnFHZ3xcOmLTYJmN_R',
2673 'nl': '1dqYXg3ilzVOSQ_tz_dF47elSIvSIhgqd',
2674 'nn': '1pDrtRhQ001z2WUNMWCZQU3RV_M0BqOmv',
2675 'no': '1zuT8MI96Ivpiu9mEVFNjwbiM8gJlSzY2',
2676 'nov': '1l38388Rln0NXsSARMZHmTmyfo5C0wYTd',
2677 'nrm': '10vxPq1Nci7Wpq4XOvx3dtqODskzjdxJQ',
2678 'nso': '1iaIV8qlT0RDnbeQlnxJ3RehsG3gU5ePK',
2679 'nv': '1oN31jT0w3wP9aGwAPz91pSdUytnd9B0g',
2680 'ny': '1eEKH_rUPC560bfEg11kp3kbe8qWm35IG',
2681 'oc': '1C01cW8G_j8US-DTrsmeal_ENHTtNWn-H',
2682 'olo': '1vbDwKZKqFq84dusr1SvDx5JbBcPanx9L', # leer
2683 'om': '1q3h22VMbWg2kgVFm-OArR-E4y1yBQ1JX',
2684 'or': '1k8LwCE8nC7lq6neXDaS3zRn0KOrd9RnS',
2685 'os': '1u81KAB34aEQfet00dLMRIBJsfRwbDTij',
2686 'pa': '1JDEHL1VcLHBamgTPBom_Ryi8hk6PBpsu',
2687 'pag': '1k905VUWnRgY8kFb2P2431Kr4dZuolYGF',
2688 'pam': '1ssugGyJb8ipispC60B3I6kzMsri1WcvC',
2689 'pap': '1Za0wfwatxYoD7jGclmTtRoBP0uV_qImQ',
2690 'pcd': '1csJlKgtG04pdIYCUWhsCCZARKIGlEYPx',
2691 'pdc': '1Xnms4RXZKZ1BBQmQJEPokmkiweTpouUw',
2692 'pfl': '1tPQfHX7E0uKMdDSlwNw5aGmaS5bUK0rn',
2693 'pi': '16b-KxNxzbEuyoNSlI3bfe2YXmdSEsPFu',
2694 'pih': '1vwyihTnS8_PE5BNK7cTISmIBqGWvsVnF',
2695 'pl': '1fijjS0LbfpKcoPB5V8c8fH08T8AkXRp9',
2696 'pms': '12ySc7X9ajWWqMlBjyrPiEdc-qVBuIkbA',
2697 'pnb': '1RB3-wjluhTKbdTGCsk3nag1bM3m4wENb',
2698 'pnt': '1ZCUzms6fY4on_fW8uVgO7cEs9KHydHY_',
2699 'ps': '1WKl9Av6Sqz6aHKyUM5kIh90mzFzyVWH9',
2700 'pt': '13BX-_4_hcTUp59HDyczFDI32qUB94vUY',
2701 'qu': '1CB_C4ygtRoegkqgcqfXNHr8oQd-UcvDE',
2702 'rm': '1YRSGgWoxEqSojHXuBHJnY8vAHr1VgLu-',
2703 'rmy': '1uFcCyvOWBJWKFQxbkYSp373xUXVl4IgF',
2704 'rn': '1ekyyb2MvupYGY_E8_BhKvV664sLvW4aE',
2705 'ro': '1YfeNTSoxU-zJMnyQotLk5X8B_6nHryBu',
2706 'roa-rup': '150s4H4TdQ5nNYVC6j0E416TUAjBE85yy',
2707 'roa-tara': '1H6emfQsD_a5yohK4RMPQ-GrnHXqqVgr3',
2708 'ru': '11gP2s-SYcfS3j9MjPp5C3_nFeQB-8x86',
2709 'rue': '1OuSglZAndja1J5D5IUmdbt_niTTyEgYK',
2710 'rw': '1NuhHfi0-B-Xlr_BApijnxCw0WMEltttP',
2711 'sa': '1P2S3gL_zvKgXLKJJxg-Fb4z8XdlVpQik',
2712 'sah': '1qz0MpKckzUref2FX_FYiNzI2p4BDc5oR',
2713 'sc': '1oAYj_Fty4FUwjAOBEBaiZt_cY8dtpDfA',
2714 'scn': '1sDN9zHkXWYoHYx-DUu-GPvsUgB_IRa8S',
2715 'sco': '1i8W7KQPj6YZQLop89vZBSybJNgNsvXWR',
2716 'sd': '1vaNqfv3S8Gl5pQmig3vwWQ3cqRTsXmMR',
2717 'se': '1RT9xhn0Vl90zjWYDTw5V1L_u1Oh16tpP',
2718 'sg': '1iIh2oXD2Szz_AygUvTt3_ZK8a3RYEGZ_',
2719 'sh': '1qPwLiAm6t4__G-zVEOrBgYx6VRmgDgiS',
2720 'si': '1G5ryceID0TP6SAO42e-HAbIlCvYmnUN7',
2721 'simple': '1FVV49o_RlK6M5Iw_7zeJOEDQoTa5zSbq',
2722 'sk': '11mkYvbmAWKTInj6t4Ma8BUPxoR5o6irL',
2723 'sl': '1fsIZS5LgMzMzZ6T7ogStyj-ILEZIBRvO',
2724 'sm': '1yefECpKX_Y4R7G2tggIxvc_BvJfOAz-t',
2725 'sn': '1fYeCjMPvRAv94kvZjiKI-ktIDLkbv0Ve',
2726 'so': '1Uc-eSZnJb36SgeTvRU3GirXZOlGD_NB6',
2727 'sq': '11u-53n71O_yjpwRiCQSwgL7N2w72ZptX',
2728 'sr': '1PGLGlQi8Q0Eac6dib-uuCJAAHK6SF5Pz',
2729 'srn': '1JKiL3TSXqK1-KhPfAwMK0uqw90WEzg7M',
2730 'ss': '1e0quNEsA1dn57-IbincF4D82dRWgzQlp',
2731 'st': '1ny-FBzpBqIDgv6jMcsoFev3Ih65FNZFO',
2732 'stq': '15Fx32ROy2IM6lSqAPUykkr3CITR6Xd7v',
2733 'su': '1C0FJum7bYZpnyptBvfAgwJb0TX2hggtO',
2734 'sv': '1YyqzOSXzK5yrAou9zeTDWH_7s569mDcz',
2735 'sw': '1_bNTj6T8eXlNAIuHaveleWlHB_22alJs',
2736 'szl': '1_dXEip1snK4CPVGqH8x7lF5O-6FdCNFW',
2737 'ta': '1ZFTONsxGtSnC9QB6RpWSvgD_MbZwIhHH',
2738 'tcy': '15R6u7KQs1vmDSm_aSDrQMJ3Q6q3Be0r7', # leer
2739 'te': '11Sx-pBAPeZOXGyv48UNSVMD0AH7uf4YN',
2740 'tet': '11mr2MYLcv9pz7mHhGGNi5iNCOVErYeOt',
2741 'tg': '16ttF7HWqM9Cnj4qmgf3ZfNniiOJfZ52w',
2742 'th': '14xhIt-xr5n9nMuvcwayCGM1-zBCFZquW',
2743 'ti': '123q5e9MStMShp8eESGtHdSBGLDrCKfJU',
2744 'tk': '1X-JNInt34BNGhg8A8Peyjw2WjsALdXsD',
2745 'tl': '1WkQHbWd9cqtTnSHAv0DpUThaBnzeSPTJ',
2746 'tn': '1fHfQHetZn8-fLuRZEu-cvs-kQYwPvjyL',
2747 'to': '1cHOLaczYJ8h-OqQgxeoH9vMG3izg6muT',
2748 'tpi': '1YsRjxVu6NYOrXRb8oqMO9FPaicelFEcu',
2749 'tr': '1J1Zy02IxvtCK0d1Ba2h_Ulit1mVb9UIX',
2750 'ts': '1pIcfAt3KmtmDkyhOl-SMSeoM8aP8bOpl',
2751 'tt': '1vsfzCjj-_bMOn5jBai41TF5GjKJM_Ius',
2752 'tum': '1NWcg65daI2Bt0awyEgU6apUDbBmiqCus',
2753 'tw': '1WCYKZIqS7AagS76QFSfbteiOgFNBvNne',
2754 'ty': '1DIqaP1l-N9VXTNokrlr6EuPMGE765o4h',
2755 'tyv': '1F3qa05OYLBcjT1lXMurAJFDXP_EesCvM',
2756 'udm': '1T0YMTAPLOk768sstnewy5Jxgx2RPu3Rb',
2757 'ug': '1fjezvqlysyZhiQMZdazqLGgk72PqtXAw',
2758 'uk': '1UMJCHtzxkfLDBJE7NtfN5FeMrnnUVwoh',
2759 'ur': '1WNaD2TuHvdsF-z0k_emQYchwoQQDFmRk',
2760 'uz': '11wrG2FSTpRJc2jb5MhgvxjkVDYhT8M-l',
2761 've': '1PucJ7pJ4CXGEXZ5p_WleZDs2usNz74to',
2762 'vec': '1cAVjm_y3ehNteDQIYz9yyoq1EKkqOXZ0',
2763 'vep': '1K_eqV7O6C7KPJWZtmIuzFMKAagj-0O85',
2764 'vi': '1yQ6nhm1BmG9lD4_NaG1hE5VV6biEaV5f',
2765 'vls': '1bpQQW6pKHruKJJaKtuggH5rReMXyeVXp',
2766 'vo': '1D80QRdTpe7H4mHFKpfugscsjX71kiMJN',
2767 'wa': '1m4B81QYbf74htpInDU5p7d0n0ot8WLPZ',
2768 'war': '1EC3jsHtu22tHBv6jX_I4rupC5RwV3OYd',
2769 'wo': '1vChyqNNLu5xYHdyHpACwwpw4l3ptiKlo',
2770 'wuu': '1_EIn02xCUBcwLOwYnA-lScjS2Lh2ECw6',
2771 'xal': '19bKXsL1D2UesbB50JPyc9TpG1lNc2POt',
2772 'xh': '1pPVcxBG3xsCzEnUzlohc_p89gQ9dSJB3',
2773 'xmf': '1SM9llku6I_ZuZz05mOBuL2lx-KQXvehr',
2774 'yi': '1WNWr1oV-Nl7c1Jv8x_MiAj2vxRtyQawu',
2775 'yo': '1yNVOwMOWeglbOcRoZzgd4uwlN5JMynnY',
2776 'za': '1i7pg162cD_iU9h8dgtI2An8QCcbzUAjB',
2777 'zea': '1EWSkiSkPBfbyjWjZK0VuKdpqFnFOpXXQ',
2778 'zh-classical': '1uUKZamNp08KA7s7794sKPOqPALvo_btl',
2779 'zh-min-nan': '1oSgz3YBXLGUgI7kl-uMOC_ww6L0FNFmp',
2780 'zh-yue': '1zhwlUeeiyOAU1QqwqZ8n91yXIRPFA7UE',
2781 'zh': '1LZ96GUhkVHQU-aj2C3WOrtffOp0U3Z7f',
2782 'zu': '1FyXl_UK1737XB3drqQFhGXiJrJckiB1W'
2783 }
2784 return languages_ids[language]
2787class NER_MULTI_XTREME(MultiCorpus):
2788 def __init__(
2789 self,
2790 languages: Union[str, List[str]] = "en",
2791 base_path: Union[str, Path] = None,
2792 tag_to_bioes: str = "ner",
2793 in_memory: bool = False,
2794 **corpusargs,
2795 ):
2796 """
2797 Xtreme corpus for cross-lingual NER consisting of datasets of a total of 176 languages. The data comes from the google
2798 research work XTREME https://github.com/google-research/xtreme. All datasets for NER and respective language abbreviations (e.g.
2799 "en" for english can be found here https://www.amazon.com/clouddrive/share/d3KGCRCIYwhKJF0H3eWA26hjg2ZCRhjpEQtDL70FSBN/folder/C43gs51bSIaq5sFTQkWNCQ?_encoding=UTF8&*Version*=1&*entries*=0&mgh=1 )
2800 The data is derived from the wikiann dataset https://elisa-ie.github.io/wikiann/ (license: https://opendatacommons.org/licenses/by/)
2802 Parameters
2803 ----------
2804 languages : Union[str, List[str]], optional
2805 Default the 40 languages that are used in XTREME are loaded. Otherwise on can hand over a strings or a list of strings
2806 consisiting of abbreviations for languages. All datasets will be loaded in a MultiCorpus object.
2807 base_path : Union[str, Path], optional
2808 Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
2809 to point to a different folder but typically this should not be necessary.
2810 tag_to_bioes : str, optional
2811 The data is in bio-format. It will by default (with the string "ner" as value) be transformed
2812 into the bioes format. If you dont want that set it to None.
2814 """
2815 # if no languages are given as argument all languages used in XTREME will be loaded
2816 if not languages:
2817 languages = ["af", "ar", "bg", "bn", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr", "he", "hi", "hu",
2818 "id", "it", "ja", "jv", "ka", "kk", "ko", "ml", "mr", "ms", "my", "nl", "pt", "ru", "sw", "ta",
2819 "te", "th", "tl", "tr", "ur", "vi", "yo", "zh"]
2821 # if only one language is given
2822 if type(languages) == str:
2823 languages = [languages]
2825 if type(base_path) == str:
2826 base_path: Path = Path(base_path)
2828 # column format
2829 columns = {0: "text", 1: "ner"}
2831 # this dataset name
2832 dataset_name = self.__class__.__name__.lower()
2834 # default dataset folder is the cache root
2835 if not base_path:
2836 base_path = flair.cache_root / "datasets"
2837 data_folder = base_path / dataset_name
2839 # For each language in languages, the file is downloaded if not existent
2840 # Then a comlumncorpus of that data is created and saved in a list
2841 # This list is handed to the multicorpus
2843 # list that contains the columncopora
2844 corpora = []
2846 hu_path = "https://nlp.informatik.hu-berlin.de/resources/datasets/panx_dataset"
2848 # download data if necessary
2849 for language in languages:
2851 language_folder = data_folder / language
2853 # if language not downloaded yet, download it
2854 if not language_folder.exists():
2856 file_name = language + '.tar.gz'
2857 # create folder
2858 os.makedirs(language_folder)
2860 # download from HU Server
2861 temp_file = cached_path(
2862 hu_path + "/" + file_name,
2863 Path("datasets") / dataset_name / language
2864 )
2866 # unzip
2867 log.info("Extracting data...")
2868 import tarfile
2869 tar = tarfile.open(str(temp_file), "r:gz")
2870 for part in ["train", "test", "dev"]:
2871 tar.extract(part, str(language_folder))
2872 tar.close()
2873 log.info('...done.')
2875 # transform data into required format
2876 log.info("Processing dataset...")
2877 for part in ["train", "test", "dev"]:
2878 self._xtreme_to_simple_ner_annotation(str(language_folder / part))
2879 log.info('...done.')
2881 # initialize comlumncorpus and add it to list
2882 log.info(f"Reading data for language {language}")
2883 corp = ColumnCorpus(data_folder=language_folder,
2884 column_format=columns,
2885 tag_to_bioes=tag_to_bioes,
2886 in_memory=in_memory,
2887 **corpusargs,
2888 )
2889 corpora.append(corp)
2891 super(NER_MULTI_XTREME, self).__init__(
2892 corpora, name='xtreme',
2893 )
2895 def _xtreme_to_simple_ner_annotation(self, data_file: Union[str, Path]):
2896 with open(data_file, 'r', encoding='utf-8') as f:
2897 lines = f.readlines()
2898 with open(data_file, 'w', encoding='utf-8') as f:
2899 for line in lines:
2900 if line == '\n':
2901 f.write(line)
2902 else:
2903 liste = line.split()
2904 f.write(liste[0].split(':', 1)[1] + ' ' + liste[1] + '\n')
2907class NER_MULTI_WIKINER(MultiCorpus):
2908 def __init__(
2909 self,
2910 languages: Union[str, List[str]] = "en",
2911 base_path: Union[str, Path] = None,
2912 tag_to_bioes: str = "ner",
2913 in_memory: bool = False,
2914 **corpusargs,
2915 ):
2916 if type(base_path) == str:
2917 base_path: Path = Path(base_path)
2919 # if only one language is given
2920 if type(languages) == str:
2921 languages = [languages]
2923 # column format
2924 columns = {0: "text", 1: "pos", 2: "ner"}
2926 # this dataset name
2927 dataset_name = self.__class__.__name__.lower()
2929 # default dataset folder is the cache root
2930 if not base_path:
2931 base_path = flair.cache_root / "datasets"
2932 data_folder = base_path / dataset_name
2934 corpora = []
2935 for language in languages:
2936 language_folder = data_folder / language
2938 # download data if necessary
2939 self._download_wikiner(language, language_folder)
2941 # initialize comlumncorpus and add it to list
2942 log.info(f"Read data for language {language}")
2943 corp = ColumnCorpus(data_folder=language_folder,
2944 column_format=columns,
2945 tag_to_bioes=tag_to_bioes,
2946 in_memory=in_memory,
2947 **corpusargs,
2948 )
2949 corpora.append(corp)
2951 super(NER_MULTI_WIKINER, self).__init__(
2952 corpora, name='wikiner',
2953 )
2955 def _download_wikiner(self, language_code: str, dataset_name: str):
2956 # download data if necessary
2957 wikiner_path = (
2958 "https://raw.githubusercontent.com/dice-group/FOX/master/input/Wikiner/"
2959 )
2960 lc = language_code
2962 data_file = (
2963 flair.cache_root
2964 / "datasets"
2965 / dataset_name
2966 / f"aij-wikiner-{lc}-wp3.train"
2967 )
2968 if not data_file.is_file():
2970 cached_path(
2971 f"{wikiner_path}aij-wikiner-{lc}-wp3.bz2", Path("datasets") / dataset_name
2972 )
2973 import bz2, shutil
2975 # unpack and write out in CoNLL column-like format
2976 bz_file = bz2.BZ2File(
2977 flair.cache_root
2978 / "datasets"
2979 / dataset_name
2980 / f"aij-wikiner-{lc}-wp3.bz2",
2981 "rb",
2982 )
2983 with bz_file as f, open(
2984 flair.cache_root
2985 / "datasets"
2986 / dataset_name
2987 / f"aij-wikiner-{lc}-wp3.train",
2988 "w",
2989 encoding="utf-8"
2990 ) as out:
2991 for line in f:
2992 line = line.decode("utf-8")
2993 words = line.split(" ")
2994 for word in words:
2995 out.write("\t".join(word.split("|")) + "\n")
2998class NER_SWEDISH(ColumnCorpus):
2999 def __init__(
3000 self,
3001 base_path: Union[str, Path] = None,
3002 tag_to_bioes: str = "ner",
3003 in_memory: bool = True,
3004 **corpusargs,
3005 ):
3006 """
3007 Initialize the NER_SWEDISH corpus for Swedish. The first time you call this constructor it will automatically
3008 download the dataset.
3009 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
3010 to point to a different folder but typically this should not be necessary.
3011 :param in_memory: If True, keeps dataset in memory giving speedups in training.
3012 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
3013 """
3015 if type(base_path) == str:
3016 base_path: Path = Path(base_path)
3018 # column format
3019 columns = {0: "text", 1: "ner"}
3021 # this dataset name
3022 dataset_name = self.__class__.__name__.lower()
3024 # default dataset folder is the cache root
3025 if not base_path:
3026 base_path = flair.cache_root / "datasets"
3027 data_folder = base_path / dataset_name
3029 # download data if necessary
3030 ner_spraakbanken_path = "https://raw.githubusercontent.com/klintan/swedish-ner-corpus/master/"
3031 cached_path(f"{ner_spraakbanken_path}test_corpus.txt", Path("datasets") / dataset_name)
3032 cached_path(f"{ner_spraakbanken_path}train_corpus.txt", Path("datasets") / dataset_name)
3034 # data is not in IOB2 format. Thus we transform it to IOB2
3035 self._add_IOB2_tags(data_file=Path(data_folder / "test_corpus.txt"))
3036 self._add_IOB2_tags(data_file=Path(data_folder / "train_corpus.txt"))
3038 super(NER_SWEDISH, self).__init__(
3039 data_folder,
3040 columns,
3041 tag_to_bioes=tag_to_bioes,
3042 in_memory=in_memory,
3043 **corpusargs,
3044 )
3046 def _add_IOB2_tags(self, data_file: Union[str, Path], encoding: str = "utf8"):
3047 """
3048 Function that adds IOB2 tags if only chunk names are provided (e.g. words are tagged PER instead
3049 of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects
3050 the letter 'O'. Additionally it removes lines with no tags in the data file and can also
3051 be used if the data is only partially IOB tagged.
3052 Parameters
3053 ----------
3054 data_file : Union[str, Path]
3055 Path to the data file.
3056 encoding : str, optional
3057 Encoding used in open function. The default is "utf8".
3059 """
3060 with open(file=data_file, mode='r', encoding=encoding) as f:
3061 lines = f.readlines()
3062 with open(file=data_file, mode='w', encoding=encoding) as f:
3063 pred = 'O' # remembers tag of predecessing line
3064 for line in lines:
3065 line_list = line.split()
3066 if len(line_list) == 2: # word with tag
3067 word = line_list[0]
3068 tag = line_list[1]
3069 if tag in ['0', 'O']: # no chunk
3070 f.write(word + ' O\n')
3071 pred = 'O'
3072 elif '-' not in tag: # no IOB tags
3073 if pred == 'O': # found a new chunk
3074 f.write(word + ' B-' + tag + '\n')
3075 pred = tag
3076 else: # found further part of chunk or new chunk directly after old chunk
3077 if pred == tag:
3078 f.write(word + ' I-' + tag + '\n')
3079 else:
3080 f.write(word + ' B-' + tag + '\n')
3081 pred = tag
3082 else: # line already has IOB tag (tag contains '-')
3083 f.write(line)
3084 pred = tag.split('-')[1]
3085 elif len(line_list) == 0: # empty line
3086 f.write('\n')
3087 pred = 'O'
3090class NER_TURKU(ColumnCorpus):
3091 def __init__(
3092 self,
3093 base_path: Union[str, Path] = None,
3094 tag_to_bioes: str = "ner",
3095 in_memory: bool = True,
3096 **corpusargs,
3097 ):
3098 """
3099 Initialize the Finnish TurkuNER corpus. The first time you call this constructor it will automatically
3100 download the dataset.
3101 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
3102 to point to a different folder but typically this should not be necessary.
3103 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
3104 POS tags instead
3105 :param in_memory: If True, keeps dataset in memory giving speedups in training.
3106 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
3107 """
3108 if type(base_path) == str:
3109 base_path: Path = Path(base_path)
3111 # column format
3112 columns = {0: "text", 1: "ner"}
3114 # this dataset name
3115 dataset_name = self.__class__.__name__.lower()
3117 # default dataset folder is the cache root
3118 if not base_path:
3119 base_path = flair.cache_root / "datasets"
3120 data_folder = base_path / dataset_name
3122 # download data if necessary
3123 conll_path = "https://raw.githubusercontent.com/TurkuNLP/turku-ner-corpus/master/data/conll"
3124 dev_file = "dev.tsv"
3125 test_file = "test.tsv"
3126 train_file = "train.tsv"
3127 cached_path(f"{conll_path}/{dev_file}", Path("datasets") / dataset_name)
3128 cached_path(f"{conll_path}/{test_file}", Path("datasets") / dataset_name)
3129 cached_path(f"{conll_path}/{train_file}", Path("datasets") / dataset_name)
3131 super(NER_TURKU, self).__init__(
3132 data_folder,
3133 columns,
3134 dev_file=dev_file,
3135 test_file=test_file,
3136 train_file=train_file,
3137 column_delimiter="\t",
3138 tag_to_bioes=tag_to_bioes,
3139 encoding="latin-1",
3140 in_memory=in_memory,
3141 document_separator_token="-DOCSTART-",
3142 **corpusargs,
3143 )
3146class KEYPHRASE_SEMEVAL2017(ColumnCorpus):
3147 def __init__(
3148 self,
3149 base_path: Union[str, Path] = None,
3150 tag_to_bioes: str = "keyword",
3151 in_memory: bool = True,
3152 **corpusargs,
3153 ):
3155 if type(base_path) == str:
3156 base_path: Path = Path(base_path)
3158 # column format
3159 columns = {0: "text", 1: "keyword"}
3161 # this dataset name
3162 dataset_name = self.__class__.__name__.lower()
3164 # default dataset folder is the cache root
3165 if not base_path:
3166 base_path = flair.cache_root / "datasets"
3167 data_folder = base_path / dataset_name
3169 semeval2017_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/SemEval-2017"
3170 cached_path(f"{semeval2017_path}/train.txt", Path("datasets") / dataset_name)
3171 cached_path(f"{semeval2017_path}/test.txt", Path("datasets") / dataset_name)
3172 cached_path(f"{semeval2017_path}/dev.txt", Path("datasets") / dataset_name)
3174 super(KEYPHRASE_SEMEVAL2017, self).__init__(
3175 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
3176 )
3179class KEYPHRASE_INSPEC(ColumnCorpus):
3180 def __init__(
3181 self,
3182 base_path: Union[str, Path] = None,
3183 tag_to_bioes: str = "keyword",
3184 in_memory: bool = True,
3185 **corpusargs,
3186 ):
3188 if type(base_path) == str:
3189 base_path: Path = Path(base_path)
3191 # column format
3192 columns = {0: "text", 1: "keyword"}
3194 # this dataset name
3195 dataset_name = self.__class__.__name__.lower()
3197 # default dataset folder is the cache root
3198 if not base_path:
3199 base_path = flair.cache_root / "datasets"
3200 data_folder = base_path / dataset_name
3202 inspec_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/Inspec"
3203 cached_path(f"{inspec_path}/train.txt", Path("datasets") / dataset_name)
3204 cached_path(f"{inspec_path}/test.txt", Path("datasets") / dataset_name)
3205 if not "dev.txt" in os.listdir(data_folder):
3206 cached_path(f"{inspec_path}/valid.txt", Path("datasets") / dataset_name)
3207 # rename according to train - test - dev - convention
3208 os.rename(data_folder / "valid.txt", data_folder / "dev.txt")
3210 super(KEYPHRASE_INSPEC, self).__init__(
3211 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
3212 )
3215class KEYPHRASE_SEMEVAL2010(ColumnCorpus):
3216 def __init__(
3217 self,
3218 base_path: Union[str, Path] = None,
3219 tag_to_bioes: str = "keyword",
3220 in_memory: bool = True,
3221 **corpusargs,
3222 ):
3224 if type(base_path) == str:
3225 base_path: Path = Path(base_path)
3227 # column format
3228 columns = {0: "text", 1: "keyword"}
3230 # this dataset name
3231 dataset_name = self.__class__.__name__.lower()
3233 # default dataset folder is the cache root
3234 if not base_path:
3235 base_path = flair.cache_root / "datasets"
3236 data_folder = base_path / dataset_name
3238 semeval2010_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/processed_semeval-2010"
3239 cached_path(f"{semeval2010_path}/train.txt", Path("datasets") / dataset_name)
3240 cached_path(f"{semeval2010_path}/test.txt", Path("datasets") / dataset_name)
3242 super(KEYPHRASE_SEMEVAL2010, self).__init__(
3243 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
3244 )
3247class UP_CHINESE(ColumnCorpus):
3248 def __init__(
3249 self,
3250 base_path: Union[str, Path] = None,
3251 in_memory: bool = True,
3252 document_as_sequence: bool = False,
3253 **corpusargs,
3254 ):
3255 """
3256 Initialize the Chinese dataset from the Universal Propositions Bank, comming from that webpage:
3257 https://github.com/System-T/UniversalPropositions/tree/master/UP_Chinese
3259 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
3260 to point to a different folder but typically this should not be necessary.
3261 :param in_memory: If True, keeps dataset in memory giving speedups in training.
3262 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
3263 """
3264 if type(base_path) == str:
3265 base_path: Path = Path(base_path)
3267 # column format
3268 columns = {1: "text", 9: "frame"}
3270 # this dataset name
3271 dataset_name = self.__class__.__name__.lower()
3273 # default dataset folder is the cache root
3274 if not base_path:
3275 base_path = flair.cache_root / "datasets"
3276 data_folder = base_path / dataset_name
3278 # download data if necessary
3279 up_zh_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Chinese/"
3280 cached_path(f"{up_zh_path}zh-up-train.conllu", Path("datasets") / dataset_name)
3281 cached_path(f"{up_zh_path}zh-up-dev.conllu", Path("datasets") / dataset_name)
3282 cached_path(f"{up_zh_path}zh-up-test.conllu", Path("datasets") / dataset_name)
3284 super(UP_CHINESE, self).__init__(
3285 data_folder,
3286 columns,
3287 encoding="utf-8",
3288 train_file="zh-up-train.conllu",
3289 test_file="zh-up-test.conllu",
3290 dev_file="zh-up-dev.conllu",
3291 in_memory=in_memory,
3292 document_separator_token=None if not document_as_sequence else "-DOCSTART-",
3293 comment_symbol="#",
3294 **corpusargs,
3295 )
3298class UP_ENGLISH(ColumnCorpus):
3299 def __init__(
3300 self,
3301 base_path: Union[str, Path] = None,
3302 in_memory: bool = True,
3303 document_as_sequence: bool = False,
3304 **corpusargs,
3305 ):
3306 """
3307 Initialize the English dataset from the Universal Propositions Bank, comming from that webpage:
3308 https://github.com/System-T/UniversalPropositions.
3310 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
3311 to point to a different folder but typically this should not be necessary.
3312 :param in_memory: If True, keeps dataset in memory giving speedups in training.
3313 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
3314 """
3315 if type(base_path) == str:
3316 base_path: Path = Path(base_path)
3318 # column format
3319 columns = {1: "text", 10: "frame"}
3321 # this dataset name
3322 dataset_name = self.__class__.__name__.lower()
3324 # default dataset folder is the cache root
3325 if not base_path:
3326 base_path = flair.cache_root / "datasets"
3327 data_folder = base_path / dataset_name
3329 # download data if necessary
3330 up_en_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_English-EWT/"
3331 cached_path(f"{up_en_path}en_ewt-up-train.conllu", Path("datasets") / dataset_name)
3332 cached_path(f"{up_en_path}en_ewt-up-dev.conllu", Path("datasets") / dataset_name)
3333 cached_path(f"{up_en_path}en_ewt-up-test.conllu", Path("datasets") / dataset_name)
3335 super(UP_ENGLISH, self).__init__(
3336 data_folder,
3337 columns,
3338 encoding="utf-8",
3339 train_file="en_ewt-up-train.conllu",
3340 test_file="en_ewt-up-test.conllu",
3341 dev_file="en_ewt-up-dev.conllu",
3342 in_memory=in_memory,
3343 document_separator_token=None if not document_as_sequence else "-DOCSTART-",
3344 comment_symbol="#",
3345 label_name_map={"_": "O"},
3346 **corpusargs,
3347 )
3350class UP_FRENCH(ColumnCorpus):
3351 def __init__(
3352 self,
3353 base_path: Union[str, Path] = None,
3354 in_memory: bool = True,
3355 document_as_sequence: bool = False,
3356 **corpusargs,
3357 ):
3358 """
3359 Initialize the French dataset from the Universal Propositions Bank, comming from that webpage:
3360 https://github.com/System-T/UniversalPropositions.
3362 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
3363 to point to a different folder but typically this should not be necessary.
3364 :param in_memory: If True, keeps dataset in memory giving speedups in training.
3365 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
3366 """
3367 if type(base_path) == str:
3368 base_path: Path = Path(base_path)
3370 # column format
3371 columns = {1: "text", 9: "frame"}
3373 # this dataset name
3374 dataset_name = self.__class__.__name__.lower()
3376 # default dataset folder is the cache root
3377 if not base_path:
3378 base_path = flair.cache_root / "datasets"
3379 data_folder = base_path / dataset_name
3381 # download data if necessary
3382 up_fr_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_French/"
3383 cached_path(f"{up_fr_path}fr-up-train.conllu", Path("datasets") / dataset_name)
3384 cached_path(f"{up_fr_path}fr-up-dev.conllu", Path("datasets") / dataset_name)
3385 cached_path(f"{up_fr_path}fr-up-test.conllu", Path("datasets") / dataset_name)
3387 super(UP_FRENCH, self).__init__(
3388 data_folder,
3389 columns,
3390 encoding="utf-8",
3391 train_file="fr-up-train.conllu",
3392 test_file="fr-up-test.conllu",
3393 dev_file="fr-up-dev.conllu",
3394 in_memory=in_memory,
3395 document_separator_token=None if not document_as_sequence else "-DOCSTART-",
3396 comment_symbol="#",
3397 **corpusargs,
3398 )
3401class UP_FINNISH(ColumnCorpus):
3402 def __init__(
3403 self,
3404 base_path: Union[str, Path] = None,
3405 in_memory: bool = True,
3406 document_as_sequence: bool = False,
3407 **corpusargs,
3408 ):
3409 """
3410 Initialize the Finnish dataset from the Universal Propositions Bank, comming from that webpage:
3411 https://github.com/System-T/UniversalPropositions/tree/master/UP_Finnish
3413 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
3414 to point to a different folder but typically this should not be necessary.
3415 :param in_memory: If True, keeps dataset in memory giving speedups in training.
3416 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
3417 """
3418 if type(base_path) == str:
3419 base_path: Path = Path(base_path)
3421 # column format
3422 columns = {1: "text", 9: "frame"}
3424 # this dataset name
3425 dataset_name = self.__class__.__name__.lower()
3427 # default dataset folder is the cache root
3428 if not base_path:
3429 base_path = flair.cache_root / "datasets"
3430 data_folder = base_path / dataset_name
3432 # download data if necessary
3433 up_fi_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Finnish/"
3434 cached_path(f"{up_fi_path}fi-up-train.conllu", Path("datasets") / dataset_name)
3435 cached_path(f"{up_fi_path}fi-up-dev.conllu", Path("datasets") / dataset_name)
3436 cached_path(f"{up_fi_path}fi-up-test.conllu", Path("datasets") / dataset_name)
3438 super(UP_FINNISH, self).__init__(
3439 data_folder,
3440 columns,
3441 encoding="utf-8",
3442 train_file="fi-up-train.conllu",
3443 test_file="fi-up-test.conllu",
3444 dev_file="fi-up-dev.conllu",
3445 in_memory=in_memory,
3446 document_separator_token=None if not document_as_sequence else "-DOCSTART-",
3447 comment_symbol="#",
3448 **corpusargs,
3449 )
3452class UP_GERMAN(ColumnCorpus):
3453 def __init__(
3454 self,
3455 base_path: Union[str, Path] = None,
3456 in_memory: bool = True,
3457 document_as_sequence: bool = False,
3458 **corpusargs,
3459 ):
3460 """
3461 Initialize the German dataset from the Universal Propositions Bank, comming from that webpage:
3462 https://github.com/System-T/UniversalPropositions.
3464 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
3465 to point to a different folder but typically this should not be necessary.
3466 :param in_memory: If True, keeps dataset in memory giving speedups in training.
3467 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
3468 """
3469 if type(base_path) == str:
3470 base_path: Path = Path(base_path)
3472 # column format
3473 columns = {1: "text", 9: "frame"}
3475 # this dataset name
3476 dataset_name = self.__class__.__name__.lower()
3478 # default dataset folder is the cache root
3479 if not base_path:
3480 base_path = flair.cache_root / "datasets"
3481 data_folder = base_path / dataset_name
3483 # download data if necessary
3484 up_de_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_German/"
3485 cached_path(f"{up_de_path}de-up-train.conllu", Path("datasets") / dataset_name)
3486 cached_path(f"{up_de_path}de-up-dev.conllu", Path("datasets") / dataset_name)
3487 cached_path(f"{up_de_path}de-up-test.conllu", Path("datasets") / dataset_name)
3489 super(UP_GERMAN, self).__init__(
3490 data_folder,
3491 columns,
3492 encoding="utf-8",
3493 train_file="de-up-train.conllu",
3494 test_file="de-up-test.conllu",
3495 dev_file="de-up-dev.conllu",
3496 in_memory=in_memory,
3497 document_separator_token=None if not document_as_sequence else "-DOCSTART-",
3498 comment_symbol="#",
3499 **corpusargs,
3500 )
3503class UP_ITALIAN(ColumnCorpus):
3504 def __init__(
3505 self,
3506 base_path: Union[str, Path] = None,
3507 in_memory: bool = True,
3508 document_as_sequence: bool = False,
3509 **corpusargs,
3510 ):
3511 """
3512 Initialize the Italian dataset from the Universal Propositions Bank, comming from that webpage:
3513 https://github.com/System-T/UniversalPropositions/tree/master/UP_Italian
3515 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
3516 to point to a different folder but typically this should not be necessary.
3517 :param in_memory: If True, keeps dataset in memory giving speedups in training.
3518 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
3519 """
3520 if type(base_path) == str:
3521 base_path: Path = Path(base_path)
3523 # column format
3524 columns = {1: "text", 9: "frame"}
3526 # this dataset name
3527 dataset_name = self.__class__.__name__.lower()
3529 # default dataset folder is the cache root
3530 if not base_path:
3531 base_path = flair.cache_root / "datasets"
3532 data_folder = base_path / dataset_name
3534 # download data if necessary
3535 up_it_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Italian/"
3536 cached_path(f"{up_it_path}it-up-train.conllu", Path("datasets") / dataset_name)
3537 cached_path(f"{up_it_path}it-up-dev.conllu", Path("datasets") / dataset_name)
3538 cached_path(f"{up_it_path}it-up-test.conllu", Path("datasets") / dataset_name)
3540 super(UP_ITALIAN, self).__init__(
3541 data_folder,
3542 columns,
3543 encoding="utf-8",
3544 train_file="it-up-train.conllu",
3545 test_file="it-up-test.conllu",
3546 dev_file="it-up-dev.conllu",
3547 in_memory=in_memory,
3548 document_separator_token=None if not document_as_sequence else "-DOCSTART-",
3549 comment_symbol="#",
3550 **corpusargs,
3551 )
3554class UP_SPANISH(ColumnCorpus):
3555 def __init__(
3556 self,
3557 base_path: Union[str, Path] = None,
3558 in_memory: bool = True,
3559 document_as_sequence: bool = False,
3560 **corpusargs,
3561 ):
3562 """
3563 Initialize the Spanish dataset from the Universal Propositions Bank, comming from that webpage:
3564 https://github.com/System-T/UniversalPropositions
3566 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
3567 to point to a different folder but typically this should not be necessary.
3568 :param in_memory: If True, keeps dataset in memory giving speedups in training.
3569 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
3570 """
3571 if type(base_path) == str:
3572 base_path: Path = Path(base_path)
3574 # column format
3575 columns = {1: "text", 9: "frame"}
3577 # this dataset name
3578 dataset_name = self.__class__.__name__.lower()
3580 # default dataset folder is the cache root
3581 if not base_path:
3582 base_path = flair.cache_root / "datasets"
3583 data_folder = base_path / dataset_name
3585 # download data if necessary
3586 up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish/"
3587 cached_path(f"{up_es_path}es-up-train.conllu", Path("datasets") / dataset_name)
3588 cached_path(f"{up_es_path}es-up-dev.conllu", Path("datasets") / dataset_name)
3589 cached_path(f"{up_es_path}es-up-test.conllu", Path("datasets") / dataset_name)
3591 super(UP_SPANISH, self).__init__(
3592 data_folder,
3593 columns,
3594 encoding="utf-8",
3595 train_file="es-up-train.conllu",
3596 test_file="es-up-test.conllu",
3597 dev_file="es-up-dev.conllu",
3598 in_memory=in_memory,
3599 document_separator_token=None if not document_as_sequence else "-DOCSTART-",
3600 comment_symbol="#",
3601 **corpusargs,
3602 )
3605class UP_SPANISH_ANCORA(ColumnCorpus):
3606 def __init__(
3607 self,
3608 base_path: Union[str, Path] = None,
3609 in_memory: bool = True,
3610 document_as_sequence: bool = False,
3611 **corpusargs,
3612 ):
3613 """
3614 Initialize the Spanish AnCora dataset from the Universal Propositions Bank, comming from that webpage:
3615 https://github.com/System-T/UniversalPropositions
3617 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
3618 to point to a different folder but typically this should not be necessary.
3619 :param in_memory: If True, keeps dataset in memory giving speedups in training.
3620 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
3621 """
3622 if type(base_path) == str:
3623 base_path: Path = Path(base_path)
3625 # column format
3626 columns = {1: "text", 9: "frame"}
3628 # this dataset name
3629 dataset_name = self.__class__.__name__.lower()
3631 # default dataset folder is the cache root
3632 if not base_path:
3633 base_path = flair.cache_root / "datasets"
3634 data_folder = base_path / dataset_name
3636 # download data if necessary
3637 up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish-AnCora/"
3638 cached_path(f"{up_es_path}es_ancora-up-train.conllu", Path("datasets") / dataset_name)
3639 cached_path(f"{up_es_path}es_ancora-up-dev.conllu", Path("datasets") / dataset_name)
3640 cached_path(f"{up_es_path}es_ancora-up-test.conllu", Path("datasets") / dataset_name)
3642 super(UP_SPANISH_ANCORA, self).__init__(
3643 data_folder,
3644 columns,
3645 encoding="utf-8",
3646 train_file="es_ancora-up-train.conllu",
3647 test_file="es_ancora-up-test.conllu",
3648 dev_file="es_ancora-up-dev.conllu",
3649 in_memory=in_memory,
3650 document_separator_token=None if not document_as_sequence else "-DOCSTART-",
3651 comment_symbol="#",
3652 **corpusargs,
3653 )