Coverage for flair/flair/datasets/sequence_labeling.py: 19%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import logging
2import os
3import re
4import shutil
5from pathlib import Path
6from typing import Union, Dict, List, Optional
8import flair
9from flair.data import Corpus, MultiCorpus, FlairDataset, Sentence, Token
10from flair.datasets.base import find_train_dev_test_files
11from flair.file_utils import cached_path, unpack_file
13log = logging.getLogger("flair")
16class ColumnCorpus(Corpus):
17 def __init__(
18 self,
19 data_folder: Union[str, Path],
20 column_format: Dict[int, str],
21 train_file=None,
22 test_file=None,
23 dev_file=None,
24 tag_to_bioes=None,
25 column_delimiter: str = r"\s+",
26 comment_symbol: str = None,
27 encoding: str = "utf-8",
28 document_separator_token: str = None,
29 skip_first_line: bool = False,
30 in_memory: bool = True,
31 label_name_map: Dict[str, str] = None,
32 banned_sentences: List[str] = None,
33 autofind_splits: bool = True,
34 name: Optional[str] = None,
35 **corpusargs,
36 ):
37 """
38 Instantiates a Corpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000.
39 :param data_folder: base folder with the task data
40 :param column_format: a map specifying the column format
41 :param train_file: the name of the train file
42 :param test_file: the name of the test file
43 :param dev_file: the name of the dev file, if None, dev data is sampled from train
44 :param tag_to_bioes: whether to convert to BIOES tagging scheme
45 :param column_delimiter: default is to split on any separatator, but you can overwrite for instance with "\t"
46 to split only on tabs
47 :param comment_symbol: if set, lines that begin with this symbol are treated as comments
48 :param document_separator_token: If provided, sentences that function as document boundaries are so marked
49 :param skip_first_line: set to True if your dataset has a header line
50 :param in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads
51 :param label_name_map: Optionally map tag names to different schema.
52 :param banned_sentences: Optionally remove sentences from the corpus. Works only if `in_memory` is true
53 :return: a Corpus with annotated train, dev and test data
54 """
56 # find train, dev and test files if not specified
57 dev_file, test_file, train_file = \
58 find_train_dev_test_files(data_folder, dev_file, test_file, train_file, autofind_splits)
60 # get train data
61 train = ColumnDataset(
62 train_file,
63 column_format,
64 tag_to_bioes,
65 encoding=encoding,
66 comment_symbol=comment_symbol,
67 column_delimiter=column_delimiter,
68 banned_sentences=banned_sentences,
69 in_memory=in_memory,
70 document_separator_token=document_separator_token,
71 skip_first_line=skip_first_line,
72 label_name_map=label_name_map,
73 ) if train_file is not None else None
75 # read in test file if exists
76 test = ColumnDataset(
77 test_file,
78 column_format,
79 tag_to_bioes,
80 encoding=encoding,
81 comment_symbol=comment_symbol,
82 column_delimiter=column_delimiter,
83 banned_sentences=banned_sentences,
84 in_memory=in_memory,
85 document_separator_token=document_separator_token,
86 skip_first_line=skip_first_line,
87 label_name_map=label_name_map,
88 ) if test_file is not None else None
90 # read in dev file if exists
91 dev = ColumnDataset(
92 dev_file,
93 column_format,
94 tag_to_bioes,
95 encoding=encoding,
96 comment_symbol=comment_symbol,
97 banned_sentences=banned_sentences,
98 column_delimiter=column_delimiter,
99 in_memory=in_memory,
100 document_separator_token=document_separator_token,
101 skip_first_line=skip_first_line,
102 label_name_map=label_name_map,
103 ) if dev_file is not None else None
105 corpus_name = str(data_folder) if not name else name
106 super(ColumnCorpus, self).__init__(train, dev, test, name=corpus_name, **corpusargs)
109class ColumnDataset(FlairDataset):
110 # special key for space after
111 SPACE_AFTER_KEY = "space-after"
113 def __init__(
114 self,
115 path_to_column_file: Union[str, Path],
116 column_name_map: Dict[int, str],
117 tag_to_bioes: str = None,
118 column_delimiter: str = r"\s+",
119 comment_symbol: str = None,
120 banned_sentences: List[str] = None,
121 in_memory: bool = True,
122 document_separator_token: str = None,
123 encoding: str = "utf-8",
124 skip_first_line: bool = False,
125 label_name_map: Dict[str, str] = None,
126 ):
127 """
128 Instantiates a column dataset (typically used for sequence labeling or word-level prediction).
129 :param path_to_column_file: path to the file with the column-formatted data
130 :param column_name_map: a map specifying the column format
131 :param tag_to_bioes: whether to convert to BIOES tagging scheme
132 :param column_delimiter: default is to split on any separatator, but you can overwrite for instance with "\t"
133 to split only on tabs
134 :param comment_symbol: if set, lines that begin with this symbol are treated as comments
135 :param in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads
136 :param document_separator_token: If provided, sentences that function as document boundaries are so marked
137 :param skip_first_line: set to True if your dataset has a header line
138 :param label_name_map: Optionally map tag names to different schema.
139 :param banned_sentences: Optionally remove sentences from the corpus. Works only if `in_memory` is true
140 :return: a dataset with annotated data
141 """
142 if type(path_to_column_file) is str:
143 path_to_column_file = Path(path_to_column_file)
144 assert path_to_column_file.exists()
145 self.path_to_column_file = path_to_column_file
146 self.tag_to_bioes = tag_to_bioes
147 self.column_name_map = column_name_map
148 self.column_delimiter = column_delimiter
149 self.comment_symbol = comment_symbol
150 self.document_separator_token = document_separator_token
151 self.label_name_map = label_name_map
152 self.banned_sentences = banned_sentences
154 # store either Sentence objects in memory, or only file offsets
155 self.in_memory = in_memory
157 self.total_sentence_count: int = 0
159 # most data sets have the token text in the first column, if not, pass 'text' as column
160 self.text_column: int = 0
161 for column in self.column_name_map:
162 if column_name_map[column] == "text":
163 self.text_column = column
165 # determine encoding of text file
166 self.encoding = encoding
168 with open(str(self.path_to_column_file), encoding=self.encoding) as file:
170 # skip first line if to selected
171 if skip_first_line:
172 file.readline()
174 # option 1: read only sentence boundaries as offset positions
175 if not self.in_memory:
176 self.indices: List[int] = []
178 line = file.readline()
179 position = 0
180 sentence_started = False
181 while line:
182 if sentence_started and self.__line_completes_sentence(line):
183 self.indices.append(position)
184 position = file.tell()
185 sentence_started = False
187 elif not line.isspace():
188 sentence_started = True
189 line = file.readline()
191 if sentence_started:
192 self.indices.append(position)
194 self.total_sentence_count = len(self.indices)
196 # option 2: keep everything in memory
197 if self.in_memory:
198 self.sentences: List[Sentence] = []
200 # pointer to previous
201 previous_sentence = None
202 while True:
203 sentence = self._convert_lines_to_sentence(self._read_next_sentence(file))
204 if not sentence: break
205 if self.banned_sentences is not None and any(
206 [d in sentence.to_plain_string() for d in self.banned_sentences]):
207 continue
208 sentence._previous_sentence = previous_sentence
209 sentence._next_sentence = None
211 if previous_sentence: previous_sentence._next_sentence = sentence
213 self.sentences.append(sentence)
214 previous_sentence = sentence
216 self.total_sentence_count = len(self.sentences)
218 def _read_next_sentence(self, file):
219 lines = []
220 line = file.readline()
221 while line:
222 if not line.isspace():
223 lines.append(line)
225 # if sentence ends, break
226 if len(lines) > 0 and self.__line_completes_sentence(line):
227 break
229 line = file.readline()
230 return lines
232 def _convert_lines_to_sentence(self, lines):
234 sentence: Sentence = Sentence()
235 for line in lines:
236 # skip comments
237 if self.comment_symbol is not None and line.startswith(self.comment_symbol):
238 continue
240 # if sentence ends, convert and return
241 if self.__line_completes_sentence(line):
242 if len(sentence) > 0:
243 if self.tag_to_bioes is not None:
244 sentence.convert_tag_scheme(
245 tag_type=self.tag_to_bioes, target_scheme="iobes"
246 )
247 # check if this sentence is a document boundary
248 if sentence.to_original_text() == self.document_separator_token:
249 sentence.is_document_boundary = True
250 return sentence
252 # otherwise, this line is a token. parse and add to sentence
253 else:
254 token = self._parse_token(line)
255 sentence.add_token(token)
257 # check if this sentence is a document boundary
258 if sentence.to_original_text() == self.document_separator_token: sentence.is_document_boundary = True
260 if self.tag_to_bioes is not None:
261 sentence.convert_tag_scheme(
262 tag_type=self.tag_to_bioes, target_scheme="iobes"
263 )
265 if len(sentence) > 0: return sentence
267 def _parse_token(self, line: str) -> Token:
268 fields: List[str] = re.split(self.column_delimiter, line.rstrip())
269 token = Token(fields[self.text_column])
270 for column in self.column_name_map:
271 if len(fields) > column:
272 if column != self.text_column and self.column_name_map[column] != self.SPACE_AFTER_KEY:
273 task = self.column_name_map[column] # for example 'pos'
274 tag = fields[column]
275 if tag.count("-") >= 1: # tag with prefix, for example tag='B-OBJ'
276 split_at_first_hyphen = tag.split("-", 1)
277 tagging_format_prefix = split_at_first_hyphen[0]
278 tag_without_tagging_format = split_at_first_hyphen[1]
279 if self.label_name_map and tag_without_tagging_format in self.label_name_map.keys():
280 tag = tagging_format_prefix + "-" + self.label_name_map[tag_without_tagging_format]
281 # for example, transforming 'B-OBJ' to 'B-part-of-speech-object'
282 if self.label_name_map[tag_without_tagging_format] == 'O': tag = 'O'
283 else: # tag without prefix, for example tag='PPER'
284 if self.label_name_map and tag in self.label_name_map.keys():
285 tag = self.label_name_map[tag] # for example, transforming 'PPER' to 'person'
287 token.add_label(task, tag)
288 if self.column_name_map[column] == self.SPACE_AFTER_KEY and fields[column] == '-':
289 token.whitespace_after = False
290 return token
292 def __line_completes_sentence(self, line: str) -> bool:
293 sentence_completed = line.isspace() or line == ''
294 return sentence_completed
296 def is_in_memory(self) -> bool:
297 return self.in_memory
299 def __len__(self):
300 return self.total_sentence_count
302 def __getitem__(self, index: int = 0) -> Sentence:
304 # if in memory, retrieve parsed sentence
305 if self.in_memory:
306 sentence = self.sentences[index]
308 # else skip to position in file where sentence begins
309 else:
310 with open(str(self.path_to_column_file), encoding=self.encoding) as file:
311 file.seek(self.indices[index])
312 sentence = self._convert_lines_to_sentence(self._read_next_sentence(file))
314 # set sentence context using partials
315 sentence._position_in_dataset = (self, index)
317 return sentence
320class CONLL_03(ColumnCorpus):
321 def __init__(
322 self,
323 base_path: Union[str, Path] = None,
324 tag_to_bioes: str = "ner",
325 in_memory: bool = True,
326 **corpusargs,
327 ):
328 """
329 Initialize the CoNLL-03 corpus. This is only possible if you've manually downloaded it to your machine.
330 Obtain the corpus from https://www.clips.uantwerpen.be/conll2003/ner/ and put the eng.testa, .testb, .train
331 files in a folder called 'conll_03'. Then set the base_path parameter in the constructor to the path to the
332 parent directory where the conll_03 folder resides.
333 If using entity linking, the conll03 dateset is reduced by about 20 Documents, which are not part of the yago dataset.
334 :param base_path: Path to the CoNLL-03 corpus (i.e. 'conll_03' folder) on your machine
335 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' or 'np' to predict
336 POS tags or chunks respectively
337 :param in_memory: If True, keeps dataset in memory giving speedups in training.
338 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
339 """
340 if type(base_path) == str:
341 base_path: Path = Path(base_path)
343 # column format
344 columns = {0: "text", 1: "pos", 2: "np", 3: "ner"}
346 # this dataset name
347 dataset_name = self.__class__.__name__.lower()
349 # default dataset folder is the cache root
350 if not base_path:
351 base_path = flair.cache_root / "datasets"
352 data_folder = base_path / dataset_name
354 # check if data there
355 if not data_folder.exists():
356 log.warning("-" * 100)
357 log.warning(f'WARNING: CoNLL-03 dataset not found at "{data_folder}".')
358 log.warning(
359 'Instructions for obtaining the data can be found here: https://www.clips.uantwerpen.be/conll2003/ner/"'
360 )
361 log.warning("-" * 100)
363 super(CONLL_03, self).__init__(
364 data_folder,
365 columns,
366 tag_to_bioes=tag_to_bioes,
367 in_memory=in_memory,
368 document_separator_token="-DOCSTART-",
369 **corpusargs,
370 )
373class CONLL_03_GERMAN(ColumnCorpus):
374 def __init__(
375 self,
376 base_path: Union[str, Path] = None,
377 tag_to_bioes: str = "ner",
378 in_memory: bool = True,
379 **corpusargs,
380 ):
381 """
382 Initialize the CoNLL-03 corpus for German. This is only possible if you've manually downloaded it to your machine.
383 Obtain the corpus from https://www.clips.uantwerpen.be/conll2003/ner/ and put the respective files in a folder called
384 'conll_03_german'. Then set the base_path parameter in the constructor to the path to the parent directory where
385 the conll_03_german folder resides.
386 :param base_path: Path to the CoNLL-03 corpus (i.e. 'conll_03_german' folder) on your machine
387 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'lemma', 'pos' or 'np' to predict
388 word lemmas, POS tags or chunks respectively
389 :param in_memory: If True, keeps dataset in memory giving speedups in training.
390 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
391 """
392 if type(base_path) == str:
393 base_path: Path = Path(base_path)
395 # column format
396 columns = {0: "text", 1: "lemma", 2: "pos", 3: "np", 4: "ner"}
398 # this dataset name
399 dataset_name = self.__class__.__name__.lower()
401 # default dataset folder is the cache root
402 if not base_path:
403 base_path = flair.cache_root / "datasets"
404 data_folder = base_path / dataset_name
406 # check if data there
407 if not data_folder.exists():
408 log.warning("-" * 100)
409 log.warning(f'WARNING: CoNLL-03 dataset not found at "{data_folder}".')
410 log.warning(
411 'Instructions for obtaining the data can be found here: https://www.clips.uantwerpen.be/conll2003/ner/"'
412 )
413 log.warning("-" * 100)
415 super(CONLL_03_GERMAN, self).__init__(
416 data_folder,
417 columns,
418 tag_to_bioes=tag_to_bioes,
419 in_memory=in_memory,
420 document_separator_token="-DOCSTART-",
421 **corpusargs,
422 )
425class CONLL_03_DUTCH(ColumnCorpus):
426 def __init__(
427 self,
428 base_path: Union[str, Path] = None,
429 tag_to_bioes: str = "ner",
430 in_memory: bool = True,
431 **corpusargs,
432 ):
433 """
434 Initialize the CoNLL-03 corpus for Dutch. The first time you call this constructor it will automatically
435 download the dataset.
436 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
437 to point to a different folder but typically this should not be necessary.
438 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
439 POS tags instead
440 :param in_memory: If True, keeps dataset in memory giving speedups in training.
441 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
442 """
443 if type(base_path) == str:
444 base_path: Path = Path(base_path)
446 # column format
447 columns = {0: "text", 1: "pos", 2: "ner"}
449 # this dataset name
450 dataset_name = self.__class__.__name__.lower()
452 # default dataset folder is the cache root
453 if not base_path:
454 base_path = flair.cache_root / "datasets"
455 data_folder = base_path / dataset_name
457 # download data if necessary
458 conll_02_path = "https://www.clips.uantwerpen.be/conll2002/ner/data/"
460 # download files if not present locally
461 cached_path(f"{conll_02_path}ned.testa", data_folder / 'raw')
462 cached_path(f"{conll_02_path}ned.testb", data_folder / 'raw')
463 cached_path(f"{conll_02_path}ned.train", data_folder / 'raw')
465 # we need to slightly modify the original files by adding some new lines after document separators
466 train_data_file = data_folder / 'train.txt'
467 if not train_data_file.is_file():
468 self.__offset_docstarts(data_folder / 'raw' / "ned.train", data_folder / 'train.txt')
469 self.__offset_docstarts(data_folder / 'raw' / "ned.testa", data_folder / 'dev.txt')
470 self.__offset_docstarts(data_folder / 'raw' / "ned.testb", data_folder / 'test.txt')
472 super(CONLL_03_DUTCH, self).__init__(
473 data_folder,
474 columns,
475 train_file='train.txt',
476 dev_file='dev.txt',
477 test_file='test.txt',
478 tag_to_bioes=tag_to_bioes,
479 encoding="latin-1",
480 in_memory=in_memory,
481 document_separator_token="-DOCSTART-",
482 **corpusargs,
483 )
485 @staticmethod
486 def __offset_docstarts(file_in: Union[str, Path], file_out: Union[str, Path]):
487 with open(file_in, 'r', encoding="latin-1") as f:
488 lines = f.readlines()
489 with open(file_out, 'w', encoding="latin-1") as f:
490 for line in lines:
491 f.write(line)
492 if line.startswith('-DOCSTART-'):
493 f.write("\n")
496class CONLL_03_SPANISH(ColumnCorpus):
497 def __init__(
498 self,
499 base_path: Union[str, Path] = None,
500 tag_to_bioes: str = "ner",
501 in_memory: bool = True,
502 **corpusargs,
503 ):
504 """
505 Initialize the CoNLL-03 corpus for Spanish. The first time you call this constructor it will automatically
506 download the dataset.
507 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
508 to point to a different folder but typically this should not be necessary.
509 :param tag_to_bioes: NER by default, should not be changed
510 :param in_memory: If True, keeps dataset in memory giving speedups in training.
511 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
512 """
513 if type(base_path) == str:
514 base_path: Path = Path(base_path)
516 # column format
517 columns = {0: "text", 1: "ner"}
519 # this dataset name
520 dataset_name = self.__class__.__name__.lower()
522 # default dataset folder is the cache root
523 if not base_path:
524 base_path = flair.cache_root / "datasets"
525 data_folder = base_path / dataset_name
527 # download data if necessary
528 conll_02_path = "https://www.clips.uantwerpen.be/conll2002/ner/data/"
529 cached_path(f"{conll_02_path}esp.testa", Path("datasets") / dataset_name)
530 cached_path(f"{conll_02_path}esp.testb", Path("datasets") / dataset_name)
531 cached_path(f"{conll_02_path}esp.train", Path("datasets") / dataset_name)
533 super(CONLL_03_SPANISH, self).__init__(
534 data_folder,
535 columns,
536 tag_to_bioes=tag_to_bioes,
537 encoding="latin-1",
538 in_memory=in_memory,
539 **corpusargs,
540 )
543class CONLL_2000(ColumnCorpus):
544 def __init__(
545 self,
546 base_path: Union[str, Path] = None,
547 tag_to_bioes: str = "np",
548 in_memory: bool = True,
549 **corpusargs,
550 ):
551 """
552 Initialize the CoNLL-2000 corpus for English chunking.
553 The first time you call this constructor it will automatically download the dataset.
554 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
555 to point to a different folder but typically this should not be necessary.
556 :param tag_to_bioes: 'np' by default, should not be changed, but you can set 'pos' instead to predict POS tags
557 :param in_memory: If True, keeps dataset in memory giving speedups in training.
558 """
559 if type(base_path) == str:
560 base_path: Path = Path(base_path)
562 # column format
563 columns = {0: "text", 1: "pos", 2: "np"}
565 # this dataset name
566 dataset_name = self.__class__.__name__.lower()
568 # default dataset folder is the cache root
569 if not base_path:
570 base_path = flair.cache_root / "datasets"
571 data_folder = base_path / dataset_name
573 # download data if necessary
574 conll_2000_path = "https://www.clips.uantwerpen.be/conll2000/chunking/"
575 data_file = flair.cache_root / "datasets" / dataset_name / "train.txt"
576 if not data_file.is_file():
577 cached_path(
578 f"{conll_2000_path}train.txt.gz", Path("datasets") / dataset_name
579 )
580 cached_path(
581 f"{conll_2000_path}test.txt.gz", Path("datasets") / dataset_name
582 )
583 import gzip, shutil
585 with gzip.open(
586 flair.cache_root / "datasets" / dataset_name / "train.txt.gz",
587 "rb",
588 ) as f_in:
589 with open(
590 flair.cache_root / "datasets" / dataset_name / "train.txt",
591 "wb",
592 ) as f_out:
593 shutil.copyfileobj(f_in, f_out)
594 with gzip.open(
595 flair.cache_root / "datasets" / dataset_name / "test.txt.gz", "rb"
596 ) as f_in:
597 with open(
598 flair.cache_root / "datasets" / dataset_name / "test.txt",
599 "wb",
600 ) as f_out:
601 shutil.copyfileobj(f_in, f_out)
603 super(CONLL_2000, self).__init__(
604 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
605 )
608class WNUT_17(ColumnCorpus):
609 def __init__(
610 self,
611 base_path: Union[str, Path] = None,
612 tag_to_bioes: str = "ner",
613 in_memory: bool = True,
614 **corpusargs,
615 ):
616 if type(base_path) == str:
617 base_path: Path = Path(base_path)
619 # column format
620 columns = {0: "text", 1: "ner"}
622 # this dataset name
623 dataset_name = self.__class__.__name__.lower()
625 # default dataset folder is the cache root
626 if not base_path:
627 base_path = flair.cache_root / "datasets"
628 data_folder = base_path / dataset_name
630 # download data if necessary
631 wnut_path = "https://noisy-text.github.io/2017/files/"
632 cached_path(f"{wnut_path}wnut17train.conll", Path("datasets") / dataset_name)
633 cached_path(f"{wnut_path}emerging.dev.conll", Path("datasets") / dataset_name)
634 cached_path(
635 f"{wnut_path}emerging.test.annotated", Path("datasets") / dataset_name
636 )
638 super(WNUT_17, self).__init__(
639 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
640 )
643class BIOSCOPE(ColumnCorpus):
644 def __init__(
645 self,
646 base_path: Union[str, Path] = None,
647 in_memory: bool = True,
648 **corpusargs,
649 ):
650 if type(base_path) == str:
651 base_path: Path = Path(base_path)
653 # column format
654 columns = {0: "text", 1: "tag"}
656 # this dataset name
657 dataset_name = self.__class__.__name__.lower()
659 # default dataset folder is the cache root
660 if not base_path:
661 base_path = flair.cache_root / "datasets"
662 data_folder = base_path / dataset_name
664 # download data if necessary
665 bioscope_path = "https://raw.githubusercontent.com/whoisjones/BioScopeSequenceLabelingData/master/sequence_labeled/"
666 cached_path(f"{bioscope_path}output.txt", Path("datasets") / dataset_name)
668 super(BIOSCOPE, self).__init__(
669 data_folder, columns, in_memory=in_memory, train_file="output.txt", **corpusargs,
670 )
673class NER_ARABIC_ANER(ColumnCorpus):
674 def __init__(
675 self,
676 base_path: Union[str, Path] = None,
677 tag_to_bioes: str = "ner",
678 in_memory: bool = True,
679 document_as_sequence: bool = False,
680 **corpusargs,
681 ):
682 """
683 Initialize a preprocessed version of the Arabic Named Entity Recognition Corpus (ANERCorp) dataset available
684 from https://github.com/EmnamoR/Arabic-named-entity-recognition/blob/master/ANERCorp.rar.
685 http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp
686 Column order is swapped
687 The first time you call this constructor it will automatically download the dataset.
688 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
689 to point to a different folder but typically this should not be necessary.
690 :param tag_to_bioes: NER by default, need not be changed.
691 :param in_memory: If True, keeps dataset in memory giving speedups in training.
692 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
693 """
694 if type(base_path) == str:
695 base_path: Path = Path(base_path)
697 # column format
698 columns = {0: "text", 1: "ner"}
700 # this dataset name
701 dataset_name = self.__class__.__name__.lower()
703 # default dataset folder is the cache root
704 if not base_path:
705 base_path = flair.cache_root / "datasets"
706 data_folder = base_path / dataset_name
708 # download data if necessary
709 anercorp_path = "https://megantosh.s3.eu-central-1.amazonaws.com/ANERcorp/"
710 # cached_path(f"{anercorp_path}test.txt", Path("datasets") / dataset_name)
711 cached_path(f"{anercorp_path}train.txt", Path("datasets") / dataset_name)
713 super(NER_ARABIC_ANER, self).__init__(
714 data_folder,
715 columns,
716 tag_to_bioes=tag_to_bioes,
717 encoding="utf-8",
718 in_memory=in_memory,
719 document_separator_token=None if not document_as_sequence else "-DOCSTART-",
720 **corpusargs,
721 )
724class NER_ARABIC_AQMAR(ColumnCorpus):
725 def __init__(
726 self,
727 base_path: Union[str, Path] = None,
728 tag_to_bioes: str = "ner",
729 in_memory: bool = True,
730 document_as_sequence: bool = False,
731 **corpusargs,
732 ):
733 """
734 Initialize a preprocessed and modified version of the American and Qatari Modeling of Arabic (AQMAR) dataset available
735 from http://www.cs.cmu.edu/~ark/ArabicNER/AQMAR_Arabic_NER_corpus-1.0.zip.
736 via http://www.cs.cmu.edu/~ark/AQMAR/
738 - Modifications from original dataset: Miscellaneous tags (MIS0, MIS1, MIS2, MIS3) are merged to one tag "MISC" as these categories deviate across the original dataset
739 - The 28 original Wikipedia articles are merged into a single file containing the articles in alphabetical order
741 The first time you call this constructor it will automatically download the dataset.
743 This dataset is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.
744 please cite: "Behrang Mohit, Nathan Schneider, Rishav Bhowmick, Kemal Oflazer, and Noah A. Smith (2012),
745 Recall-Oriented Learning of Named Entities in Arabic Wikipedia. Proceedings of EACL."
747 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
748 :param tag_to_bioes: NER by default
749 :param in_memory: If True, keeps dataset in memory giving speedups in training.
750 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
751 """
752 if type(base_path) == str:
753 base_path: Path = Path(base_path)
755 # column format
756 columns = {0: "text", 1: "ner"}
758 # this dataset name
759 dataset_name = self.__class__.__name__.lower()
761 # default dataset folder is the cache root
762 if not base_path:
763 base_path = flair.cache_root / "datasets"
764 data_folder = base_path / dataset_name
766 # download data if necessary
767 aqmar_path = "https://megantosh.s3.eu-central-1.amazonaws.com/AQMAR/"
768 # cached_path(f"{anercorp_path}test.txt", Path("datasets") / dataset_name)
769 cached_path(f"{aqmar_path}train.txt", Path("datasets") / dataset_name)
771 super(NER_ARABIC_AQMAR, self).__init__(
772 data_folder,
773 columns,
774 tag_to_bioes=tag_to_bioes,
775 encoding="utf-8",
776 in_memory=in_memory,
777 document_separator_token=None if not document_as_sequence else "-DOCSTART-",
778 **corpusargs,
779 )
782class NER_BASQUE(ColumnCorpus):
783 def __init__(
784 self,
785 base_path: Union[str, Path] = None,
786 tag_to_bioes: str = "ner",
787 in_memory: bool = True,
788 **corpusargs,
789 ):
790 if type(base_path) == str:
791 base_path: Path = Path(base_path)
793 # column format
794 columns = {0: "text", 1: "ner"}
796 # this dataset name
797 dataset_name = self.__class__.__name__.lower()
799 # default dataset folder is the cache root
800 if not base_path:
801 base_path = flair.cache_root / "datasets"
802 data_folder = base_path / dataset_name
804 # download data if necessary
805 ner_basque_path = "http://ixa2.si.ehu.eus/eiec/"
806 data_path = flair.cache_root / "datasets" / dataset_name
807 data_file = data_path / "named_ent_eu.train"
808 if not data_file.is_file():
809 cached_path(
810 f"{ner_basque_path}/eiec_v1.0.tgz", Path("datasets") / dataset_name
811 )
812 import tarfile, shutil
814 with tarfile.open(
815 flair.cache_root / "datasets" / dataset_name / "eiec_v1.0.tgz",
816 "r:gz",
817 ) as f_in:
818 corpus_files = (
819 "eiec_v1.0/named_ent_eu.train",
820 "eiec_v1.0/named_ent_eu.test",
821 )
822 for corpus_file in corpus_files:
823 f_in.extract(corpus_file, data_path)
824 shutil.move(f"{data_path}/{corpus_file}", data_path)
826 super(NER_BASQUE, self).__init__(
827 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
828 )
831class NER_CHINESE_WEIBO(ColumnCorpus):
832 def __init__(
833 self,
834 base_path: Union[str, Path] = None,
835 tag_to_bioes: str = "ner",
836 in_memory: bool = True,
837 document_as_sequence: bool = False,
838 **corpusargs,
839 ):
840 """
841 Initialize the WEIBO_NER corpus . The first time you call this constructor it will automatically
842 download the dataset.
843 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
844 to point to a different folder but typically this should not be necessary.
845 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
846 POS tags instead
847 :param in_memory: If True, keeps dataset in memory giving speedups in training.
848 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
849 """
850 if type(base_path) == str:
851 base_path: Path = Path(base_path)
853 # column format
854 columns = {0: 'text', 1: 'ner'}
856 # this dataset name
857 dataset_name = self.__class__.__name__.lower()
859 # default dataset folder is the cache root
860 if not base_path:
861 base_path = flair.cache_root / "datasets"
862 data_folder = base_path / dataset_name
864 # download data if necessary
865 weiboNER_conll_path = "https://raw.githubusercontent.com/87302380/WEIBO_NER/main/data/"
866 cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.train", Path("datasets") / dataset_name)
867 cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.test", Path("datasets") / dataset_name)
868 cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.dev", Path("datasets") / dataset_name)
870 super(NER_CHINESE_WEIBO, self).__init__(
871 data_folder,
872 columns,
873 tag_to_bioes=tag_to_bioes,
874 encoding="utf-8",
875 in_memory=in_memory,
876 train_file="weiboNER_2nd_conll_format.train",
877 test_file="weiboNER_2nd_conll_format.test",
878 dev_file="weiboNER_2nd_conll_format.dev",
879 document_separator_token=None if not document_as_sequence else "-DOCSTART-",
880 **corpusargs,
881 )
884class NER_DANISH_DANE(ColumnCorpus):
885 def __init__(
886 self,
887 base_path: Union[str, Path] = None,
888 tag_to_bioes: str = "ner",
889 in_memory: bool = True,
890 **corpusargs,
891 ):
892 if type(base_path) == str:
893 base_path: Path = Path(base_path)
895 # column format
896 columns = {1: 'text', 3: 'pos', 9: 'ner'}
898 # this dataset name
899 dataset_name = self.__class__.__name__.lower()
901 # default dataset folder is the cache root
902 if not base_path:
903 base_path = flair.cache_root / "datasets"
904 data_folder = base_path / dataset_name
906 # download data if necessary
907 data_path = flair.cache_root / "datasets" / dataset_name
908 train_data_file = data_path / "ddt.train.conllu"
909 if not train_data_file.is_file():
910 temp_file = cached_path(
911 'https://danlp.alexandra.dk/304bd159d5de/datasets/ddt.zip',
912 Path("datasets") / dataset_name
913 )
914 from zipfile import ZipFile
916 with ZipFile(temp_file, 'r') as zip_file:
917 zip_file.extractall(path=data_path)
919 # Remove CoNLL-U meta information in the last column
920 for part in ['train', 'dev', 'test']:
921 lines = []
922 data_file = "ddt.{}.conllu".format(part)
923 with open(data_path / data_file, 'r') as file:
924 for line in file:
925 if line.startswith("#") or line == "\n":
926 lines.append(line)
927 lines.append(line.replace("name=", "").replace("|SpaceAfter=No", ""))
929 with open(data_path / data_file, 'w') as file:
930 file.writelines(lines)
932 print(data_path / data_file)
934 super(NER_DANISH_DANE, self).__init__(
935 data_folder, columns, tag_to_bioes=tag_to_bioes,
936 in_memory=in_memory, comment_symbol="#",
937 **corpusargs,
938 )
941class NER_ENGLISH_MOVIE_SIMPLE(ColumnCorpus):
942 def __init__(
943 self,
944 base_path: Union[str, Path] = None,
945 tag_to_bioes: str = "ner",
946 in_memory: bool = True,
947 **corpusargs,
948 ):
949 """
950 Initialize the eng corpus of the MIT Movie Corpus (it has simpler queries compared to the trivia10k13 corpus)
951 in BIO format. The first time you call this constructor it will automatically download the dataset.
952 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
953 to point to a different folder but typically this should not be necessary.
954 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
955 POS tags instead
956 :param in_memory: If True, keeps dataset in memory giving speedups in training.
957 """
958 # column format
959 columns = {0: "ner", 1: "text"}
961 # dataset name
962 dataset_name = self.__class__.__name__.lower()
964 # data folder: default dataset folder is the cache root
965 if type(base_path) == str:
966 base_path: Path = Path(base_path)
967 if not base_path:
968 base_path: Path = flair.cache_root / "datasets"
969 data_folder = base_path / dataset_name
971 # download data if necessary
972 mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/"
973 train_file = "engtrain.bio"
974 test_file = "engtest.bio"
975 cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name)
976 cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name)
978 super(NER_ENGLISH_MOVIE_SIMPLE, self).__init__(
979 data_folder,
980 columns,
981 train_file=train_file,
982 test_file=test_file,
983 tag_to_bioes=tag_to_bioes,
984 in_memory=in_memory,
985 **corpusargs,
986 )
989class NER_ENGLISH_MOVIE_COMPLEX(ColumnCorpus):
990 def __init__(
991 self,
992 base_path: Union[str, Path] = None,
993 tag_to_bioes: str = "ner",
994 in_memory: bool = True,
995 **corpusargs,
996 ):
997 """
998 Initialize the trivia10k13 corpus of the MIT Movie Corpus (it has more complex queries compared to the eng corpus)
999 in BIO format. The first time you call this constructor it will automatically download the dataset.
1000 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
1001 to point to a different folder but typically this should not be necessary.
1002 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
1003 POS tags instead
1004 :param in_memory: If True, keeps dataset in memory giving speedups in training.
1005 """
1006 # column format
1007 columns = {0: "ner", 1: "text"}
1009 # dataset name
1010 dataset_name = self.__class__.__name__.lower()
1012 # data folder: default dataset folder is the cache root
1013 if type(base_path) == str:
1014 base_path: Path = Path(base_path)
1015 if not base_path:
1016 base_path: Path = flair.cache_root / "datasets"
1017 data_folder = base_path / dataset_name
1019 # download data if necessary
1020 mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/"
1021 train_file = "trivia10k13train.bio"
1022 test_file = "trivia10k13test.bio"
1023 cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name)
1024 cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name)
1026 super(NER_ENGLISH_MOVIE_COMPLEX, self).__init__(
1027 data_folder,
1028 columns,
1029 train_file=train_file,
1030 test_file=test_file,
1031 tag_to_bioes=tag_to_bioes,
1032 in_memory=in_memory,
1033 **corpusargs,
1034 )
1037class NER_ENGLISH_SEC_FILLINGS(ColumnCorpus):
1038 """
1039 Initialize corpus of SEC-fillings annotated with English NER tags. See paper "Domain Adaption of Named Entity
1040 Recognition to Support Credit Risk Assessment" by Alvarado et al, 2015: https://aclanthology.org/U15-1010/
1041 :param base_path: Path to the CoNLL-03 corpus (i.e. 'conll_03' folder) on your machine
1042 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' or 'np' to predict
1043 POS tags or chunks respectively
1044 :param in_memory: If True, keeps dataset in memory giving speedups in training.
1045 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
1046 """
1048 def __init__(
1049 self,
1050 base_path: Union[str, Path] = None,
1051 tag_to_bioes: str = "ner",
1052 in_memory: bool = True,
1053 **corpusargs,
1054 ):
1056 if type(base_path) == str:
1057 base_path: Path = Path(base_path)
1059 # column format
1060 columns = {0: "text", 1: "pos", 3: "ner"}
1062 # this dataset name
1063 dataset_name = self.__class__.__name__.lower()
1065 # default dataset folder is the cache root
1066 if not base_path:
1067 base_path = flair.cache_root / "datasets"
1068 data_folder = base_path / dataset_name
1070 # download data if necessary
1071 SEC_FILLINGS_Path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/SEC-filings/CONLL-format/data/"
1072 cached_path(f"{SEC_FILLINGS_Path}test/FIN3.txt", Path("datasets") / dataset_name)
1073 cached_path(f"{SEC_FILLINGS_Path}train/FIN5.txt", Path("datasets") / dataset_name)
1075 super(NER_ENGLISH_SEC_FILLINGS, self).__init__(
1076 data_folder,
1077 columns,
1078 tag_to_bioes=tag_to_bioes,
1079 encoding="utf-8",
1080 in_memory=in_memory,
1081 train_file='FIN5.txt',
1082 test_file="FIN3.txt",
1083 skip_first_line=True,
1084 **corpusargs,
1085 )
1088class NER_ENGLISH_RESTAURANT(ColumnCorpus):
1089 def __init__(
1090 self,
1091 base_path: Union[str, Path] = None,
1092 tag_to_bioes: str = "ner",
1093 in_memory: bool = True,
1094 **corpusargs,
1095 ):
1096 """
1097 Initialize the experimental MIT Restaurant corpus available on https://groups.csail.mit.edu/sls/downloads/restaurant/.
1098 The first time you call this constructor it will automatically download the dataset.
1099 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
1100 to point to a different folder but typically this should not be necessary.
1101 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
1102 POS tags instead
1103 :param in_memory: If True, keeps dataset in memory giving speedups in training.
1104 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
1105 """
1106 if type(base_path) == str:
1107 base_path: Path = Path(base_path)
1109 # column format
1110 columns = {0: "text", 1: "ner"}
1112 # this dataset name
1113 dataset_name = self.__class__.__name__.lower()
1115 # default dataset folder is the cache root
1116 if not base_path:
1117 base_path = flair.cache_root / "datasets"
1118 data_folder = base_path / dataset_name
1120 # download data if necessary
1121 mit_restaurants_path = "https://megantosh.s3.eu-central-1.amazonaws.com/MITRestoCorpus/"
1122 cached_path(f"{mit_restaurants_path}test.txt", Path("datasets") / dataset_name)
1123 cached_path(f"{mit_restaurants_path}train.txt", Path("datasets") / dataset_name)
1125 super(NER_ENGLISH_RESTAURANT, self).__init__(
1126 data_folder,
1127 columns,
1128 tag_to_bioes=tag_to_bioes,
1129 encoding="latin-1",
1130 in_memory=in_memory,
1131 **corpusargs,
1132 )
1135class NER_ENGLISH_STACKOVERFLOW(ColumnCorpus):
1136 def __init__(
1137 self,
1138 base_path: Union[str, Path] = None,
1139 tag_to_bioes: str = "ner",
1140 in_memory: bool = True,
1141 **corpusargs,
1142 ):
1143 """
1144 Initialize the STACKOVERFLOW_NER corpus. The first time you call this constructor it will automatically
1145 download the dataset.
1146 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
1147 to point to a different folder but typically this should not be necessary.
1148 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
1149 POS tags instead
1150 :param in_memory: If True, keeps dataset in memory giving speedups in training.
1151 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
1152 """
1153 if type(base_path) == str:
1154 base_path: Path = Path(base_path)
1156 """
1157 The Datasets are represented in the Conll format.
1158 In this format each line of the Dataset is in the following format:
1159 <word>+"\t"+<NE>"\t"+<word>+"\t"<markdown>
1160 The end of sentence is marked with an empty line.
1161 In each line NE represented the human annotated named entity
1162 and <markdown> represented the code tags provided by the users who wrote the posts.
1163 """
1164 # column format
1165 columns = {0: "word", 1: "ner", 3: "markdown"}
1167 # entity_mapping
1168 entity_mapping = {"Library_Function": "Function",
1169 "Function_Name": "Function",
1170 "Class_Name": "Class",
1171 "Library_Class": "Class",
1172 "Organization": "Website",
1173 "Library_Variable": "Variable",
1174 "Variable_Name": "Variable",
1175 "Error_Name": "O",
1176 "Keyboard_IP": "O",
1177 "Value": "O",
1178 "Output_Block": "O"
1179 }
1181 # this dataset name
1182 dataset_name = self.__class__.__name__.lower()
1184 # default dataset folder is the cache root
1185 if not base_path:
1186 base_path = flair.cache_root / "datasets"
1187 data_folder = base_path / dataset_name
1189 # download data if necessary
1190 STACKOVERFLOW_NER_path = "https://raw.githubusercontent.com/jeniyat/StackOverflowNER/master/resources/annotated_ner_data/StackOverflow/"
1192 # data validation
1193 banned_sentences = ["code omitted for annotation",
1194 "omitted for annotation",
1195 "CODE_BLOCK :",
1196 "OP_BLOCK :",
1197 "Question_URL :",
1198 "Question_ID :"
1199 ]
1201 files = ["train", "test", "dev"]
1203 for file in files:
1204 questions = 0
1205 answers = 0
1207 cached_path(f"{STACKOVERFLOW_NER_path}{file}.txt", Path("datasets") / dataset_name)
1208 for line in open(data_folder / (file + ".txt"), mode="r", encoding="utf-8"):
1209 if line.startswith("Question_ID"):
1210 questions += 1
1212 if line.startswith("Answer_to_Question_ID"):
1213 answers += 1
1214 log.info(f"File {file} has {questions} questions and {answers} answers.")
1216 super(NER_ENGLISH_STACKOVERFLOW, self).__init__(
1217 data_folder,
1218 columns,
1219 train_file="train.txt",
1220 test_file="test.txt",
1221 dev_file="dev.txt",
1222 tag_to_bioes=tag_to_bioes,
1223 encoding="utf-8",
1224 banned_sentences=banned_sentences,
1225 in_memory=in_memory,
1226 label_name_map=entity_mapping,
1227 **corpusargs
1228 )
1231class NER_ENGLISH_TWITTER(ColumnCorpus):
1232 def __init__(
1233 self,
1234 base_path: Union[str, Path] = None,
1235 tag_to_bioes: str = "ner",
1236 in_memory: bool = True,
1237 **corpusargs,
1238 ):
1239 """
1240 Initialize a dataset called twitter_ner which can be found on the following page:
1241 https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/ner.txt.
1243 The first time you call this constructor it will automatically
1244 download the dataset.
1245 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
1246 to point to a different folder but typically this should not be necessary.
1247 :param tag_to_bioes: NER by default, need not be changed
1248 :param in_memory: If True, keeps dataset in memory giving speedups in training.
1249 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
1250 """
1251 if type(base_path) == str:
1252 base_path: Path = Path(base_path)
1254 # column format
1255 columns = {0: 'text', 1: 'ner'}
1257 # this dataset name
1258 dataset_name = self.__class__.__name__.lower()
1260 # default dataset folder is the cache root
1261 if not base_path:
1262 base_path = flair.cache_root / "datasets"
1263 data_folder = base_path / dataset_name
1265 # download data if necessary
1266 twitter_ner_path = "https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/"
1267 cached_path(f"{twitter_ner_path}ner.txt", Path("datasets") / dataset_name)
1269 super(NER_ENGLISH_TWITTER, self).__init__(
1270 data_folder,
1271 columns,
1272 tag_to_bioes=tag_to_bioes,
1273 encoding="latin-1",
1274 train_file="ner.txt",
1275 in_memory=in_memory,
1276 **corpusargs,
1277 )
1280class NER_ENGLISH_PERSON(ColumnCorpus):
1281 def __init__(
1282 self,
1283 base_path: Union[str, Path] = None,
1284 in_memory: bool = True,
1285 ):
1286 """
1287 Initialize the PERSON_NER corpus for person names. The first time you call this constructor it will automatically
1288 download the dataset.
1289 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
1290 to point to a different folder but typically this should not be necessary.
1291 :param in_memory: If True, keeps dataset in memory giving speedups in training.
1292 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
1293 """
1295 if type(base_path) == str:
1296 base_path: Path = Path(base_path)
1298 # column format
1299 columns = {0: "text", 1: "ner"}
1301 # this dataset name
1302 dataset_name = self.__class__.__name__.lower()
1304 # default dataset folder is the cache root
1305 if not base_path:
1306 base_path = flair.cache_root / "datasets"
1307 data_folder = base_path / dataset_name
1309 # download data if necessary
1310 conll_path = "https://raw.githubusercontent.com/das-sudeshna/genid/master/"
1312 # download files if not present locallys
1313 cached_path(f"{conll_path}conll-g.conll", data_folder / 'raw')
1314 cached_path(f"{conll_path}ieer-g.conll", data_folder / 'raw')
1315 cached_path(f"{conll_path}textbook-g.conll", data_folder / 'raw')
1316 cached_path(f"{conll_path}wiki-g.conll", data_folder / 'raw')
1318 self.__concatAllFiles(data_folder)
1320 super(NER_ENGLISH_PERSON, self).__init__(
1321 data_folder,
1322 columns,
1323 in_memory=in_memory,
1324 train_file='bigFile.conll'
1325 )
1327 @staticmethod
1328 def __concatAllFiles(data_folder):
1329 arr = os.listdir(data_folder / 'raw')
1331 with open(data_folder / 'bigFile.conll', 'w') as outfile:
1332 for fname in arr:
1333 with open(data_folder / 'raw' / fname) as infile:
1334 outfile.write(infile.read())
1337class NER_ENGLISH_WEBPAGES(ColumnCorpus):
1338 def __init__(
1339 self,
1340 base_path: Union[str, Path] = None,
1341 tag_to_bioes: str = "ner",
1342 in_memory: bool = True,
1343 **corpusargs,
1344 ):
1345 """
1346 Initialize the WEBPAGES_NER corpus introduced in the paper "Design Challenges and Misconceptions in Named Entity
1347 Recognition" by Ratinov and Roth (2009): https://aclanthology.org/W09-1119/.
1348 The first time you call this constructor it will automatically download the dataset.
1349 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
1350 to point to a different folder but typically this should not be necessary.
1351 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
1352 POS tags instead
1353 :param in_memory: If True, keeps dataset in memory giving speedups in training.
1354 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
1355 """
1356 if type(base_path) == str:
1357 base_path: Path = Path(base_path)
1359 # column format
1360 columns = {0: "ner", 5: "text"}
1362 # this dataset name
1363 dataset_name = self.__class__.__name__.lower()
1365 # default dataset folder is the cache root
1366 if not base_path:
1367 base_path = Path(flair.cache_root) / "datasets"
1368 data_folder = base_path / dataset_name
1369 import tarfile
1370 if not os.path.isfile(data_folder / 'webpages_ner.txt'):
1371 # # download zip
1372 tar_file = "https://cogcomp.seas.upenn.edu/Data/NERWebpagesColumns.tgz"
1373 webpages_ner_path = cached_path(tar_file, Path("datasets") / dataset_name)
1374 tf = tarfile.open(webpages_ner_path)
1375 tf.extractall(data_folder)
1376 tf.close()
1377 outputfile = os.path.abspath(data_folder)
1379 # merge the files in one as the zip is containing multiples files
1381 with open(outputfile / data_folder / "webpages_ner.txt", "w+") as outfile:
1382 for files in os.walk(outputfile):
1383 f = files[1]
1384 ff = os.listdir(outputfile / data_folder / f[-1])
1385 for i, file in enumerate(ff):
1386 if file.endswith('.gold'):
1387 with open(outputfile / data_folder / f[-1] / file, 'r+', errors='replace') as infile:
1388 content = infile.read()
1389 outfile.write(content)
1390 break
1392 super(NER_ENGLISH_WEBPAGES, self).__init__(
1393 data_folder,
1394 columns,
1395 train_file='webpages_ner.txt',
1396 tag_to_bioes=tag_to_bioes,
1397 in_memory=in_memory,
1398 **corpusargs,
1399 )
1402class NER_ENGLISH_WNUT_2020(ColumnCorpus):
1403 def __init__(
1404 self,
1405 base_path: Union[str, Path] = None,
1406 tag_to_bioes: str = "ner",
1407 in_memory: bool = True,
1408 document_as_sequence: bool = False,
1409 **corpusargs,
1410 ):
1411 """
1412 Initialize the WNUT_2020_NER corpus. The first time you call this constructor it will automatically
1413 download the dataset.
1414 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
1415 to point to a different folder but typically this should not be necessary.
1416 :param tag_to_bioes: NER by default, since it is the only option of the WNUT corpus.
1417 :param in_memory: If True, keeps dataset in memory giving speedups in training.
1418 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
1419 """
1420 if type(base_path) == str:
1421 base_path: Path = Path(base_path)
1423 # column format
1424 columns = {0: "text", 1: "ner"}
1426 # this dataset name
1427 dataset_name = self.__class__.__name__.lower()
1429 # default dataset folder is the cache root
1430 if not base_path:
1431 base_path = flair.cache_root / "datasets"
1432 data_folder = base_path / dataset_name
1434 # download data if necessary
1435 github_url = "https://github.com/jeniyat/WNUT_2020_NER/archive/master.zip"
1437 for sample in ["train", "test", "dev"]:
1439 sample_file = data_folder / (sample + ".txt")
1440 if not sample_file.is_file():
1442 zip_path = cached_path(
1443 f"{github_url}", Path("datasets") / dataset_name
1444 )
1446 # unzip the downloaded repo and merge the train, dev and test datasets
1447 unpack_file(zip_path, data_folder, "zip", False) # unzipped folder name: WNUT_2020_NER-master
1449 if sample == "test":
1450 file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data_2020/Conll_Format/")
1451 else:
1452 file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data/Conll_Format/")
1453 filenames = os.listdir(file_path)
1454 with open(data_folder / (sample + '.txt'), 'w') as outfile:
1455 for fname in filenames:
1456 with open(file_path / fname) as infile:
1457 lines = infile.read()
1458 outfile.write(lines)
1460 shutil.rmtree(str(data_folder / "WNUT_2020_NER-master")) # clean up when done
1462 super(NER_ENGLISH_WNUT_2020, self).__init__(
1463 data_folder,
1464 columns,
1465 tag_to_bioes=tag_to_bioes,
1466 encoding="utf-8",
1467 in_memory=in_memory,
1468 document_separator_token=None if not document_as_sequence else "-DOCSTART-",
1469 **corpusargs,
1470 )
1473class NER_ENGLISH_WIKIGOLD(ColumnCorpus):
1474 def __init__(
1475 self,
1476 base_path: Union[str, Path] = None,
1477 tag_to_bioes: str = "ner",
1478 in_memory: bool = True,
1479 document_as_sequence: bool = False,
1480 **corpusargs,
1481 ):
1482 """
1483 Initialize the wikigold corpus. The first time you call this constructor it will automatically
1484 download the dataset.
1485 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
1486 to point to a different folder but typically this should not be necessary.
1487 :param tag_to_bioes: NER by default, should not be changed
1488 :param in_memory: If True, keeps dataset in memory giving speedups in training.
1489 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
1490 """
1491 if type(base_path) == str:
1492 base_path: Path = Path(base_path)
1494 # column format
1495 columns = {0: "text", 1: "ner"}
1497 # this dataset name
1498 dataset_name = self.__class__.__name__.lower()
1500 # default dataset folder is the cache root
1501 if not base_path:
1502 base_path = flair.cache_root / "datasets"
1503 data_folder = base_path / dataset_name
1505 # download data if necessary
1506 wikigold_ner_path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/wikigold/CONLL-format/data/"
1507 cached_path(f"{wikigold_ner_path}wikigold.conll.txt", Path("datasets") / dataset_name)
1509 super(NER_ENGLISH_WIKIGOLD, self).__init__(
1510 data_folder,
1511 columns,
1512 tag_to_bioes=tag_to_bioes,
1513 encoding="utf-8",
1514 in_memory=in_memory,
1515 train_file='wikigold.conll.txt',
1516 document_separator_token=None if not document_as_sequence else "-DOCSTART-",
1517 **corpusargs,
1518 )
1521class NER_FINNISH(ColumnCorpus):
1522 def __init__(
1523 self,
1524 base_path: Union[str, Path] = None,
1525 tag_to_bioes: str = "ner",
1526 in_memory: bool = True,
1527 **corpusargs,
1528 ):
1529 if type(base_path) == str:
1530 base_path: Path = Path(base_path)
1532 # column format
1533 columns = {0: "text", 1: "ner"}
1535 # this dataset name
1536 dataset_name = self.__class__.__name__.lower()
1538 # default dataset folder is the cache root
1539 if not base_path:
1540 base_path = flair.cache_root / "datasets"
1541 data_folder = base_path / dataset_name
1543 # download data if necessary
1544 ner_finnish_path = "https://raw.githubusercontent.com/mpsilfve/finer-data/master/data/digitoday."
1545 cached_path(f"{ner_finnish_path}2014.train.csv", Path("datasets") / dataset_name)
1546 cached_path(f"{ner_finnish_path}2014.dev.csv", Path("datasets") / dataset_name)
1547 cached_path(f"{ner_finnish_path}2015.test.csv", Path("datasets") / dataset_name)
1549 self._remove_lines_without_annotations(data_file=Path(data_folder / "digitoday.2015.test.csv"))
1551 super(NER_FINNISH, self).__init__(
1552 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, skip_first_line=True, **corpusargs,
1553 )
1555 def _remove_lines_without_annotations(self, data_file: Union[str, Path] = None):
1556 with open(data_file, 'r') as f:
1557 lines = f.readlines()
1558 with open(data_file, 'w') as f:
1559 for line in lines:
1560 if len(line.split()) != 1:
1561 f.write(line)
1564class NER_GERMAN_BIOFID(ColumnCorpus):
1565 def __init__(
1566 self,
1567 base_path: Union[str, Path] = None,
1568 tag_to_bioes: str = "ner",
1569 in_memory: bool = True,
1570 **corpusargs,
1571 ):
1572 if type(base_path) == str:
1573 base_path: Path = Path(base_path)
1575 # column format
1576 columns = {0: "text", 1: "lemma", 2: "pos", 3: "ner"}
1578 # this dataset name
1579 dataset_name = self.__class__.__name__.lower()
1581 # default dataset folder is the cache root
1582 if not base_path:
1583 base_path = flair.cache_root / "datasets"
1584 data_folder = base_path / dataset_name
1586 # download data if necessary
1587 biofid_path = "https://raw.githubusercontent.com/texttechnologylab/BIOfid/master/BIOfid-Dataset-NER/"
1588 cached_path(f"{biofid_path}train.conll", Path("datasets") / dataset_name)
1589 cached_path(f"{biofid_path}dev.conll", Path("datasets") / dataset_name)
1590 cached_path(f"{biofid_path}test.conll", Path("datasets") / dataset_name)
1592 super(NER_GERMAN_BIOFID, self).__init__(
1593 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
1594 )
1597class NER_GERMAN_EUROPARL(ColumnCorpus):
1598 def __init__(
1599 self,
1600 base_path: Union[str, Path] = None,
1601 tag_to_bioes: str = "ner",
1602 in_memory: bool = True,
1603 **corpusargs,
1604 ):
1605 """
1606 Initialize the EUROPARL_NER_GERMAN corpus. The first time you call this constructor it will automatically
1607 download the dataset.
1608 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
1609 to point to a different folder but typically this should not be necessary.
1610 :param tag_to_bioes: 'ner' by default, should not be changed.
1611 :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
1612 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
1613 """
1615 if type(base_path) == str:
1616 base_path: Path = Path(base_path)
1618 # column format
1619 columns = {0: 'text', 1: 'lemma', 2: 'pos', 3: 'np', 4: 'ner'}
1621 # this dataset name
1622 dataset_name = self.__class__.__name__.lower()
1624 # default dataset folder is the cache root
1625 if not base_path:
1626 base_path = flair.cache_root / "datasets"
1627 data_folder = base_path / dataset_name
1629 # download data if necessary
1630 europarl_ner_german_path = "https://nlpado.de/~sebastian/software/ner/"
1631 cached_path(f"{europarl_ner_german_path}ep-96-04-15.conll", Path("datasets") / dataset_name)
1632 cached_path(f"{europarl_ner_german_path}ep-96-04-16.conll", Path("datasets") / dataset_name)
1634 self._add_IOB_tags(data_file=Path(data_folder / "ep-96-04-15.conll"), encoding="latin-1", ner_column=4)
1635 self._add_IOB_tags(data_file=Path(data_folder / "ep-96-04-16.conll"), encoding="latin-1", ner_column=4)
1637 super(NER_GERMAN_EUROPARL, self).__init__(
1638 data_folder,
1639 columns,
1640 tag_to_bioes=tag_to_bioes,
1641 encoding="latin-1",
1642 in_memory=in_memory,
1643 train_file='ep-96-04-16.conll',
1644 test_file='ep-96-04-15.conll',
1645 **corpusargs,
1646 )
1648 def _add_IOB_tags(self, data_file: Union[str, Path], encoding: str = "utf8", ner_column: int = 1):
1649 """
1650 Function that adds IOB tags if only chunk names are provided (e.g. words are tagged PER instead
1651 of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects
1652 the letter 'O'. Additionally it removes lines with no tags in the data file and can also
1653 be used if the data is only partially IOB tagged.
1654 Parameters
1655 ----------
1656 data_file : Union[str, Path]
1657 Path to the data file.
1658 encoding : str, optional
1659 Encoding used in open function. The default is "utf8".
1660 ner_column : int, optional
1661 Specifies the ner-tagged column. The default is 1 (the second column).
1663 """
1665 def add_I_prefix(current_line: List[str], ner: int, tag: str):
1666 for i in range(0, len(current_line)):
1667 if i == 0:
1668 f.write(line_list[i])
1669 elif i == ner:
1670 f.write(' I-' + tag)
1671 else:
1672 f.write(' ' + current_line[i])
1673 f.write('\n')
1675 with open(file=data_file, mode='r', encoding=encoding) as f:
1676 lines = f.readlines()
1677 with open(file=data_file, mode='w', encoding=encoding) as f:
1678 pred = 'O' # remembers ner tag of predecessing line
1679 for line in lines:
1680 line_list = line.split()
1681 if len(line_list) > 2: # word with tags
1682 ner_tag = line_list[ner_column]
1683 if ner_tag in ['0', 'O']: # no chunk
1684 for i in range(0, len(line_list)):
1685 if i == 0:
1686 f.write(line_list[i])
1687 elif i == ner_column:
1688 f.write(' O')
1689 else:
1690 f.write(' ' + line_list[i])
1691 f.write('\n')
1692 pred = 'O'
1693 elif '-' not in ner_tag: # no IOB tags
1694 if pred == 'O': # found a new chunk
1695 add_I_prefix(line_list, ner_column, ner_tag)
1696 pred = ner_tag
1697 else: # found further part of chunk or new chunk directly after old chunk
1698 add_I_prefix(line_list, ner_column, ner_tag)
1699 pred = ner_tag
1700 else: # line already has IOB tag (tag contains '-')
1701 f.write(line)
1702 pred = ner_tag.split('-')[1]
1703 elif len(line_list) == 0: # empty line
1704 f.write('\n')
1705 pred = 'O'
1708class NER_GERMAN_LEGAL(ColumnCorpus):
1709 def __init__(
1710 self,
1711 base_path: Union[str, Path] = None,
1712 tag_to_bioes: str = "ner",
1713 in_memory: bool = True,
1714 **corpusargs,
1715 ):
1716 """
1717 Initialize the LER_GERMAN (Legal Entity Recognition) corpus. The first time you call this constructor it will automatically
1718 download the dataset.
1719 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
1720 to point to a different folder but typically this should not be necessary.
1721 :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
1722 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
1723 """
1725 if type(base_path) == str:
1726 base_path: Path = Path(base_path)
1728 # column format
1729 columns = {0: "text", 1: "ner"}
1731 # this dataset name
1732 dataset_name = self.__class__.__name__.lower()
1734 # default dataset folder is the cache root
1735 if not base_path:
1736 base_path = flair.cache_root / "datasets"
1737 data_folder = base_path / dataset_name
1739 # download data if necessary
1740 ler_path = "https://raw.githubusercontent.com/elenanereiss/Legal-Entity-Recognition/master/data/"
1741 cached_path(f"{ler_path}ler.conll", Path("datasets") / dataset_name)
1743 super(NER_GERMAN_LEGAL, self).__init__(
1744 data_folder,
1745 columns,
1746 tag_to_bioes=tag_to_bioes,
1747 in_memory=in_memory,
1748 train_file='ler.conll',
1749 **corpusargs,
1750 )
1753class NER_GERMAN_GERMEVAL(ColumnCorpus):
1754 def __init__(
1755 self,
1756 base_path: Union[str, Path] = None,
1757 tag_to_bioes: str = "ner",
1758 in_memory: bool = True,
1759 **corpusargs,
1760 ):
1761 """
1762 Initialize the GermEval NER corpus for German. This is only possible if you've manually downloaded it to your
1763 machine. Obtain the corpus from https://sites.google.com/site/germeval2014ner/data and put it into some folder.
1764 Then point the base_path parameter in the constructor to this folder
1765 :param base_path: Path to the GermEval corpus on your machine
1766 :param tag_to_bioes: 'ner' by default, should not be changed.
1767 :param in_memory:If True, keeps dataset in memory giving speedups in training.
1768 """
1769 if type(base_path) == str:
1770 base_path: Path = Path(base_path)
1772 # column format
1773 columns = {1: "text", 2: "ner"}
1775 # this dataset name
1776 dataset_name = self.__class__.__name__.lower()
1778 # default dataset folder is the cache root
1779 if not base_path:
1780 base_path = flair.cache_root / "datasets"
1781 data_folder = base_path / dataset_name
1783 # check if data there
1784 if not data_folder.exists():
1785 # create folder
1786 os.makedirs(data_folder)
1788 # download dataset
1789 import gdown
1790 gdown.download(url="https://drive.google.com/uc?id={}".format("1Jjhbal535VVz2ap4v4r_rN1UEHTdLK5P"),
1791 output=str(data_folder / 'train.tsv'))
1792 gdown.download(url="https://drive.google.com/uc?id={}".format("1u9mb7kNJHWQCWyweMDRMuTFoOHOfeBTH"),
1793 output=str(data_folder / 'test.tsv'))
1794 gdown.download(url="https://drive.google.com/uc?id={}".format("1ZfRcQThdtAR5PPRjIDtrVP7BtXSCUBbm"),
1795 output=str(data_folder / 'dev.tsv'))
1797 super(NER_GERMAN_GERMEVAL, self).__init__(
1798 data_folder,
1799 columns,
1800 tag_to_bioes=tag_to_bioes,
1801 comment_symbol="#",
1802 in_memory=in_memory,
1803 **corpusargs,
1804 )
1807class NER_GERMAN_POLITICS(ColumnCorpus):
1808 def __init__(
1809 self,
1810 base_path: Union[str, Path] = None,
1811 tag_to_bioes: str = "ner",
1812 column_delimiter: str = r"\s+",
1813 in_memory: bool = True,
1814 **corpusargs,
1815 ):
1816 """
1817 Initialize corpus with Named Entity Model for German, Politics (NEMGP) data from
1818 https://www.thomas-zastrow.de/nlp/. The first time you call this constructor it will automatically download the
1819 dataset.
1820 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
1821 to point to a different folder but typically this should not be necessary.
1822 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
1823 POS tags instead
1824 :param in_memory: If True, keeps dataset in memory giving speedups in training.
1825 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
1826 """
1827 if type(base_path) == str:
1828 base_path: Path = Path(base_path)
1830 # column format
1831 columns = {0: "text", 1: "ner"}
1833 # this dataset name
1834 dataset_name = self.__class__.__name__.lower()
1836 # default dataset folder is the cache root
1837 if not base_path:
1838 base_path = flair.cache_root / "datasets"
1839 data_folder = base_path / dataset_name
1841 # download and parse data if necessary
1842 german_politics_path = "https://www.thomas-zastrow.de/nlp/nemgp_trainingdata_01.txt.zip"
1843 corpus_file_name = "nemgp_trainingdata_01.txt"
1844 parsed_dataset = data_folder / "raw" / corpus_file_name
1846 if not parsed_dataset.exists():
1847 german_politics_zip = cached_path(f"{german_politics_path}", Path("datasets") / dataset_name / "raw")
1848 unpack_file(german_politics_zip, data_folder / "raw", "zip", False)
1849 self._convert_to_column_corpus(parsed_dataset)
1851 # create train test dev if not exist
1852 train_dataset = data_folder / "train.txt"
1853 if not train_dataset.exists():
1854 self._create_datasets(parsed_dataset, data_folder)
1856 super(NER_GERMAN_POLITICS, self).__init__(
1857 data_folder,
1858 columns,
1859 column_delimiter=column_delimiter,
1860 train_file='train.txt',
1861 dev_file='dev.txt',
1862 test_file='test.txt',
1863 tag_to_bioes=tag_to_bioes,
1864 encoding="utf-8",
1865 in_memory=in_memory,
1866 **corpusargs,
1867 )
1869 def _convert_to_column_corpus(self, data_file: Union[str, Path]):
1870 with open(data_file, 'r', encoding='utf-8') as f:
1871 lines = f.readlines()
1872 with open(data_file, 'w', encoding='utf-8') as f:
1873 tag_bool = False
1874 new_sentence = True
1875 for line in lines:
1876 line = re.sub('\s{2,}', ' ', line).strip().split(' ')
1877 for substr in line:
1878 if substr == '.':
1879 f.write("\n")
1880 new_sentence = True
1881 elif "<START:" in substr:
1882 tag_bool = True
1883 tag = substr.strip('<START:').strip('>')
1884 if 'loc' in tag:
1885 tag_IOB = '-LOC'
1886 elif 'per' in tag:
1887 tag_IOB = '-PER'
1888 elif 'org' in tag:
1889 tag_IOB = '-ORG'
1890 elif 'misc' in tag:
1891 tag_IOB = '-MISC'
1892 elif "<END>" in substr:
1893 tag_bool = False
1894 new_sentence = True
1895 else:
1896 if tag_bool:
1897 if new_sentence is True:
1898 start = 'B'
1899 new_sentence = False
1900 else:
1901 start = 'I'
1902 f.write(substr.strip(' ') + " " + start + tag_IOB + "\n")
1903 else:
1904 f.write(substr.strip(' ') + " " + 'O' + "\n")
1906 def _create_datasets(self, data_file: Union[str, Path], data_folder: Union[str, Path]):
1907 with open(data_file, 'r') as file:
1908 num_lines = len(file.readlines())
1909 file.seek(0)
1911 train_len = round(num_lines * 0.8)
1912 test_len = round(num_lines * 0.1)
1913 dev_len = num_lines - train_len - test_len
1915 train = open(data_folder / "train.txt", "w")
1916 test = open(data_folder / "test.txt", "w")
1917 dev = open(data_folder / "dev.txt", "w")
1919 k = 0
1920 for line in file.readlines():
1921 k += 1
1922 if k <= train_len:
1923 train.write(line)
1924 elif k > train_len and k <= (train_len + test_len):
1925 test.write(line)
1926 elif k > (train_len + test_len) and k <= num_lines:
1927 dev.write(line)
1930class NER_HUNGARIAN(ColumnCorpus):
1931 def __init__(
1932 self,
1933 base_path: Union[str, Path] = None,
1934 tag_to_bioes: str = "ner",
1935 in_memory: bool = True,
1936 document_as_sequence: bool = False,
1937 **corpusargs,
1938 ):
1939 """
1940 Initialize the NER Business corpus for Hungarian. The first time you call this constructor it will automatically
1941 download the dataset.
1942 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
1943 to point to a different folder but typically this should not be necessary.
1944 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
1945 POS tags instead
1946 :param in_memory: If True, keeps dataset in memory giving speedups in training.
1947 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
1948 """
1949 if type(base_path) == str:
1950 base_path: Path = Path(base_path)
1952 # column format
1953 columns = {0: "text", 1: "ner"}
1955 # this dataset name
1956 dataset_name = self.__class__.__name__.lower()
1958 # default dataset folder is the cache root
1959 if not base_path:
1960 base_path = flair.cache_root / "datasets"
1961 data_folder = base_path / dataset_name
1963 # If the extracted corpus file is not yet present in dir
1964 if not os.path.isfile(data_folder / 'hun_ner_corpus.txt'):
1965 # download zip if necessary
1966 hun_ner_path = "https://rgai.sed.hu/sites/rgai.sed.hu/files/business_NER.zip"
1967 path_to_zipped_corpus = cached_path(hun_ner_path, Path("datasets") / dataset_name)
1968 # extracted corpus is not present , so unpacking it.
1969 unpack_file(
1970 path_to_zipped_corpus,
1971 data_folder,
1972 mode="zip",
1973 keep=True
1974 )
1976 super(NER_HUNGARIAN, self).__init__(
1977 data_folder,
1978 columns,
1979 train_file='hun_ner_corpus.txt',
1980 column_delimiter='\t',
1981 tag_to_bioes=tag_to_bioes,
1982 encoding="latin-1",
1983 in_memory=in_memory,
1984 label_name_map={'0': 'O'},
1985 document_separator_token=None if not document_as_sequence else "-DOCSTART-",
1986 **corpusargs,
1987 )
1990class NER_ICELANDIC(ColumnCorpus):
1991 def __init__(
1992 self,
1993 base_path: Union[str, Path] = None,
1994 tag_to_bioes: str = "ner",
1995 in_memory: bool = True,
1996 **corpusargs,
1997 ):
1998 """
1999 Initialize the ICELANDIC_NER corpus. The first time you call this constructor it will automatically
2000 download the dataset.
2001 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
2002 to point to a different folder but typically this should not be necessary.
2003 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
2004 POS tags instead
2005 :param in_memory: If True, keeps dataset in memory giving speedups in training.
2006 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
2007 """
2008 if type(base_path) == str:
2009 base_path: Path = Path(base_path)
2011 # column format
2012 columns = {0: "text", 1: "ner"}
2014 # this dataset name
2015 dataset_name = self.__class__.__name__.lower()
2017 # default dataset folder is the cache root
2018 if not base_path:
2019 base_path = flair.cache_root / "datasets"
2020 data_folder = base_path / dataset_name
2022 if not os.path.isfile(data_folder / 'icelandic_ner.txt'):
2023 # download zip
2024 icelandic_ner = "https://repository.clarin.is/repository/xmlui/handle/20.500.12537/42/allzip"
2025 icelandic_ner_path = cached_path(icelandic_ner, Path("datasets") / dataset_name)
2027 # unpacking the zip
2028 unpack_file(
2029 icelandic_ner_path,
2030 data_folder,
2031 mode="zip",
2032 keep=True
2033 )
2034 outputfile = os.path.abspath(data_folder)
2036 # merge the files in one as the zip is containing multiples files
2038 with open(outputfile / data_folder / "icelandic_ner.txt", "wb") as outfile:
2039 for files in os.walk(outputfile / data_folder):
2040 f = files[2]
2042 for i in range(len(f)):
2043 if f[i].endswith('.txt'):
2044 with open(outputfile / data_folder / f[i], 'rb') as infile:
2045 contents = infile.read()
2046 outfile.write(contents)
2048 super(NER_ICELANDIC, self).__init__(
2049 data_folder,
2050 columns,
2051 train_file='icelandic_ner.txt',
2052 tag_to_bioes=tag_to_bioes,
2053 in_memory=in_memory,
2054 **corpusargs,
2055 )
2058class NER_JAPANESE(ColumnCorpus):
2059 def __init__(
2060 self,
2061 base_path: Union[str, Path] = None,
2062 tag_to_bioes: str = "ner",
2063 in_memory: bool = True,
2064 **corpusargs,
2065 ):
2066 """
2067 Initialize the Hironsan/IOB2 corpus for Japanese. The first time you call this constructor it will automatically
2068 download the dataset.
2069 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
2070 to point to a different folder but typically this should not be necessary.
2071 :param tag_to_bioes: NER by default.
2072 :param in_memory: If True, keeps dataset in memory giving speedups in training.
2073 """
2074 if type(base_path) == str:
2075 base_path: Path = Path(base_path)
2077 # column format
2078 columns = {0: 'text', 1: 'ner'}
2080 # this dataset name
2081 dataset_name = self.__class__.__name__.lower()
2083 # default dataset folder is the cache root
2084 if not base_path:
2085 base_path = flair.cache_root / "datasets"
2086 data_folder = base_path / dataset_name
2088 # download data from github if necessary (hironsan.txt, ja.wikipedia.conll)
2089 IOB2_path = "https://raw.githubusercontent.com/Hironsan/IOB2Corpus/master/"
2091 # download files if not present locally
2092 cached_path(f"{IOB2_path}hironsan.txt", data_folder / 'raw')
2093 cached_path(f"{IOB2_path}ja.wikipedia.conll", data_folder / 'raw')
2095 # we need to modify the original files by adding new lines after after the end of each sentence
2096 train_data_file = data_folder / 'train.txt'
2097 if not train_data_file.is_file():
2098 self.__prepare_jap_wikinews_corpus(data_folder / 'raw' / "hironsan.txt", data_folder / 'train.txt')
2099 self.__prepare_jap_wikipedia_corpus(data_folder / 'raw' / "ja.wikipedia.conll", data_folder / 'train.txt')
2101 super(NER_JAPANESE, self).__init__(
2102 data_folder,
2103 columns,
2104 train_file='train.txt',
2105 tag_to_bioes=tag_to_bioes,
2106 in_memory=in_memory,
2107 **corpusargs,
2108 )
2110 @staticmethod
2111 def __prepare_jap_wikipedia_corpus(file_in: Union[str, Path], file_out: Union[str, Path]):
2112 with open(file_in, 'r') as f:
2113 lines = f.readlines()
2114 with open(file_out, 'a') as f:
2115 for line in lines:
2116 if (line[0] == "。"):
2117 f.write(line)
2118 f.write("\n")
2119 elif (line[0] == "\n"):
2120 continue
2121 else:
2122 f.write(line)
2124 @staticmethod
2125 def __prepare_jap_wikinews_corpus(file_in: Union[str, Path], file_out: Union[str, Path]):
2126 with open(file_in, 'r') as f:
2127 lines = f.readlines()
2128 with open(file_out, 'a') as f:
2129 for line in lines:
2130 sp_line = line.split("\t")
2131 if (sp_line[0] == "\n"):
2132 f.write("\n")
2133 else:
2134 f.write(sp_line[0] + "\t" + sp_line[len(sp_line) - 1])
2137class NER_MASAKHANE(MultiCorpus):
2138 def __init__(
2139 self,
2140 languages: Union[str, List[str]] = "luo",
2141 base_path: Union[str, Path] = None,
2142 tag_to_bioes: str = "ner",
2143 in_memory: bool = True,
2144 **corpusargs,
2145 ):
2146 """
2147 Initialize the Masakhane corpus available on https://github.com/masakhane-io/masakhane-ner/tree/main/data.
2148 It consists of ten African languages. Pass a language code or a list of language codes to initialize the corpus
2149 with the languages you require. If you pass "all", all languages will be initialized.
2150 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
2151 to point to a different folder but typically this should not be necessary.
2152 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
2153 POS tags instead
2154 :param in_memory: If True, keeps dataset in memory giving speedups in training.
2155 """
2156 if type(base_path) == str:
2157 base_path: Path = Path(base_path)
2159 # if only one language is given
2160 if type(languages) == str:
2161 languages = [languages]
2163 # column format
2164 columns = {0: "text", 1: "ner"}
2166 # this dataset name
2167 dataset_name = self.__class__.__name__.lower()
2169 # default dataset folder is the cache root
2170 if not base_path:
2171 base_path = flair.cache_root / "datasets"
2172 data_folder = base_path / dataset_name
2174 language_to_code = {"amharic": "amh",
2175 "hausa": "hau",
2176 "igbo": "ibo",
2177 "kinyarwanda": "kin",
2178 "luganda": "lug",
2179 "luo": "luo",
2180 "naija": "pcm",
2181 "swahili": "swa",
2182 "yoruba": "yor",
2183 "wolof": "wol",
2184 }
2186 # use all languages if explicitly set to "all"
2187 if languages == ["all"]: languages = language_to_code.values()
2189 corpora = []
2190 for language in languages:
2192 if language in language_to_code.keys():
2193 language = language_to_code[language]
2195 if language not in language_to_code.values():
2196 log.error(f"Language '{language}' is not in list of supported languages!")
2197 log.error(f"Supported are '{language_to_code.values()}'!")
2198 log.error(f"Instantiate this Corpus for instance like so 'corpus = NER_MASAKHANE(languages='luo')'")
2199 raise Exception()
2201 language_folder = data_folder / language
2203 # download data if necessary
2204 data_path = f"https://raw.githubusercontent.com/masakhane-io/masakhane-ner/main/data/{language}/"
2205 cached_path(f"{data_path}dev.txt", language_folder)
2206 cached_path(f"{data_path}test.txt", language_folder)
2207 cached_path(f"{data_path}train.txt", language_folder)
2209 # initialize comlumncorpus and add it to list
2210 log.info(f"Reading data for language {language}")
2211 corp = ColumnCorpus(data_folder=language_folder,
2212 column_format=columns,
2213 tag_to_bioes=tag_to_bioes,
2214 encoding="utf-8",
2215 in_memory=in_memory,
2216 name=language,
2217 **corpusargs,
2218 )
2219 corpora.append(corp)
2221 super(NER_MASAKHANE, self).__init__(
2222 corpora,
2223 name='masakhane-' + '-'.join(languages),
2224 )
2227class NER_MULTI_WIKIANN(MultiCorpus):
2228 def __init__(
2229 self,
2230 languages: Union[str, List[str]] = "en",
2231 base_path: Union[str, Path] = None,
2232 tag_to_bioes: str = "ner",
2233 in_memory: bool = False,
2234 **corpusargs,
2235 ):
2236 """
2237 WkiAnn corpus for cross-lingual NER consisting of datasets from 282 languages that exist
2238 in Wikipedia. See https://elisa-ie.github.io/wikiann/ for details and for the languages and their
2239 respective abbreveations, i.e. "en" for english. (license: https://opendatacommons.org/licenses/by/)
2240 Parameters
2241 ----------
2242 languages : Union[str, List[str]]
2243 Should be an abbreviation of a language ("en", "de",..) or a list of abbreviations.
2244 The datasets of all passed languages will be saved in one MultiCorpus.
2245 (Note that, even though listed on https://elisa-ie.github.io/wikiann/ some datasets are empty.
2246 This includes "aa", "cho", "ho", "hz", "ii", "jam", "kj", "kr", "mus", "olo" and "tcy".)
2247 base_path : Union[str, Path], optional
2248 Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
2249 to point to a different folder but typically this should not be necessary.
2250 tag_to_bioes : str, optional
2251 The data is in bio-format. It will by default (with the string "ner" as value) be transformed
2252 into the bioes format. If you dont want that set it to None.
2254 """
2255 if type(languages) == str:
2256 languages = [languages]
2258 if type(base_path) == str:
2259 base_path: Path = Path(base_path)
2261 # column format
2262 columns = {0: "text", 1: "ner"}
2264 # this dataset name
2265 dataset_name = self.__class__.__name__.lower()
2267 # default dataset folder is the cache root
2268 if not base_path:
2269 base_path = flair.cache_root / "datasets"
2270 data_folder = base_path / dataset_name
2272 # For each language in languages, the file is downloaded if not existent
2273 # Then a comlumncorpus of that data is created and saved in a list
2274 # this list is handed to the multicorpus
2276 # list that contains the columncopora
2277 corpora = []
2279 google_drive_path = 'https://drive.google.com/uc?id='
2280 # download data if necessary
2281 first = True
2282 for language in languages:
2284 language_folder = data_folder / language
2285 file_name = 'wikiann-' + language + '.bio'
2287 # if language not downloaded yet, download it
2288 if not language_folder.exists():
2289 if first:
2290 import gdown
2291 import tarfile
2292 first = False
2293 # create folder
2294 os.makedirs(language_folder)
2295 # get google drive id from list
2296 google_id = self._google_drive_id_from_language_name(language)
2297 url = google_drive_path + google_id
2299 # download from google drive
2300 gdown.download(url, str(language_folder / language) + '.tar.gz')
2302 # unzip
2303 log.info("Extracting data...")
2304 tar = tarfile.open(str(language_folder / language) + '.tar.gz', "r:gz")
2305 # tar.extractall(language_folder,members=[tar.getmember(file_name)])
2306 tar.extract(file_name, str(language_folder))
2307 tar.close()
2308 log.info('...done.')
2310 # transform data into required format
2311 # the processed dataset has the additional ending "_new"
2312 log.info("Processing dataset...")
2313 self._silver_standard_to_simple_ner_annotation(str(language_folder / file_name))
2314 # remove the unprocessed dataset
2315 os.remove(str(language_folder / file_name))
2316 log.info('...done.')
2318 # initialize comlumncorpus and add it to list
2319 log.info(f"Reading data for language {language}")
2320 corp = ColumnCorpus(data_folder=language_folder,
2321 column_format=columns,
2322 train_file=file_name + '_new',
2323 tag_to_bioes=tag_to_bioes,
2324 in_memory=in_memory,
2325 **corpusargs,
2326 )
2327 corpora.append(corp)
2328 log.info("...done.")
2330 super(NER_MULTI_WIKIANN, self).__init__(
2331 corpora, name='wikiann',
2332 )
2334 def _silver_standard_to_simple_ner_annotation(self, data_file: Union[str, Path]):
2335 f_read = open(data_file, 'r', encoding='utf-8')
2336 f_write = open(data_file + '_new', 'w+', encoding='utf-8')
2337 while True:
2338 line = f_read.readline()
2339 if line:
2340 if line == '\n':
2341 f_write.write(line)
2342 else:
2343 liste = line.split()
2344 f_write.write(liste[0] + ' ' + liste[-1] + '\n')
2345 else:
2346 break
2347 f_read.close()
2348 f_write.close()
2350 def _google_drive_id_from_language_name(self, language):
2351 languages_ids = {
2352 'aa': '1tDDlydKq7KQQ3_23Ysbtke4HJOe4snIk', # leer
2353 'ab': '1hB8REj2XA_0DjI9hdQvNvSDpuBIb8qRf',
2354 'ace': '1WENJS2ppHcZqaBEXRZyk2zY-PqXkTkgG',
2355 'ady': '1n6On8WWDHxEoybj7F9K15d_fkGPy6KgO',
2356 'af': '1CPB-0BD2tg3zIT60D3hmJT0i5O_SKja0',
2357 'ak': '1l2vlGHnQwvm9XhW5S-403fetwUXhBlZm',
2358 'als': '196xyYjhbie7sYLHLZHWkkurOwQLi8wK-',
2359 'am': '1ug1IEoExKD3xWpvfZprAPSQi82YF9Cet',
2360 'an': '1DNLgPOAOsGZBYd6rC5ddhzvc9_DtWnk2',
2361 'ang': '1W_0ti7Tl8AkqM91lRCMPWEuUnPOAZroV',
2362 'ar': '1tyvd32udEQG_cNeVpaD5I2fxvCc6XKIS',
2363 'arc': '1hSOByStqPmP3b9HfQ39EclUZGo8IKCMb',
2364 'arz': '1CKW5ZhxTpIHmc8Jt5JLz_5O6Cr8Icsan',
2365 'as': '12opBoIweBLM8XciMHT4B6-MAaKdYdvpE',
2366 'ast': '1rp64PxGZBDfcw-tpFBjLg_ddLDElG1II',
2367 'av': '1hncGUrkG1vwAAQgLtwOf41BWkHkEvdss',
2368 'ay': '1VmIsWpMTz442b4Mx798ZOgtB9vquKQtf',
2369 'az': '1FXDXsvBSdqc7GGIDZv0hqBOaaw12Ip2-',
2370 'azb': '1amVqOuHLEkhjn8rkGUl-mXdZlaACWyNT',
2371 'ba': '1aLx1d8GagI11VZVYOGQy0BEePeqoT0x3',
2372 'bar': '1JZ8-k8ZmnpWYI_Yl_cBBgjVdxoM9Daci',
2373 'bat-smg': '1trxKXDFSeKsygTMKi-ZqXSJs7F90k5a8',
2374 'bcl': '1Hs0k7KVZ2DPsqroZ4cUKcwZG4HdPV794',
2375 'be-x-old': '1gaK-spj1m6eGYQ-SsngLxxLUvP1VRk08',
2376 'be': '1_ttfOSy9BzCRkIT_p3mImT82XRPpEiuH',
2377 'bg': '1Iug6gYKemb0OrLTUrKDc_c66YGypTfCF',
2378 'bh': '12OcSFLu940A8tVQLxI8pnxKBpTeZHmrh',
2379 'bi': '1rftVziS_pqARx4mvLJC0sKLY-OL5ZIjE',
2380 'bjn': '1n17mkRjPUAOWQk5LQs2C3Tz3ShxK0enZ',
2381 'bm': '1284dwO_sfdsWE7FR06HhfBRUb8ePesKR',
2382 'bn': '1K2DM1mT4hkr6NlAIBTj95BeVXcgvpgDm',
2383 'bo': '1SzGHDVK-OguKdjZ4DXWiOJVrie1iHeWm',
2384 'bpy': '1m-e5EoruJufvwBEgJLmJtx6jzx64pYN2',
2385 'br': '1xdaBoJ1DnwI0iEq7gQN1dWcABAs_bM9H',
2386 'bs': '167dsB01trMYFQl8FshtIdfhjw7IfVKbk',
2387 'bug': '1yCnevM9_KJzFk27Vxsva_20OacLo4Uam',
2388 'bxr': '1DlByAX3zB-9UyEAVD4wtX-R7mXC-8xum',
2389 'ca': '1LuUgbd9sGa-5Ahcsy31EK89a3WOowftY',
2390 'cbk-zam': '1kgF8xoD-kIOWZET_9kp_4yNX6AAXn6PI',
2391 'cdo': '14x1y6611G-UAEGq92QEHRpreVkYnoUCw',
2392 'ce': '1QUUCVKA-fkiCHd3KT3zUWefaWnxzlZLu',
2393 'ceb': '1DJZE9RfaMoPNXHI73KBXAm4YSe-_YCUk',
2394 'ch': '1YzAfhmatkmTpkZbAcD6X83epCgzD5S2_',
2395 'cho': '1ciY0vF3c5a2mTOo_k32A2wMs0klK98Kb', # leer
2396 'chr': '1EHaxz1UZHn7v2bbRzCLAhPsNtRzrG3Ae',
2397 'chy': '1nNWwMAJr1KNdz3bHf6uIn-thZCknlTeB',
2398 'ckb': '1llpaftcUSiXCZQZMdAqaJSrhwMdcf9IV',
2399 'co': '1ZP-8oWgMYfW7a6w6ygEFkKDGbN39QnDn',
2400 'cr': '1ST0xRicLAG4JdCZwGdaY-0pEXooQh7e6',
2401 'crh': '1Jmpq2XVYUR_XaXU5XNhtOMnz-qkpsgpE',
2402 'cs': '1Vydyze-jBkK_S1uV5ewV_Y6dbwhXr7lk',
2403 'csb': '1naUyF74lZPnnopXdOqf5Xor2kT4WoHfS',
2404 'cu': '1EN5dVTU6jc7YOYPCHq8EYUF31HlMUKs7',
2405 'cv': '1gEUAlqYSSDI4TrWCqP1LUq2n0X1XEjN3',
2406 'cy': '1q5g6NJE5GXf65Vc_P4BnUMHQ49Prz-J1',
2407 'da': '11onAGOLkkqrIwM784siWlg-cewa5WKm8',
2408 'de': '1f9nWvNkCCy6XWhd9uf4Dq-2--GzSaYAb',
2409 'diq': '1IkpJaVbEOuOs9qay_KG9rkxRghWZhWPm',
2410 'dsb': '1hlExWaMth-2eVIQ3i3siJSG-MN_7Z6MY',
2411 'dv': '1WpCrslO4I7TMb2uaKVQw4U2U8qMs5szi',
2412 'dz': '10WX52ePq2KfyGliwPvY_54hIjpzW6klV',
2413 'ee': '1tYEt3oN2KPzBSWrk9jpCqnW3J1KXdhjz',
2414 'el': '1cxq4NUYmHwWsEn5waYXfFSanlINXWLfM',
2415 'eml': '17FgGhPZqZNtzbxpTJOf-6nxEuI5oU4Vd',
2416 'en': '1mqxeCPjxqmO7e8utj1MQv1CICLFVvKa-',
2417 'eo': '1YeknLymGcqj44ug2yd4P7xQVpSK27HkK',
2418 'es': '1Dnx3MVR9r5cuoOgeew2gT8bDvWpOKxkU',
2419 'et': '1Qhb3kYlQnLefWmNimdN_Vykm4mWzbcWy',
2420 'eu': '1f613wH88UeITYyBSEMZByK-nRNMwLHTs',
2421 'ext': '1D0nLOZ3aolCM8TShIRyCgF3-_MhWXccN',
2422 'fa': '1QOG15HU8VfZvJUNKos024xI-OGm0zhEX',
2423 'ff': '1h5pVjxDYcq70bSus30oqi9KzDmezVNry',
2424 'fi': '1y3Kf6qYsSvL8_nSEwE1Y6Bf6ninaPvqa',
2425 'fiu-vro': '1oKUiqG19WgPd3CCl4FGudk5ATmtNfToR',
2426 'fj': '10xDMuqtoTJlJFp5ghbhKfNWRpLDK3W4d',
2427 'fo': '1RhjYqgtri1276Be1N9RrNitdBNkpzh0J',
2428 'fr': '1sK_T_-wzVPJYrnziNqWTriU52rEsXGjn',
2429 'frp': '1NUm8B2zClBcEa8dHLBb-ZgzEr8phcQyZ',
2430 'frr': '1FjNqbIUlOW1deJdB8WCuWjaZfUzKqujV',
2431 'fur': '1oqHZMK7WAV8oHoZLjGR0PfmO38wmR6XY',
2432 'fy': '1DvnU6iaTJc9bWedmDklHyx8nzKD1s3Ge',
2433 'ga': '1Ql6rh7absdYQ8l-3hj_MVKcEC3tHKeFB',
2434 'gag': '1zli-hOl2abuQ2wsDJU45qbb0xuvYwA3a',
2435 'gan': '1u2dOwy58y-GaS-tCPJS_i9VRDQIPXwCr',
2436 'gd': '1umsUpngJiwkLdGQbRqYpkgxZju9dWlRz',
2437 'gl': '141K2IbLjJfXwFTIf-kthmmG0YWdi8liE',
2438 'glk': '1ZDaxQ6ilXaoivo4_KllagabbvfOuiZ0c',
2439 'gn': '1hM4MuCaVnZqnL-w-0N-WcWag22ikVLtZ',
2440 'gom': '1BNOSw75tzPC0wEgLOCKbwu9wg9gcLOzs',
2441 'got': '1YSHYBtXc1WvUvMIHPz6HHgJvaXKulJUj',
2442 'gu': '1VdK-B2drqFwKg8KD23c3dKXY-cZgCMgd',
2443 'gv': '1XZFohYNbKszEFR-V-yDXxx40V41PV9Zm',
2444 'ha': '18ZG4tUU0owRtQA8Ey3Dl72ALjryEJWMC',
2445 'hak': '1QQe3WgrCWbvnVH42QXD7KX4kihHURB0Z',
2446 'haw': '1FLqlK-wpz4jy768XbQAtxd9PhC-9ciP7',
2447 'he': '18K-Erc2VOgtIdskaQq4D5A3XkVstDmfX',
2448 'hi': '1lBRapb5tjBqT176gD36K5yb_qsaFeu-k',
2449 'hif': '153MQ9Ga4NQ-CkK8UiJM3DjKOk09fhCOV',
2450 'ho': '1c1AoS7yq15iVkTEE-0f3x25NT4F202B8', # leer
2451 'hr': '1wS-UtB3sGHuXJQQGR0F5lDegogsgoyif',
2452 'hsb': '1_3mMLzAE5OmXn2z64rW3OwWbo85Mirbd',
2453 'ht': '1BwCaF0nfdgkM7Yt7A7d7KyVk0BcuwPGk',
2454 'hu': '10AkDmTxUWNbOXuYLYZ-ZPbLAdGAGZZ8J',
2455 'hy': '1Mi2k2alJJquT1ybd3GC3QYDstSagaWdo',
2456 'hz': '1c1m_-Q92v0Di7Nez6VuaccrN19i8icKV', # leer
2457 'ia': '1jPyqTmDuVhEhj89N606Cja5heJEbcMoM',
2458 'id': '1JWIvIh8fQoMQqk1rPvUThaskxnTs8tsf',
2459 'ie': '1TaKRlTtB8-Wqu4sfvx6JQKIugAlg0pV-',
2460 'ig': '15NFAf2Qx6BXSjv_Oun9_3QRBWNn49g86',
2461 'ii': '1qldGJkMOMKwY13DpcgbxQCbff0K982f9', # leer
2462 'ik': '1VoSTou2ZlwVhply26ujowDz6gjwtxmny',
2463 'ilo': '1-xMuIT6GaM_YeHqgm1OamGkxYfBREiv3',
2464 'io': '19Zla0wsAcrZm2c0Pw5ghpp4rHjYs26Pp',
2465 'is': '11i-NCyqS6HbldIbYulsCgQGZFXR8hwoB',
2466 'it': '1HmjlOaQunHqL2Te7pIkuBWrnjlmdfYo_',
2467 'iu': '18jKm1S7Ls3l0_pHqQH8MycG3LhoC2pdX',
2468 'ja': '10dz8UxyK4RIacXE2HcGdrharmp5rwc3r',
2469 'jam': '1v99CXf9RnbF6aJo669YeTR6mQRTOLZ74', # leer
2470 'jbo': '1_LmH9hc6FDGE3F7pyGB1fUEbSwuTYQdD',
2471 'jv': '1qiSu1uECCLl4IBZS27FBdJIBivkJ7GwE',
2472 'ka': '172UFuFRBX2V1aWeXlPSpu9TjS-3cxNaD',
2473 'kaa': '1kh6hMPUdqO-FIxRY6qaIBZothBURXxbY',
2474 'kab': '1oKjbZI6ZrrALCqnPCYgIjKNrKDA7ehcs',
2475 'kbd': '1jNbfrboPOwJmlXQBIv053d7n5WXpMRv7',
2476 'kg': '1iiu5z-sdJ2JLC4Ja9IgDxpRZklIb6nDx',
2477 'ki': '1GUtt0QI84c5McyLGGxoi5uwjHOq1d6G8',
2478 'kj': '1nSxXUSGDlXVCIPGlVpcakRc537MwuKZR', # leer
2479 'kk': '1ryC3UN0myckc1awrWhhb6RIi17C0LCuS',
2480 'kl': '1gXtGtX9gcTXms1IExICnqZUHefrlcIFf',
2481 'km': '1DS5ATxvxyfn1iWvq2G6qmjZv9pv0T6hD',
2482 'kn': '1ZGLYMxbb5-29MNmuUfg2xFhYUbkJFMJJ',
2483 'ko': '12r8tIkTnwKhLJxy71qpIcoLrT6NNhQYm',
2484 'koi': '1EdG_wZ_Qk124EPAZw-w6rdEhYLsgcvIj',
2485 'kr': '19VNQtnBA-YL_avWuVeHQHxJZ9MZ04WPF', # leer
2486 'krc': '1nReV4Mb7Wdj96czpO5regFbdBPu0zZ_y',
2487 'ks': '1kzh0Pgrv27WRMstR9MpU8mu7p60TcT-X',
2488 'ksh': '1iHJvrl2HeRaCumlrx3N7CPrHQ2KuLUkt',
2489 'ku': '1YqJog7Bkk0fHBCSTxJ9heeE-bfbkbkye',
2490 'kv': '1s91HI4eq8lQYlZwfrJAgaGlCyAtIhvIJ',
2491 'kw': '16TaIX2nRfqDp8n7zudd4bqf5abN49dvW',
2492 'ky': '17HPUKFdKWhUjuR1NOp5f3PQYfMlMCxCT',
2493 'la': '1NiQuBaUIFEERvVXo6CQLwosPraGyiRYw',
2494 'lad': '1PEmXCWLCqnjLBomMAYHeObM1AmVHtD08',
2495 'lb': '1nE4g10xoTU23idmDtOQ0w2QCuizZ6QH_',
2496 'lbe': '1KOm-AdRcCHfSc1-uYBxBA4GjxXjnIlE-',
2497 'lez': '1cJAXshrLlF1TZlPHJTpDwEvurIOsz4yR',
2498 'lg': '1Ur0y7iiEpWBgHECrIrT1OyIC8um_y4th',
2499 'li': '1TikIqfqcZlSDWhOae1JnjJiDko4nj4Dj',
2500 'lij': '1ro5ItUcF49iP3JdV82lhCQ07MtZn_VjW',
2501 'lmo': '1W4rhBy2Pi5SuYWyWbNotOVkVY3kYWS_O',
2502 'ln': '1bLSV6bWx0CgFm7ByKppZLpYCFL8EIAoD',
2503 'lo': '1C6SSLeKF3QirjZbAZAcpVX_AXYg_TJG3',
2504 'lrc': '1GUcS28MlJe_OjeQfS2AJ8uczpD8ut60e',
2505 'lt': '1gAG6TcMTmC128wWK0rCXRlCTsJY9wFQY',
2506 'ltg': '12ziP8t_fAAS9JqOCEC0kuJObEyuoiOjD',
2507 'lv': '1MPuAM04u-AtfybXdpHwCqUpFWbe-zD0_',
2508 'mai': '1d_nUewBkka2QGEmxCc9v3dTfvo7lPATH',
2509 'map-bms': '1wrNIE-mqp2xb3lrNdwADe6pb7f35NP6V',
2510 'mdf': '1BmMGUJy7afuKfhfTBMiKxM3D7FY-JrQ2',
2511 'mg': '105WaMhcWa-46tCztoj8npUyg0aH18nFL',
2512 'mh': '1Ej7n6yA1cF1cpD5XneftHtL33iHJwntT',
2513 'mhr': '1CCPIUaFkEYXiHO0HF8_w07UzVyWchrjS',
2514 'mi': '1F6au9xQjnF-aNBupGJ1PwaMMM6T_PgdQ',
2515 'min': '1tVK5SHiCy_DaZSDm3nZBgT5bgWThbJt_',
2516 'mk': '18NpudytGhSWq_LbmycTDw10cSftlSBGS',
2517 'ml': '1V73UE-EvcE-vV3V1RTvU4sak6QFcP91y',
2518 'mn': '14jRXicA87oXZOZllWqUjKBMetNpQEUUp',
2519 'mo': '1YsLGNMsJ7VsekhdcITQeolzOSK4NzE6U',
2520 'mr': '1vOr1AIHbgkhTO9Ol9Jx5Wh98Qdyh1QKI',
2521 'mrj': '1dW-YmEW8a9D5KyXz8ojSdIXWGekNzGzN',
2522 'ms': '1bs-_5WNRiZBjO-DtcNtkcIle-98homf_',
2523 'mt': '1L7aU3iGjm6SmPIU74k990qRgHFV9hrL0',
2524 'mus': '1_b7DcRqiKJFEFwp87cUecqf8A5BDbTIJ', # leer
2525 'mwl': '1MfP0jba2jQfGVeJOLq26MjI6fYY7xTPu',
2526 'my': '16wsIGBhNVd2lC2p6n1X8rdMbiaemeiUM',
2527 'myv': '1KEqHmfx2pfU-a1tdI_7ZxMQAk5NJzJjB',
2528 'mzn': '1CflvmYEXZnWwpsBmIs2OvG-zDDvLEMDJ',
2529 'na': '1r0AVjee5wNnrcgJxQmVGPVKg5YWz1irz',
2530 'nah': '1fx6eu91NegyueZ1i0XaB07CKjUwjHN7H',
2531 'nap': '1bhT4sXCJvaTchCIV9mwLBtf3a7OprbVB',
2532 'nds-nl': '1UIFi8eOCuFYJXSAXZ9pCWwkQMlHaY4ye',
2533 'nds': '1FLgZIXUWa_vekDt4ndY0B5XL7FNLiulr',
2534 'ne': '1gEoCjSJmzjIH4kdHsbDZzD6ID4_78ekS',
2535 'new': '1_-p45Ny4w9UvGuhD8uRNSPPeaARYvESH',
2536 'ng': '11yxPdkmpmnijQUcnFHZ3xcOmLTYJmN_R',
2537 'nl': '1dqYXg3ilzVOSQ_tz_dF47elSIvSIhgqd',
2538 'nn': '1pDrtRhQ001z2WUNMWCZQU3RV_M0BqOmv',
2539 'no': '1zuT8MI96Ivpiu9mEVFNjwbiM8gJlSzY2',
2540 'nov': '1l38388Rln0NXsSARMZHmTmyfo5C0wYTd',
2541 'nrm': '10vxPq1Nci7Wpq4XOvx3dtqODskzjdxJQ',
2542 'nso': '1iaIV8qlT0RDnbeQlnxJ3RehsG3gU5ePK',
2543 'nv': '1oN31jT0w3wP9aGwAPz91pSdUytnd9B0g',
2544 'ny': '1eEKH_rUPC560bfEg11kp3kbe8qWm35IG',
2545 'oc': '1C01cW8G_j8US-DTrsmeal_ENHTtNWn-H',
2546 'olo': '1vbDwKZKqFq84dusr1SvDx5JbBcPanx9L', # leer
2547 'om': '1q3h22VMbWg2kgVFm-OArR-E4y1yBQ1JX',
2548 'or': '1k8LwCE8nC7lq6neXDaS3zRn0KOrd9RnS',
2549 'os': '1u81KAB34aEQfet00dLMRIBJsfRwbDTij',
2550 'pa': '1JDEHL1VcLHBamgTPBom_Ryi8hk6PBpsu',
2551 'pag': '1k905VUWnRgY8kFb2P2431Kr4dZuolYGF',
2552 'pam': '1ssugGyJb8ipispC60B3I6kzMsri1WcvC',
2553 'pap': '1Za0wfwatxYoD7jGclmTtRoBP0uV_qImQ',
2554 'pcd': '1csJlKgtG04pdIYCUWhsCCZARKIGlEYPx',
2555 'pdc': '1Xnms4RXZKZ1BBQmQJEPokmkiweTpouUw',
2556 'pfl': '1tPQfHX7E0uKMdDSlwNw5aGmaS5bUK0rn',
2557 'pi': '16b-KxNxzbEuyoNSlI3bfe2YXmdSEsPFu',
2558 'pih': '1vwyihTnS8_PE5BNK7cTISmIBqGWvsVnF',
2559 'pl': '1fijjS0LbfpKcoPB5V8c8fH08T8AkXRp9',
2560 'pms': '12ySc7X9ajWWqMlBjyrPiEdc-qVBuIkbA',
2561 'pnb': '1RB3-wjluhTKbdTGCsk3nag1bM3m4wENb',
2562 'pnt': '1ZCUzms6fY4on_fW8uVgO7cEs9KHydHY_',
2563 'ps': '1WKl9Av6Sqz6aHKyUM5kIh90mzFzyVWH9',
2564 'pt': '13BX-_4_hcTUp59HDyczFDI32qUB94vUY',
2565 'qu': '1CB_C4ygtRoegkqgcqfXNHr8oQd-UcvDE',
2566 'rm': '1YRSGgWoxEqSojHXuBHJnY8vAHr1VgLu-',
2567 'rmy': '1uFcCyvOWBJWKFQxbkYSp373xUXVl4IgF',
2568 'rn': '1ekyyb2MvupYGY_E8_BhKvV664sLvW4aE',
2569 'ro': '1YfeNTSoxU-zJMnyQotLk5X8B_6nHryBu',
2570 'roa-rup': '150s4H4TdQ5nNYVC6j0E416TUAjBE85yy',
2571 'roa-tara': '1H6emfQsD_a5yohK4RMPQ-GrnHXqqVgr3',
2572 'ru': '11gP2s-SYcfS3j9MjPp5C3_nFeQB-8x86',
2573 'rue': '1OuSglZAndja1J5D5IUmdbt_niTTyEgYK',
2574 'rw': '1NuhHfi0-B-Xlr_BApijnxCw0WMEltttP',
2575 'sa': '1P2S3gL_zvKgXLKJJxg-Fb4z8XdlVpQik',
2576 'sah': '1qz0MpKckzUref2FX_FYiNzI2p4BDc5oR',
2577 'sc': '1oAYj_Fty4FUwjAOBEBaiZt_cY8dtpDfA',
2578 'scn': '1sDN9zHkXWYoHYx-DUu-GPvsUgB_IRa8S',
2579 'sco': '1i8W7KQPj6YZQLop89vZBSybJNgNsvXWR',
2580 'sd': '1vaNqfv3S8Gl5pQmig3vwWQ3cqRTsXmMR',
2581 'se': '1RT9xhn0Vl90zjWYDTw5V1L_u1Oh16tpP',
2582 'sg': '1iIh2oXD2Szz_AygUvTt3_ZK8a3RYEGZ_',
2583 'sh': '1qPwLiAm6t4__G-zVEOrBgYx6VRmgDgiS',
2584 'si': '1G5ryceID0TP6SAO42e-HAbIlCvYmnUN7',
2585 'simple': '1FVV49o_RlK6M5Iw_7zeJOEDQoTa5zSbq',
2586 'sk': '11mkYvbmAWKTInj6t4Ma8BUPxoR5o6irL',
2587 'sl': '1fsIZS5LgMzMzZ6T7ogStyj-ILEZIBRvO',
2588 'sm': '1yefECpKX_Y4R7G2tggIxvc_BvJfOAz-t',
2589 'sn': '1fYeCjMPvRAv94kvZjiKI-ktIDLkbv0Ve',
2590 'so': '1Uc-eSZnJb36SgeTvRU3GirXZOlGD_NB6',
2591 'sq': '11u-53n71O_yjpwRiCQSwgL7N2w72ZptX',
2592 'sr': '1PGLGlQi8Q0Eac6dib-uuCJAAHK6SF5Pz',
2593 'srn': '1JKiL3TSXqK1-KhPfAwMK0uqw90WEzg7M',
2594 'ss': '1e0quNEsA1dn57-IbincF4D82dRWgzQlp',
2595 'st': '1ny-FBzpBqIDgv6jMcsoFev3Ih65FNZFO',
2596 'stq': '15Fx32ROy2IM6lSqAPUykkr3CITR6Xd7v',
2597 'su': '1C0FJum7bYZpnyptBvfAgwJb0TX2hggtO',
2598 'sv': '1YyqzOSXzK5yrAou9zeTDWH_7s569mDcz',
2599 'sw': '1_bNTj6T8eXlNAIuHaveleWlHB_22alJs',
2600 'szl': '1_dXEip1snK4CPVGqH8x7lF5O-6FdCNFW',
2601 'ta': '1ZFTONsxGtSnC9QB6RpWSvgD_MbZwIhHH',
2602 'tcy': '15R6u7KQs1vmDSm_aSDrQMJ3Q6q3Be0r7', # leer
2603 'te': '11Sx-pBAPeZOXGyv48UNSVMD0AH7uf4YN',
2604 'tet': '11mr2MYLcv9pz7mHhGGNi5iNCOVErYeOt',
2605 'tg': '16ttF7HWqM9Cnj4qmgf3ZfNniiOJfZ52w',
2606 'th': '14xhIt-xr5n9nMuvcwayCGM1-zBCFZquW',
2607 'ti': '123q5e9MStMShp8eESGtHdSBGLDrCKfJU',
2608 'tk': '1X-JNInt34BNGhg8A8Peyjw2WjsALdXsD',
2609 'tl': '1WkQHbWd9cqtTnSHAv0DpUThaBnzeSPTJ',
2610 'tn': '1fHfQHetZn8-fLuRZEu-cvs-kQYwPvjyL',
2611 'to': '1cHOLaczYJ8h-OqQgxeoH9vMG3izg6muT',
2612 'tpi': '1YsRjxVu6NYOrXRb8oqMO9FPaicelFEcu',
2613 'tr': '1J1Zy02IxvtCK0d1Ba2h_Ulit1mVb9UIX',
2614 'ts': '1pIcfAt3KmtmDkyhOl-SMSeoM8aP8bOpl',
2615 'tt': '1vsfzCjj-_bMOn5jBai41TF5GjKJM_Ius',
2616 'tum': '1NWcg65daI2Bt0awyEgU6apUDbBmiqCus',
2617 'tw': '1WCYKZIqS7AagS76QFSfbteiOgFNBvNne',
2618 'ty': '1DIqaP1l-N9VXTNokrlr6EuPMGE765o4h',
2619 'tyv': '1F3qa05OYLBcjT1lXMurAJFDXP_EesCvM',
2620 'udm': '1T0YMTAPLOk768sstnewy5Jxgx2RPu3Rb',
2621 'ug': '1fjezvqlysyZhiQMZdazqLGgk72PqtXAw',
2622 'uk': '1UMJCHtzxkfLDBJE7NtfN5FeMrnnUVwoh',
2623 'ur': '1WNaD2TuHvdsF-z0k_emQYchwoQQDFmRk',
2624 'uz': '11wrG2FSTpRJc2jb5MhgvxjkVDYhT8M-l',
2625 've': '1PucJ7pJ4CXGEXZ5p_WleZDs2usNz74to',
2626 'vec': '1cAVjm_y3ehNteDQIYz9yyoq1EKkqOXZ0',
2627 'vep': '1K_eqV7O6C7KPJWZtmIuzFMKAagj-0O85',
2628 'vi': '1yQ6nhm1BmG9lD4_NaG1hE5VV6biEaV5f',
2629 'vls': '1bpQQW6pKHruKJJaKtuggH5rReMXyeVXp',
2630 'vo': '1D80QRdTpe7H4mHFKpfugscsjX71kiMJN',
2631 'wa': '1m4B81QYbf74htpInDU5p7d0n0ot8WLPZ',
2632 'war': '1EC3jsHtu22tHBv6jX_I4rupC5RwV3OYd',
2633 'wo': '1vChyqNNLu5xYHdyHpACwwpw4l3ptiKlo',
2634 'wuu': '1_EIn02xCUBcwLOwYnA-lScjS2Lh2ECw6',
2635 'xal': '19bKXsL1D2UesbB50JPyc9TpG1lNc2POt',
2636 'xh': '1pPVcxBG3xsCzEnUzlohc_p89gQ9dSJB3',
2637 'xmf': '1SM9llku6I_ZuZz05mOBuL2lx-KQXvehr',
2638 'yi': '1WNWr1oV-Nl7c1Jv8x_MiAj2vxRtyQawu',
2639 'yo': '1yNVOwMOWeglbOcRoZzgd4uwlN5JMynnY',
2640 'za': '1i7pg162cD_iU9h8dgtI2An8QCcbzUAjB',
2641 'zea': '1EWSkiSkPBfbyjWjZK0VuKdpqFnFOpXXQ',
2642 'zh-classical': '1uUKZamNp08KA7s7794sKPOqPALvo_btl',
2643 'zh-min-nan': '1oSgz3YBXLGUgI7kl-uMOC_ww6L0FNFmp',
2644 'zh-yue': '1zhwlUeeiyOAU1QqwqZ8n91yXIRPFA7UE',
2645 'zh': '1LZ96GUhkVHQU-aj2C3WOrtffOp0U3Z7f',
2646 'zu': '1FyXl_UK1737XB3drqQFhGXiJrJckiB1W'
2647 }
2648 return languages_ids[language]
2651class NER_MULTI_XTREME(MultiCorpus):
2652 def __init__(
2653 self,
2654 languages: Union[str, List[str]] = "en",
2655 base_path: Union[str, Path] = None,
2656 tag_to_bioes: str = "ner",
2657 in_memory: bool = False,
2658 **corpusargs,
2659 ):
2660 """
2661 Xtreme corpus for cross-lingual NER consisting of datasets of a total of 176 languages. The data comes from the google
2662 research work XTREME https://github.com/google-research/xtreme. All datasets for NER and respective language abbreviations (e.g.
2663 "en" for english can be found here https://www.amazon.com/clouddrive/share/d3KGCRCIYwhKJF0H3eWA26hjg2ZCRhjpEQtDL70FSBN/folder/C43gs51bSIaq5sFTQkWNCQ?_encoding=UTF8&*Version*=1&*entries*=0&mgh=1 )
2664 The data is derived from the wikiann dataset https://elisa-ie.github.io/wikiann/ (license: https://opendatacommons.org/licenses/by/)
2666 Parameters
2667 ----------
2668 languages : Union[str, List[str]], optional
2669 Default the 40 languages that are used in XTREME are loaded. Otherwise on can hand over a strings or a list of strings
2670 consisiting of abbreviations for languages. All datasets will be loaded in a MultiCorpus object.
2671 base_path : Union[str, Path], optional
2672 Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
2673 to point to a different folder but typically this should not be necessary.
2674 tag_to_bioes : str, optional
2675 The data is in bio-format. It will by default (with the string "ner" as value) be transformed
2676 into the bioes format. If you dont want that set it to None.
2678 """
2679 # if no languages are given as argument all languages used in XTREME will be loaded
2680 if not languages:
2681 languages = ["af", "ar", "bg", "bn", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr", "he", "hi", "hu",
2682 "id", "it", "ja", "jv", "ka", "kk", "ko", "ml", "mr", "ms", "my", "nl", "pt", "ru", "sw", "ta",
2683 "te", "th", "tl", "tr", "ur", "vi", "yo", "zh"]
2685 # if only one language is given
2686 if type(languages) == str:
2687 languages = [languages]
2689 if type(base_path) == str:
2690 base_path: Path = Path(base_path)
2692 # column format
2693 columns = {0: "text", 1: "ner"}
2695 # this dataset name
2696 dataset_name = self.__class__.__name__.lower()
2698 # default dataset folder is the cache root
2699 if not base_path:
2700 base_path = flair.cache_root / "datasets"
2701 data_folder = base_path / dataset_name
2703 # For each language in languages, the file is downloaded if not existent
2704 # Then a comlumncorpus of that data is created and saved in a list
2705 # This list is handed to the multicorpus
2707 # list that contains the columncopora
2708 corpora = []
2710 hu_path = "https://nlp.informatik.hu-berlin.de/resources/datasets/panx_dataset"
2712 # download data if necessary
2713 for language in languages:
2715 language_folder = data_folder / language
2717 # if language not downloaded yet, download it
2718 if not language_folder.exists():
2720 file_name = language + '.tar.gz'
2721 # create folder
2722 os.makedirs(language_folder)
2724 # download from HU Server
2725 temp_file = cached_path(
2726 hu_path + "/" + file_name,
2727 Path("datasets") / dataset_name / language
2728 )
2730 # unzip
2731 log.info("Extracting data...")
2732 import tarfile
2733 tar = tarfile.open(str(temp_file), "r:gz")
2734 for part in ["train", "test", "dev"]:
2735 tar.extract(part, str(language_folder))
2736 tar.close()
2737 log.info('...done.')
2739 # transform data into required format
2740 log.info("Processing dataset...")
2741 for part in ["train", "test", "dev"]:
2742 self._xtreme_to_simple_ner_annotation(str(language_folder / part))
2743 log.info('...done.')
2745 # initialize comlumncorpus and add it to list
2746 log.info(f"Reading data for language {language}")
2747 corp = ColumnCorpus(data_folder=language_folder,
2748 column_format=columns,
2749 tag_to_bioes=tag_to_bioes,
2750 in_memory=in_memory,
2751 **corpusargs,
2752 )
2753 corpora.append(corp)
2755 super(NER_MULTI_XTREME, self).__init__(
2756 corpora, name='xtreme',
2757 )
2759 def _xtreme_to_simple_ner_annotation(self, data_file: Union[str, Path]):
2760 with open(data_file, 'r', encoding='utf-8') as f:
2761 lines = f.readlines()
2762 with open(data_file, 'w', encoding='utf-8') as f:
2763 for line in lines:
2764 if line == '\n':
2765 f.write(line)
2766 else:
2767 liste = line.split()
2768 f.write(liste[0].split(':', 1)[1] + ' ' + liste[1] + '\n')
2771class NER_MULTI_WIKINER(MultiCorpus):
2772 def __init__(
2773 self,
2774 languages: Union[str, List[str]] = "en",
2775 base_path: Union[str, Path] = None,
2776 tag_to_bioes: str = "ner",
2777 in_memory: bool = False,
2778 **corpusargs,
2779 ):
2780 if type(base_path) == str:
2781 base_path: Path = Path(base_path)
2783 # if only one language is given
2784 if type(languages) == str:
2785 languages = [languages]
2787 # column format
2788 columns = {0: "text", 1: "pos", 2: "ner"}
2790 # this dataset name
2791 dataset_name = self.__class__.__name__.lower()
2793 # default dataset folder is the cache root
2794 if not base_path:
2795 base_path = flair.cache_root / "datasets"
2796 data_folder = base_path / dataset_name
2798 corpora = []
2799 for language in languages:
2800 language_folder = data_folder / language
2802 # download data if necessary
2803 self._download_wikiner(language, language_folder)
2805 # initialize comlumncorpus and add it to list
2806 log.info(f"Read data for language {language}")
2807 corp = ColumnCorpus(data_folder=language_folder,
2808 column_format=columns,
2809 tag_to_bioes=tag_to_bioes,
2810 in_memory=in_memory,
2811 **corpusargs,
2812 )
2813 corpora.append(corp)
2815 super(NER_MULTI_WIKINER, self).__init__(
2816 corpora, name='wikiner',
2817 )
2819 def _download_wikiner(self, language_code: str, dataset_name: str):
2820 # download data if necessary
2821 wikiner_path = (
2822 "https://raw.githubusercontent.com/dice-group/FOX/master/input/Wikiner/"
2823 )
2824 lc = language_code
2826 data_file = (
2827 flair.cache_root
2828 / "datasets"
2829 / dataset_name
2830 / f"aij-wikiner-{lc}-wp3.train"
2831 )
2832 if not data_file.is_file():
2834 cached_path(
2835 f"{wikiner_path}aij-wikiner-{lc}-wp3.bz2", Path("datasets") / dataset_name
2836 )
2837 import bz2, shutil
2839 # unpack and write out in CoNLL column-like format
2840 bz_file = bz2.BZ2File(
2841 flair.cache_root
2842 / "datasets"
2843 / dataset_name
2844 / f"aij-wikiner-{lc}-wp3.bz2",
2845 "rb",
2846 )
2847 with bz_file as f, open(
2848 flair.cache_root
2849 / "datasets"
2850 / dataset_name
2851 / f"aij-wikiner-{lc}-wp3.train",
2852 "w",
2853 encoding="utf-8"
2854 ) as out:
2855 for line in f:
2856 line = line.decode("utf-8")
2857 words = line.split(" ")
2858 for word in words:
2859 out.write("\t".join(word.split("|")) + "\n")
2862class NER_SWEDISH(ColumnCorpus):
2863 def __init__(
2864 self,
2865 base_path: Union[str, Path] = None,
2866 tag_to_bioes: str = "ner",
2867 in_memory: bool = True,
2868 **corpusargs,
2869 ):
2870 """
2871 Initialize the NER_SWEDISH corpus for Swedish. The first time you call this constructor it will automatically
2872 download the dataset.
2873 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
2874 to point to a different folder but typically this should not be necessary.
2875 :param in_memory: If True, keeps dataset in memory giving speedups in training.
2876 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
2877 """
2879 if type(base_path) == str:
2880 base_path: Path = Path(base_path)
2882 # column format
2883 columns = {0: "text", 1: "ner"}
2885 # this dataset name
2886 dataset_name = self.__class__.__name__.lower()
2888 # default dataset folder is the cache root
2889 if not base_path:
2890 base_path = flair.cache_root / "datasets"
2891 data_folder = base_path / dataset_name
2893 # download data if necessary
2894 ner_spraakbanken_path = "https://raw.githubusercontent.com/klintan/swedish-ner-corpus/master/"
2895 cached_path(f"{ner_spraakbanken_path}test_corpus.txt", Path("datasets") / dataset_name)
2896 cached_path(f"{ner_spraakbanken_path}train_corpus.txt", Path("datasets") / dataset_name)
2898 # data is not in IOB2 format. Thus we transform it to IOB2
2899 self._add_IOB2_tags(data_file=Path(data_folder / "test_corpus.txt"))
2900 self._add_IOB2_tags(data_file=Path(data_folder / "train_corpus.txt"))
2902 super(NER_SWEDISH, self).__init__(
2903 data_folder,
2904 columns,
2905 tag_to_bioes=tag_to_bioes,
2906 in_memory=in_memory,
2907 **corpusargs,
2908 )
2910 def _add_IOB2_tags(self, data_file: Union[str, Path], encoding: str = "utf8"):
2911 """
2912 Function that adds IOB2 tags if only chunk names are provided (e.g. words are tagged PER instead
2913 of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects
2914 the letter 'O'. Additionally it removes lines with no tags in the data file and can also
2915 be used if the data is only partially IOB tagged.
2916 Parameters
2917 ----------
2918 data_file : Union[str, Path]
2919 Path to the data file.
2920 encoding : str, optional
2921 Encoding used in open function. The default is "utf8".
2923 """
2924 with open(file=data_file, mode='r', encoding=encoding) as f:
2925 lines = f.readlines()
2926 with open(file=data_file, mode='w', encoding=encoding) as f:
2927 pred = 'O' # remembers tag of predecessing line
2928 for line in lines:
2929 line_list = line.split()
2930 if len(line_list) == 2: # word with tag
2931 word = line_list[0]
2932 tag = line_list[1]
2933 if tag in ['0', 'O']: # no chunk
2934 f.write(word + ' O\n')
2935 pred = 'O'
2936 elif '-' not in tag: # no IOB tags
2937 if pred == 'O': # found a new chunk
2938 f.write(word + ' B-' + tag + '\n')
2939 pred = tag
2940 else: # found further part of chunk or new chunk directly after old chunk
2941 if pred == tag:
2942 f.write(word + ' I-' + tag + '\n')
2943 else:
2944 f.write(word + ' B-' + tag + '\n')
2945 pred = tag
2946 else: # line already has IOB tag (tag contains '-')
2947 f.write(line)
2948 pred = tag.split('-')[1]
2949 elif len(line_list) == 0: # empty line
2950 f.write('\n')
2951 pred = 'O'
2954class NER_TURKU(ColumnCorpus):
2955 def __init__(
2956 self,
2957 base_path: Union[str, Path] = None,
2958 tag_to_bioes: str = "ner",
2959 in_memory: bool = True,
2960 **corpusargs,
2961 ):
2962 """
2963 Initialize the Finnish TurkuNER corpus. The first time you call this constructor it will automatically
2964 download the dataset.
2965 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
2966 to point to a different folder but typically this should not be necessary.
2967 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
2968 POS tags instead
2969 :param in_memory: If True, keeps dataset in memory giving speedups in training.
2970 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
2971 """
2972 if type(base_path) == str:
2973 base_path: Path = Path(base_path)
2975 # column format
2976 columns = {0: "text", 1: "ner"}
2978 # this dataset name
2979 dataset_name = self.__class__.__name__.lower()
2981 # default dataset folder is the cache root
2982 if not base_path:
2983 base_path = flair.cache_root / "datasets"
2984 data_folder = base_path / dataset_name
2986 # download data if necessary
2987 conll_path = "https://raw.githubusercontent.com/TurkuNLP/turku-ner-corpus/master/data/conll"
2988 dev_file = "dev.tsv"
2989 test_file = "test.tsv"
2990 train_file = "train.tsv"
2991 cached_path(f"{conll_path}/{dev_file}", Path("datasets") / dataset_name)
2992 cached_path(f"{conll_path}/{test_file}", Path("datasets") / dataset_name)
2993 cached_path(f"{conll_path}/{train_file}", Path("datasets") / dataset_name)
2995 super(NER_TURKU, self).__init__(
2996 data_folder,
2997 columns,
2998 dev_file=dev_file,
2999 test_file=test_file,
3000 train_file=train_file,
3001 column_delimiter="\t",
3002 tag_to_bioes=tag_to_bioes,
3003 encoding="latin-1",
3004 in_memory=in_memory,
3005 document_separator_token="-DOCSTART-",
3006 **corpusargs,
3007 )
3010class KEYPHRASE_SEMEVAL2017(ColumnCorpus):
3011 def __init__(
3012 self,
3013 base_path: Union[str, Path] = None,
3014 tag_to_bioes: str = "keyword",
3015 in_memory: bool = True,
3016 **corpusargs,
3017 ):
3019 if type(base_path) == str:
3020 base_path: Path = Path(base_path)
3022 # column format
3023 columns = {0: "text", 1: "keyword"}
3025 # this dataset name
3026 dataset_name = self.__class__.__name__.lower()
3028 # default dataset folder is the cache root
3029 if not base_path:
3030 base_path = flair.cache_root / "datasets"
3031 data_folder = base_path / dataset_name
3033 semeval2017_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/SemEval-2017"
3034 cached_path(f"{semeval2017_path}/train.txt", Path("datasets") / dataset_name)
3035 cached_path(f"{semeval2017_path}/test.txt", Path("datasets") / dataset_name)
3036 cached_path(f"{semeval2017_path}/dev.txt", Path("datasets") / dataset_name)
3038 super(KEYPHRASE_SEMEVAL2017, self).__init__(
3039 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
3040 )
3043class KEYPHRASE_INSPEC(ColumnCorpus):
3044 def __init__(
3045 self,
3046 base_path: Union[str, Path] = None,
3047 tag_to_bioes: str = "keyword",
3048 in_memory: bool = True,
3049 **corpusargs,
3050 ):
3052 if type(base_path) == str:
3053 base_path: Path = Path(base_path)
3055 # column format
3056 columns = {0: "text", 1: "keyword"}
3058 # this dataset name
3059 dataset_name = self.__class__.__name__.lower()
3061 # default dataset folder is the cache root
3062 if not base_path:
3063 base_path = flair.cache_root / "datasets"
3064 data_folder = base_path / dataset_name
3066 inspec_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/Inspec"
3067 cached_path(f"{inspec_path}/train.txt", Path("datasets") / dataset_name)
3068 cached_path(f"{inspec_path}/test.txt", Path("datasets") / dataset_name)
3069 if not "dev.txt" in os.listdir(data_folder):
3070 cached_path(f"{inspec_path}/valid.txt", Path("datasets") / dataset_name)
3071 # rename according to train - test - dev - convention
3072 os.rename(data_folder / "valid.txt", data_folder / "dev.txt")
3074 super(KEYPHRASE_INSPEC, self).__init__(
3075 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
3076 )
3079class KEYPHRASE_SEMEVAL2010(ColumnCorpus):
3080 def __init__(
3081 self,
3082 base_path: Union[str, Path] = None,
3083 tag_to_bioes: str = "keyword",
3084 in_memory: bool = True,
3085 **corpusargs,
3086 ):
3088 if type(base_path) == str:
3089 base_path: Path = Path(base_path)
3091 # column format
3092 columns = {0: "text", 1: "keyword"}
3094 # this dataset name
3095 dataset_name = self.__class__.__name__.lower()
3097 # default dataset folder is the cache root
3098 if not base_path:
3099 base_path = flair.cache_root / "datasets"
3100 data_folder = base_path / dataset_name
3102 semeval2010_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/processed_semeval-2010"
3103 cached_path(f"{semeval2010_path}/train.txt", Path("datasets") / dataset_name)
3104 cached_path(f"{semeval2010_path}/test.txt", Path("datasets") / dataset_name)
3106 super(KEYPHRASE_SEMEVAL2010, self).__init__(
3107 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
3108 )
3111class UP_CHINESE(ColumnCorpus):
3112 def __init__(
3113 self,
3114 base_path: Union[str, Path] = None,
3115 in_memory: bool = True,
3116 document_as_sequence: bool = False,
3117 **corpusargs,
3118 ):
3119 """
3120 Initialize the Chinese dataset from the Universal Propositions Bank, comming from that webpage:
3121 https://github.com/System-T/UniversalPropositions/tree/master/UP_Chinese
3123 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
3124 to point to a different folder but typically this should not be necessary.
3125 :param in_memory: If True, keeps dataset in memory giving speedups in training.
3126 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
3127 """
3128 if type(base_path) == str:
3129 base_path: Path = Path(base_path)
3131 # column format
3132 columns = {1: "text", 9: "frame"}
3134 # this dataset name
3135 dataset_name = self.__class__.__name__.lower()
3137 # default dataset folder is the cache root
3138 if not base_path:
3139 base_path = flair.cache_root / "datasets"
3140 data_folder = base_path / dataset_name
3142 # download data if necessary
3143 up_zh_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Chinese/"
3144 cached_path(f"{up_zh_path}zh-up-train.conllu", Path("datasets") / dataset_name)
3145 cached_path(f"{up_zh_path}zh-up-dev.conllu", Path("datasets") / dataset_name)
3146 cached_path(f"{up_zh_path}zh-up-test.conllu", Path("datasets") / dataset_name)
3148 super(UP_CHINESE, self).__init__(
3149 data_folder,
3150 columns,
3151 encoding="utf-8",
3152 train_file="zh-up-train.conllu",
3153 test_file="zh-up-test.conllu",
3154 dev_file="zh-up-dev.conllu",
3155 in_memory=in_memory,
3156 document_separator_token=None if not document_as_sequence else "-DOCSTART-",
3157 comment_symbol="#",
3158 **corpusargs,
3159 )
3162class UP_ENGLISH(ColumnCorpus):
3163 def __init__(
3164 self,
3165 base_path: Union[str, Path] = None,
3166 in_memory: bool = True,
3167 document_as_sequence: bool = False,
3168 **corpusargs,
3169 ):
3170 """
3171 Initialize the English dataset from the Universal Propositions Bank, comming from that webpage:
3172 https://github.com/System-T/UniversalPropositions.
3174 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
3175 to point to a different folder but typically this should not be necessary.
3176 :param in_memory: If True, keeps dataset in memory giving speedups in training.
3177 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
3178 """
3179 if type(base_path) == str:
3180 base_path: Path = Path(base_path)
3182 # column format
3183 columns = {1: "text", 10: "frame"}
3185 # this dataset name
3186 dataset_name = self.__class__.__name__.lower()
3188 # default dataset folder is the cache root
3189 if not base_path:
3190 base_path = flair.cache_root / "datasets"
3191 data_folder = base_path / dataset_name
3193 # download data if necessary
3194 up_en_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_English-EWT/"
3195 cached_path(f"{up_en_path}en_ewt-up-train.conllu", Path("datasets") / dataset_name)
3196 cached_path(f"{up_en_path}en_ewt-up-dev.conllu", Path("datasets") / dataset_name)
3197 cached_path(f"{up_en_path}en_ewt-up-test.conllu", Path("datasets") / dataset_name)
3199 super(UP_ENGLISH, self).__init__(
3200 data_folder,
3201 columns,
3202 encoding="utf-8",
3203 train_file="en_ewt-up-train.conllu",
3204 test_file="en_ewt-up-test.conllu",
3205 dev_file="en_ewt-up-dev.conllu",
3206 in_memory=in_memory,
3207 document_separator_token=None if not document_as_sequence else "-DOCSTART-",
3208 comment_symbol="#",
3209 label_name_map={"_": "O"},
3210 **corpusargs,
3211 )
3214class UP_FRENCH(ColumnCorpus):
3215 def __init__(
3216 self,
3217 base_path: Union[str, Path] = None,
3218 in_memory: bool = True,
3219 document_as_sequence: bool = False,
3220 **corpusargs,
3221 ):
3222 """
3223 Initialize the French dataset from the Universal Propositions Bank, comming from that webpage:
3224 https://github.com/System-T/UniversalPropositions.
3226 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
3227 to point to a different folder but typically this should not be necessary.
3228 :param in_memory: If True, keeps dataset in memory giving speedups in training.
3229 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
3230 """
3231 if type(base_path) == str:
3232 base_path: Path = Path(base_path)
3234 # column format
3235 columns = {1: "text", 9: "frame"}
3237 # this dataset name
3238 dataset_name = self.__class__.__name__.lower()
3240 # default dataset folder is the cache root
3241 if not base_path:
3242 base_path = flair.cache_root / "datasets"
3243 data_folder = base_path / dataset_name
3245 # download data if necessary
3246 up_fr_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_French/"
3247 cached_path(f"{up_fr_path}fr-up-train.conllu", Path("datasets") / dataset_name)
3248 cached_path(f"{up_fr_path}fr-up-dev.conllu", Path("datasets") / dataset_name)
3249 cached_path(f"{up_fr_path}fr-up-test.conllu", Path("datasets") / dataset_name)
3251 super(UP_FRENCH, self).__init__(
3252 data_folder,
3253 columns,
3254 encoding="utf-8",
3255 train_file="fr-up-train.conllu",
3256 test_file="fr-up-test.conllu",
3257 dev_file="fr-up-dev.conllu",
3258 in_memory=in_memory,
3259 document_separator_token=None if not document_as_sequence else "-DOCSTART-",
3260 comment_symbol="#",
3261 **corpusargs,
3262 )
3265class UP_FINNISH(ColumnCorpus):
3266 def __init__(
3267 self,
3268 base_path: Union[str, Path] = None,
3269 in_memory: bool = True,
3270 document_as_sequence: bool = False,
3271 **corpusargs,
3272 ):
3273 """
3274 Initialize the Finnish dataset from the Universal Propositions Bank, comming from that webpage:
3275 https://github.com/System-T/UniversalPropositions/tree/master/UP_Finnish
3277 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
3278 to point to a different folder but typically this should not be necessary.
3279 :param in_memory: If True, keeps dataset in memory giving speedups in training.
3280 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
3281 """
3282 if type(base_path) == str:
3283 base_path: Path = Path(base_path)
3285 # column format
3286 columns = {1: "text", 9: "frame"}
3288 # this dataset name
3289 dataset_name = self.__class__.__name__.lower()
3291 # default dataset folder is the cache root
3292 if not base_path:
3293 base_path = flair.cache_root / "datasets"
3294 data_folder = base_path / dataset_name
3296 # download data if necessary
3297 up_fi_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Finnish/"
3298 cached_path(f"{up_fi_path}fi-up-train.conllu", Path("datasets") / dataset_name)
3299 cached_path(f"{up_fi_path}fi-up-dev.conllu", Path("datasets") / dataset_name)
3300 cached_path(f"{up_fi_path}fi-up-test.conllu", Path("datasets") / dataset_name)
3302 super(UP_FINNISH, self).__init__(
3303 data_folder,
3304 columns,
3305 encoding="utf-8",
3306 train_file="fi-up-train.conllu",
3307 test_file="fi-up-test.conllu",
3308 dev_file="fi-up-dev.conllu",
3309 in_memory=in_memory,
3310 document_separator_token=None if not document_as_sequence else "-DOCSTART-",
3311 comment_symbol="#",
3312 **corpusargs,
3313 )
3316class UP_GERMAN(ColumnCorpus):
3317 def __init__(
3318 self,
3319 base_path: Union[str, Path] = None,
3320 in_memory: bool = True,
3321 document_as_sequence: bool = False,
3322 **corpusargs,
3323 ):
3324 """
3325 Initialize the German dataset from the Universal Propositions Bank, comming from that webpage:
3326 https://github.com/System-T/UniversalPropositions.
3328 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
3329 to point to a different folder but typically this should not be necessary.
3330 :param in_memory: If True, keeps dataset in memory giving speedups in training.
3331 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
3332 """
3333 if type(base_path) == str:
3334 base_path: Path = Path(base_path)
3336 # column format
3337 columns = {1: "text", 9: "frame"}
3339 # this dataset name
3340 dataset_name = self.__class__.__name__.lower()
3342 # default dataset folder is the cache root
3343 if not base_path:
3344 base_path = flair.cache_root / "datasets"
3345 data_folder = base_path / dataset_name
3347 # download data if necessary
3348 up_de_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_German/"
3349 cached_path(f"{up_de_path}de-up-train.conllu", Path("datasets") / dataset_name)
3350 cached_path(f"{up_de_path}de-up-dev.conllu", Path("datasets") / dataset_name)
3351 cached_path(f"{up_de_path}de-up-test.conllu", Path("datasets") / dataset_name)
3353 super(UP_GERMAN, self).__init__(
3354 data_folder,
3355 columns,
3356 encoding="utf-8",
3357 train_file="de-up-train.conllu",
3358 test_file="de-up-test.conllu",
3359 dev_file="de-up-dev.conllu",
3360 in_memory=in_memory,
3361 document_separator_token=None if not document_as_sequence else "-DOCSTART-",
3362 comment_symbol="#",
3363 **corpusargs,
3364 )
3367class UP_ITALIAN(ColumnCorpus):
3368 def __init__(
3369 self,
3370 base_path: Union[str, Path] = None,
3371 in_memory: bool = True,
3372 document_as_sequence: bool = False,
3373 **corpusargs,
3374 ):
3375 """
3376 Initialize the Italian dataset from the Universal Propositions Bank, comming from that webpage:
3377 https://github.com/System-T/UniversalPropositions/tree/master/UP_Italian
3379 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
3380 to point to a different folder but typically this should not be necessary.
3381 :param in_memory: If True, keeps dataset in memory giving speedups in training.
3382 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
3383 """
3384 if type(base_path) == str:
3385 base_path: Path = Path(base_path)
3387 # column format
3388 columns = {1: "text", 9: "frame"}
3390 # this dataset name
3391 dataset_name = self.__class__.__name__.lower()
3393 # default dataset folder is the cache root
3394 if not base_path:
3395 base_path = flair.cache_root / "datasets"
3396 data_folder = base_path / dataset_name
3398 # download data if necessary
3399 up_it_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Italian/"
3400 cached_path(f"{up_it_path}it-up-train.conllu", Path("datasets") / dataset_name)
3401 cached_path(f"{up_it_path}it-up-dev.conllu", Path("datasets") / dataset_name)
3402 cached_path(f"{up_it_path}it-up-test.conllu", Path("datasets") / dataset_name)
3404 super(UP_ITALIAN, self).__init__(
3405 data_folder,
3406 columns,
3407 encoding="utf-8",
3408 train_file="it-up-train.conllu",
3409 test_file="it-up-test.conllu",
3410 dev_file="it-up-dev.conllu",
3411 in_memory=in_memory,
3412 document_separator_token=None if not document_as_sequence else "-DOCSTART-",
3413 comment_symbol="#",
3414 **corpusargs,
3415 )
3418class UP_SPANISH(ColumnCorpus):
3419 def __init__(
3420 self,
3421 base_path: Union[str, Path] = None,
3422 in_memory: bool = True,
3423 document_as_sequence: bool = False,
3424 **corpusargs,
3425 ):
3426 """
3427 Initialize the Spanish dataset from the Universal Propositions Bank, comming from that webpage:
3428 https://github.com/System-T/UniversalPropositions
3430 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
3431 to point to a different folder but typically this should not be necessary.
3432 :param in_memory: If True, keeps dataset in memory giving speedups in training.
3433 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
3434 """
3435 if type(base_path) == str:
3436 base_path: Path = Path(base_path)
3438 # column format
3439 columns = {1: "text", 9: "frame"}
3441 # this dataset name
3442 dataset_name = self.__class__.__name__.lower()
3444 # default dataset folder is the cache root
3445 if not base_path:
3446 base_path = flair.cache_root / "datasets"
3447 data_folder = base_path / dataset_name
3449 # download data if necessary
3450 up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish/"
3451 cached_path(f"{up_es_path}es-up-train.conllu", Path("datasets") / dataset_name)
3452 cached_path(f"{up_es_path}es-up-dev.conllu", Path("datasets") / dataset_name)
3453 cached_path(f"{up_es_path}es-up-test.conllu", Path("datasets") / dataset_name)
3455 super(UP_SPANISH, self).__init__(
3456 data_folder,
3457 columns,
3458 encoding="utf-8",
3459 train_file="es-up-train.conllu",
3460 test_file="es-up-test.conllu",
3461 dev_file="es-up-dev.conllu",
3462 in_memory=in_memory,
3463 document_separator_token=None if not document_as_sequence else "-DOCSTART-",
3464 comment_symbol="#",
3465 **corpusargs,
3466 )
3469class UP_SPANISH_ANCORA(ColumnCorpus):
3470 def __init__(
3471 self,
3472 base_path: Union[str, Path] = None,
3473 in_memory: bool = True,
3474 document_as_sequence: bool = False,
3475 **corpusargs,
3476 ):
3477 """
3478 Initialize the Spanish AnCora dataset from the Universal Propositions Bank, comming from that webpage:
3479 https://github.com/System-T/UniversalPropositions
3481 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
3482 to point to a different folder but typically this should not be necessary.
3483 :param in_memory: If True, keeps dataset in memory giving speedups in training.
3484 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
3485 """
3486 if type(base_path) == str:
3487 base_path: Path = Path(base_path)
3489 # column format
3490 columns = {1: "text", 9: "frame"}
3492 # this dataset name
3493 dataset_name = self.__class__.__name__.lower()
3495 # default dataset folder is the cache root
3496 if not base_path:
3497 base_path = flair.cache_root / "datasets"
3498 data_folder = base_path / dataset_name
3500 # download data if necessary
3501 up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish-AnCora/"
3502 cached_path(f"{up_es_path}es_ancora-up-train.conllu", Path("datasets") / dataset_name)
3503 cached_path(f"{up_es_path}es_ancora-up-dev.conllu", Path("datasets") / dataset_name)
3504 cached_path(f"{up_es_path}es_ancora-up-test.conllu", Path("datasets") / dataset_name)
3506 super(UP_SPANISH_ANCORA, self).__init__(
3507 data_folder,
3508 columns,
3509 encoding="utf-8",
3510 train_file="es_ancora-up-train.conllu",
3511 test_file="es_ancora-up-test.conllu",
3512 dev_file="es_ancora-up-dev.conllu",
3513 in_memory=in_memory,
3514 document_separator_token=None if not document_as_sequence else "-DOCSTART-",
3515 comment_symbol="#",
3516 **corpusargs,
3517 )