Coverage for flair/flair/datasets/sequence_labeling.py: 12%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1197 statements  

1import logging 

2import os 

3import re 

4import shutil 

5from pathlib import Path 

6from typing import Union, Dict, List, Optional 

7 

8from torch.utils.data import ConcatDataset 

9 

10import flair 

11from flair.data import Corpus, MultiCorpus, FlairDataset, Sentence, Token 

12from flair.datasets.base import find_train_dev_test_files 

13from flair.file_utils import cached_path, unpack_file 

14 

15log = logging.getLogger("flair") 

16 

17 

18class MultiFileColumnCorpus(Corpus): 

19 def __init__( 

20 self, 

21 column_format: Dict[int, str], 

22 train_files=None, 

23 test_files=None, 

24 dev_files=None, 

25 tag_to_bioes=None, 

26 column_delimiter: str = r"\s+", 

27 comment_symbol: str = None, 

28 encoding: str = "utf-8", 

29 document_separator_token: str = None, 

30 skip_first_line: bool = False, 

31 in_memory: bool = True, 

32 label_name_map: Dict[str, str] = None, 

33 banned_sentences: List[str] = None, 

34 **corpusargs, 

35 ): 

36 """ 

37 Instantiates a Corpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000. 

38 :param data_folder: base folder with the task data 

39 :param column_format: a map specifying the column format 

40 :param train_files: the name of the train files 

41 :param test_files: the name of the test files 

42 :param dev_files: the name of the dev files, if empty, dev data is sampled from train 

43 :param tag_to_bioes: whether to convert to BIOES tagging scheme 

44 :param column_delimiter: default is to split on any separatator, but you can overwrite for instance with "\t" 

45 to split only on tabs 

46 :param comment_symbol: if set, lines that begin with this symbol are treated as comments 

47 :param document_separator_token: If provided, sentences that function as document boundaries are so marked 

48 :param skip_first_line: set to True if your dataset has a header line 

49 :param in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads 

50 :param label_name_map: Optionally map tag names to different schema. 

51 :param banned_sentences: Optionally remove sentences from the corpus. Works only if `in_memory` is true 

52 :return: a Corpus with annotated train, dev and test data 

53 """ 

54 # get train data 

55 train = ConcatDataset([ 

56 ColumnDataset( 

57 train_file, 

58 column_format, 

59 tag_to_bioes, 

60 encoding=encoding, 

61 comment_symbol=comment_symbol, 

62 column_delimiter=column_delimiter, 

63 banned_sentences=banned_sentences, 

64 in_memory=in_memory, 

65 document_separator_token=document_separator_token, 

66 skip_first_line=skip_first_line, 

67 label_name_map=label_name_map, 

68 ) for train_file in train_files 

69 ]) if train_files and train_files[0] else None 

70 

71 # read in test file if exists 

72 test = ConcatDataset([ 

73 ColumnDataset( 

74 test_file, 

75 column_format, 

76 tag_to_bioes, 

77 encoding=encoding, 

78 comment_symbol=comment_symbol, 

79 column_delimiter=column_delimiter, 

80 banned_sentences=banned_sentences, 

81 in_memory=in_memory, 

82 document_separator_token=document_separator_token, 

83 skip_first_line=skip_first_line, 

84 label_name_map=label_name_map, 

85 ) for test_file in test_files 

86 ]) if test_files and test_files[0] else None 

87 

88 # read in dev file if exists 

89 dev = ConcatDataset([ 

90 ColumnDataset( 

91 dev_file, 

92 column_format, 

93 tag_to_bioes, 

94 encoding=encoding, 

95 comment_symbol=comment_symbol, 

96 column_delimiter=column_delimiter, 

97 banned_sentences=banned_sentences, 

98 in_memory=in_memory, 

99 document_separator_token=document_separator_token, 

100 skip_first_line=skip_first_line, 

101 label_name_map=label_name_map, 

102 ) for dev_file in dev_files 

103 ]) if dev_files and dev_files[0] else None 

104 

105 super(MultiFileColumnCorpus, self).__init__(train, dev, test, **corpusargs) 

106 

107 

108class ColumnCorpus(MultiFileColumnCorpus): 

109 def __init__( 

110 self, 

111 data_folder: Union[str, Path], 

112 column_format: Dict[int, str], 

113 train_file=None, 

114 test_file=None, 

115 dev_file=None, 

116 autofind_splits: bool = True, 

117 name: Optional[str] = None, 

118 **corpusargs, 

119 ): 

120 """ 

121 Instantiates a Corpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000. 

122 :param data_folder: base folder with the task data 

123 :param column_format: a map specifying the column format 

124 :param train_file: the name of the train file 

125 :param test_file: the name of the test file 

126 :param dev_file: the name of the dev file, if None, dev data is sampled from train 

127 :param tag_to_bioes: whether to convert to BIOES tagging scheme 

128 :param column_delimiter: default is to split on any separatator, but you can overwrite for instance with "\t" 

129 to split only on tabs 

130 :param comment_symbol: if set, lines that begin with this symbol are treated as comments 

131 :param document_separator_token: If provided, sentences that function as document boundaries are so marked 

132 :param skip_first_line: set to True if your dataset has a header line 

133 :param in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads 

134 :param label_name_map: Optionally map tag names to different schema. 

135 :param banned_sentences: Optionally remove sentences from the corpus. Works only if `in_memory` is true 

136 :return: a Corpus with annotated train, dev and test data 

137 """ 

138 

139 # find train, dev and test files if not specified 

140 dev_file, test_file, train_file = \ 

141 find_train_dev_test_files(data_folder, dev_file, test_file, train_file, autofind_splits) 

142 super(ColumnCorpus, self).__init__( 

143 column_format, 

144 dev_files=[dev_file] if dev_file else [], 

145 train_files=[train_file] if train_file else [], 

146 test_files=[test_file] if test_file else [], 

147 name=name if data_folder is None else str(data_folder), 

148 **corpusargs 

149 ) 

150 

151 

152class ColumnDataset(FlairDataset): 

153 # special key for space after 

154 SPACE_AFTER_KEY = "space-after" 

155 

156 def __init__( 

157 self, 

158 path_to_column_file: Union[str, Path], 

159 column_name_map: Dict[int, str], 

160 tag_to_bioes: str = None, 

161 column_delimiter: str = r"\s+", 

162 comment_symbol: str = None, 

163 banned_sentences: List[str] = None, 

164 in_memory: bool = True, 

165 document_separator_token: str = None, 

166 encoding: str = "utf-8", 

167 skip_first_line: bool = False, 

168 label_name_map: Dict[str, str] = None, 

169 ): 

170 """ 

171 Instantiates a column dataset (typically used for sequence labeling or word-level prediction). 

172 :param path_to_column_file: path to the file with the column-formatted data 

173 :param column_name_map: a map specifying the column format 

174 :param tag_to_bioes: whether to convert to BIOES tagging scheme 

175 :param column_delimiter: default is to split on any separatator, but you can overwrite for instance with "\t" 

176 to split only on tabs 

177 :param comment_symbol: if set, lines that begin with this symbol are treated as comments 

178 :param in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads 

179 :param document_separator_token: If provided, sentences that function as document boundaries are so marked 

180 :param skip_first_line: set to True if your dataset has a header line 

181 :param label_name_map: Optionally map tag names to different schema. 

182 :param banned_sentences: Optionally remove sentences from the corpus. Works only if `in_memory` is true 

183 :return: a dataset with annotated data 

184 """ 

185 if type(path_to_column_file) is str: 

186 path_to_column_file = Path(path_to_column_file) 

187 assert path_to_column_file.exists() 

188 self.path_to_column_file = path_to_column_file 

189 self.tag_to_bioes = tag_to_bioes 

190 self.column_name_map = column_name_map 

191 self.column_delimiter = column_delimiter 

192 self.comment_symbol = comment_symbol 

193 self.document_separator_token = document_separator_token 

194 self.label_name_map = label_name_map 

195 self.banned_sentences = banned_sentences 

196 

197 # store either Sentence objects in memory, or only file offsets 

198 self.in_memory = in_memory 

199 

200 self.total_sentence_count: int = 0 

201 

202 # most data sets have the token text in the first column, if not, pass 'text' as column 

203 self.text_column: int = 0 

204 for column in self.column_name_map: 

205 if column_name_map[column] == "text": 

206 self.text_column = column 

207 

208 # determine encoding of text file 

209 self.encoding = encoding 

210 

211 with open(str(self.path_to_column_file), encoding=self.encoding) as file: 

212 

213 # skip first line if to selected 

214 if skip_first_line: 

215 file.readline() 

216 

217 # option 1: read only sentence boundaries as offset positions 

218 if not self.in_memory: 

219 self.indices: List[int] = [] 

220 

221 line = file.readline() 

222 position = 0 

223 sentence_started = False 

224 while line: 

225 if sentence_started and self.__line_completes_sentence(line): 

226 self.indices.append(position) 

227 position = file.tell() 

228 sentence_started = False 

229 

230 elif not line.isspace(): 

231 sentence_started = True 

232 line = file.readline() 

233 

234 if sentence_started: 

235 self.indices.append(position) 

236 

237 self.total_sentence_count = len(self.indices) 

238 

239 # option 2: keep everything in memory 

240 if self.in_memory: 

241 self.sentences: List[Sentence] = [] 

242 

243 # pointer to previous 

244 previous_sentence = None 

245 while True: 

246 sentence = self._convert_lines_to_sentence(self._read_next_sentence(file)) 

247 if not sentence: break 

248 if self.banned_sentences is not None and any( 

249 [d in sentence.to_plain_string() for d in self.banned_sentences]): 

250 continue 

251 sentence._previous_sentence = previous_sentence 

252 sentence._next_sentence = None 

253 

254 if previous_sentence: previous_sentence._next_sentence = sentence 

255 

256 self.sentences.append(sentence) 

257 previous_sentence = sentence 

258 

259 self.total_sentence_count = len(self.sentences) 

260 

261 def _read_next_sentence(self, file): 

262 lines = [] 

263 line = file.readline() 

264 while line: 

265 if not line.isspace(): 

266 lines.append(line) 

267 

268 # if sentence ends, break 

269 if len(lines) > 0 and self.__line_completes_sentence(line): 

270 break 

271 

272 line = file.readline() 

273 return lines 

274 

275 def _convert_lines_to_sentence(self, lines): 

276 

277 sentence: Sentence = Sentence() 

278 for line in lines: 

279 # skip comments 

280 if self.comment_symbol is not None and line.startswith(self.comment_symbol): 

281 continue 

282 

283 # if sentence ends, convert and return 

284 if self.__line_completes_sentence(line): 

285 if len(sentence) > 0: 

286 if self.tag_to_bioes is not None: 

287 sentence.convert_tag_scheme( 

288 tag_type=self.tag_to_bioes, target_scheme="iobes" 

289 ) 

290 # check if this sentence is a document boundary 

291 if sentence.to_original_text() == self.document_separator_token: 

292 sentence.is_document_boundary = True 

293 return sentence 

294 

295 # otherwise, this line is a token. parse and add to sentence 

296 else: 

297 token = self._parse_token(line) 

298 sentence.add_token(token) 

299 

300 # check if this sentence is a document boundary 

301 if sentence.to_original_text() == self.document_separator_token: sentence.is_document_boundary = True 

302 

303 if self.tag_to_bioes is not None: 

304 sentence.convert_tag_scheme( 

305 tag_type=self.tag_to_bioes, target_scheme="iobes" 

306 ) 

307 

308 if len(sentence) > 0: return sentence 

309 

310 def _parse_token(self, line: str) -> Token: 

311 fields: List[str] = re.split(self.column_delimiter, line.rstrip()) 

312 token = Token(fields[self.text_column]) 

313 for column in self.column_name_map: 

314 if len(fields) > column: 

315 if column != self.text_column and self.column_name_map[column] != self.SPACE_AFTER_KEY: 

316 task = self.column_name_map[column] # for example 'pos' 

317 tag = fields[column] 

318 if tag.count("-") >= 1: # tag with prefix, for example tag='B-OBJ' 

319 split_at_first_hyphen = tag.split("-", 1) 

320 tagging_format_prefix = split_at_first_hyphen[0] 

321 tag_without_tagging_format = split_at_first_hyphen[1] 

322 if self.label_name_map and tag_without_tagging_format in self.label_name_map.keys(): 

323 tag = tagging_format_prefix + "-" + self.label_name_map[tag_without_tagging_format] 

324 # for example, transforming 'B-OBJ' to 'B-part-of-speech-object' 

325 if self.label_name_map[tag_without_tagging_format] == 'O': tag = 'O' 

326 else: # tag without prefix, for example tag='PPER' 

327 if self.label_name_map and tag in self.label_name_map.keys(): 

328 tag = self.label_name_map[tag] # for example, transforming 'PPER' to 'person' 

329 

330 token.add_label(task, tag) 

331 if self.column_name_map[column] == self.SPACE_AFTER_KEY and fields[column] == '-': 

332 token.whitespace_after = False 

333 return token 

334 

335 def __line_completes_sentence(self, line: str) -> bool: 

336 sentence_completed = line.isspace() or line == '' 

337 return sentence_completed 

338 

339 def is_in_memory(self) -> bool: 

340 return self.in_memory 

341 

342 def __len__(self): 

343 return self.total_sentence_count 

344 

345 def __getitem__(self, index: int = 0) -> Sentence: 

346 

347 # if in memory, retrieve parsed sentence 

348 if self.in_memory: 

349 sentence = self.sentences[index] 

350 

351 # else skip to position in file where sentence begins 

352 else: 

353 with open(str(self.path_to_column_file), encoding=self.encoding) as file: 

354 file.seek(self.indices[index]) 

355 sentence = self._convert_lines_to_sentence(self._read_next_sentence(file)) 

356 

357 # set sentence context using partials 

358 sentence._position_in_dataset = (self, index) 

359 

360 return sentence 

361 

362 

363class MultiCoNer(MultiFileColumnCorpus): 

364 def __init__( 

365 self, 

366 task: str = "multi", 

367 base_path: Union[str, Path] = None, 

368 tag_to_bioes: str = "ner", 

369 in_memory: bool = True, 

370 use_dev_as_test: bool = True, 

371 **corpusargs, 

372 ): 

373 """ 

374 Initialize the MultiCoNer corpus. This is only possible if you've applied and downloaded it to your machine. 

375 Apply for the corpus from here https://multiconer.github.io/dataset and unpack the .zip file's content into 

376 a folder called 'multiconer'. Then set the base_path parameter in the constructor to the path to the 

377 parent directory where the multiconer folder resides. You can also create the multiconer in 

378 the {FLAIR_CACHE_ROOT}/datasets folder to leave the path empty. 

379 :param base_path: Path to the CoNLL-03 corpus (i.e. 'conll_03' folder) on your machine 

380 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' or 'np' to predict 

381 POS tags or chunks respectively 

382 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

383 :param use_dev_as_test: If True, it uses the dev set as test set and samples random training data for a dev split. 

384 :param task: either 'multi', 'code-switch', or the language code for one of the mono tasks. 

385 """ 

386 if type(base_path) == str: 

387 base_path: Path = Path(base_path) 

388 

389 folders = { 

390 "bn": "BN-Bangla", 

391 "de": "DE-German", 

392 "en": "EN-English", 

393 "es": "ES-Espanish", 

394 "fa": "FA-Farsi", 

395 "hi": "HI-Hindi", 

396 "ko": "KO-Korean", 

397 "nl": "NL-Dutch", 

398 "ru": "RU-Russian", 

399 "tr": "TR-Turkish", 

400 "zh": "ZH-Chinese", 

401 } 

402 

403 possible_tasks = ["multi", "code-switch"] + list(folders.keys()) 

404 task = task.lower() 

405 

406 if task not in possible_tasks: 

407 raise ValueError(f"task has to be one of {possible_tasks}, but is '{task}'") 

408 

409 # column format 

410 columns = {0: "text", 3: "ner"} 

411 

412 # this dataset name 

413 dataset_name = self.__class__.__name__.lower() 

414 

415 # default dataset folder is the cache root 

416 if not base_path: 

417 base_path = flair.cache_root / "datasets" 

418 data_folder = base_path / dataset_name 

419 

420 # check if data there 

421 if not data_folder.exists(): 

422 log.warning("-" * 100) 

423 log.warning(f'WARNING: MultiCoNer dataset not found at "{data_folder}".') 

424 log.warning( 

425 'Instructions for obtaining the data can be found here: https://multiconer.github.io/dataset"' 

426 ) 

427 log.warning("-" * 100) 

428 

429 if task in ["multi", "code-switch"]: 

430 # code-switch uses the same training data than multi but provides a different test set. 

431 # as the test set is not published, those two tasks are the same. 

432 train_files = list(data_folder.rglob("*_train.conll")) 

433 dev_files = list(data_folder.rglob("*_dev.conll")) 

434 else: 

435 train_files = [data_folder / folders[task] / f"{task}_train.conll"] 

436 dev_files = [data_folder / folders[task] / f"{task}_dev.conll"] 

437 

438 if use_dev_as_test: 

439 test_files = dev_files 

440 dev_files = [] 

441 else: 

442 test_files = [] 

443 

444 super(MultiCoNer, self).__init__( 

445 train_files=train_files, 

446 dev_files=dev_files, 

447 test_files=test_files, 

448 column_format=columns, 

449 tag_to_bioes=tag_to_bioes, 

450 comment_symbol="# id ", 

451 in_memory=in_memory, 

452 **corpusargs, 

453 ) 

454 

455 

456class CONLL_03(ColumnCorpus): 

457 def __init__( 

458 self, 

459 base_path: Union[str, Path] = None, 

460 tag_to_bioes: str = "ner", 

461 in_memory: bool = True, 

462 **corpusargs, 

463 ): 

464 """ 

465 Initialize the CoNLL-03 corpus. This is only possible if you've manually downloaded it to your machine. 

466 Obtain the corpus from https://www.clips.uantwerpen.be/conll2003/ner/ and put the eng.testa, .testb, .train 

467 files in a folder called 'conll_03'. Then set the base_path parameter in the constructor to the path to the 

468 parent directory where the conll_03 folder resides. 

469 If using entity linking, the conll03 dateset is reduced by about 20 Documents, which are not part of the yago dataset. 

470 :param base_path: Path to the CoNLL-03 corpus (i.e. 'conll_03' folder) on your machine 

471 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' or 'np' to predict 

472 POS tags or chunks respectively 

473 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

474 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

475 """ 

476 if type(base_path) == str: 

477 base_path: Path = Path(base_path) 

478 

479 # column format 

480 columns = {0: "text", 1: "pos", 2: "np", 3: "ner"} 

481 

482 # this dataset name 

483 dataset_name = self.__class__.__name__.lower() 

484 

485 # default dataset folder is the cache root 

486 if not base_path: 

487 base_path = flair.cache_root / "datasets" 

488 data_folder = base_path / dataset_name 

489 

490 # check if data there 

491 if not data_folder.exists(): 

492 log.warning("-" * 100) 

493 log.warning(f'WARNING: CoNLL-03 dataset not found at "{data_folder}".') 

494 log.warning( 

495 'Instructions for obtaining the data can be found here: https://www.clips.uantwerpen.be/conll2003/ner/"' 

496 ) 

497 log.warning("-" * 100) 

498 

499 super(CONLL_03, self).__init__( 

500 data_folder, 

501 columns, 

502 tag_to_bioes=tag_to_bioes, 

503 in_memory=in_memory, 

504 document_separator_token="-DOCSTART-", 

505 **corpusargs, 

506 ) 

507 

508 

509class CONLL_03_GERMAN(ColumnCorpus): 

510 def __init__( 

511 self, 

512 base_path: Union[str, Path] = None, 

513 tag_to_bioes: str = "ner", 

514 in_memory: bool = True, 

515 **corpusargs, 

516 ): 

517 """ 

518 Initialize the CoNLL-03 corpus for German. This is only possible if you've manually downloaded it to your machine. 

519 Obtain the corpus from https://www.clips.uantwerpen.be/conll2003/ner/ and put the respective files in a folder called 

520 'conll_03_german'. Then set the base_path parameter in the constructor to the path to the parent directory where 

521 the conll_03_german folder resides. 

522 :param base_path: Path to the CoNLL-03 corpus (i.e. 'conll_03_german' folder) on your machine 

523 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'lemma', 'pos' or 'np' to predict 

524 word lemmas, POS tags or chunks respectively 

525 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

526 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

527 """ 

528 if type(base_path) == str: 

529 base_path: Path = Path(base_path) 

530 

531 # column format 

532 columns = {0: "text", 1: "lemma", 2: "pos", 3: "np", 4: "ner"} 

533 

534 # this dataset name 

535 dataset_name = self.__class__.__name__.lower() 

536 

537 # default dataset folder is the cache root 

538 if not base_path: 

539 base_path = flair.cache_root / "datasets" 

540 data_folder = base_path / dataset_name 

541 

542 # check if data there 

543 if not data_folder.exists(): 

544 log.warning("-" * 100) 

545 log.warning(f'WARNING: CoNLL-03 dataset not found at "{data_folder}".') 

546 log.warning( 

547 'Instructions for obtaining the data can be found here: https://www.clips.uantwerpen.be/conll2003/ner/"' 

548 ) 

549 log.warning("-" * 100) 

550 

551 super(CONLL_03_GERMAN, self).__init__( 

552 data_folder, 

553 columns, 

554 tag_to_bioes=tag_to_bioes, 

555 in_memory=in_memory, 

556 document_separator_token="-DOCSTART-", 

557 **corpusargs, 

558 ) 

559 

560 

561class CONLL_03_DUTCH(ColumnCorpus): 

562 def __init__( 

563 self, 

564 base_path: Union[str, Path] = None, 

565 tag_to_bioes: str = "ner", 

566 in_memory: bool = True, 

567 **corpusargs, 

568 ): 

569 """ 

570 Initialize the CoNLL-03 corpus for Dutch. The first time you call this constructor it will automatically 

571 download the dataset. 

572 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

573 to point to a different folder but typically this should not be necessary. 

574 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict 

575 POS tags instead 

576 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

577 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

578 """ 

579 if type(base_path) == str: 

580 base_path: Path = Path(base_path) 

581 

582 # column format 

583 columns = {0: "text", 1: "pos", 2: "ner"} 

584 

585 # this dataset name 

586 dataset_name = self.__class__.__name__.lower() 

587 

588 # default dataset folder is the cache root 

589 if not base_path: 

590 base_path = flair.cache_root / "datasets" 

591 data_folder = base_path / dataset_name 

592 

593 # download data if necessary 

594 conll_02_path = "https://www.clips.uantwerpen.be/conll2002/ner/data/" 

595 

596 # download files if not present locally 

597 cached_path(f"{conll_02_path}ned.testa", data_folder / 'raw') 

598 cached_path(f"{conll_02_path}ned.testb", data_folder / 'raw') 

599 cached_path(f"{conll_02_path}ned.train", data_folder / 'raw') 

600 

601 # we need to slightly modify the original files by adding some new lines after document separators 

602 train_data_file = data_folder / 'train.txt' 

603 if not train_data_file.is_file(): 

604 self.__offset_docstarts(data_folder / 'raw' / "ned.train", data_folder / 'train.txt') 

605 self.__offset_docstarts(data_folder / 'raw' / "ned.testa", data_folder / 'dev.txt') 

606 self.__offset_docstarts(data_folder / 'raw' / "ned.testb", data_folder / 'test.txt') 

607 

608 super(CONLL_03_DUTCH, self).__init__( 

609 data_folder, 

610 columns, 

611 train_file='train.txt', 

612 dev_file='dev.txt', 

613 test_file='test.txt', 

614 tag_to_bioes=tag_to_bioes, 

615 encoding="latin-1", 

616 in_memory=in_memory, 

617 document_separator_token="-DOCSTART-", 

618 **corpusargs, 

619 ) 

620 

621 @staticmethod 

622 def __offset_docstarts(file_in: Union[str, Path], file_out: Union[str, Path]): 

623 with open(file_in, 'r', encoding="latin-1") as f: 

624 lines = f.readlines() 

625 with open(file_out, 'w', encoding="latin-1") as f: 

626 for line in lines: 

627 f.write(line) 

628 if line.startswith('-DOCSTART-'): 

629 f.write("\n") 

630 

631 

632class CONLL_03_SPANISH(ColumnCorpus): 

633 def __init__( 

634 self, 

635 base_path: Union[str, Path] = None, 

636 tag_to_bioes: str = "ner", 

637 in_memory: bool = True, 

638 **corpusargs, 

639 ): 

640 """ 

641 Initialize the CoNLL-03 corpus for Spanish. The first time you call this constructor it will automatically 

642 download the dataset. 

643 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

644 to point to a different folder but typically this should not be necessary. 

645 :param tag_to_bioes: NER by default, should not be changed 

646 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

647 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

648 """ 

649 if type(base_path) == str: 

650 base_path: Path = Path(base_path) 

651 

652 # column format 

653 columns = {0: "text", 1: "ner"} 

654 

655 # this dataset name 

656 dataset_name = self.__class__.__name__.lower() 

657 

658 # default dataset folder is the cache root 

659 if not base_path: 

660 base_path = flair.cache_root / "datasets" 

661 data_folder = base_path / dataset_name 

662 

663 # download data if necessary 

664 conll_02_path = "https://www.clips.uantwerpen.be/conll2002/ner/data/" 

665 cached_path(f"{conll_02_path}esp.testa", Path("datasets") / dataset_name) 

666 cached_path(f"{conll_02_path}esp.testb", Path("datasets") / dataset_name) 

667 cached_path(f"{conll_02_path}esp.train", Path("datasets") / dataset_name) 

668 

669 super(CONLL_03_SPANISH, self).__init__( 

670 data_folder, 

671 columns, 

672 tag_to_bioes=tag_to_bioes, 

673 encoding="latin-1", 

674 in_memory=in_memory, 

675 **corpusargs, 

676 ) 

677 

678 

679class CONLL_2000(ColumnCorpus): 

680 def __init__( 

681 self, 

682 base_path: Union[str, Path] = None, 

683 tag_to_bioes: str = "np", 

684 in_memory: bool = True, 

685 **corpusargs, 

686 ): 

687 """ 

688 Initialize the CoNLL-2000 corpus for English chunking. 

689 The first time you call this constructor it will automatically download the dataset. 

690 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

691 to point to a different folder but typically this should not be necessary. 

692 :param tag_to_bioes: 'np' by default, should not be changed, but you can set 'pos' instead to predict POS tags 

693 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

694 """ 

695 if type(base_path) == str: 

696 base_path: Path = Path(base_path) 

697 

698 # column format 

699 columns = {0: "text", 1: "pos", 2: "np"} 

700 

701 # this dataset name 

702 dataset_name = self.__class__.__name__.lower() 

703 

704 # default dataset folder is the cache root 

705 if not base_path: 

706 base_path = flair.cache_root / "datasets" 

707 data_folder = base_path / dataset_name 

708 

709 # download data if necessary 

710 conll_2000_path = "https://www.clips.uantwerpen.be/conll2000/chunking/" 

711 data_file = flair.cache_root / "datasets" / dataset_name / "train.txt" 

712 if not data_file.is_file(): 

713 cached_path( 

714 f"{conll_2000_path}train.txt.gz", Path("datasets") / dataset_name 

715 ) 

716 cached_path( 

717 f"{conll_2000_path}test.txt.gz", Path("datasets") / dataset_name 

718 ) 

719 import gzip, shutil 

720 

721 with gzip.open( 

722 flair.cache_root / "datasets" / dataset_name / "train.txt.gz", 

723 "rb", 

724 ) as f_in: 

725 with open( 

726 flair.cache_root / "datasets" / dataset_name / "train.txt", 

727 "wb", 

728 ) as f_out: 

729 shutil.copyfileobj(f_in, f_out) 

730 with gzip.open( 

731 flair.cache_root / "datasets" / dataset_name / "test.txt.gz", "rb" 

732 ) as f_in: 

733 with open( 

734 flair.cache_root / "datasets" / dataset_name / "test.txt", 

735 "wb", 

736 ) as f_out: 

737 shutil.copyfileobj(f_in, f_out) 

738 

739 super(CONLL_2000, self).__init__( 

740 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, 

741 ) 

742 

743 

744class WNUT_17(ColumnCorpus): 

745 def __init__( 

746 self, 

747 base_path: Union[str, Path] = None, 

748 tag_to_bioes: str = "ner", 

749 in_memory: bool = True, 

750 **corpusargs, 

751 ): 

752 if type(base_path) == str: 

753 base_path: Path = Path(base_path) 

754 

755 # column format 

756 columns = {0: "text", 1: "ner"} 

757 

758 # this dataset name 

759 dataset_name = self.__class__.__name__.lower() 

760 

761 # default dataset folder is the cache root 

762 if not base_path: 

763 base_path = flair.cache_root / "datasets" 

764 data_folder = base_path / dataset_name 

765 

766 # download data if necessary 

767 wnut_path = "https://noisy-text.github.io/2017/files/" 

768 cached_path(f"{wnut_path}wnut17train.conll", Path("datasets") / dataset_name) 

769 cached_path(f"{wnut_path}emerging.dev.conll", Path("datasets") / dataset_name) 

770 cached_path( 

771 f"{wnut_path}emerging.test.annotated", Path("datasets") / dataset_name 

772 ) 

773 

774 super(WNUT_17, self).__init__( 

775 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, 

776 ) 

777 

778 

779class BIOSCOPE(ColumnCorpus): 

780 def __init__( 

781 self, 

782 base_path: Union[str, Path] = None, 

783 in_memory: bool = True, 

784 **corpusargs, 

785 ): 

786 if type(base_path) == str: 

787 base_path: Path = Path(base_path) 

788 

789 # column format 

790 columns = {0: "text", 1: "tag"} 

791 

792 # this dataset name 

793 dataset_name = self.__class__.__name__.lower() 

794 

795 # default dataset folder is the cache root 

796 if not base_path: 

797 base_path = flair.cache_root / "datasets" 

798 data_folder = base_path / dataset_name 

799 

800 # download data if necessary 

801 bioscope_path = "https://raw.githubusercontent.com/whoisjones/BioScopeSequenceLabelingData/master/sequence_labeled/" 

802 cached_path(f"{bioscope_path}output.txt", Path("datasets") / dataset_name) 

803 

804 super(BIOSCOPE, self).__init__( 

805 data_folder, columns, in_memory=in_memory, train_file="output.txt", **corpusargs, 

806 ) 

807 

808 

809class NER_ARABIC_ANER(ColumnCorpus): 

810 def __init__( 

811 self, 

812 base_path: Union[str, Path] = None, 

813 tag_to_bioes: str = "ner", 

814 in_memory: bool = True, 

815 document_as_sequence: bool = False, 

816 **corpusargs, 

817 ): 

818 """ 

819 Initialize a preprocessed version of the Arabic Named Entity Recognition Corpus (ANERCorp) dataset available 

820 from https://github.com/EmnamoR/Arabic-named-entity-recognition/blob/master/ANERCorp.rar. 

821 http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp 

822 Column order is swapped 

823 The first time you call this constructor it will automatically download the dataset. 

824 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

825 to point to a different folder but typically this should not be necessary. 

826 :param tag_to_bioes: NER by default, need not be changed. 

827 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

828 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

829 """ 

830 if type(base_path) == str: 

831 base_path: Path = Path(base_path) 

832 

833 # column format 

834 columns = {0: "text", 1: "ner"} 

835 

836 # this dataset name 

837 dataset_name = self.__class__.__name__.lower() 

838 

839 # default dataset folder is the cache root 

840 if not base_path: 

841 base_path = flair.cache_root / "datasets" 

842 data_folder = base_path / dataset_name 

843 

844 # download data if necessary 

845 anercorp_path = "https://megantosh.s3.eu-central-1.amazonaws.com/ANERcorp/" 

846 # cached_path(f"{anercorp_path}test.txt", Path("datasets") / dataset_name) 

847 cached_path(f"{anercorp_path}train.txt", Path("datasets") / dataset_name) 

848 

849 super(NER_ARABIC_ANER, self).__init__( 

850 data_folder, 

851 columns, 

852 tag_to_bioes=tag_to_bioes, 

853 encoding="utf-8", 

854 in_memory=in_memory, 

855 document_separator_token=None if not document_as_sequence else "-DOCSTART-", 

856 **corpusargs, 

857 ) 

858 

859 

860class NER_ARABIC_AQMAR(ColumnCorpus): 

861 def __init__( 

862 self, 

863 base_path: Union[str, Path] = None, 

864 tag_to_bioes: str = "ner", 

865 in_memory: bool = True, 

866 document_as_sequence: bool = False, 

867 **corpusargs, 

868 ): 

869 """ 

870 Initialize a preprocessed and modified version of the American and Qatari Modeling of Arabic (AQMAR) dataset available 

871 from http://www.cs.cmu.edu/~ark/ArabicNER/AQMAR_Arabic_NER_corpus-1.0.zip. 

872 via http://www.cs.cmu.edu/~ark/AQMAR/ 

873 

874 - Modifications from original dataset: Miscellaneous tags (MIS0, MIS1, MIS2, MIS3) are merged to one tag "MISC" as these categories deviate across the original dataset 

875 - The 28 original Wikipedia articles are merged into a single file containing the articles in alphabetical order 

876 

877 The first time you call this constructor it will automatically download the dataset. 

878 

879 This dataset is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License. 

880 please cite: "Behrang Mohit, Nathan Schneider, Rishav Bhowmick, Kemal Oflazer, and Noah A. Smith (2012), 

881 Recall-Oriented Learning of Named Entities in Arabic Wikipedia. Proceedings of EACL." 

882 

883 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. 

884 :param tag_to_bioes: NER by default 

885 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

886 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

887 """ 

888 if type(base_path) == str: 

889 base_path: Path = Path(base_path) 

890 

891 # column format 

892 columns = {0: "text", 1: "ner"} 

893 

894 # this dataset name 

895 dataset_name = self.__class__.__name__.lower() 

896 

897 # default dataset folder is the cache root 

898 if not base_path: 

899 base_path = flair.cache_root / "datasets" 

900 data_folder = base_path / dataset_name 

901 

902 # download data if necessary 

903 aqmar_path = "https://megantosh.s3.eu-central-1.amazonaws.com/AQMAR/" 

904 # cached_path(f"{anercorp_path}test.txt", Path("datasets") / dataset_name) 

905 cached_path(f"{aqmar_path}train.txt", Path("datasets") / dataset_name) 

906 

907 super(NER_ARABIC_AQMAR, self).__init__( 

908 data_folder, 

909 columns, 

910 tag_to_bioes=tag_to_bioes, 

911 encoding="utf-8", 

912 in_memory=in_memory, 

913 document_separator_token=None if not document_as_sequence else "-DOCSTART-", 

914 **corpusargs, 

915 ) 

916 

917 

918class NER_BASQUE(ColumnCorpus): 

919 def __init__( 

920 self, 

921 base_path: Union[str, Path] = None, 

922 tag_to_bioes: str = "ner", 

923 in_memory: bool = True, 

924 **corpusargs, 

925 ): 

926 if type(base_path) == str: 

927 base_path: Path = Path(base_path) 

928 

929 # column format 

930 columns = {0: "text", 1: "ner"} 

931 

932 # this dataset name 

933 dataset_name = self.__class__.__name__.lower() 

934 

935 # default dataset folder is the cache root 

936 if not base_path: 

937 base_path = flair.cache_root / "datasets" 

938 data_folder = base_path / dataset_name 

939 

940 # download data if necessary 

941 ner_basque_path = "http://ixa2.si.ehu.eus/eiec/" 

942 data_path = flair.cache_root / "datasets" / dataset_name 

943 data_file = data_path / "named_ent_eu.train" 

944 if not data_file.is_file(): 

945 cached_path( 

946 f"{ner_basque_path}/eiec_v1.0.tgz", Path("datasets") / dataset_name 

947 ) 

948 import tarfile, shutil 

949 

950 with tarfile.open( 

951 flair.cache_root / "datasets" / dataset_name / "eiec_v1.0.tgz", 

952 "r:gz", 

953 ) as f_in: 

954 corpus_files = ( 

955 "eiec_v1.0/named_ent_eu.train", 

956 "eiec_v1.0/named_ent_eu.test", 

957 ) 

958 for corpus_file in corpus_files: 

959 f_in.extract(corpus_file, data_path) 

960 shutil.move(f"{data_path}/{corpus_file}", data_path) 

961 

962 super(NER_BASQUE, self).__init__( 

963 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, 

964 ) 

965 

966 

967class NER_CHINESE_WEIBO(ColumnCorpus): 

968 def __init__( 

969 self, 

970 base_path: Union[str, Path] = None, 

971 tag_to_bioes: str = "ner", 

972 in_memory: bool = True, 

973 document_as_sequence: bool = False, 

974 **corpusargs, 

975 ): 

976 """ 

977 Initialize the WEIBO_NER corpus . The first time you call this constructor it will automatically 

978 download the dataset. 

979 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

980 to point to a different folder but typically this should not be necessary. 

981 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict 

982 POS tags instead 

983 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

984 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

985 """ 

986 if type(base_path) == str: 

987 base_path: Path = Path(base_path) 

988 

989 # column format 

990 columns = {0: 'text', 1: 'ner'} 

991 

992 # this dataset name 

993 dataset_name = self.__class__.__name__.lower() 

994 

995 # default dataset folder is the cache root 

996 if not base_path: 

997 base_path = flair.cache_root / "datasets" 

998 data_folder = base_path / dataset_name 

999 

1000 # download data if necessary 

1001 weiboNER_conll_path = "https://raw.githubusercontent.com/87302380/WEIBO_NER/main/data/" 

1002 cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.train", Path("datasets") / dataset_name) 

1003 cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.test", Path("datasets") / dataset_name) 

1004 cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.dev", Path("datasets") / dataset_name) 

1005 

1006 super(NER_CHINESE_WEIBO, self).__init__( 

1007 data_folder, 

1008 columns, 

1009 tag_to_bioes=tag_to_bioes, 

1010 encoding="utf-8", 

1011 in_memory=in_memory, 

1012 train_file="weiboNER_2nd_conll_format.train", 

1013 test_file="weiboNER_2nd_conll_format.test", 

1014 dev_file="weiboNER_2nd_conll_format.dev", 

1015 document_separator_token=None if not document_as_sequence else "-DOCSTART-", 

1016 **corpusargs, 

1017 ) 

1018 

1019 

1020class NER_DANISH_DANE(ColumnCorpus): 

1021 def __init__( 

1022 self, 

1023 base_path: Union[str, Path] = None, 

1024 tag_to_bioes: str = "ner", 

1025 in_memory: bool = True, 

1026 **corpusargs, 

1027 ): 

1028 if type(base_path) == str: 

1029 base_path: Path = Path(base_path) 

1030 

1031 # column format 

1032 columns = {1: 'text', 3: 'pos', 9: 'ner'} 

1033 

1034 # this dataset name 

1035 dataset_name = self.__class__.__name__.lower() 

1036 

1037 # default dataset folder is the cache root 

1038 if not base_path: 

1039 base_path = flair.cache_root / "datasets" 

1040 data_folder = base_path / dataset_name 

1041 

1042 # download data if necessary 

1043 data_path = flair.cache_root / "datasets" / dataset_name 

1044 train_data_file = data_path / "ddt.train.conllu" 

1045 if not train_data_file.is_file(): 

1046 temp_file = cached_path( 

1047 'https://danlp.alexandra.dk/304bd159d5de/datasets/ddt.zip', 

1048 Path("datasets") / dataset_name 

1049 ) 

1050 from zipfile import ZipFile 

1051 

1052 with ZipFile(temp_file, 'r') as zip_file: 

1053 zip_file.extractall(path=data_path) 

1054 

1055 # Remove CoNLL-U meta information in the last column 

1056 for part in ['train', 'dev', 'test']: 

1057 lines = [] 

1058 data_file = "ddt.{}.conllu".format(part) 

1059 with open(data_path / data_file, 'r') as file: 

1060 for line in file: 

1061 if line.startswith("#") or line == "\n": 

1062 lines.append(line) 

1063 lines.append(line.replace("name=", "").replace("|SpaceAfter=No", "")) 

1064 

1065 with open(data_path / data_file, 'w') as file: 

1066 file.writelines(lines) 

1067 

1068 print(data_path / data_file) 

1069 

1070 super(NER_DANISH_DANE, self).__init__( 

1071 data_folder, columns, tag_to_bioes=tag_to_bioes, 

1072 in_memory=in_memory, comment_symbol="#", 

1073 **corpusargs, 

1074 ) 

1075 

1076 

1077class NER_ENGLISH_MOVIE_SIMPLE(ColumnCorpus): 

1078 def __init__( 

1079 self, 

1080 base_path: Union[str, Path] = None, 

1081 tag_to_bioes: str = "ner", 

1082 in_memory: bool = True, 

1083 **corpusargs, 

1084 ): 

1085 """ 

1086 Initialize the eng corpus of the MIT Movie Corpus (it has simpler queries compared to the trivia10k13 corpus) 

1087 in BIO format. The first time you call this constructor it will automatically download the dataset. 

1088 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

1089 to point to a different folder but typically this should not be necessary. 

1090 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict 

1091 POS tags instead 

1092 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

1093 """ 

1094 # column format 

1095 columns = {0: "ner", 1: "text"} 

1096 

1097 # dataset name 

1098 dataset_name = self.__class__.__name__.lower() 

1099 

1100 # data folder: default dataset folder is the cache root 

1101 if type(base_path) == str: 

1102 base_path: Path = Path(base_path) 

1103 if not base_path: 

1104 base_path: Path = flair.cache_root / "datasets" 

1105 data_folder = base_path / dataset_name 

1106 

1107 # download data if necessary 

1108 mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/" 

1109 train_file = "engtrain.bio" 

1110 test_file = "engtest.bio" 

1111 cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name) 

1112 cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name) 

1113 

1114 super(NER_ENGLISH_MOVIE_SIMPLE, self).__init__( 

1115 data_folder, 

1116 columns, 

1117 train_file=train_file, 

1118 test_file=test_file, 

1119 tag_to_bioes=tag_to_bioes, 

1120 in_memory=in_memory, 

1121 **corpusargs, 

1122 ) 

1123 

1124 

1125class NER_ENGLISH_MOVIE_COMPLEX(ColumnCorpus): 

1126 def __init__( 

1127 self, 

1128 base_path: Union[str, Path] = None, 

1129 tag_to_bioes: str = "ner", 

1130 in_memory: bool = True, 

1131 **corpusargs, 

1132 ): 

1133 """ 

1134 Initialize the trivia10k13 corpus of the MIT Movie Corpus (it has more complex queries compared to the eng corpus) 

1135 in BIO format. The first time you call this constructor it will automatically download the dataset. 

1136 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

1137 to point to a different folder but typically this should not be necessary. 

1138 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict 

1139 POS tags instead 

1140 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

1141 """ 

1142 # column format 

1143 columns = {0: "ner", 1: "text"} 

1144 

1145 # dataset name 

1146 dataset_name = self.__class__.__name__.lower() 

1147 

1148 # data folder: default dataset folder is the cache root 

1149 if type(base_path) == str: 

1150 base_path: Path = Path(base_path) 

1151 if not base_path: 

1152 base_path: Path = flair.cache_root / "datasets" 

1153 data_folder = base_path / dataset_name 

1154 

1155 # download data if necessary 

1156 mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/" 

1157 train_file = "trivia10k13train.bio" 

1158 test_file = "trivia10k13test.bio" 

1159 cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name) 

1160 cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name) 

1161 

1162 super(NER_ENGLISH_MOVIE_COMPLEX, self).__init__( 

1163 data_folder, 

1164 columns, 

1165 train_file=train_file, 

1166 test_file=test_file, 

1167 tag_to_bioes=tag_to_bioes, 

1168 in_memory=in_memory, 

1169 **corpusargs, 

1170 ) 

1171 

1172 

1173class NER_ENGLISH_SEC_FILLINGS(ColumnCorpus): 

1174 """ 

1175 Initialize corpus of SEC-fillings annotated with English NER tags. See paper "Domain Adaption of Named Entity 

1176 Recognition to Support Credit Risk Assessment" by Alvarado et al, 2015: https://aclanthology.org/U15-1010/ 

1177 :param base_path: Path to the CoNLL-03 corpus (i.e. 'conll_03' folder) on your machine 

1178 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' or 'np' to predict 

1179 POS tags or chunks respectively 

1180 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

1181 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

1182 """ 

1183 

1184 def __init__( 

1185 self, 

1186 base_path: Union[str, Path] = None, 

1187 tag_to_bioes: str = "ner", 

1188 in_memory: bool = True, 

1189 **corpusargs, 

1190 ): 

1191 

1192 if type(base_path) == str: 

1193 base_path: Path = Path(base_path) 

1194 

1195 # column format 

1196 columns = {0: "text", 1: "pos", 3: "ner"} 

1197 

1198 # this dataset name 

1199 dataset_name = self.__class__.__name__.lower() 

1200 

1201 # default dataset folder is the cache root 

1202 if not base_path: 

1203 base_path = flair.cache_root / "datasets" 

1204 data_folder = base_path / dataset_name 

1205 

1206 # download data if necessary 

1207 SEC_FILLINGS_Path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/SEC-filings/CONLL-format/data/" 

1208 cached_path(f"{SEC_FILLINGS_Path}test/FIN3.txt", Path("datasets") / dataset_name) 

1209 cached_path(f"{SEC_FILLINGS_Path}train/FIN5.txt", Path("datasets") / dataset_name) 

1210 

1211 super(NER_ENGLISH_SEC_FILLINGS, self).__init__( 

1212 data_folder, 

1213 columns, 

1214 tag_to_bioes=tag_to_bioes, 

1215 encoding="utf-8", 

1216 in_memory=in_memory, 

1217 train_file='FIN5.txt', 

1218 test_file="FIN3.txt", 

1219 skip_first_line=True, 

1220 **corpusargs, 

1221 ) 

1222 

1223 

1224class NER_ENGLISH_RESTAURANT(ColumnCorpus): 

1225 def __init__( 

1226 self, 

1227 base_path: Union[str, Path] = None, 

1228 tag_to_bioes: str = "ner", 

1229 in_memory: bool = True, 

1230 **corpusargs, 

1231 ): 

1232 """ 

1233 Initialize the experimental MIT Restaurant corpus available on https://groups.csail.mit.edu/sls/downloads/restaurant/. 

1234 The first time you call this constructor it will automatically download the dataset. 

1235 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

1236 to point to a different folder but typically this should not be necessary. 

1237 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict 

1238 POS tags instead 

1239 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

1240 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

1241 """ 

1242 if type(base_path) == str: 

1243 base_path: Path = Path(base_path) 

1244 

1245 # column format 

1246 columns = {0: "text", 1: "ner"} 

1247 

1248 # this dataset name 

1249 dataset_name = self.__class__.__name__.lower() 

1250 

1251 # default dataset folder is the cache root 

1252 if not base_path: 

1253 base_path = flair.cache_root / "datasets" 

1254 data_folder = base_path / dataset_name 

1255 

1256 # download data if necessary 

1257 mit_restaurants_path = "https://megantosh.s3.eu-central-1.amazonaws.com/MITRestoCorpus/" 

1258 cached_path(f"{mit_restaurants_path}test.txt", Path("datasets") / dataset_name) 

1259 cached_path(f"{mit_restaurants_path}train.txt", Path("datasets") / dataset_name) 

1260 

1261 super(NER_ENGLISH_RESTAURANT, self).__init__( 

1262 data_folder, 

1263 columns, 

1264 tag_to_bioes=tag_to_bioes, 

1265 encoding="latin-1", 

1266 in_memory=in_memory, 

1267 **corpusargs, 

1268 ) 

1269 

1270 

1271class NER_ENGLISH_STACKOVERFLOW(ColumnCorpus): 

1272 def __init__( 

1273 self, 

1274 base_path: Union[str, Path] = None, 

1275 tag_to_bioes: str = "ner", 

1276 in_memory: bool = True, 

1277 **corpusargs, 

1278 ): 

1279 """ 

1280 Initialize the STACKOVERFLOW_NER corpus. The first time you call this constructor it will automatically 

1281 download the dataset. 

1282 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

1283 to point to a different folder but typically this should not be necessary. 

1284 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict 

1285 POS tags instead 

1286 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

1287 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

1288 """ 

1289 if type(base_path) == str: 

1290 base_path: Path = Path(base_path) 

1291 

1292 """ 

1293 The Datasets are represented in the Conll format. 

1294 In this format each line of the Dataset is in the following format: 

1295 <word>+"\t"+<NE>"\t"+<word>+"\t"<markdown> 

1296 The end of sentence is marked with an empty line. 

1297 In each line NE represented the human annotated named entity  

1298 and <markdown> represented the code tags provided by the users who wrote the posts. 

1299 """ 

1300 # column format 

1301 columns = {0: "word", 1: "ner", 3: "markdown"} 

1302 

1303 # entity_mapping 

1304 entity_mapping = {"Library_Function": "Function", 

1305 "Function_Name": "Function", 

1306 "Class_Name": "Class", 

1307 "Library_Class": "Class", 

1308 "Organization": "Website", 

1309 "Library_Variable": "Variable", 

1310 "Variable_Name": "Variable", 

1311 "Error_Name": "O", 

1312 "Keyboard_IP": "O", 

1313 "Value": "O", 

1314 "Output_Block": "O" 

1315 } 

1316 

1317 # this dataset name 

1318 dataset_name = self.__class__.__name__.lower() 

1319 

1320 # default dataset folder is the cache root 

1321 if not base_path: 

1322 base_path = flair.cache_root / "datasets" 

1323 data_folder = base_path / dataset_name 

1324 

1325 # download data if necessary 

1326 STACKOVERFLOW_NER_path = "https://raw.githubusercontent.com/jeniyat/StackOverflowNER/master/resources/annotated_ner_data/StackOverflow/" 

1327 

1328 # data validation 

1329 banned_sentences = ["code omitted for annotation", 

1330 "omitted for annotation", 

1331 "CODE_BLOCK :", 

1332 "OP_BLOCK :", 

1333 "Question_URL :", 

1334 "Question_ID :" 

1335 ] 

1336 

1337 files = ["train", "test", "dev"] 

1338 

1339 for file in files: 

1340 questions = 0 

1341 answers = 0 

1342 

1343 cached_path(f"{STACKOVERFLOW_NER_path}{file}.txt", Path("datasets") / dataset_name) 

1344 for line in open(data_folder / (file + ".txt"), mode="r", encoding="utf-8"): 

1345 if line.startswith("Question_ID"): 

1346 questions += 1 

1347 

1348 if line.startswith("Answer_to_Question_ID"): 

1349 answers += 1 

1350 log.info(f"File {file} has {questions} questions and {answers} answers.") 

1351 

1352 super(NER_ENGLISH_STACKOVERFLOW, self).__init__( 

1353 data_folder, 

1354 columns, 

1355 train_file="train.txt", 

1356 test_file="test.txt", 

1357 dev_file="dev.txt", 

1358 tag_to_bioes=tag_to_bioes, 

1359 encoding="utf-8", 

1360 banned_sentences=banned_sentences, 

1361 in_memory=in_memory, 

1362 label_name_map=entity_mapping, 

1363 **corpusargs 

1364 ) 

1365 

1366 

1367class NER_ENGLISH_TWITTER(ColumnCorpus): 

1368 def __init__( 

1369 self, 

1370 base_path: Union[str, Path] = None, 

1371 tag_to_bioes: str = "ner", 

1372 in_memory: bool = True, 

1373 **corpusargs, 

1374 ): 

1375 """ 

1376 Initialize a dataset called twitter_ner which can be found on the following page: 

1377 https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/ner.txt. 

1378 

1379 The first time you call this constructor it will automatically 

1380 download the dataset. 

1381 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

1382 to point to a different folder but typically this should not be necessary. 

1383 :param tag_to_bioes: NER by default, need not be changed 

1384 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

1385 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

1386 """ 

1387 if type(base_path) == str: 

1388 base_path: Path = Path(base_path) 

1389 

1390 # column format 

1391 columns = {0: 'text', 1: 'ner'} 

1392 

1393 # this dataset name 

1394 dataset_name = self.__class__.__name__.lower() 

1395 

1396 # default dataset folder is the cache root 

1397 if not base_path: 

1398 base_path = flair.cache_root / "datasets" 

1399 data_folder = base_path / dataset_name 

1400 

1401 # download data if necessary 

1402 twitter_ner_path = "https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/" 

1403 cached_path(f"{twitter_ner_path}ner.txt", Path("datasets") / dataset_name) 

1404 

1405 super(NER_ENGLISH_TWITTER, self).__init__( 

1406 data_folder, 

1407 columns, 

1408 tag_to_bioes=tag_to_bioes, 

1409 encoding="latin-1", 

1410 train_file="ner.txt", 

1411 in_memory=in_memory, 

1412 **corpusargs, 

1413 ) 

1414 

1415 

1416class NER_ENGLISH_PERSON(ColumnCorpus): 

1417 def __init__( 

1418 self, 

1419 base_path: Union[str, Path] = None, 

1420 in_memory: bool = True, 

1421 ): 

1422 """ 

1423 Initialize the PERSON_NER corpus for person names. The first time you call this constructor it will automatically 

1424 download the dataset. 

1425 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

1426 to point to a different folder but typically this should not be necessary. 

1427 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

1428 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

1429 """ 

1430 

1431 if type(base_path) == str: 

1432 base_path: Path = Path(base_path) 

1433 

1434 # column format 

1435 columns = {0: "text", 1: "ner"} 

1436 

1437 # this dataset name 

1438 dataset_name = self.__class__.__name__.lower() 

1439 

1440 # default dataset folder is the cache root 

1441 if not base_path: 

1442 base_path = flair.cache_root / "datasets" 

1443 data_folder = base_path / dataset_name 

1444 

1445 # download data if necessary 

1446 conll_path = "https://raw.githubusercontent.com/das-sudeshna/genid/master/" 

1447 

1448 # download files if not present locallys 

1449 cached_path(f"{conll_path}conll-g.conll", data_folder / 'raw') 

1450 cached_path(f"{conll_path}ieer-g.conll", data_folder / 'raw') 

1451 cached_path(f"{conll_path}textbook-g.conll", data_folder / 'raw') 

1452 cached_path(f"{conll_path}wiki-g.conll", data_folder / 'raw') 

1453 

1454 self.__concatAllFiles(data_folder) 

1455 

1456 super(NER_ENGLISH_PERSON, self).__init__( 

1457 data_folder, 

1458 columns, 

1459 in_memory=in_memory, 

1460 train_file='bigFile.conll' 

1461 ) 

1462 

1463 @staticmethod 

1464 def __concatAllFiles(data_folder): 

1465 arr = os.listdir(data_folder / 'raw') 

1466 

1467 with open(data_folder / 'bigFile.conll', 'w') as outfile: 

1468 for fname in arr: 

1469 with open(data_folder / 'raw' / fname) as infile: 

1470 outfile.write(infile.read()) 

1471 

1472 

1473class NER_ENGLISH_WEBPAGES(ColumnCorpus): 

1474 def __init__( 

1475 self, 

1476 base_path: Union[str, Path] = None, 

1477 tag_to_bioes: str = "ner", 

1478 in_memory: bool = True, 

1479 **corpusargs, 

1480 ): 

1481 """ 

1482 Initialize the WEBPAGES_NER corpus introduced in the paper "Design Challenges and Misconceptions in Named Entity 

1483 Recognition" by Ratinov and Roth (2009): https://aclanthology.org/W09-1119/. 

1484 The first time you call this constructor it will automatically download the dataset. 

1485 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

1486 to point to a different folder but typically this should not be necessary. 

1487 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict 

1488 POS tags instead 

1489 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

1490 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

1491 """ 

1492 if type(base_path) == str: 

1493 base_path: Path = Path(base_path) 

1494 

1495 # column format 

1496 columns = {0: "ner", 5: "text"} 

1497 

1498 # this dataset name 

1499 dataset_name = self.__class__.__name__.lower() 

1500 

1501 # default dataset folder is the cache root 

1502 if not base_path: 

1503 base_path = Path(flair.cache_root) / "datasets" 

1504 data_folder = base_path / dataset_name 

1505 import tarfile 

1506 if not os.path.isfile(data_folder / 'webpages_ner.txt'): 

1507 # # download zip 

1508 tar_file = "https://cogcomp.seas.upenn.edu/Data/NERWebpagesColumns.tgz" 

1509 webpages_ner_path = cached_path(tar_file, Path("datasets") / dataset_name) 

1510 tf = tarfile.open(webpages_ner_path) 

1511 tf.extractall(data_folder) 

1512 tf.close() 

1513 outputfile = os.path.abspath(data_folder) 

1514 

1515 # merge the files in one as the zip is containing multiples files 

1516 

1517 with open(outputfile / data_folder / "webpages_ner.txt", "w+") as outfile: 

1518 for files in os.walk(outputfile): 

1519 f = files[1] 

1520 ff = os.listdir(outputfile / data_folder / f[-1]) 

1521 for i, file in enumerate(ff): 

1522 if file.endswith('.gold'): 

1523 with open(outputfile / data_folder / f[-1] / file, 'r+', errors='replace') as infile: 

1524 content = infile.read() 

1525 outfile.write(content) 

1526 break 

1527 

1528 super(NER_ENGLISH_WEBPAGES, self).__init__( 

1529 data_folder, 

1530 columns, 

1531 train_file='webpages_ner.txt', 

1532 tag_to_bioes=tag_to_bioes, 

1533 in_memory=in_memory, 

1534 **corpusargs, 

1535 ) 

1536 

1537 

1538class NER_ENGLISH_WNUT_2020(ColumnCorpus): 

1539 def __init__( 

1540 self, 

1541 base_path: Union[str, Path] = None, 

1542 tag_to_bioes: str = "ner", 

1543 in_memory: bool = True, 

1544 document_as_sequence: bool = False, 

1545 **corpusargs, 

1546 ): 

1547 """ 

1548 Initialize the WNUT_2020_NER corpus. The first time you call this constructor it will automatically 

1549 download the dataset. 

1550 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

1551 to point to a different folder but typically this should not be necessary. 

1552 :param tag_to_bioes: NER by default, since it is the only option of the WNUT corpus. 

1553 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

1554 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

1555 """ 

1556 if type(base_path) == str: 

1557 base_path: Path = Path(base_path) 

1558 

1559 # column format 

1560 columns = {0: "text", 1: "ner"} 

1561 

1562 # this dataset name 

1563 dataset_name = self.__class__.__name__.lower() 

1564 

1565 # default dataset folder is the cache root 

1566 if not base_path: 

1567 base_path = flair.cache_root / "datasets" 

1568 data_folder = base_path / dataset_name 

1569 

1570 # download data if necessary 

1571 github_url = "https://github.com/jeniyat/WNUT_2020_NER/archive/master.zip" 

1572 

1573 for sample in ["train", "test", "dev"]: 

1574 

1575 sample_file = data_folder / (sample + ".txt") 

1576 if not sample_file.is_file(): 

1577 

1578 zip_path = cached_path( 

1579 f"{github_url}", Path("datasets") / dataset_name 

1580 ) 

1581 

1582 # unzip the downloaded repo and merge the train, dev and test datasets 

1583 unpack_file(zip_path, data_folder, "zip", False) # unzipped folder name: WNUT_2020_NER-master 

1584 

1585 if sample == "test": 

1586 file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data_2020/Conll_Format/") 

1587 else: 

1588 file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data/Conll_Format/") 

1589 filenames = os.listdir(file_path) 

1590 with open(data_folder / (sample + '.txt'), 'w') as outfile: 

1591 for fname in filenames: 

1592 with open(file_path / fname) as infile: 

1593 lines = infile.read() 

1594 outfile.write(lines) 

1595 

1596 shutil.rmtree(str(data_folder / "WNUT_2020_NER-master")) # clean up when done 

1597 

1598 super(NER_ENGLISH_WNUT_2020, self).__init__( 

1599 data_folder, 

1600 columns, 

1601 tag_to_bioes=tag_to_bioes, 

1602 encoding="utf-8", 

1603 in_memory=in_memory, 

1604 document_separator_token=None if not document_as_sequence else "-DOCSTART-", 

1605 **corpusargs, 

1606 ) 

1607 

1608 

1609class NER_ENGLISH_WIKIGOLD(ColumnCorpus): 

1610 def __init__( 

1611 self, 

1612 base_path: Union[str, Path] = None, 

1613 tag_to_bioes: str = "ner", 

1614 in_memory: bool = True, 

1615 document_as_sequence: bool = False, 

1616 **corpusargs, 

1617 ): 

1618 """ 

1619 Initialize the wikigold corpus. The first time you call this constructor it will automatically 

1620 download the dataset. 

1621 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

1622 to point to a different folder but typically this should not be necessary. 

1623 :param tag_to_bioes: NER by default, should not be changed 

1624 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

1625 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

1626 """ 

1627 if type(base_path) == str: 

1628 base_path: Path = Path(base_path) 

1629 

1630 # column format 

1631 columns = {0: "text", 1: "ner"} 

1632 

1633 # this dataset name 

1634 dataset_name = self.__class__.__name__.lower() 

1635 

1636 # default dataset folder is the cache root 

1637 if not base_path: 

1638 base_path = flair.cache_root / "datasets" 

1639 data_folder = base_path / dataset_name 

1640 

1641 # download data if necessary 

1642 wikigold_ner_path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/wikigold/CONLL-format/data/" 

1643 cached_path(f"{wikigold_ner_path}wikigold.conll.txt", Path("datasets") / dataset_name) 

1644 

1645 super(NER_ENGLISH_WIKIGOLD, self).__init__( 

1646 data_folder, 

1647 columns, 

1648 tag_to_bioes=tag_to_bioes, 

1649 encoding="utf-8", 

1650 in_memory=in_memory, 

1651 train_file='wikigold.conll.txt', 

1652 document_separator_token=None if not document_as_sequence else "-DOCSTART-", 

1653 **corpusargs, 

1654 ) 

1655 

1656 

1657class NER_FINNISH(ColumnCorpus): 

1658 def __init__( 

1659 self, 

1660 base_path: Union[str, Path] = None, 

1661 tag_to_bioes: str = "ner", 

1662 in_memory: bool = True, 

1663 **corpusargs, 

1664 ): 

1665 if type(base_path) == str: 

1666 base_path: Path = Path(base_path) 

1667 

1668 # column format 

1669 columns = {0: "text", 1: "ner"} 

1670 

1671 # this dataset name 

1672 dataset_name = self.__class__.__name__.lower() 

1673 

1674 # default dataset folder is the cache root 

1675 if not base_path: 

1676 base_path = flair.cache_root / "datasets" 

1677 data_folder = base_path / dataset_name 

1678 

1679 # download data if necessary 

1680 ner_finnish_path = "https://raw.githubusercontent.com/mpsilfve/finer-data/master/data/digitoday." 

1681 cached_path(f"{ner_finnish_path}2014.train.csv", Path("datasets") / dataset_name) 

1682 cached_path(f"{ner_finnish_path}2014.dev.csv", Path("datasets") / dataset_name) 

1683 cached_path(f"{ner_finnish_path}2015.test.csv", Path("datasets") / dataset_name) 

1684 

1685 self._remove_lines_without_annotations(data_file=Path(data_folder / "digitoday.2015.test.csv")) 

1686 

1687 super(NER_FINNISH, self).__init__( 

1688 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, skip_first_line=True, **corpusargs, 

1689 ) 

1690 

1691 def _remove_lines_without_annotations(self, data_file: Union[str, Path] = None): 

1692 with open(data_file, 'r') as f: 

1693 lines = f.readlines() 

1694 with open(data_file, 'w') as f: 

1695 for line in lines: 

1696 if len(line.split()) != 1: 

1697 f.write(line) 

1698 

1699 

1700class NER_GERMAN_BIOFID(ColumnCorpus): 

1701 def __init__( 

1702 self, 

1703 base_path: Union[str, Path] = None, 

1704 tag_to_bioes: str = "ner", 

1705 in_memory: bool = True, 

1706 **corpusargs, 

1707 ): 

1708 if type(base_path) == str: 

1709 base_path: Path = Path(base_path) 

1710 

1711 # column format 

1712 columns = {0: "text", 1: "lemma", 2: "pos", 3: "ner"} 

1713 

1714 # this dataset name 

1715 dataset_name = self.__class__.__name__.lower() 

1716 

1717 # default dataset folder is the cache root 

1718 if not base_path: 

1719 base_path = flair.cache_root / "datasets" 

1720 data_folder = base_path / dataset_name 

1721 

1722 # download data if necessary 

1723 biofid_path = "https://raw.githubusercontent.com/texttechnologylab/BIOfid/master/BIOfid-Dataset-NER/" 

1724 cached_path(f"{biofid_path}train.conll", Path("datasets") / dataset_name) 

1725 cached_path(f"{biofid_path}dev.conll", Path("datasets") / dataset_name) 

1726 cached_path(f"{biofid_path}test.conll", Path("datasets") / dataset_name) 

1727 

1728 super(NER_GERMAN_BIOFID, self).__init__( 

1729 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, 

1730 ) 

1731 

1732 

1733class NER_GERMAN_EUROPARL(ColumnCorpus): 

1734 def __init__( 

1735 self, 

1736 base_path: Union[str, Path] = None, 

1737 tag_to_bioes: str = "ner", 

1738 in_memory: bool = True, 

1739 **corpusargs, 

1740 ): 

1741 """ 

1742 Initialize the EUROPARL_NER_GERMAN corpus. The first time you call this constructor it will automatically 

1743 download the dataset. 

1744 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

1745 to point to a different folder but typically this should not be necessary. 

1746 :param tag_to_bioes: 'ner' by default, should not be changed. 

1747 :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage. 

1748 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

1749 """ 

1750 

1751 if type(base_path) == str: 

1752 base_path: Path = Path(base_path) 

1753 

1754 # column format 

1755 columns = {0: 'text', 1: 'lemma', 2: 'pos', 3: 'np', 4: 'ner'} 

1756 

1757 # this dataset name 

1758 dataset_name = self.__class__.__name__.lower() 

1759 

1760 # default dataset folder is the cache root 

1761 if not base_path: 

1762 base_path = flair.cache_root / "datasets" 

1763 data_folder = base_path / dataset_name 

1764 

1765 # download data if necessary 

1766 europarl_ner_german_path = "https://nlpado.de/~sebastian/software/ner/" 

1767 cached_path(f"{europarl_ner_german_path}ep-96-04-15.conll", Path("datasets") / dataset_name) 

1768 cached_path(f"{europarl_ner_german_path}ep-96-04-16.conll", Path("datasets") / dataset_name) 

1769 

1770 self._add_IOB_tags(data_file=Path(data_folder / "ep-96-04-15.conll"), encoding="latin-1", ner_column=4) 

1771 self._add_IOB_tags(data_file=Path(data_folder / "ep-96-04-16.conll"), encoding="latin-1", ner_column=4) 

1772 

1773 super(NER_GERMAN_EUROPARL, self).__init__( 

1774 data_folder, 

1775 columns, 

1776 tag_to_bioes=tag_to_bioes, 

1777 encoding="latin-1", 

1778 in_memory=in_memory, 

1779 train_file='ep-96-04-16.conll', 

1780 test_file='ep-96-04-15.conll', 

1781 **corpusargs, 

1782 ) 

1783 

1784 def _add_IOB_tags(self, data_file: Union[str, Path], encoding: str = "utf8", ner_column: int = 1): 

1785 """ 

1786 Function that adds IOB tags if only chunk names are provided (e.g. words are tagged PER instead 

1787 of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects 

1788 the letter 'O'. Additionally it removes lines with no tags in the data file and can also 

1789 be used if the data is only partially IOB tagged. 

1790 Parameters 

1791 ---------- 

1792 data_file : Union[str, Path] 

1793 Path to the data file. 

1794 encoding : str, optional 

1795 Encoding used in open function. The default is "utf8". 

1796 ner_column : int, optional 

1797 Specifies the ner-tagged column. The default is 1 (the second column). 

1798 

1799 """ 

1800 

1801 def add_I_prefix(current_line: List[str], ner: int, tag: str): 

1802 for i in range(0, len(current_line)): 

1803 if i == 0: 

1804 f.write(line_list[i]) 

1805 elif i == ner: 

1806 f.write(' I-' + tag) 

1807 else: 

1808 f.write(' ' + current_line[i]) 

1809 f.write('\n') 

1810 

1811 with open(file=data_file, mode='r', encoding=encoding) as f: 

1812 lines = f.readlines() 

1813 with open(file=data_file, mode='w', encoding=encoding) as f: 

1814 pred = 'O' # remembers ner tag of predecessing line 

1815 for line in lines: 

1816 line_list = line.split() 

1817 if len(line_list) > 2: # word with tags 

1818 ner_tag = line_list[ner_column] 

1819 if ner_tag in ['0', 'O']: # no chunk 

1820 for i in range(0, len(line_list)): 

1821 if i == 0: 

1822 f.write(line_list[i]) 

1823 elif i == ner_column: 

1824 f.write(' O') 

1825 else: 

1826 f.write(' ' + line_list[i]) 

1827 f.write('\n') 

1828 pred = 'O' 

1829 elif '-' not in ner_tag: # no IOB tags 

1830 if pred == 'O': # found a new chunk 

1831 add_I_prefix(line_list, ner_column, ner_tag) 

1832 pred = ner_tag 

1833 else: # found further part of chunk or new chunk directly after old chunk 

1834 add_I_prefix(line_list, ner_column, ner_tag) 

1835 pred = ner_tag 

1836 else: # line already has IOB tag (tag contains '-') 

1837 f.write(line) 

1838 pred = ner_tag.split('-')[1] 

1839 elif len(line_list) == 0: # empty line 

1840 f.write('\n') 

1841 pred = 'O' 

1842 

1843 

1844class NER_GERMAN_LEGAL(ColumnCorpus): 

1845 def __init__( 

1846 self, 

1847 base_path: Union[str, Path] = None, 

1848 tag_to_bioes: str = "ner", 

1849 in_memory: bool = True, 

1850 **corpusargs, 

1851 ): 

1852 """ 

1853 Initialize the LER_GERMAN (Legal Entity Recognition) corpus. The first time you call this constructor it will automatically 

1854 download the dataset. 

1855 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

1856 to point to a different folder but typically this should not be necessary. 

1857 :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage. 

1858 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

1859 """ 

1860 

1861 if type(base_path) == str: 

1862 base_path: Path = Path(base_path) 

1863 

1864 # column format 

1865 columns = {0: "text", 1: "ner"} 

1866 

1867 # this dataset name 

1868 dataset_name = self.__class__.__name__.lower() 

1869 

1870 # default dataset folder is the cache root 

1871 if not base_path: 

1872 base_path = flair.cache_root / "datasets" 

1873 data_folder = base_path / dataset_name 

1874 

1875 # download data if necessary 

1876 ler_path = "https://raw.githubusercontent.com/elenanereiss/Legal-Entity-Recognition/master/data/" 

1877 cached_path(f"{ler_path}ler.conll", Path("datasets") / dataset_name) 

1878 

1879 super(NER_GERMAN_LEGAL, self).__init__( 

1880 data_folder, 

1881 columns, 

1882 tag_to_bioes=tag_to_bioes, 

1883 in_memory=in_memory, 

1884 train_file='ler.conll', 

1885 **corpusargs, 

1886 ) 

1887 

1888 

1889class NER_GERMAN_GERMEVAL(ColumnCorpus): 

1890 def __init__( 

1891 self, 

1892 base_path: Union[str, Path] = None, 

1893 tag_to_bioes: str = "ner", 

1894 in_memory: bool = True, 

1895 **corpusargs, 

1896 ): 

1897 """ 

1898 Initialize the GermEval NER corpus for German. This is only possible if you've manually downloaded it to your 

1899 machine. Obtain the corpus from https://sites.google.com/site/germeval2014ner/data and put it into some folder. 

1900 Then point the base_path parameter in the constructor to this folder 

1901 :param base_path: Path to the GermEval corpus on your machine 

1902 :param tag_to_bioes: 'ner' by default, should not be changed. 

1903 :param in_memory:If True, keeps dataset in memory giving speedups in training. 

1904 """ 

1905 if type(base_path) == str: 

1906 base_path: Path = Path(base_path) 

1907 

1908 # column format 

1909 columns = {1: "text", 2: "ner"} 

1910 

1911 # this dataset name 

1912 dataset_name = self.__class__.__name__.lower() 

1913 

1914 # default dataset folder is the cache root 

1915 if not base_path: 

1916 base_path = flair.cache_root / "datasets" 

1917 data_folder = base_path / dataset_name 

1918 

1919 # check if data there 

1920 if not data_folder.exists(): 

1921 # create folder 

1922 os.makedirs(data_folder) 

1923 

1924 # download dataset 

1925 import gdown 

1926 gdown.download(url="https://drive.google.com/uc?id={}".format("1Jjhbal535VVz2ap4v4r_rN1UEHTdLK5P"), 

1927 output=str(data_folder / 'train.tsv')) 

1928 gdown.download(url="https://drive.google.com/uc?id={}".format("1u9mb7kNJHWQCWyweMDRMuTFoOHOfeBTH"), 

1929 output=str(data_folder / 'test.tsv')) 

1930 gdown.download(url="https://drive.google.com/uc?id={}".format("1ZfRcQThdtAR5PPRjIDtrVP7BtXSCUBbm"), 

1931 output=str(data_folder / 'dev.tsv')) 

1932 

1933 super(NER_GERMAN_GERMEVAL, self).__init__( 

1934 data_folder, 

1935 columns, 

1936 tag_to_bioes=tag_to_bioes, 

1937 comment_symbol="#", 

1938 in_memory=in_memory, 

1939 **corpusargs, 

1940 ) 

1941 

1942 

1943class NER_GERMAN_POLITICS(ColumnCorpus): 

1944 def __init__( 

1945 self, 

1946 base_path: Union[str, Path] = None, 

1947 tag_to_bioes: str = "ner", 

1948 column_delimiter: str = r"\s+", 

1949 in_memory: bool = True, 

1950 **corpusargs, 

1951 ): 

1952 """ 

1953 Initialize corpus with Named Entity Model for German, Politics (NEMGP) data from 

1954 https://www.thomas-zastrow.de/nlp/. The first time you call this constructor it will automatically download the 

1955 dataset. 

1956 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

1957 to point to a different folder but typically this should not be necessary. 

1958 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict 

1959 POS tags instead 

1960 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

1961 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

1962 """ 

1963 if type(base_path) == str: 

1964 base_path: Path = Path(base_path) 

1965 

1966 # column format 

1967 columns = {0: "text", 1: "ner"} 

1968 

1969 # this dataset name 

1970 dataset_name = self.__class__.__name__.lower() 

1971 

1972 # default dataset folder is the cache root 

1973 if not base_path: 

1974 base_path = flair.cache_root / "datasets" 

1975 data_folder = base_path / dataset_name 

1976 

1977 # download and parse data if necessary 

1978 german_politics_path = "https://www.thomas-zastrow.de/nlp/nemgp_trainingdata_01.txt.zip" 

1979 corpus_file_name = "nemgp_trainingdata_01.txt" 

1980 parsed_dataset = data_folder / "raw" / corpus_file_name 

1981 

1982 if not parsed_dataset.exists(): 

1983 german_politics_zip = cached_path(f"{german_politics_path}", Path("datasets") / dataset_name / "raw") 

1984 unpack_file(german_politics_zip, data_folder / "raw", "zip", False) 

1985 self._convert_to_column_corpus(parsed_dataset) 

1986 

1987 # create train test dev if not exist 

1988 train_dataset = data_folder / "train.txt" 

1989 if not train_dataset.exists(): 

1990 self._create_datasets(parsed_dataset, data_folder) 

1991 

1992 super(NER_GERMAN_POLITICS, self).__init__( 

1993 data_folder, 

1994 columns, 

1995 column_delimiter=column_delimiter, 

1996 train_file='train.txt', 

1997 dev_file='dev.txt', 

1998 test_file='test.txt', 

1999 tag_to_bioes=tag_to_bioes, 

2000 encoding="utf-8", 

2001 in_memory=in_memory, 

2002 **corpusargs, 

2003 ) 

2004 

2005 def _convert_to_column_corpus(self, data_file: Union[str, Path]): 

2006 with open(data_file, 'r', encoding='utf-8') as f: 

2007 lines = f.readlines() 

2008 with open(data_file, 'w', encoding='utf-8') as f: 

2009 tag_bool = False 

2010 new_sentence = True 

2011 for line in lines: 

2012 line = re.sub('\s{2,}', ' ', line).strip().split(' ') 

2013 for substr in line: 

2014 if substr == '.': 

2015 f.write("\n") 

2016 new_sentence = True 

2017 elif "<START:" in substr: 

2018 tag_bool = True 

2019 tag = substr.strip('<START:').strip('>') 

2020 if 'loc' in tag: 

2021 tag_IOB = '-LOC' 

2022 elif 'per' in tag: 

2023 tag_IOB = '-PER' 

2024 elif 'org' in tag: 

2025 tag_IOB = '-ORG' 

2026 elif 'misc' in tag: 

2027 tag_IOB = '-MISC' 

2028 elif "<END>" in substr: 

2029 tag_bool = False 

2030 new_sentence = True 

2031 else: 

2032 if tag_bool: 

2033 if new_sentence is True: 

2034 start = 'B' 

2035 new_sentence = False 

2036 else: 

2037 start = 'I' 

2038 f.write(substr.strip(' ') + " " + start + tag_IOB + "\n") 

2039 else: 

2040 f.write(substr.strip(' ') + " " + 'O' + "\n") 

2041 

2042 def _create_datasets(self, data_file: Union[str, Path], data_folder: Union[str, Path]): 

2043 with open(data_file, 'r') as file: 

2044 num_lines = len(file.readlines()) 

2045 file.seek(0) 

2046 

2047 train_len = round(num_lines * 0.8) 

2048 test_len = round(num_lines * 0.1) 

2049 dev_len = num_lines - train_len - test_len 

2050 

2051 train = open(data_folder / "train.txt", "w") 

2052 test = open(data_folder / "test.txt", "w") 

2053 dev = open(data_folder / "dev.txt", "w") 

2054 

2055 k = 0 

2056 for line in file.readlines(): 

2057 k += 1 

2058 if k <= train_len: 

2059 train.write(line) 

2060 elif k > train_len and k <= (train_len + test_len): 

2061 test.write(line) 

2062 elif k > (train_len + test_len) and k <= num_lines: 

2063 dev.write(line) 

2064 

2065 

2066class NER_HUNGARIAN(ColumnCorpus): 

2067 def __init__( 

2068 self, 

2069 base_path: Union[str, Path] = None, 

2070 tag_to_bioes: str = "ner", 

2071 in_memory: bool = True, 

2072 document_as_sequence: bool = False, 

2073 **corpusargs, 

2074 ): 

2075 """ 

2076 Initialize the NER Business corpus for Hungarian. The first time you call this constructor it will automatically 

2077 download the dataset. 

2078 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

2079 to point to a different folder but typically this should not be necessary. 

2080 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict 

2081 POS tags instead 

2082 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

2083 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

2084 """ 

2085 if type(base_path) == str: 

2086 base_path: Path = Path(base_path) 

2087 

2088 # column format 

2089 columns = {0: "text", 1: "ner"} 

2090 

2091 # this dataset name 

2092 dataset_name = self.__class__.__name__.lower() 

2093 

2094 # default dataset folder is the cache root 

2095 if not base_path: 

2096 base_path = flair.cache_root / "datasets" 

2097 data_folder = base_path / dataset_name 

2098 

2099 # If the extracted corpus file is not yet present in dir 

2100 if not os.path.isfile(data_folder / 'hun_ner_corpus.txt'): 

2101 # download zip if necessary 

2102 hun_ner_path = "https://rgai.sed.hu/sites/rgai.sed.hu/files/business_NER.zip" 

2103 path_to_zipped_corpus = cached_path(hun_ner_path, Path("datasets") / dataset_name) 

2104 # extracted corpus is not present , so unpacking it. 

2105 unpack_file( 

2106 path_to_zipped_corpus, 

2107 data_folder, 

2108 mode="zip", 

2109 keep=True 

2110 ) 

2111 

2112 super(NER_HUNGARIAN, self).__init__( 

2113 data_folder, 

2114 columns, 

2115 train_file='hun_ner_corpus.txt', 

2116 column_delimiter='\t', 

2117 tag_to_bioes=tag_to_bioes, 

2118 encoding="latin-1", 

2119 in_memory=in_memory, 

2120 label_name_map={'0': 'O'}, 

2121 document_separator_token=None if not document_as_sequence else "-DOCSTART-", 

2122 **corpusargs, 

2123 ) 

2124 

2125 

2126class NER_ICELANDIC(ColumnCorpus): 

2127 def __init__( 

2128 self, 

2129 base_path: Union[str, Path] = None, 

2130 tag_to_bioes: str = "ner", 

2131 in_memory: bool = True, 

2132 **corpusargs, 

2133 ): 

2134 """ 

2135 Initialize the ICELANDIC_NER corpus. The first time you call this constructor it will automatically 

2136 download the dataset. 

2137 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

2138 to point to a different folder but typically this should not be necessary. 

2139 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict 

2140 POS tags instead 

2141 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

2142 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

2143 """ 

2144 if type(base_path) == str: 

2145 base_path: Path = Path(base_path) 

2146 

2147 # column format 

2148 columns = {0: "text", 1: "ner"} 

2149 

2150 # this dataset name 

2151 dataset_name = self.__class__.__name__.lower() 

2152 

2153 # default dataset folder is the cache root 

2154 if not base_path: 

2155 base_path = flair.cache_root / "datasets" 

2156 data_folder = base_path / dataset_name 

2157 

2158 if not os.path.isfile(data_folder / 'icelandic_ner.txt'): 

2159 # download zip 

2160 icelandic_ner = "https://repository.clarin.is/repository/xmlui/handle/20.500.12537/42/allzip" 

2161 icelandic_ner_path = cached_path(icelandic_ner, Path("datasets") / dataset_name) 

2162 

2163 # unpacking the zip 

2164 unpack_file( 

2165 icelandic_ner_path, 

2166 data_folder, 

2167 mode="zip", 

2168 keep=True 

2169 ) 

2170 outputfile = os.path.abspath(data_folder) 

2171 

2172 # merge the files in one as the zip is containing multiples files 

2173 

2174 with open(outputfile / data_folder / "icelandic_ner.txt", "wb") as outfile: 

2175 for files in os.walk(outputfile / data_folder): 

2176 f = files[2] 

2177 

2178 for i in range(len(f)): 

2179 if f[i].endswith('.txt'): 

2180 with open(outputfile / data_folder / f[i], 'rb') as infile: 

2181 contents = infile.read() 

2182 outfile.write(contents) 

2183 

2184 super(NER_ICELANDIC, self).__init__( 

2185 data_folder, 

2186 columns, 

2187 train_file='icelandic_ner.txt', 

2188 tag_to_bioes=tag_to_bioes, 

2189 in_memory=in_memory, 

2190 **corpusargs, 

2191 ) 

2192 

2193 

2194class NER_JAPANESE(ColumnCorpus): 

2195 def __init__( 

2196 self, 

2197 base_path: Union[str, Path] = None, 

2198 tag_to_bioes: str = "ner", 

2199 in_memory: bool = True, 

2200 **corpusargs, 

2201 ): 

2202 """ 

2203 Initialize the Hironsan/IOB2 corpus for Japanese. The first time you call this constructor it will automatically 

2204 download the dataset. 

2205 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

2206 to point to a different folder but typically this should not be necessary. 

2207 :param tag_to_bioes: NER by default. 

2208 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

2209 """ 

2210 if type(base_path) == str: 

2211 base_path: Path = Path(base_path) 

2212 

2213 # column format 

2214 columns = {0: 'text', 1: 'ner'} 

2215 

2216 # this dataset name 

2217 dataset_name = self.__class__.__name__.lower() 

2218 

2219 # default dataset folder is the cache root 

2220 if not base_path: 

2221 base_path = flair.cache_root / "datasets" 

2222 data_folder = base_path / dataset_name 

2223 

2224 # download data from github if necessary (hironsan.txt, ja.wikipedia.conll) 

2225 IOB2_path = "https://raw.githubusercontent.com/Hironsan/IOB2Corpus/master/" 

2226 

2227 # download files if not present locally 

2228 cached_path(f"{IOB2_path}hironsan.txt", data_folder / 'raw') 

2229 cached_path(f"{IOB2_path}ja.wikipedia.conll", data_folder / 'raw') 

2230 

2231 # we need to modify the original files by adding new lines after after the end of each sentence 

2232 train_data_file = data_folder / 'train.txt' 

2233 if not train_data_file.is_file(): 

2234 self.__prepare_jap_wikinews_corpus(data_folder / 'raw' / "hironsan.txt", data_folder / 'train.txt') 

2235 self.__prepare_jap_wikipedia_corpus(data_folder / 'raw' / "ja.wikipedia.conll", data_folder / 'train.txt') 

2236 

2237 super(NER_JAPANESE, self).__init__( 

2238 data_folder, 

2239 columns, 

2240 train_file='train.txt', 

2241 tag_to_bioes=tag_to_bioes, 

2242 in_memory=in_memory, 

2243 **corpusargs, 

2244 ) 

2245 

2246 @staticmethod 

2247 def __prepare_jap_wikipedia_corpus(file_in: Union[str, Path], file_out: Union[str, Path]): 

2248 with open(file_in, 'r') as f: 

2249 lines = f.readlines() 

2250 with open(file_out, 'a') as f: 

2251 for line in lines: 

2252 if (line[0] == "。"): 

2253 f.write(line) 

2254 f.write("\n") 

2255 elif (line[0] == "\n"): 

2256 continue 

2257 else: 

2258 f.write(line) 

2259 

2260 @staticmethod 

2261 def __prepare_jap_wikinews_corpus(file_in: Union[str, Path], file_out: Union[str, Path]): 

2262 with open(file_in, 'r') as f: 

2263 lines = f.readlines() 

2264 with open(file_out, 'a') as f: 

2265 for line in lines: 

2266 sp_line = line.split("\t") 

2267 if (sp_line[0] == "\n"): 

2268 f.write("\n") 

2269 else: 

2270 f.write(sp_line[0] + "\t" + sp_line[len(sp_line) - 1]) 

2271 

2272 

2273class NER_MASAKHANE(MultiCorpus): 

2274 def __init__( 

2275 self, 

2276 languages: Union[str, List[str]] = "luo", 

2277 base_path: Union[str, Path] = None, 

2278 tag_to_bioes: str = "ner", 

2279 in_memory: bool = True, 

2280 **corpusargs, 

2281 ): 

2282 """ 

2283 Initialize the Masakhane corpus available on https://github.com/masakhane-io/masakhane-ner/tree/main/data. 

2284 It consists of ten African languages. Pass a language code or a list of language codes to initialize the corpus 

2285 with the languages you require. If you pass "all", all languages will be initialized. 

2286 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

2287 to point to a different folder but typically this should not be necessary. 

2288 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict 

2289 POS tags instead 

2290 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

2291 """ 

2292 if type(base_path) == str: 

2293 base_path: Path = Path(base_path) 

2294 

2295 # if only one language is given 

2296 if type(languages) == str: 

2297 languages = [languages] 

2298 

2299 # column format 

2300 columns = {0: "text", 1: "ner"} 

2301 

2302 # this dataset name 

2303 dataset_name = self.__class__.__name__.lower() 

2304 

2305 # default dataset folder is the cache root 

2306 if not base_path: 

2307 base_path = flair.cache_root / "datasets" 

2308 data_folder = base_path / dataset_name 

2309 

2310 language_to_code = {"amharic": "amh", 

2311 "hausa": "hau", 

2312 "igbo": "ibo", 

2313 "kinyarwanda": "kin", 

2314 "luganda": "lug", 

2315 "luo": "luo", 

2316 "naija": "pcm", 

2317 "swahili": "swa", 

2318 "yoruba": "yor", 

2319 "wolof": "wol", 

2320 } 

2321 

2322 # use all languages if explicitly set to "all" 

2323 if languages == ["all"]: languages = language_to_code.values() 

2324 

2325 corpora = [] 

2326 for language in languages: 

2327 

2328 if language in language_to_code.keys(): 

2329 language = language_to_code[language] 

2330 

2331 if language not in language_to_code.values(): 

2332 log.error(f"Language '{language}' is not in list of supported languages!") 

2333 log.error(f"Supported are '{language_to_code.values()}'!") 

2334 log.error(f"Instantiate this Corpus for instance like so 'corpus = NER_MASAKHANE(languages='luo')'") 

2335 raise Exception() 

2336 

2337 language_folder = data_folder / language 

2338 

2339 # download data if necessary 

2340 data_path = f"https://raw.githubusercontent.com/masakhane-io/masakhane-ner/main/data/{language}/" 

2341 cached_path(f"{data_path}dev.txt", language_folder) 

2342 cached_path(f"{data_path}test.txt", language_folder) 

2343 cached_path(f"{data_path}train.txt", language_folder) 

2344 

2345 # initialize comlumncorpus and add it to list 

2346 log.info(f"Reading data for language {language}") 

2347 corp = ColumnCorpus(data_folder=language_folder, 

2348 column_format=columns, 

2349 tag_to_bioes=tag_to_bioes, 

2350 encoding="utf-8", 

2351 in_memory=in_memory, 

2352 name=language, 

2353 **corpusargs, 

2354 ) 

2355 corpora.append(corp) 

2356 

2357 super(NER_MASAKHANE, self).__init__( 

2358 corpora, 

2359 name='masakhane-' + '-'.join(languages), 

2360 ) 

2361 

2362 

2363class NER_MULTI_WIKIANN(MultiCorpus): 

2364 def __init__( 

2365 self, 

2366 languages: Union[str, List[str]] = "en", 

2367 base_path: Union[str, Path] = None, 

2368 tag_to_bioes: str = "ner", 

2369 in_memory: bool = False, 

2370 **corpusargs, 

2371 ): 

2372 """ 

2373 WkiAnn corpus for cross-lingual NER consisting of datasets from 282 languages that exist 

2374 in Wikipedia. See https://elisa-ie.github.io/wikiann/ for details and for the languages and their 

2375 respective abbreveations, i.e. "en" for english. (license: https://opendatacommons.org/licenses/by/) 

2376 Parameters 

2377 ---------- 

2378 languages : Union[str, List[str]] 

2379 Should be an abbreviation of a language ("en", "de",..) or a list of abbreviations. 

2380 The datasets of all passed languages will be saved in one MultiCorpus. 

2381 (Note that, even though listed on https://elisa-ie.github.io/wikiann/ some datasets are empty. 

2382 This includes "aa", "cho", "ho", "hz", "ii", "jam", "kj", "kr", "mus", "olo" and "tcy".) 

2383 base_path : Union[str, Path], optional 

2384 Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

2385 to point to a different folder but typically this should not be necessary. 

2386 tag_to_bioes : str, optional 

2387 The data is in bio-format. It will by default (with the string "ner" as value) be transformed 

2388 into the bioes format. If you dont want that set it to None. 

2389 

2390 """ 

2391 if type(languages) == str: 

2392 languages = [languages] 

2393 

2394 if type(base_path) == str: 

2395 base_path: Path = Path(base_path) 

2396 

2397 # column format 

2398 columns = {0: "text", 1: "ner"} 

2399 

2400 # this dataset name 

2401 dataset_name = self.__class__.__name__.lower() 

2402 

2403 # default dataset folder is the cache root 

2404 if not base_path: 

2405 base_path = flair.cache_root / "datasets" 

2406 data_folder = base_path / dataset_name 

2407 

2408 # For each language in languages, the file is downloaded if not existent 

2409 # Then a comlumncorpus of that data is created and saved in a list 

2410 # this list is handed to the multicorpus 

2411 

2412 # list that contains the columncopora 

2413 corpora = [] 

2414 

2415 google_drive_path = 'https://drive.google.com/uc?id=' 

2416 # download data if necessary 

2417 first = True 

2418 for language in languages: 

2419 

2420 language_folder = data_folder / language 

2421 file_name = 'wikiann-' + language + '.bio' 

2422 

2423 # if language not downloaded yet, download it 

2424 if not language_folder.exists(): 

2425 if first: 

2426 import gdown 

2427 import tarfile 

2428 first = False 

2429 # create folder 

2430 os.makedirs(language_folder) 

2431 # get google drive id from list 

2432 google_id = self._google_drive_id_from_language_name(language) 

2433 url = google_drive_path + google_id 

2434 

2435 # download from google drive 

2436 gdown.download(url, str(language_folder / language) + '.tar.gz') 

2437 

2438 # unzip 

2439 log.info("Extracting data...") 

2440 tar = tarfile.open(str(language_folder / language) + '.tar.gz', "r:gz") 

2441 # tar.extractall(language_folder,members=[tar.getmember(file_name)]) 

2442 tar.extract(file_name, str(language_folder)) 

2443 tar.close() 

2444 log.info('...done.') 

2445 

2446 # transform data into required format 

2447 # the processed dataset has the additional ending "_new" 

2448 log.info("Processing dataset...") 

2449 self._silver_standard_to_simple_ner_annotation(str(language_folder / file_name)) 

2450 # remove the unprocessed dataset 

2451 os.remove(str(language_folder / file_name)) 

2452 log.info('...done.') 

2453 

2454 # initialize comlumncorpus and add it to list 

2455 log.info(f"Reading data for language {language}") 

2456 corp = ColumnCorpus(data_folder=language_folder, 

2457 column_format=columns, 

2458 train_file=file_name + '_new', 

2459 tag_to_bioes=tag_to_bioes, 

2460 in_memory=in_memory, 

2461 **corpusargs, 

2462 ) 

2463 corpora.append(corp) 

2464 log.info("...done.") 

2465 

2466 super(NER_MULTI_WIKIANN, self).__init__( 

2467 corpora, name='wikiann', 

2468 ) 

2469 

2470 def _silver_standard_to_simple_ner_annotation(self, data_file: Union[str, Path]): 

2471 f_read = open(data_file, 'r', encoding='utf-8') 

2472 f_write = open(data_file + '_new', 'w+', encoding='utf-8') 

2473 while True: 

2474 line = f_read.readline() 

2475 if line: 

2476 if line == '\n': 

2477 f_write.write(line) 

2478 else: 

2479 liste = line.split() 

2480 f_write.write(liste[0] + ' ' + liste[-1] + '\n') 

2481 else: 

2482 break 

2483 f_read.close() 

2484 f_write.close() 

2485 

2486 def _google_drive_id_from_language_name(self, language): 

2487 languages_ids = { 

2488 'aa': '1tDDlydKq7KQQ3_23Ysbtke4HJOe4snIk', # leer 

2489 'ab': '1hB8REj2XA_0DjI9hdQvNvSDpuBIb8qRf', 

2490 'ace': '1WENJS2ppHcZqaBEXRZyk2zY-PqXkTkgG', 

2491 'ady': '1n6On8WWDHxEoybj7F9K15d_fkGPy6KgO', 

2492 'af': '1CPB-0BD2tg3zIT60D3hmJT0i5O_SKja0', 

2493 'ak': '1l2vlGHnQwvm9XhW5S-403fetwUXhBlZm', 

2494 'als': '196xyYjhbie7sYLHLZHWkkurOwQLi8wK-', 

2495 'am': '1ug1IEoExKD3xWpvfZprAPSQi82YF9Cet', 

2496 'an': '1DNLgPOAOsGZBYd6rC5ddhzvc9_DtWnk2', 

2497 'ang': '1W_0ti7Tl8AkqM91lRCMPWEuUnPOAZroV', 

2498 'ar': '1tyvd32udEQG_cNeVpaD5I2fxvCc6XKIS', 

2499 'arc': '1hSOByStqPmP3b9HfQ39EclUZGo8IKCMb', 

2500 'arz': '1CKW5ZhxTpIHmc8Jt5JLz_5O6Cr8Icsan', 

2501 'as': '12opBoIweBLM8XciMHT4B6-MAaKdYdvpE', 

2502 'ast': '1rp64PxGZBDfcw-tpFBjLg_ddLDElG1II', 

2503 'av': '1hncGUrkG1vwAAQgLtwOf41BWkHkEvdss', 

2504 'ay': '1VmIsWpMTz442b4Mx798ZOgtB9vquKQtf', 

2505 'az': '1FXDXsvBSdqc7GGIDZv0hqBOaaw12Ip2-', 

2506 'azb': '1amVqOuHLEkhjn8rkGUl-mXdZlaACWyNT', 

2507 'ba': '1aLx1d8GagI11VZVYOGQy0BEePeqoT0x3', 

2508 'bar': '1JZ8-k8ZmnpWYI_Yl_cBBgjVdxoM9Daci', 

2509 'bat-smg': '1trxKXDFSeKsygTMKi-ZqXSJs7F90k5a8', 

2510 'bcl': '1Hs0k7KVZ2DPsqroZ4cUKcwZG4HdPV794', 

2511 'be-x-old': '1gaK-spj1m6eGYQ-SsngLxxLUvP1VRk08', 

2512 'be': '1_ttfOSy9BzCRkIT_p3mImT82XRPpEiuH', 

2513 'bg': '1Iug6gYKemb0OrLTUrKDc_c66YGypTfCF', 

2514 'bh': '12OcSFLu940A8tVQLxI8pnxKBpTeZHmrh', 

2515 'bi': '1rftVziS_pqARx4mvLJC0sKLY-OL5ZIjE', 

2516 'bjn': '1n17mkRjPUAOWQk5LQs2C3Tz3ShxK0enZ', 

2517 'bm': '1284dwO_sfdsWE7FR06HhfBRUb8ePesKR', 

2518 'bn': '1K2DM1mT4hkr6NlAIBTj95BeVXcgvpgDm', 

2519 'bo': '1SzGHDVK-OguKdjZ4DXWiOJVrie1iHeWm', 

2520 'bpy': '1m-e5EoruJufvwBEgJLmJtx6jzx64pYN2', 

2521 'br': '1xdaBoJ1DnwI0iEq7gQN1dWcABAs_bM9H', 

2522 'bs': '167dsB01trMYFQl8FshtIdfhjw7IfVKbk', 

2523 'bug': '1yCnevM9_KJzFk27Vxsva_20OacLo4Uam', 

2524 'bxr': '1DlByAX3zB-9UyEAVD4wtX-R7mXC-8xum', 

2525 'ca': '1LuUgbd9sGa-5Ahcsy31EK89a3WOowftY', 

2526 'cbk-zam': '1kgF8xoD-kIOWZET_9kp_4yNX6AAXn6PI', 

2527 'cdo': '14x1y6611G-UAEGq92QEHRpreVkYnoUCw', 

2528 'ce': '1QUUCVKA-fkiCHd3KT3zUWefaWnxzlZLu', 

2529 'ceb': '1DJZE9RfaMoPNXHI73KBXAm4YSe-_YCUk', 

2530 'ch': '1YzAfhmatkmTpkZbAcD6X83epCgzD5S2_', 

2531 'cho': '1ciY0vF3c5a2mTOo_k32A2wMs0klK98Kb', # leer 

2532 'chr': '1EHaxz1UZHn7v2bbRzCLAhPsNtRzrG3Ae', 

2533 'chy': '1nNWwMAJr1KNdz3bHf6uIn-thZCknlTeB', 

2534 'ckb': '1llpaftcUSiXCZQZMdAqaJSrhwMdcf9IV', 

2535 'co': '1ZP-8oWgMYfW7a6w6ygEFkKDGbN39QnDn', 

2536 'cr': '1ST0xRicLAG4JdCZwGdaY-0pEXooQh7e6', 

2537 'crh': '1Jmpq2XVYUR_XaXU5XNhtOMnz-qkpsgpE', 

2538 'cs': '1Vydyze-jBkK_S1uV5ewV_Y6dbwhXr7lk', 

2539 'csb': '1naUyF74lZPnnopXdOqf5Xor2kT4WoHfS', 

2540 'cu': '1EN5dVTU6jc7YOYPCHq8EYUF31HlMUKs7', 

2541 'cv': '1gEUAlqYSSDI4TrWCqP1LUq2n0X1XEjN3', 

2542 'cy': '1q5g6NJE5GXf65Vc_P4BnUMHQ49Prz-J1', 

2543 'da': '11onAGOLkkqrIwM784siWlg-cewa5WKm8', 

2544 'de': '1f9nWvNkCCy6XWhd9uf4Dq-2--GzSaYAb', 

2545 'diq': '1IkpJaVbEOuOs9qay_KG9rkxRghWZhWPm', 

2546 'dsb': '1hlExWaMth-2eVIQ3i3siJSG-MN_7Z6MY', 

2547 'dv': '1WpCrslO4I7TMb2uaKVQw4U2U8qMs5szi', 

2548 'dz': '10WX52ePq2KfyGliwPvY_54hIjpzW6klV', 

2549 'ee': '1tYEt3oN2KPzBSWrk9jpCqnW3J1KXdhjz', 

2550 'el': '1cxq4NUYmHwWsEn5waYXfFSanlINXWLfM', 

2551 'eml': '17FgGhPZqZNtzbxpTJOf-6nxEuI5oU4Vd', 

2552 'en': '1mqxeCPjxqmO7e8utj1MQv1CICLFVvKa-', 

2553 'eo': '1YeknLymGcqj44ug2yd4P7xQVpSK27HkK', 

2554 'es': '1Dnx3MVR9r5cuoOgeew2gT8bDvWpOKxkU', 

2555 'et': '1Qhb3kYlQnLefWmNimdN_Vykm4mWzbcWy', 

2556 'eu': '1f613wH88UeITYyBSEMZByK-nRNMwLHTs', 

2557 'ext': '1D0nLOZ3aolCM8TShIRyCgF3-_MhWXccN', 

2558 'fa': '1QOG15HU8VfZvJUNKos024xI-OGm0zhEX', 

2559 'ff': '1h5pVjxDYcq70bSus30oqi9KzDmezVNry', 

2560 'fi': '1y3Kf6qYsSvL8_nSEwE1Y6Bf6ninaPvqa', 

2561 'fiu-vro': '1oKUiqG19WgPd3CCl4FGudk5ATmtNfToR', 

2562 'fj': '10xDMuqtoTJlJFp5ghbhKfNWRpLDK3W4d', 

2563 'fo': '1RhjYqgtri1276Be1N9RrNitdBNkpzh0J', 

2564 'fr': '1sK_T_-wzVPJYrnziNqWTriU52rEsXGjn', 

2565 'frp': '1NUm8B2zClBcEa8dHLBb-ZgzEr8phcQyZ', 

2566 'frr': '1FjNqbIUlOW1deJdB8WCuWjaZfUzKqujV', 

2567 'fur': '1oqHZMK7WAV8oHoZLjGR0PfmO38wmR6XY', 

2568 'fy': '1DvnU6iaTJc9bWedmDklHyx8nzKD1s3Ge', 

2569 'ga': '1Ql6rh7absdYQ8l-3hj_MVKcEC3tHKeFB', 

2570 'gag': '1zli-hOl2abuQ2wsDJU45qbb0xuvYwA3a', 

2571 'gan': '1u2dOwy58y-GaS-tCPJS_i9VRDQIPXwCr', 

2572 'gd': '1umsUpngJiwkLdGQbRqYpkgxZju9dWlRz', 

2573 'gl': '141K2IbLjJfXwFTIf-kthmmG0YWdi8liE', 

2574 'glk': '1ZDaxQ6ilXaoivo4_KllagabbvfOuiZ0c', 

2575 'gn': '1hM4MuCaVnZqnL-w-0N-WcWag22ikVLtZ', 

2576 'gom': '1BNOSw75tzPC0wEgLOCKbwu9wg9gcLOzs', 

2577 'got': '1YSHYBtXc1WvUvMIHPz6HHgJvaXKulJUj', 

2578 'gu': '1VdK-B2drqFwKg8KD23c3dKXY-cZgCMgd', 

2579 'gv': '1XZFohYNbKszEFR-V-yDXxx40V41PV9Zm', 

2580 'ha': '18ZG4tUU0owRtQA8Ey3Dl72ALjryEJWMC', 

2581 'hak': '1QQe3WgrCWbvnVH42QXD7KX4kihHURB0Z', 

2582 'haw': '1FLqlK-wpz4jy768XbQAtxd9PhC-9ciP7', 

2583 'he': '18K-Erc2VOgtIdskaQq4D5A3XkVstDmfX', 

2584 'hi': '1lBRapb5tjBqT176gD36K5yb_qsaFeu-k', 

2585 'hif': '153MQ9Ga4NQ-CkK8UiJM3DjKOk09fhCOV', 

2586 'ho': '1c1AoS7yq15iVkTEE-0f3x25NT4F202B8', # leer 

2587 'hr': '1wS-UtB3sGHuXJQQGR0F5lDegogsgoyif', 

2588 'hsb': '1_3mMLzAE5OmXn2z64rW3OwWbo85Mirbd', 

2589 'ht': '1BwCaF0nfdgkM7Yt7A7d7KyVk0BcuwPGk', 

2590 'hu': '10AkDmTxUWNbOXuYLYZ-ZPbLAdGAGZZ8J', 

2591 'hy': '1Mi2k2alJJquT1ybd3GC3QYDstSagaWdo', 

2592 'hz': '1c1m_-Q92v0Di7Nez6VuaccrN19i8icKV', # leer 

2593 'ia': '1jPyqTmDuVhEhj89N606Cja5heJEbcMoM', 

2594 'id': '1JWIvIh8fQoMQqk1rPvUThaskxnTs8tsf', 

2595 'ie': '1TaKRlTtB8-Wqu4sfvx6JQKIugAlg0pV-', 

2596 'ig': '15NFAf2Qx6BXSjv_Oun9_3QRBWNn49g86', 

2597 'ii': '1qldGJkMOMKwY13DpcgbxQCbff0K982f9', # leer 

2598 'ik': '1VoSTou2ZlwVhply26ujowDz6gjwtxmny', 

2599 'ilo': '1-xMuIT6GaM_YeHqgm1OamGkxYfBREiv3', 

2600 'io': '19Zla0wsAcrZm2c0Pw5ghpp4rHjYs26Pp', 

2601 'is': '11i-NCyqS6HbldIbYulsCgQGZFXR8hwoB', 

2602 'it': '1HmjlOaQunHqL2Te7pIkuBWrnjlmdfYo_', 

2603 'iu': '18jKm1S7Ls3l0_pHqQH8MycG3LhoC2pdX', 

2604 'ja': '10dz8UxyK4RIacXE2HcGdrharmp5rwc3r', 

2605 'jam': '1v99CXf9RnbF6aJo669YeTR6mQRTOLZ74', # leer 

2606 'jbo': '1_LmH9hc6FDGE3F7pyGB1fUEbSwuTYQdD', 

2607 'jv': '1qiSu1uECCLl4IBZS27FBdJIBivkJ7GwE', 

2608 'ka': '172UFuFRBX2V1aWeXlPSpu9TjS-3cxNaD', 

2609 'kaa': '1kh6hMPUdqO-FIxRY6qaIBZothBURXxbY', 

2610 'kab': '1oKjbZI6ZrrALCqnPCYgIjKNrKDA7ehcs', 

2611 'kbd': '1jNbfrboPOwJmlXQBIv053d7n5WXpMRv7', 

2612 'kg': '1iiu5z-sdJ2JLC4Ja9IgDxpRZklIb6nDx', 

2613 'ki': '1GUtt0QI84c5McyLGGxoi5uwjHOq1d6G8', 

2614 'kj': '1nSxXUSGDlXVCIPGlVpcakRc537MwuKZR', # leer 

2615 'kk': '1ryC3UN0myckc1awrWhhb6RIi17C0LCuS', 

2616 'kl': '1gXtGtX9gcTXms1IExICnqZUHefrlcIFf', 

2617 'km': '1DS5ATxvxyfn1iWvq2G6qmjZv9pv0T6hD', 

2618 'kn': '1ZGLYMxbb5-29MNmuUfg2xFhYUbkJFMJJ', 

2619 'ko': '12r8tIkTnwKhLJxy71qpIcoLrT6NNhQYm', 

2620 'koi': '1EdG_wZ_Qk124EPAZw-w6rdEhYLsgcvIj', 

2621 'kr': '19VNQtnBA-YL_avWuVeHQHxJZ9MZ04WPF', # leer 

2622 'krc': '1nReV4Mb7Wdj96czpO5regFbdBPu0zZ_y', 

2623 'ks': '1kzh0Pgrv27WRMstR9MpU8mu7p60TcT-X', 

2624 'ksh': '1iHJvrl2HeRaCumlrx3N7CPrHQ2KuLUkt', 

2625 'ku': '1YqJog7Bkk0fHBCSTxJ9heeE-bfbkbkye', 

2626 'kv': '1s91HI4eq8lQYlZwfrJAgaGlCyAtIhvIJ', 

2627 'kw': '16TaIX2nRfqDp8n7zudd4bqf5abN49dvW', 

2628 'ky': '17HPUKFdKWhUjuR1NOp5f3PQYfMlMCxCT', 

2629 'la': '1NiQuBaUIFEERvVXo6CQLwosPraGyiRYw', 

2630 'lad': '1PEmXCWLCqnjLBomMAYHeObM1AmVHtD08', 

2631 'lb': '1nE4g10xoTU23idmDtOQ0w2QCuizZ6QH_', 

2632 'lbe': '1KOm-AdRcCHfSc1-uYBxBA4GjxXjnIlE-', 

2633 'lez': '1cJAXshrLlF1TZlPHJTpDwEvurIOsz4yR', 

2634 'lg': '1Ur0y7iiEpWBgHECrIrT1OyIC8um_y4th', 

2635 'li': '1TikIqfqcZlSDWhOae1JnjJiDko4nj4Dj', 

2636 'lij': '1ro5ItUcF49iP3JdV82lhCQ07MtZn_VjW', 

2637 'lmo': '1W4rhBy2Pi5SuYWyWbNotOVkVY3kYWS_O', 

2638 'ln': '1bLSV6bWx0CgFm7ByKppZLpYCFL8EIAoD', 

2639 'lo': '1C6SSLeKF3QirjZbAZAcpVX_AXYg_TJG3', 

2640 'lrc': '1GUcS28MlJe_OjeQfS2AJ8uczpD8ut60e', 

2641 'lt': '1gAG6TcMTmC128wWK0rCXRlCTsJY9wFQY', 

2642 'ltg': '12ziP8t_fAAS9JqOCEC0kuJObEyuoiOjD', 

2643 'lv': '1MPuAM04u-AtfybXdpHwCqUpFWbe-zD0_', 

2644 'mai': '1d_nUewBkka2QGEmxCc9v3dTfvo7lPATH', 

2645 'map-bms': '1wrNIE-mqp2xb3lrNdwADe6pb7f35NP6V', 

2646 'mdf': '1BmMGUJy7afuKfhfTBMiKxM3D7FY-JrQ2', 

2647 'mg': '105WaMhcWa-46tCztoj8npUyg0aH18nFL', 

2648 'mh': '1Ej7n6yA1cF1cpD5XneftHtL33iHJwntT', 

2649 'mhr': '1CCPIUaFkEYXiHO0HF8_w07UzVyWchrjS', 

2650 'mi': '1F6au9xQjnF-aNBupGJ1PwaMMM6T_PgdQ', 

2651 'min': '1tVK5SHiCy_DaZSDm3nZBgT5bgWThbJt_', 

2652 'mk': '18NpudytGhSWq_LbmycTDw10cSftlSBGS', 

2653 'ml': '1V73UE-EvcE-vV3V1RTvU4sak6QFcP91y', 

2654 'mn': '14jRXicA87oXZOZllWqUjKBMetNpQEUUp', 

2655 'mo': '1YsLGNMsJ7VsekhdcITQeolzOSK4NzE6U', 

2656 'mr': '1vOr1AIHbgkhTO9Ol9Jx5Wh98Qdyh1QKI', 

2657 'mrj': '1dW-YmEW8a9D5KyXz8ojSdIXWGekNzGzN', 

2658 'ms': '1bs-_5WNRiZBjO-DtcNtkcIle-98homf_', 

2659 'mt': '1L7aU3iGjm6SmPIU74k990qRgHFV9hrL0', 

2660 'mus': '1_b7DcRqiKJFEFwp87cUecqf8A5BDbTIJ', # leer 

2661 'mwl': '1MfP0jba2jQfGVeJOLq26MjI6fYY7xTPu', 

2662 'my': '16wsIGBhNVd2lC2p6n1X8rdMbiaemeiUM', 

2663 'myv': '1KEqHmfx2pfU-a1tdI_7ZxMQAk5NJzJjB', 

2664 'mzn': '1CflvmYEXZnWwpsBmIs2OvG-zDDvLEMDJ', 

2665 'na': '1r0AVjee5wNnrcgJxQmVGPVKg5YWz1irz', 

2666 'nah': '1fx6eu91NegyueZ1i0XaB07CKjUwjHN7H', 

2667 'nap': '1bhT4sXCJvaTchCIV9mwLBtf3a7OprbVB', 

2668 'nds-nl': '1UIFi8eOCuFYJXSAXZ9pCWwkQMlHaY4ye', 

2669 'nds': '1FLgZIXUWa_vekDt4ndY0B5XL7FNLiulr', 

2670 'ne': '1gEoCjSJmzjIH4kdHsbDZzD6ID4_78ekS', 

2671 'new': '1_-p45Ny4w9UvGuhD8uRNSPPeaARYvESH', 

2672 'ng': '11yxPdkmpmnijQUcnFHZ3xcOmLTYJmN_R', 

2673 'nl': '1dqYXg3ilzVOSQ_tz_dF47elSIvSIhgqd', 

2674 'nn': '1pDrtRhQ001z2WUNMWCZQU3RV_M0BqOmv', 

2675 'no': '1zuT8MI96Ivpiu9mEVFNjwbiM8gJlSzY2', 

2676 'nov': '1l38388Rln0NXsSARMZHmTmyfo5C0wYTd', 

2677 'nrm': '10vxPq1Nci7Wpq4XOvx3dtqODskzjdxJQ', 

2678 'nso': '1iaIV8qlT0RDnbeQlnxJ3RehsG3gU5ePK', 

2679 'nv': '1oN31jT0w3wP9aGwAPz91pSdUytnd9B0g', 

2680 'ny': '1eEKH_rUPC560bfEg11kp3kbe8qWm35IG', 

2681 'oc': '1C01cW8G_j8US-DTrsmeal_ENHTtNWn-H', 

2682 'olo': '1vbDwKZKqFq84dusr1SvDx5JbBcPanx9L', # leer 

2683 'om': '1q3h22VMbWg2kgVFm-OArR-E4y1yBQ1JX', 

2684 'or': '1k8LwCE8nC7lq6neXDaS3zRn0KOrd9RnS', 

2685 'os': '1u81KAB34aEQfet00dLMRIBJsfRwbDTij', 

2686 'pa': '1JDEHL1VcLHBamgTPBom_Ryi8hk6PBpsu', 

2687 'pag': '1k905VUWnRgY8kFb2P2431Kr4dZuolYGF', 

2688 'pam': '1ssugGyJb8ipispC60B3I6kzMsri1WcvC', 

2689 'pap': '1Za0wfwatxYoD7jGclmTtRoBP0uV_qImQ', 

2690 'pcd': '1csJlKgtG04pdIYCUWhsCCZARKIGlEYPx', 

2691 'pdc': '1Xnms4RXZKZ1BBQmQJEPokmkiweTpouUw', 

2692 'pfl': '1tPQfHX7E0uKMdDSlwNw5aGmaS5bUK0rn', 

2693 'pi': '16b-KxNxzbEuyoNSlI3bfe2YXmdSEsPFu', 

2694 'pih': '1vwyihTnS8_PE5BNK7cTISmIBqGWvsVnF', 

2695 'pl': '1fijjS0LbfpKcoPB5V8c8fH08T8AkXRp9', 

2696 'pms': '12ySc7X9ajWWqMlBjyrPiEdc-qVBuIkbA', 

2697 'pnb': '1RB3-wjluhTKbdTGCsk3nag1bM3m4wENb', 

2698 'pnt': '1ZCUzms6fY4on_fW8uVgO7cEs9KHydHY_', 

2699 'ps': '1WKl9Av6Sqz6aHKyUM5kIh90mzFzyVWH9', 

2700 'pt': '13BX-_4_hcTUp59HDyczFDI32qUB94vUY', 

2701 'qu': '1CB_C4ygtRoegkqgcqfXNHr8oQd-UcvDE', 

2702 'rm': '1YRSGgWoxEqSojHXuBHJnY8vAHr1VgLu-', 

2703 'rmy': '1uFcCyvOWBJWKFQxbkYSp373xUXVl4IgF', 

2704 'rn': '1ekyyb2MvupYGY_E8_BhKvV664sLvW4aE', 

2705 'ro': '1YfeNTSoxU-zJMnyQotLk5X8B_6nHryBu', 

2706 'roa-rup': '150s4H4TdQ5nNYVC6j0E416TUAjBE85yy', 

2707 'roa-tara': '1H6emfQsD_a5yohK4RMPQ-GrnHXqqVgr3', 

2708 'ru': '11gP2s-SYcfS3j9MjPp5C3_nFeQB-8x86', 

2709 'rue': '1OuSglZAndja1J5D5IUmdbt_niTTyEgYK', 

2710 'rw': '1NuhHfi0-B-Xlr_BApijnxCw0WMEltttP', 

2711 'sa': '1P2S3gL_zvKgXLKJJxg-Fb4z8XdlVpQik', 

2712 'sah': '1qz0MpKckzUref2FX_FYiNzI2p4BDc5oR', 

2713 'sc': '1oAYj_Fty4FUwjAOBEBaiZt_cY8dtpDfA', 

2714 'scn': '1sDN9zHkXWYoHYx-DUu-GPvsUgB_IRa8S', 

2715 'sco': '1i8W7KQPj6YZQLop89vZBSybJNgNsvXWR', 

2716 'sd': '1vaNqfv3S8Gl5pQmig3vwWQ3cqRTsXmMR', 

2717 'se': '1RT9xhn0Vl90zjWYDTw5V1L_u1Oh16tpP', 

2718 'sg': '1iIh2oXD2Szz_AygUvTt3_ZK8a3RYEGZ_', 

2719 'sh': '1qPwLiAm6t4__G-zVEOrBgYx6VRmgDgiS', 

2720 'si': '1G5ryceID0TP6SAO42e-HAbIlCvYmnUN7', 

2721 'simple': '1FVV49o_RlK6M5Iw_7zeJOEDQoTa5zSbq', 

2722 'sk': '11mkYvbmAWKTInj6t4Ma8BUPxoR5o6irL', 

2723 'sl': '1fsIZS5LgMzMzZ6T7ogStyj-ILEZIBRvO', 

2724 'sm': '1yefECpKX_Y4R7G2tggIxvc_BvJfOAz-t', 

2725 'sn': '1fYeCjMPvRAv94kvZjiKI-ktIDLkbv0Ve', 

2726 'so': '1Uc-eSZnJb36SgeTvRU3GirXZOlGD_NB6', 

2727 'sq': '11u-53n71O_yjpwRiCQSwgL7N2w72ZptX', 

2728 'sr': '1PGLGlQi8Q0Eac6dib-uuCJAAHK6SF5Pz', 

2729 'srn': '1JKiL3TSXqK1-KhPfAwMK0uqw90WEzg7M', 

2730 'ss': '1e0quNEsA1dn57-IbincF4D82dRWgzQlp', 

2731 'st': '1ny-FBzpBqIDgv6jMcsoFev3Ih65FNZFO', 

2732 'stq': '15Fx32ROy2IM6lSqAPUykkr3CITR6Xd7v', 

2733 'su': '1C0FJum7bYZpnyptBvfAgwJb0TX2hggtO', 

2734 'sv': '1YyqzOSXzK5yrAou9zeTDWH_7s569mDcz', 

2735 'sw': '1_bNTj6T8eXlNAIuHaveleWlHB_22alJs', 

2736 'szl': '1_dXEip1snK4CPVGqH8x7lF5O-6FdCNFW', 

2737 'ta': '1ZFTONsxGtSnC9QB6RpWSvgD_MbZwIhHH', 

2738 'tcy': '15R6u7KQs1vmDSm_aSDrQMJ3Q6q3Be0r7', # leer 

2739 'te': '11Sx-pBAPeZOXGyv48UNSVMD0AH7uf4YN', 

2740 'tet': '11mr2MYLcv9pz7mHhGGNi5iNCOVErYeOt', 

2741 'tg': '16ttF7HWqM9Cnj4qmgf3ZfNniiOJfZ52w', 

2742 'th': '14xhIt-xr5n9nMuvcwayCGM1-zBCFZquW', 

2743 'ti': '123q5e9MStMShp8eESGtHdSBGLDrCKfJU', 

2744 'tk': '1X-JNInt34BNGhg8A8Peyjw2WjsALdXsD', 

2745 'tl': '1WkQHbWd9cqtTnSHAv0DpUThaBnzeSPTJ', 

2746 'tn': '1fHfQHetZn8-fLuRZEu-cvs-kQYwPvjyL', 

2747 'to': '1cHOLaczYJ8h-OqQgxeoH9vMG3izg6muT', 

2748 'tpi': '1YsRjxVu6NYOrXRb8oqMO9FPaicelFEcu', 

2749 'tr': '1J1Zy02IxvtCK0d1Ba2h_Ulit1mVb9UIX', 

2750 'ts': '1pIcfAt3KmtmDkyhOl-SMSeoM8aP8bOpl', 

2751 'tt': '1vsfzCjj-_bMOn5jBai41TF5GjKJM_Ius', 

2752 'tum': '1NWcg65daI2Bt0awyEgU6apUDbBmiqCus', 

2753 'tw': '1WCYKZIqS7AagS76QFSfbteiOgFNBvNne', 

2754 'ty': '1DIqaP1l-N9VXTNokrlr6EuPMGE765o4h', 

2755 'tyv': '1F3qa05OYLBcjT1lXMurAJFDXP_EesCvM', 

2756 'udm': '1T0YMTAPLOk768sstnewy5Jxgx2RPu3Rb', 

2757 'ug': '1fjezvqlysyZhiQMZdazqLGgk72PqtXAw', 

2758 'uk': '1UMJCHtzxkfLDBJE7NtfN5FeMrnnUVwoh', 

2759 'ur': '1WNaD2TuHvdsF-z0k_emQYchwoQQDFmRk', 

2760 'uz': '11wrG2FSTpRJc2jb5MhgvxjkVDYhT8M-l', 

2761 've': '1PucJ7pJ4CXGEXZ5p_WleZDs2usNz74to', 

2762 'vec': '1cAVjm_y3ehNteDQIYz9yyoq1EKkqOXZ0', 

2763 'vep': '1K_eqV7O6C7KPJWZtmIuzFMKAagj-0O85', 

2764 'vi': '1yQ6nhm1BmG9lD4_NaG1hE5VV6biEaV5f', 

2765 'vls': '1bpQQW6pKHruKJJaKtuggH5rReMXyeVXp', 

2766 'vo': '1D80QRdTpe7H4mHFKpfugscsjX71kiMJN', 

2767 'wa': '1m4B81QYbf74htpInDU5p7d0n0ot8WLPZ', 

2768 'war': '1EC3jsHtu22tHBv6jX_I4rupC5RwV3OYd', 

2769 'wo': '1vChyqNNLu5xYHdyHpACwwpw4l3ptiKlo', 

2770 'wuu': '1_EIn02xCUBcwLOwYnA-lScjS2Lh2ECw6', 

2771 'xal': '19bKXsL1D2UesbB50JPyc9TpG1lNc2POt', 

2772 'xh': '1pPVcxBG3xsCzEnUzlohc_p89gQ9dSJB3', 

2773 'xmf': '1SM9llku6I_ZuZz05mOBuL2lx-KQXvehr', 

2774 'yi': '1WNWr1oV-Nl7c1Jv8x_MiAj2vxRtyQawu', 

2775 'yo': '1yNVOwMOWeglbOcRoZzgd4uwlN5JMynnY', 

2776 'za': '1i7pg162cD_iU9h8dgtI2An8QCcbzUAjB', 

2777 'zea': '1EWSkiSkPBfbyjWjZK0VuKdpqFnFOpXXQ', 

2778 'zh-classical': '1uUKZamNp08KA7s7794sKPOqPALvo_btl', 

2779 'zh-min-nan': '1oSgz3YBXLGUgI7kl-uMOC_ww6L0FNFmp', 

2780 'zh-yue': '1zhwlUeeiyOAU1QqwqZ8n91yXIRPFA7UE', 

2781 'zh': '1LZ96GUhkVHQU-aj2C3WOrtffOp0U3Z7f', 

2782 'zu': '1FyXl_UK1737XB3drqQFhGXiJrJckiB1W' 

2783 } 

2784 return languages_ids[language] 

2785 

2786 

2787class NER_MULTI_XTREME(MultiCorpus): 

2788 def __init__( 

2789 self, 

2790 languages: Union[str, List[str]] = "en", 

2791 base_path: Union[str, Path] = None, 

2792 tag_to_bioes: str = "ner", 

2793 in_memory: bool = False, 

2794 **corpusargs, 

2795 ): 

2796 """ 

2797 Xtreme corpus for cross-lingual NER consisting of datasets of a total of 176 languages. The data comes from the google 

2798 research work XTREME https://github.com/google-research/xtreme. All datasets for NER and respective language abbreviations (e.g. 

2799 "en" for english can be found here https://www.amazon.com/clouddrive/share/d3KGCRCIYwhKJF0H3eWA26hjg2ZCRhjpEQtDL70FSBN/folder/C43gs51bSIaq5sFTQkWNCQ?_encoding=UTF8&*Version*=1&*entries*=0&mgh=1 ) 

2800 The data is derived from the wikiann dataset https://elisa-ie.github.io/wikiann/ (license: https://opendatacommons.org/licenses/by/) 

2801 

2802 Parameters 

2803 ---------- 

2804 languages : Union[str, List[str]], optional 

2805 Default the 40 languages that are used in XTREME are loaded. Otherwise on can hand over a strings or a list of strings 

2806 consisiting of abbreviations for languages. All datasets will be loaded in a MultiCorpus object. 

2807 base_path : Union[str, Path], optional 

2808 Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

2809 to point to a different folder but typically this should not be necessary. 

2810 tag_to_bioes : str, optional 

2811 The data is in bio-format. It will by default (with the string "ner" as value) be transformed 

2812 into the bioes format. If you dont want that set it to None. 

2813 

2814 """ 

2815 # if no languages are given as argument all languages used in XTREME will be loaded 

2816 if not languages: 

2817 languages = ["af", "ar", "bg", "bn", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr", "he", "hi", "hu", 

2818 "id", "it", "ja", "jv", "ka", "kk", "ko", "ml", "mr", "ms", "my", "nl", "pt", "ru", "sw", "ta", 

2819 "te", "th", "tl", "tr", "ur", "vi", "yo", "zh"] 

2820 

2821 # if only one language is given 

2822 if type(languages) == str: 

2823 languages = [languages] 

2824 

2825 if type(base_path) == str: 

2826 base_path: Path = Path(base_path) 

2827 

2828 # column format 

2829 columns = {0: "text", 1: "ner"} 

2830 

2831 # this dataset name 

2832 dataset_name = self.__class__.__name__.lower() 

2833 

2834 # default dataset folder is the cache root 

2835 if not base_path: 

2836 base_path = flair.cache_root / "datasets" 

2837 data_folder = base_path / dataset_name 

2838 

2839 # For each language in languages, the file is downloaded if not existent 

2840 # Then a comlumncorpus of that data is created and saved in a list 

2841 # This list is handed to the multicorpus 

2842 

2843 # list that contains the columncopora 

2844 corpora = [] 

2845 

2846 hu_path = "https://nlp.informatik.hu-berlin.de/resources/datasets/panx_dataset" 

2847 

2848 # download data if necessary 

2849 for language in languages: 

2850 

2851 language_folder = data_folder / language 

2852 

2853 # if language not downloaded yet, download it 

2854 if not language_folder.exists(): 

2855 

2856 file_name = language + '.tar.gz' 

2857 # create folder 

2858 os.makedirs(language_folder) 

2859 

2860 # download from HU Server 

2861 temp_file = cached_path( 

2862 hu_path + "/" + file_name, 

2863 Path("datasets") / dataset_name / language 

2864 ) 

2865 

2866 # unzip 

2867 log.info("Extracting data...") 

2868 import tarfile 

2869 tar = tarfile.open(str(temp_file), "r:gz") 

2870 for part in ["train", "test", "dev"]: 

2871 tar.extract(part, str(language_folder)) 

2872 tar.close() 

2873 log.info('...done.') 

2874 

2875 # transform data into required format 

2876 log.info("Processing dataset...") 

2877 for part in ["train", "test", "dev"]: 

2878 self._xtreme_to_simple_ner_annotation(str(language_folder / part)) 

2879 log.info('...done.') 

2880 

2881 # initialize comlumncorpus and add it to list 

2882 log.info(f"Reading data for language {language}") 

2883 corp = ColumnCorpus(data_folder=language_folder, 

2884 column_format=columns, 

2885 tag_to_bioes=tag_to_bioes, 

2886 in_memory=in_memory, 

2887 **corpusargs, 

2888 ) 

2889 corpora.append(corp) 

2890 

2891 super(NER_MULTI_XTREME, self).__init__( 

2892 corpora, name='xtreme', 

2893 ) 

2894 

2895 def _xtreme_to_simple_ner_annotation(self, data_file: Union[str, Path]): 

2896 with open(data_file, 'r', encoding='utf-8') as f: 

2897 lines = f.readlines() 

2898 with open(data_file, 'w', encoding='utf-8') as f: 

2899 for line in lines: 

2900 if line == '\n': 

2901 f.write(line) 

2902 else: 

2903 liste = line.split() 

2904 f.write(liste[0].split(':', 1)[1] + ' ' + liste[1] + '\n') 

2905 

2906 

2907class NER_MULTI_WIKINER(MultiCorpus): 

2908 def __init__( 

2909 self, 

2910 languages: Union[str, List[str]] = "en", 

2911 base_path: Union[str, Path] = None, 

2912 tag_to_bioes: str = "ner", 

2913 in_memory: bool = False, 

2914 **corpusargs, 

2915 ): 

2916 if type(base_path) == str: 

2917 base_path: Path = Path(base_path) 

2918 

2919 # if only one language is given 

2920 if type(languages) == str: 

2921 languages = [languages] 

2922 

2923 # column format 

2924 columns = {0: "text", 1: "pos", 2: "ner"} 

2925 

2926 # this dataset name 

2927 dataset_name = self.__class__.__name__.lower() 

2928 

2929 # default dataset folder is the cache root 

2930 if not base_path: 

2931 base_path = flair.cache_root / "datasets" 

2932 data_folder = base_path / dataset_name 

2933 

2934 corpora = [] 

2935 for language in languages: 

2936 language_folder = data_folder / language 

2937 

2938 # download data if necessary 

2939 self._download_wikiner(language, language_folder) 

2940 

2941 # initialize comlumncorpus and add it to list 

2942 log.info(f"Read data for language {language}") 

2943 corp = ColumnCorpus(data_folder=language_folder, 

2944 column_format=columns, 

2945 tag_to_bioes=tag_to_bioes, 

2946 in_memory=in_memory, 

2947 **corpusargs, 

2948 ) 

2949 corpora.append(corp) 

2950 

2951 super(NER_MULTI_WIKINER, self).__init__( 

2952 corpora, name='wikiner', 

2953 ) 

2954 

2955 def _download_wikiner(self, language_code: str, dataset_name: str): 

2956 # download data if necessary 

2957 wikiner_path = ( 

2958 "https://raw.githubusercontent.com/dice-group/FOX/master/input/Wikiner/" 

2959 ) 

2960 lc = language_code 

2961 

2962 data_file = ( 

2963 flair.cache_root 

2964 / "datasets" 

2965 / dataset_name 

2966 / f"aij-wikiner-{lc}-wp3.train" 

2967 ) 

2968 if not data_file.is_file(): 

2969 

2970 cached_path( 

2971 f"{wikiner_path}aij-wikiner-{lc}-wp3.bz2", Path("datasets") / dataset_name 

2972 ) 

2973 import bz2, shutil 

2974 

2975 # unpack and write out in CoNLL column-like format 

2976 bz_file = bz2.BZ2File( 

2977 flair.cache_root 

2978 / "datasets" 

2979 / dataset_name 

2980 / f"aij-wikiner-{lc}-wp3.bz2", 

2981 "rb", 

2982 ) 

2983 with bz_file as f, open( 

2984 flair.cache_root 

2985 / "datasets" 

2986 / dataset_name 

2987 / f"aij-wikiner-{lc}-wp3.train", 

2988 "w", 

2989 encoding="utf-8" 

2990 ) as out: 

2991 for line in f: 

2992 line = line.decode("utf-8") 

2993 words = line.split(" ") 

2994 for word in words: 

2995 out.write("\t".join(word.split("|")) + "\n") 

2996 

2997 

2998class NER_SWEDISH(ColumnCorpus): 

2999 def __init__( 

3000 self, 

3001 base_path: Union[str, Path] = None, 

3002 tag_to_bioes: str = "ner", 

3003 in_memory: bool = True, 

3004 **corpusargs, 

3005 ): 

3006 """ 

3007 Initialize the NER_SWEDISH corpus for Swedish. The first time you call this constructor it will automatically 

3008 download the dataset. 

3009 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

3010 to point to a different folder but typically this should not be necessary. 

3011 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

3012 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

3013 """ 

3014 

3015 if type(base_path) == str: 

3016 base_path: Path = Path(base_path) 

3017 

3018 # column format 

3019 columns = {0: "text", 1: "ner"} 

3020 

3021 # this dataset name 

3022 dataset_name = self.__class__.__name__.lower() 

3023 

3024 # default dataset folder is the cache root 

3025 if not base_path: 

3026 base_path = flair.cache_root / "datasets" 

3027 data_folder = base_path / dataset_name 

3028 

3029 # download data if necessary 

3030 ner_spraakbanken_path = "https://raw.githubusercontent.com/klintan/swedish-ner-corpus/master/" 

3031 cached_path(f"{ner_spraakbanken_path}test_corpus.txt", Path("datasets") / dataset_name) 

3032 cached_path(f"{ner_spraakbanken_path}train_corpus.txt", Path("datasets") / dataset_name) 

3033 

3034 # data is not in IOB2 format. Thus we transform it to IOB2 

3035 self._add_IOB2_tags(data_file=Path(data_folder / "test_corpus.txt")) 

3036 self._add_IOB2_tags(data_file=Path(data_folder / "train_corpus.txt")) 

3037 

3038 super(NER_SWEDISH, self).__init__( 

3039 data_folder, 

3040 columns, 

3041 tag_to_bioes=tag_to_bioes, 

3042 in_memory=in_memory, 

3043 **corpusargs, 

3044 ) 

3045 

3046 def _add_IOB2_tags(self, data_file: Union[str, Path], encoding: str = "utf8"): 

3047 """ 

3048 Function that adds IOB2 tags if only chunk names are provided (e.g. words are tagged PER instead 

3049 of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects 

3050 the letter 'O'. Additionally it removes lines with no tags in the data file and can also 

3051 be used if the data is only partially IOB tagged. 

3052 Parameters 

3053 ---------- 

3054 data_file : Union[str, Path] 

3055 Path to the data file. 

3056 encoding : str, optional 

3057 Encoding used in open function. The default is "utf8". 

3058 

3059 """ 

3060 with open(file=data_file, mode='r', encoding=encoding) as f: 

3061 lines = f.readlines() 

3062 with open(file=data_file, mode='w', encoding=encoding) as f: 

3063 pred = 'O' # remembers tag of predecessing line 

3064 for line in lines: 

3065 line_list = line.split() 

3066 if len(line_list) == 2: # word with tag 

3067 word = line_list[0] 

3068 tag = line_list[1] 

3069 if tag in ['0', 'O']: # no chunk 

3070 f.write(word + ' O\n') 

3071 pred = 'O' 

3072 elif '-' not in tag: # no IOB tags 

3073 if pred == 'O': # found a new chunk 

3074 f.write(word + ' B-' + tag + '\n') 

3075 pred = tag 

3076 else: # found further part of chunk or new chunk directly after old chunk 

3077 if pred == tag: 

3078 f.write(word + ' I-' + tag + '\n') 

3079 else: 

3080 f.write(word + ' B-' + tag + '\n') 

3081 pred = tag 

3082 else: # line already has IOB tag (tag contains '-') 

3083 f.write(line) 

3084 pred = tag.split('-')[1] 

3085 elif len(line_list) == 0: # empty line 

3086 f.write('\n') 

3087 pred = 'O' 

3088 

3089 

3090class NER_TURKU(ColumnCorpus): 

3091 def __init__( 

3092 self, 

3093 base_path: Union[str, Path] = None, 

3094 tag_to_bioes: str = "ner", 

3095 in_memory: bool = True, 

3096 **corpusargs, 

3097 ): 

3098 """ 

3099 Initialize the Finnish TurkuNER corpus. The first time you call this constructor it will automatically 

3100 download the dataset. 

3101 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

3102 to point to a different folder but typically this should not be necessary. 

3103 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict 

3104 POS tags instead 

3105 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

3106 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

3107 """ 

3108 if type(base_path) == str: 

3109 base_path: Path = Path(base_path) 

3110 

3111 # column format 

3112 columns = {0: "text", 1: "ner"} 

3113 

3114 # this dataset name 

3115 dataset_name = self.__class__.__name__.lower() 

3116 

3117 # default dataset folder is the cache root 

3118 if not base_path: 

3119 base_path = flair.cache_root / "datasets" 

3120 data_folder = base_path / dataset_name 

3121 

3122 # download data if necessary 

3123 conll_path = "https://raw.githubusercontent.com/TurkuNLP/turku-ner-corpus/master/data/conll" 

3124 dev_file = "dev.tsv" 

3125 test_file = "test.tsv" 

3126 train_file = "train.tsv" 

3127 cached_path(f"{conll_path}/{dev_file}", Path("datasets") / dataset_name) 

3128 cached_path(f"{conll_path}/{test_file}", Path("datasets") / dataset_name) 

3129 cached_path(f"{conll_path}/{train_file}", Path("datasets") / dataset_name) 

3130 

3131 super(NER_TURKU, self).__init__( 

3132 data_folder, 

3133 columns, 

3134 dev_file=dev_file, 

3135 test_file=test_file, 

3136 train_file=train_file, 

3137 column_delimiter="\t", 

3138 tag_to_bioes=tag_to_bioes, 

3139 encoding="latin-1", 

3140 in_memory=in_memory, 

3141 document_separator_token="-DOCSTART-", 

3142 **corpusargs, 

3143 ) 

3144 

3145 

3146class KEYPHRASE_SEMEVAL2017(ColumnCorpus): 

3147 def __init__( 

3148 self, 

3149 base_path: Union[str, Path] = None, 

3150 tag_to_bioes: str = "keyword", 

3151 in_memory: bool = True, 

3152 **corpusargs, 

3153 ): 

3154 

3155 if type(base_path) == str: 

3156 base_path: Path = Path(base_path) 

3157 

3158 # column format 

3159 columns = {0: "text", 1: "keyword"} 

3160 

3161 # this dataset name 

3162 dataset_name = self.__class__.__name__.lower() 

3163 

3164 # default dataset folder is the cache root 

3165 if not base_path: 

3166 base_path = flair.cache_root / "datasets" 

3167 data_folder = base_path / dataset_name 

3168 

3169 semeval2017_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/SemEval-2017" 

3170 cached_path(f"{semeval2017_path}/train.txt", Path("datasets") / dataset_name) 

3171 cached_path(f"{semeval2017_path}/test.txt", Path("datasets") / dataset_name) 

3172 cached_path(f"{semeval2017_path}/dev.txt", Path("datasets") / dataset_name) 

3173 

3174 super(KEYPHRASE_SEMEVAL2017, self).__init__( 

3175 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, 

3176 ) 

3177 

3178 

3179class KEYPHRASE_INSPEC(ColumnCorpus): 

3180 def __init__( 

3181 self, 

3182 base_path: Union[str, Path] = None, 

3183 tag_to_bioes: str = "keyword", 

3184 in_memory: bool = True, 

3185 **corpusargs, 

3186 ): 

3187 

3188 if type(base_path) == str: 

3189 base_path: Path = Path(base_path) 

3190 

3191 # column format 

3192 columns = {0: "text", 1: "keyword"} 

3193 

3194 # this dataset name 

3195 dataset_name = self.__class__.__name__.lower() 

3196 

3197 # default dataset folder is the cache root 

3198 if not base_path: 

3199 base_path = flair.cache_root / "datasets" 

3200 data_folder = base_path / dataset_name 

3201 

3202 inspec_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/Inspec" 

3203 cached_path(f"{inspec_path}/train.txt", Path("datasets") / dataset_name) 

3204 cached_path(f"{inspec_path}/test.txt", Path("datasets") / dataset_name) 

3205 if not "dev.txt" in os.listdir(data_folder): 

3206 cached_path(f"{inspec_path}/valid.txt", Path("datasets") / dataset_name) 

3207 # rename according to train - test - dev - convention 

3208 os.rename(data_folder / "valid.txt", data_folder / "dev.txt") 

3209 

3210 super(KEYPHRASE_INSPEC, self).__init__( 

3211 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, 

3212 ) 

3213 

3214 

3215class KEYPHRASE_SEMEVAL2010(ColumnCorpus): 

3216 def __init__( 

3217 self, 

3218 base_path: Union[str, Path] = None, 

3219 tag_to_bioes: str = "keyword", 

3220 in_memory: bool = True, 

3221 **corpusargs, 

3222 ): 

3223 

3224 if type(base_path) == str: 

3225 base_path: Path = Path(base_path) 

3226 

3227 # column format 

3228 columns = {0: "text", 1: "keyword"} 

3229 

3230 # this dataset name 

3231 dataset_name = self.__class__.__name__.lower() 

3232 

3233 # default dataset folder is the cache root 

3234 if not base_path: 

3235 base_path = flair.cache_root / "datasets" 

3236 data_folder = base_path / dataset_name 

3237 

3238 semeval2010_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/processed_semeval-2010" 

3239 cached_path(f"{semeval2010_path}/train.txt", Path("datasets") / dataset_name) 

3240 cached_path(f"{semeval2010_path}/test.txt", Path("datasets") / dataset_name) 

3241 

3242 super(KEYPHRASE_SEMEVAL2010, self).__init__( 

3243 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, 

3244 ) 

3245 

3246 

3247class UP_CHINESE(ColumnCorpus): 

3248 def __init__( 

3249 self, 

3250 base_path: Union[str, Path] = None, 

3251 in_memory: bool = True, 

3252 document_as_sequence: bool = False, 

3253 **corpusargs, 

3254 ): 

3255 """ 

3256 Initialize the Chinese dataset from the Universal Propositions Bank, comming from that webpage: 

3257 https://github.com/System-T/UniversalPropositions/tree/master/UP_Chinese 

3258 

3259 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

3260 to point to a different folder but typically this should not be necessary. 

3261 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

3262 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

3263 """ 

3264 if type(base_path) == str: 

3265 base_path: Path = Path(base_path) 

3266 

3267 # column format 

3268 columns = {1: "text", 9: "frame"} 

3269 

3270 # this dataset name 

3271 dataset_name = self.__class__.__name__.lower() 

3272 

3273 # default dataset folder is the cache root 

3274 if not base_path: 

3275 base_path = flair.cache_root / "datasets" 

3276 data_folder = base_path / dataset_name 

3277 

3278 # download data if necessary 

3279 up_zh_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Chinese/" 

3280 cached_path(f"{up_zh_path}zh-up-train.conllu", Path("datasets") / dataset_name) 

3281 cached_path(f"{up_zh_path}zh-up-dev.conllu", Path("datasets") / dataset_name) 

3282 cached_path(f"{up_zh_path}zh-up-test.conllu", Path("datasets") / dataset_name) 

3283 

3284 super(UP_CHINESE, self).__init__( 

3285 data_folder, 

3286 columns, 

3287 encoding="utf-8", 

3288 train_file="zh-up-train.conllu", 

3289 test_file="zh-up-test.conllu", 

3290 dev_file="zh-up-dev.conllu", 

3291 in_memory=in_memory, 

3292 document_separator_token=None if not document_as_sequence else "-DOCSTART-", 

3293 comment_symbol="#", 

3294 **corpusargs, 

3295 ) 

3296 

3297 

3298class UP_ENGLISH(ColumnCorpus): 

3299 def __init__( 

3300 self, 

3301 base_path: Union[str, Path] = None, 

3302 in_memory: bool = True, 

3303 document_as_sequence: bool = False, 

3304 **corpusargs, 

3305 ): 

3306 """ 

3307 Initialize the English dataset from the Universal Propositions Bank, comming from that webpage: 

3308 https://github.com/System-T/UniversalPropositions. 

3309 

3310 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

3311 to point to a different folder but typically this should not be necessary. 

3312 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

3313 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

3314 """ 

3315 if type(base_path) == str: 

3316 base_path: Path = Path(base_path) 

3317 

3318 # column format 

3319 columns = {1: "text", 10: "frame"} 

3320 

3321 # this dataset name 

3322 dataset_name = self.__class__.__name__.lower() 

3323 

3324 # default dataset folder is the cache root 

3325 if not base_path: 

3326 base_path = flair.cache_root / "datasets" 

3327 data_folder = base_path / dataset_name 

3328 

3329 # download data if necessary 

3330 up_en_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_English-EWT/" 

3331 cached_path(f"{up_en_path}en_ewt-up-train.conllu", Path("datasets") / dataset_name) 

3332 cached_path(f"{up_en_path}en_ewt-up-dev.conllu", Path("datasets") / dataset_name) 

3333 cached_path(f"{up_en_path}en_ewt-up-test.conllu", Path("datasets") / dataset_name) 

3334 

3335 super(UP_ENGLISH, self).__init__( 

3336 data_folder, 

3337 columns, 

3338 encoding="utf-8", 

3339 train_file="en_ewt-up-train.conllu", 

3340 test_file="en_ewt-up-test.conllu", 

3341 dev_file="en_ewt-up-dev.conllu", 

3342 in_memory=in_memory, 

3343 document_separator_token=None if not document_as_sequence else "-DOCSTART-", 

3344 comment_symbol="#", 

3345 label_name_map={"_": "O"}, 

3346 **corpusargs, 

3347 ) 

3348 

3349 

3350class UP_FRENCH(ColumnCorpus): 

3351 def __init__( 

3352 self, 

3353 base_path: Union[str, Path] = None, 

3354 in_memory: bool = True, 

3355 document_as_sequence: bool = False, 

3356 **corpusargs, 

3357 ): 

3358 """ 

3359 Initialize the French dataset from the Universal Propositions Bank, comming from that webpage: 

3360 https://github.com/System-T/UniversalPropositions. 

3361 

3362 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

3363 to point to a different folder but typically this should not be necessary. 

3364 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

3365 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

3366 """ 

3367 if type(base_path) == str: 

3368 base_path: Path = Path(base_path) 

3369 

3370 # column format 

3371 columns = {1: "text", 9: "frame"} 

3372 

3373 # this dataset name 

3374 dataset_name = self.__class__.__name__.lower() 

3375 

3376 # default dataset folder is the cache root 

3377 if not base_path: 

3378 base_path = flair.cache_root / "datasets" 

3379 data_folder = base_path / dataset_name 

3380 

3381 # download data if necessary 

3382 up_fr_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_French/" 

3383 cached_path(f"{up_fr_path}fr-up-train.conllu", Path("datasets") / dataset_name) 

3384 cached_path(f"{up_fr_path}fr-up-dev.conllu", Path("datasets") / dataset_name) 

3385 cached_path(f"{up_fr_path}fr-up-test.conllu", Path("datasets") / dataset_name) 

3386 

3387 super(UP_FRENCH, self).__init__( 

3388 data_folder, 

3389 columns, 

3390 encoding="utf-8", 

3391 train_file="fr-up-train.conllu", 

3392 test_file="fr-up-test.conllu", 

3393 dev_file="fr-up-dev.conllu", 

3394 in_memory=in_memory, 

3395 document_separator_token=None if not document_as_sequence else "-DOCSTART-", 

3396 comment_symbol="#", 

3397 **corpusargs, 

3398 ) 

3399 

3400 

3401class UP_FINNISH(ColumnCorpus): 

3402 def __init__( 

3403 self, 

3404 base_path: Union[str, Path] = None, 

3405 in_memory: bool = True, 

3406 document_as_sequence: bool = False, 

3407 **corpusargs, 

3408 ): 

3409 """ 

3410 Initialize the Finnish dataset from the Universal Propositions Bank, comming from that webpage: 

3411 https://github.com/System-T/UniversalPropositions/tree/master/UP_Finnish 

3412 

3413 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

3414 to point to a different folder but typically this should not be necessary. 

3415 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

3416 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

3417 """ 

3418 if type(base_path) == str: 

3419 base_path: Path = Path(base_path) 

3420 

3421 # column format 

3422 columns = {1: "text", 9: "frame"} 

3423 

3424 # this dataset name 

3425 dataset_name = self.__class__.__name__.lower() 

3426 

3427 # default dataset folder is the cache root 

3428 if not base_path: 

3429 base_path = flair.cache_root / "datasets" 

3430 data_folder = base_path / dataset_name 

3431 

3432 # download data if necessary 

3433 up_fi_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Finnish/" 

3434 cached_path(f"{up_fi_path}fi-up-train.conllu", Path("datasets") / dataset_name) 

3435 cached_path(f"{up_fi_path}fi-up-dev.conllu", Path("datasets") / dataset_name) 

3436 cached_path(f"{up_fi_path}fi-up-test.conllu", Path("datasets") / dataset_name) 

3437 

3438 super(UP_FINNISH, self).__init__( 

3439 data_folder, 

3440 columns, 

3441 encoding="utf-8", 

3442 train_file="fi-up-train.conllu", 

3443 test_file="fi-up-test.conllu", 

3444 dev_file="fi-up-dev.conllu", 

3445 in_memory=in_memory, 

3446 document_separator_token=None if not document_as_sequence else "-DOCSTART-", 

3447 comment_symbol="#", 

3448 **corpusargs, 

3449 ) 

3450 

3451 

3452class UP_GERMAN(ColumnCorpus): 

3453 def __init__( 

3454 self, 

3455 base_path: Union[str, Path] = None, 

3456 in_memory: bool = True, 

3457 document_as_sequence: bool = False, 

3458 **corpusargs, 

3459 ): 

3460 """ 

3461 Initialize the German dataset from the Universal Propositions Bank, comming from that webpage: 

3462 https://github.com/System-T/UniversalPropositions. 

3463 

3464 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

3465 to point to a different folder but typically this should not be necessary. 

3466 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

3467 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

3468 """ 

3469 if type(base_path) == str: 

3470 base_path: Path = Path(base_path) 

3471 

3472 # column format 

3473 columns = {1: "text", 9: "frame"} 

3474 

3475 # this dataset name 

3476 dataset_name = self.__class__.__name__.lower() 

3477 

3478 # default dataset folder is the cache root 

3479 if not base_path: 

3480 base_path = flair.cache_root / "datasets" 

3481 data_folder = base_path / dataset_name 

3482 

3483 # download data if necessary 

3484 up_de_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_German/" 

3485 cached_path(f"{up_de_path}de-up-train.conllu", Path("datasets") / dataset_name) 

3486 cached_path(f"{up_de_path}de-up-dev.conllu", Path("datasets") / dataset_name) 

3487 cached_path(f"{up_de_path}de-up-test.conllu", Path("datasets") / dataset_name) 

3488 

3489 super(UP_GERMAN, self).__init__( 

3490 data_folder, 

3491 columns, 

3492 encoding="utf-8", 

3493 train_file="de-up-train.conllu", 

3494 test_file="de-up-test.conllu", 

3495 dev_file="de-up-dev.conllu", 

3496 in_memory=in_memory, 

3497 document_separator_token=None if not document_as_sequence else "-DOCSTART-", 

3498 comment_symbol="#", 

3499 **corpusargs, 

3500 ) 

3501 

3502 

3503class UP_ITALIAN(ColumnCorpus): 

3504 def __init__( 

3505 self, 

3506 base_path: Union[str, Path] = None, 

3507 in_memory: bool = True, 

3508 document_as_sequence: bool = False, 

3509 **corpusargs, 

3510 ): 

3511 """ 

3512 Initialize the Italian dataset from the Universal Propositions Bank, comming from that webpage: 

3513 https://github.com/System-T/UniversalPropositions/tree/master/UP_Italian 

3514 

3515 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

3516 to point to a different folder but typically this should not be necessary. 

3517 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

3518 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

3519 """ 

3520 if type(base_path) == str: 

3521 base_path: Path = Path(base_path) 

3522 

3523 # column format 

3524 columns = {1: "text", 9: "frame"} 

3525 

3526 # this dataset name 

3527 dataset_name = self.__class__.__name__.lower() 

3528 

3529 # default dataset folder is the cache root 

3530 if not base_path: 

3531 base_path = flair.cache_root / "datasets" 

3532 data_folder = base_path / dataset_name 

3533 

3534 # download data if necessary 

3535 up_it_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Italian/" 

3536 cached_path(f"{up_it_path}it-up-train.conllu", Path("datasets") / dataset_name) 

3537 cached_path(f"{up_it_path}it-up-dev.conllu", Path("datasets") / dataset_name) 

3538 cached_path(f"{up_it_path}it-up-test.conllu", Path("datasets") / dataset_name) 

3539 

3540 super(UP_ITALIAN, self).__init__( 

3541 data_folder, 

3542 columns, 

3543 encoding="utf-8", 

3544 train_file="it-up-train.conllu", 

3545 test_file="it-up-test.conllu", 

3546 dev_file="it-up-dev.conllu", 

3547 in_memory=in_memory, 

3548 document_separator_token=None if not document_as_sequence else "-DOCSTART-", 

3549 comment_symbol="#", 

3550 **corpusargs, 

3551 ) 

3552 

3553 

3554class UP_SPANISH(ColumnCorpus): 

3555 def __init__( 

3556 self, 

3557 base_path: Union[str, Path] = None, 

3558 in_memory: bool = True, 

3559 document_as_sequence: bool = False, 

3560 **corpusargs, 

3561 ): 

3562 """ 

3563 Initialize the Spanish dataset from the Universal Propositions Bank, comming from that webpage: 

3564 https://github.com/System-T/UniversalPropositions 

3565 

3566 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

3567 to point to a different folder but typically this should not be necessary. 

3568 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

3569 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

3570 """ 

3571 if type(base_path) == str: 

3572 base_path: Path = Path(base_path) 

3573 

3574 # column format 

3575 columns = {1: "text", 9: "frame"} 

3576 

3577 # this dataset name 

3578 dataset_name = self.__class__.__name__.lower() 

3579 

3580 # default dataset folder is the cache root 

3581 if not base_path: 

3582 base_path = flair.cache_root / "datasets" 

3583 data_folder = base_path / dataset_name 

3584 

3585 # download data if necessary 

3586 up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish/" 

3587 cached_path(f"{up_es_path}es-up-train.conllu", Path("datasets") / dataset_name) 

3588 cached_path(f"{up_es_path}es-up-dev.conllu", Path("datasets") / dataset_name) 

3589 cached_path(f"{up_es_path}es-up-test.conllu", Path("datasets") / dataset_name) 

3590 

3591 super(UP_SPANISH, self).__init__( 

3592 data_folder, 

3593 columns, 

3594 encoding="utf-8", 

3595 train_file="es-up-train.conllu", 

3596 test_file="es-up-test.conllu", 

3597 dev_file="es-up-dev.conllu", 

3598 in_memory=in_memory, 

3599 document_separator_token=None if not document_as_sequence else "-DOCSTART-", 

3600 comment_symbol="#", 

3601 **corpusargs, 

3602 ) 

3603 

3604 

3605class UP_SPANISH_ANCORA(ColumnCorpus): 

3606 def __init__( 

3607 self, 

3608 base_path: Union[str, Path] = None, 

3609 in_memory: bool = True, 

3610 document_as_sequence: bool = False, 

3611 **corpusargs, 

3612 ): 

3613 """ 

3614 Initialize the Spanish AnCora dataset from the Universal Propositions Bank, comming from that webpage: 

3615 https://github.com/System-T/UniversalPropositions 

3616 

3617 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

3618 to point to a different folder but typically this should not be necessary. 

3619 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

3620 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

3621 """ 

3622 if type(base_path) == str: 

3623 base_path: Path = Path(base_path) 

3624 

3625 # column format 

3626 columns = {1: "text", 9: "frame"} 

3627 

3628 # this dataset name 

3629 dataset_name = self.__class__.__name__.lower() 

3630 

3631 # default dataset folder is the cache root 

3632 if not base_path: 

3633 base_path = flair.cache_root / "datasets" 

3634 data_folder = base_path / dataset_name 

3635 

3636 # download data if necessary 

3637 up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish-AnCora/" 

3638 cached_path(f"{up_es_path}es_ancora-up-train.conllu", Path("datasets") / dataset_name) 

3639 cached_path(f"{up_es_path}es_ancora-up-dev.conllu", Path("datasets") / dataset_name) 

3640 cached_path(f"{up_es_path}es_ancora-up-test.conllu", Path("datasets") / dataset_name) 

3641 

3642 super(UP_SPANISH_ANCORA, self).__init__( 

3643 data_folder, 

3644 columns, 

3645 encoding="utf-8", 

3646 train_file="es_ancora-up-train.conllu", 

3647 test_file="es_ancora-up-test.conllu", 

3648 dev_file="es_ancora-up-dev.conllu", 

3649 in_memory=in_memory, 

3650 document_separator_token=None if not document_as_sequence else "-DOCSTART-", 

3651 comment_symbol="#", 

3652 **corpusargs, 

3653 )