Coverage for flair/flair/datasets/sequence_labeling.py: 12%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1165 statements  

1import logging 

2import os 

3import re 

4import shutil 

5from pathlib import Path 

6from typing import Union, Dict, List, Optional 

7 

8import flair 

9from flair.data import Corpus, MultiCorpus, FlairDataset, Sentence, Token 

10from flair.datasets.base import find_train_dev_test_files 

11from flair.file_utils import cached_path, unpack_file 

12 

13log = logging.getLogger("flair") 

14 

15 

16class ColumnCorpus(Corpus): 

17 def __init__( 

18 self, 

19 data_folder: Union[str, Path], 

20 column_format: Dict[int, str], 

21 train_file=None, 

22 test_file=None, 

23 dev_file=None, 

24 tag_to_bioes=None, 

25 column_delimiter: str = r"\s+", 

26 comment_symbol: str = None, 

27 encoding: str = "utf-8", 

28 document_separator_token: str = None, 

29 skip_first_line: bool = False, 

30 in_memory: bool = True, 

31 label_name_map: Dict[str, str] = None, 

32 banned_sentences: List[str] = None, 

33 autofind_splits: bool = True, 

34 name: Optional[str] = None, 

35 **corpusargs, 

36 ): 

37 """ 

38 Instantiates a Corpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000. 

39 :param data_folder: base folder with the task data 

40 :param column_format: a map specifying the column format 

41 :param train_file: the name of the train file 

42 :param test_file: the name of the test file 

43 :param dev_file: the name of the dev file, if None, dev data is sampled from train 

44 :param tag_to_bioes: whether to convert to BIOES tagging scheme 

45 :param column_delimiter: default is to split on any separatator, but you can overwrite for instance with "\t" 

46 to split only on tabs 

47 :param comment_symbol: if set, lines that begin with this symbol are treated as comments 

48 :param document_separator_token: If provided, sentences that function as document boundaries are so marked 

49 :param skip_first_line: set to True if your dataset has a header line 

50 :param in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads 

51 :param label_name_map: Optionally map tag names to different schema. 

52 :param banned_sentences: Optionally remove sentences from the corpus. Works only if `in_memory` is true 

53 :return: a Corpus with annotated train, dev and test data 

54 """ 

55 

56 # find train, dev and test files if not specified 

57 dev_file, test_file, train_file = \ 

58 find_train_dev_test_files(data_folder, dev_file, test_file, train_file, autofind_splits) 

59 

60 # get train data 

61 train = ColumnDataset( 

62 train_file, 

63 column_format, 

64 tag_to_bioes, 

65 encoding=encoding, 

66 comment_symbol=comment_symbol, 

67 column_delimiter=column_delimiter, 

68 banned_sentences=banned_sentences, 

69 in_memory=in_memory, 

70 document_separator_token=document_separator_token, 

71 skip_first_line=skip_first_line, 

72 label_name_map=label_name_map, 

73 ) if train_file is not None else None 

74 

75 # read in test file if exists 

76 test = ColumnDataset( 

77 test_file, 

78 column_format, 

79 tag_to_bioes, 

80 encoding=encoding, 

81 comment_symbol=comment_symbol, 

82 column_delimiter=column_delimiter, 

83 banned_sentences=banned_sentences, 

84 in_memory=in_memory, 

85 document_separator_token=document_separator_token, 

86 skip_first_line=skip_first_line, 

87 label_name_map=label_name_map, 

88 ) if test_file is not None else None 

89 

90 # read in dev file if exists 

91 dev = ColumnDataset( 

92 dev_file, 

93 column_format, 

94 tag_to_bioes, 

95 encoding=encoding, 

96 comment_symbol=comment_symbol, 

97 banned_sentences=banned_sentences, 

98 column_delimiter=column_delimiter, 

99 in_memory=in_memory, 

100 document_separator_token=document_separator_token, 

101 skip_first_line=skip_first_line, 

102 label_name_map=label_name_map, 

103 ) if dev_file is not None else None 

104 

105 corpus_name = str(data_folder) if not name else name 

106 super(ColumnCorpus, self).__init__(train, dev, test, name=corpus_name, **corpusargs) 

107 

108 

109class ColumnDataset(FlairDataset): 

110 # special key for space after 

111 SPACE_AFTER_KEY = "space-after" 

112 

113 def __init__( 

114 self, 

115 path_to_column_file: Union[str, Path], 

116 column_name_map: Dict[int, str], 

117 tag_to_bioes: str = None, 

118 column_delimiter: str = r"\s+", 

119 comment_symbol: str = None, 

120 banned_sentences: List[str] = None, 

121 in_memory: bool = True, 

122 document_separator_token: str = None, 

123 encoding: str = "utf-8", 

124 skip_first_line: bool = False, 

125 label_name_map: Dict[str, str] = None, 

126 ): 

127 """ 

128 Instantiates a column dataset (typically used for sequence labeling or word-level prediction). 

129 :param path_to_column_file: path to the file with the column-formatted data 

130 :param column_name_map: a map specifying the column format 

131 :param tag_to_bioes: whether to convert to BIOES tagging scheme 

132 :param column_delimiter: default is to split on any separatator, but you can overwrite for instance with "\t" 

133 to split only on tabs 

134 :param comment_symbol: if set, lines that begin with this symbol are treated as comments 

135 :param in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads 

136 :param document_separator_token: If provided, sentences that function as document boundaries are so marked 

137 :param skip_first_line: set to True if your dataset has a header line 

138 :param label_name_map: Optionally map tag names to different schema. 

139 :param banned_sentences: Optionally remove sentences from the corpus. Works only if `in_memory` is true 

140 :return: a dataset with annotated data 

141 """ 

142 if type(path_to_column_file) is str: 

143 path_to_column_file = Path(path_to_column_file) 

144 assert path_to_column_file.exists() 

145 self.path_to_column_file = path_to_column_file 

146 self.tag_to_bioes = tag_to_bioes 

147 self.column_name_map = column_name_map 

148 self.column_delimiter = column_delimiter 

149 self.comment_symbol = comment_symbol 

150 self.document_separator_token = document_separator_token 

151 self.label_name_map = label_name_map 

152 self.banned_sentences = banned_sentences 

153 

154 # store either Sentence objects in memory, or only file offsets 

155 self.in_memory = in_memory 

156 

157 self.total_sentence_count: int = 0 

158 

159 # most data sets have the token text in the first column, if not, pass 'text' as column 

160 self.text_column: int = 0 

161 for column in self.column_name_map: 

162 if column_name_map[column] == "text": 

163 self.text_column = column 

164 

165 # determine encoding of text file 

166 self.encoding = encoding 

167 

168 with open(str(self.path_to_column_file), encoding=self.encoding) as file: 

169 

170 # skip first line if to selected 

171 if skip_first_line: 

172 file.readline() 

173 

174 # option 1: read only sentence boundaries as offset positions 

175 if not self.in_memory: 

176 self.indices: List[int] = [] 

177 

178 line = file.readline() 

179 position = 0 

180 sentence_started = False 

181 while line: 

182 if sentence_started and self.__line_completes_sentence(line): 

183 self.indices.append(position) 

184 position = file.tell() 

185 sentence_started = False 

186 

187 elif not line.isspace(): 

188 sentence_started = True 

189 line = file.readline() 

190 

191 if sentence_started: 

192 self.indices.append(position) 

193 

194 self.total_sentence_count = len(self.indices) 

195 

196 # option 2: keep everything in memory 

197 if self.in_memory: 

198 self.sentences: List[Sentence] = [] 

199 

200 # pointer to previous 

201 previous_sentence = None 

202 while True: 

203 sentence = self._convert_lines_to_sentence(self._read_next_sentence(file)) 

204 if not sentence: break 

205 if self.banned_sentences is not None and any( 

206 [d in sentence.to_plain_string() for d in self.banned_sentences]): 

207 continue 

208 sentence._previous_sentence = previous_sentence 

209 sentence._next_sentence = None 

210 

211 if previous_sentence: previous_sentence._next_sentence = sentence 

212 

213 self.sentences.append(sentence) 

214 previous_sentence = sentence 

215 

216 self.total_sentence_count = len(self.sentences) 

217 

218 def _read_next_sentence(self, file): 

219 lines = [] 

220 line = file.readline() 

221 while line: 

222 if not line.isspace(): 

223 lines.append(line) 

224 

225 # if sentence ends, break 

226 if len(lines) > 0 and self.__line_completes_sentence(line): 

227 break 

228 

229 line = file.readline() 

230 return lines 

231 

232 def _convert_lines_to_sentence(self, lines): 

233 

234 sentence: Sentence = Sentence() 

235 for line in lines: 

236 # skip comments 

237 if self.comment_symbol is not None and line.startswith(self.comment_symbol): 

238 continue 

239 

240 # if sentence ends, convert and return 

241 if self.__line_completes_sentence(line): 

242 if len(sentence) > 0: 

243 if self.tag_to_bioes is not None: 

244 sentence.convert_tag_scheme( 

245 tag_type=self.tag_to_bioes, target_scheme="iobes" 

246 ) 

247 # check if this sentence is a document boundary 

248 if sentence.to_original_text() == self.document_separator_token: 

249 sentence.is_document_boundary = True 

250 return sentence 

251 

252 # otherwise, this line is a token. parse and add to sentence 

253 else: 

254 token = self._parse_token(line) 

255 sentence.add_token(token) 

256 

257 # check if this sentence is a document boundary 

258 if sentence.to_original_text() == self.document_separator_token: sentence.is_document_boundary = True 

259 

260 if self.tag_to_bioes is not None: 

261 sentence.convert_tag_scheme( 

262 tag_type=self.tag_to_bioes, target_scheme="iobes" 

263 ) 

264 

265 if len(sentence) > 0: return sentence 

266 

267 def _parse_token(self, line: str) -> Token: 

268 fields: List[str] = re.split(self.column_delimiter, line.rstrip()) 

269 token = Token(fields[self.text_column]) 

270 for column in self.column_name_map: 

271 if len(fields) > column: 

272 if column != self.text_column and self.column_name_map[column] != self.SPACE_AFTER_KEY: 

273 task = self.column_name_map[column] # for example 'pos' 

274 tag = fields[column] 

275 if tag.count("-") >= 1: # tag with prefix, for example tag='B-OBJ' 

276 split_at_first_hyphen = tag.split("-", 1) 

277 tagging_format_prefix = split_at_first_hyphen[0] 

278 tag_without_tagging_format = split_at_first_hyphen[1] 

279 if self.label_name_map and tag_without_tagging_format in self.label_name_map.keys(): 

280 tag = tagging_format_prefix + "-" + self.label_name_map[tag_without_tagging_format] 

281 # for example, transforming 'B-OBJ' to 'B-part-of-speech-object' 

282 if self.label_name_map[tag_without_tagging_format] == 'O': tag = 'O' 

283 else: # tag without prefix, for example tag='PPER' 

284 if self.label_name_map and tag in self.label_name_map.keys(): 

285 tag = self.label_name_map[tag] # for example, transforming 'PPER' to 'person' 

286 

287 token.add_label(task, tag) 

288 if self.column_name_map[column] == self.SPACE_AFTER_KEY and fields[column] == '-': 

289 token.whitespace_after = False 

290 return token 

291 

292 def __line_completes_sentence(self, line: str) -> bool: 

293 sentence_completed = line.isspace() or line == '' 

294 return sentence_completed 

295 

296 def is_in_memory(self) -> bool: 

297 return self.in_memory 

298 

299 def __len__(self): 

300 return self.total_sentence_count 

301 

302 def __getitem__(self, index: int = 0) -> Sentence: 

303 

304 # if in memory, retrieve parsed sentence 

305 if self.in_memory: 

306 sentence = self.sentences[index] 

307 

308 # else skip to position in file where sentence begins 

309 else: 

310 with open(str(self.path_to_column_file), encoding=self.encoding) as file: 

311 file.seek(self.indices[index]) 

312 sentence = self._convert_lines_to_sentence(self._read_next_sentence(file)) 

313 

314 # set sentence context using partials 

315 sentence._position_in_dataset = (self, index) 

316 

317 return sentence 

318 

319 

320class CONLL_03(ColumnCorpus): 

321 def __init__( 

322 self, 

323 base_path: Union[str, Path] = None, 

324 tag_to_bioes: str = "ner", 

325 in_memory: bool = True, 

326 **corpusargs, 

327 ): 

328 """ 

329 Initialize the CoNLL-03 corpus. This is only possible if you've manually downloaded it to your machine. 

330 Obtain the corpus from https://www.clips.uantwerpen.be/conll2003/ner/ and put the eng.testa, .testb, .train 

331 files in a folder called 'conll_03'. Then set the base_path parameter in the constructor to the path to the 

332 parent directory where the conll_03 folder resides. 

333 If using entity linking, the conll03 dateset is reduced by about 20 Documents, which are not part of the yago dataset. 

334 :param base_path: Path to the CoNLL-03 corpus (i.e. 'conll_03' folder) on your machine 

335 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' or 'np' to predict 

336 POS tags or chunks respectively 

337 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

338 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

339 """ 

340 if type(base_path) == str: 

341 base_path: Path = Path(base_path) 

342 

343 # column format 

344 columns = {0: "text", 1: "pos", 2: "np", 3: "ner"} 

345 

346 # this dataset name 

347 dataset_name = self.__class__.__name__.lower() 

348 

349 # default dataset folder is the cache root 

350 if not base_path: 

351 base_path = flair.cache_root / "datasets" 

352 data_folder = base_path / dataset_name 

353 

354 # check if data there 

355 if not data_folder.exists(): 

356 log.warning("-" * 100) 

357 log.warning(f'WARNING: CoNLL-03 dataset not found at "{data_folder}".') 

358 log.warning( 

359 'Instructions for obtaining the data can be found here: https://www.clips.uantwerpen.be/conll2003/ner/"' 

360 ) 

361 log.warning("-" * 100) 

362 

363 super(CONLL_03, self).__init__( 

364 data_folder, 

365 columns, 

366 tag_to_bioes=tag_to_bioes, 

367 in_memory=in_memory, 

368 document_separator_token="-DOCSTART-", 

369 **corpusargs, 

370 ) 

371 

372 

373class CONLL_03_GERMAN(ColumnCorpus): 

374 def __init__( 

375 self, 

376 base_path: Union[str, Path] = None, 

377 tag_to_bioes: str = "ner", 

378 in_memory: bool = True, 

379 **corpusargs, 

380 ): 

381 """ 

382 Initialize the CoNLL-03 corpus for German. This is only possible if you've manually downloaded it to your machine. 

383 Obtain the corpus from https://www.clips.uantwerpen.be/conll2003/ner/ and put the respective files in a folder called 

384 'conll_03_german'. Then set the base_path parameter in the constructor to the path to the parent directory where 

385 the conll_03_german folder resides. 

386 :param base_path: Path to the CoNLL-03 corpus (i.e. 'conll_03_german' folder) on your machine 

387 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'lemma', 'pos' or 'np' to predict 

388 word lemmas, POS tags or chunks respectively 

389 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

390 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

391 """ 

392 if type(base_path) == str: 

393 base_path: Path = Path(base_path) 

394 

395 # column format 

396 columns = {0: "text", 1: "lemma", 2: "pos", 3: "np", 4: "ner"} 

397 

398 # this dataset name 

399 dataset_name = self.__class__.__name__.lower() 

400 

401 # default dataset folder is the cache root 

402 if not base_path: 

403 base_path = flair.cache_root / "datasets" 

404 data_folder = base_path / dataset_name 

405 

406 # check if data there 

407 if not data_folder.exists(): 

408 log.warning("-" * 100) 

409 log.warning(f'WARNING: CoNLL-03 dataset not found at "{data_folder}".') 

410 log.warning( 

411 'Instructions for obtaining the data can be found here: https://www.clips.uantwerpen.be/conll2003/ner/"' 

412 ) 

413 log.warning("-" * 100) 

414 

415 super(CONLL_03_GERMAN, self).__init__( 

416 data_folder, 

417 columns, 

418 tag_to_bioes=tag_to_bioes, 

419 in_memory=in_memory, 

420 document_separator_token="-DOCSTART-", 

421 **corpusargs, 

422 ) 

423 

424 

425class CONLL_03_DUTCH(ColumnCorpus): 

426 def __init__( 

427 self, 

428 base_path: Union[str, Path] = None, 

429 tag_to_bioes: str = "ner", 

430 in_memory: bool = True, 

431 **corpusargs, 

432 ): 

433 """ 

434 Initialize the CoNLL-03 corpus for Dutch. The first time you call this constructor it will automatically 

435 download the dataset. 

436 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

437 to point to a different folder but typically this should not be necessary. 

438 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict 

439 POS tags instead 

440 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

441 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

442 """ 

443 if type(base_path) == str: 

444 base_path: Path = Path(base_path) 

445 

446 # column format 

447 columns = {0: "text", 1: "pos", 2: "ner"} 

448 

449 # this dataset name 

450 dataset_name = self.__class__.__name__.lower() 

451 

452 # default dataset folder is the cache root 

453 if not base_path: 

454 base_path = flair.cache_root / "datasets" 

455 data_folder = base_path / dataset_name 

456 

457 # download data if necessary 

458 conll_02_path = "https://www.clips.uantwerpen.be/conll2002/ner/data/" 

459 

460 # download files if not present locally 

461 cached_path(f"{conll_02_path}ned.testa", data_folder / 'raw') 

462 cached_path(f"{conll_02_path}ned.testb", data_folder / 'raw') 

463 cached_path(f"{conll_02_path}ned.train", data_folder / 'raw') 

464 

465 # we need to slightly modify the original files by adding some new lines after document separators 

466 train_data_file = data_folder / 'train.txt' 

467 if not train_data_file.is_file(): 

468 self.__offset_docstarts(data_folder / 'raw' / "ned.train", data_folder / 'train.txt') 

469 self.__offset_docstarts(data_folder / 'raw' / "ned.testa", data_folder / 'dev.txt') 

470 self.__offset_docstarts(data_folder / 'raw' / "ned.testb", data_folder / 'test.txt') 

471 

472 super(CONLL_03_DUTCH, self).__init__( 

473 data_folder, 

474 columns, 

475 train_file='train.txt', 

476 dev_file='dev.txt', 

477 test_file='test.txt', 

478 tag_to_bioes=tag_to_bioes, 

479 encoding="latin-1", 

480 in_memory=in_memory, 

481 document_separator_token="-DOCSTART-", 

482 **corpusargs, 

483 ) 

484 

485 @staticmethod 

486 def __offset_docstarts(file_in: Union[str, Path], file_out: Union[str, Path]): 

487 with open(file_in, 'r', encoding="latin-1") as f: 

488 lines = f.readlines() 

489 with open(file_out, 'w', encoding="latin-1") as f: 

490 for line in lines: 

491 f.write(line) 

492 if line.startswith('-DOCSTART-'): 

493 f.write("\n") 

494 

495 

496class CONLL_03_SPANISH(ColumnCorpus): 

497 def __init__( 

498 self, 

499 base_path: Union[str, Path] = None, 

500 tag_to_bioes: str = "ner", 

501 in_memory: bool = True, 

502 **corpusargs, 

503 ): 

504 """ 

505 Initialize the CoNLL-03 corpus for Spanish. The first time you call this constructor it will automatically 

506 download the dataset. 

507 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

508 to point to a different folder but typically this should not be necessary. 

509 :param tag_to_bioes: NER by default, should not be changed 

510 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

511 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

512 """ 

513 if type(base_path) == str: 

514 base_path: Path = Path(base_path) 

515 

516 # column format 

517 columns = {0: "text", 1: "ner"} 

518 

519 # this dataset name 

520 dataset_name = self.__class__.__name__.lower() 

521 

522 # default dataset folder is the cache root 

523 if not base_path: 

524 base_path = flair.cache_root / "datasets" 

525 data_folder = base_path / dataset_name 

526 

527 # download data if necessary 

528 conll_02_path = "https://www.clips.uantwerpen.be/conll2002/ner/data/" 

529 cached_path(f"{conll_02_path}esp.testa", Path("datasets") / dataset_name) 

530 cached_path(f"{conll_02_path}esp.testb", Path("datasets") / dataset_name) 

531 cached_path(f"{conll_02_path}esp.train", Path("datasets") / dataset_name) 

532 

533 super(CONLL_03_SPANISH, self).__init__( 

534 data_folder, 

535 columns, 

536 tag_to_bioes=tag_to_bioes, 

537 encoding="latin-1", 

538 in_memory=in_memory, 

539 **corpusargs, 

540 ) 

541 

542 

543class CONLL_2000(ColumnCorpus): 

544 def __init__( 

545 self, 

546 base_path: Union[str, Path] = None, 

547 tag_to_bioes: str = "np", 

548 in_memory: bool = True, 

549 **corpusargs, 

550 ): 

551 """ 

552 Initialize the CoNLL-2000 corpus for English chunking. 

553 The first time you call this constructor it will automatically download the dataset. 

554 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

555 to point to a different folder but typically this should not be necessary. 

556 :param tag_to_bioes: 'np' by default, should not be changed, but you can set 'pos' instead to predict POS tags 

557 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

558 """ 

559 if type(base_path) == str: 

560 base_path: Path = Path(base_path) 

561 

562 # column format 

563 columns = {0: "text", 1: "pos", 2: "np"} 

564 

565 # this dataset name 

566 dataset_name = self.__class__.__name__.lower() 

567 

568 # default dataset folder is the cache root 

569 if not base_path: 

570 base_path = flair.cache_root / "datasets" 

571 data_folder = base_path / dataset_name 

572 

573 # download data if necessary 

574 conll_2000_path = "https://www.clips.uantwerpen.be/conll2000/chunking/" 

575 data_file = flair.cache_root / "datasets" / dataset_name / "train.txt" 

576 if not data_file.is_file(): 

577 cached_path( 

578 f"{conll_2000_path}train.txt.gz", Path("datasets") / dataset_name 

579 ) 

580 cached_path( 

581 f"{conll_2000_path}test.txt.gz", Path("datasets") / dataset_name 

582 ) 

583 import gzip, shutil 

584 

585 with gzip.open( 

586 flair.cache_root / "datasets" / dataset_name / "train.txt.gz", 

587 "rb", 

588 ) as f_in: 

589 with open( 

590 flair.cache_root / "datasets" / dataset_name / "train.txt", 

591 "wb", 

592 ) as f_out: 

593 shutil.copyfileobj(f_in, f_out) 

594 with gzip.open( 

595 flair.cache_root / "datasets" / dataset_name / "test.txt.gz", "rb" 

596 ) as f_in: 

597 with open( 

598 flair.cache_root / "datasets" / dataset_name / "test.txt", 

599 "wb", 

600 ) as f_out: 

601 shutil.copyfileobj(f_in, f_out) 

602 

603 super(CONLL_2000, self).__init__( 

604 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, 

605 ) 

606 

607 

608class WNUT_17(ColumnCorpus): 

609 def __init__( 

610 self, 

611 base_path: Union[str, Path] = None, 

612 tag_to_bioes: str = "ner", 

613 in_memory: bool = True, 

614 **corpusargs, 

615 ): 

616 if type(base_path) == str: 

617 base_path: Path = Path(base_path) 

618 

619 # column format 

620 columns = {0: "text", 1: "ner"} 

621 

622 # this dataset name 

623 dataset_name = self.__class__.__name__.lower() 

624 

625 # default dataset folder is the cache root 

626 if not base_path: 

627 base_path = flair.cache_root / "datasets" 

628 data_folder = base_path / dataset_name 

629 

630 # download data if necessary 

631 wnut_path = "https://noisy-text.github.io/2017/files/" 

632 cached_path(f"{wnut_path}wnut17train.conll", Path("datasets") / dataset_name) 

633 cached_path(f"{wnut_path}emerging.dev.conll", Path("datasets") / dataset_name) 

634 cached_path( 

635 f"{wnut_path}emerging.test.annotated", Path("datasets") / dataset_name 

636 ) 

637 

638 super(WNUT_17, self).__init__( 

639 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, 

640 ) 

641 

642 

643class BIOSCOPE(ColumnCorpus): 

644 def __init__( 

645 self, 

646 base_path: Union[str, Path] = None, 

647 in_memory: bool = True, 

648 **corpusargs, 

649 ): 

650 if type(base_path) == str: 

651 base_path: Path = Path(base_path) 

652 

653 # column format 

654 columns = {0: "text", 1: "tag"} 

655 

656 # this dataset name 

657 dataset_name = self.__class__.__name__.lower() 

658 

659 # default dataset folder is the cache root 

660 if not base_path: 

661 base_path = flair.cache_root / "datasets" 

662 data_folder = base_path / dataset_name 

663 

664 # download data if necessary 

665 bioscope_path = "https://raw.githubusercontent.com/whoisjones/BioScopeSequenceLabelingData/master/sequence_labeled/" 

666 cached_path(f"{bioscope_path}output.txt", Path("datasets") / dataset_name) 

667 

668 super(BIOSCOPE, self).__init__( 

669 data_folder, columns, in_memory=in_memory, train_file="output.txt", **corpusargs, 

670 ) 

671 

672 

673class NER_ARABIC_ANER(ColumnCorpus): 

674 def __init__( 

675 self, 

676 base_path: Union[str, Path] = None, 

677 tag_to_bioes: str = "ner", 

678 in_memory: bool = True, 

679 document_as_sequence: bool = False, 

680 **corpusargs, 

681 ): 

682 """ 

683 Initialize a preprocessed version of the Arabic Named Entity Recognition Corpus (ANERCorp) dataset available 

684 from https://github.com/EmnamoR/Arabic-named-entity-recognition/blob/master/ANERCorp.rar. 

685 http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp 

686 Column order is swapped 

687 The first time you call this constructor it will automatically download the dataset. 

688 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

689 to point to a different folder but typically this should not be necessary. 

690 :param tag_to_bioes: NER by default, need not be changed. 

691 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

692 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

693 """ 

694 if type(base_path) == str: 

695 base_path: Path = Path(base_path) 

696 

697 # column format 

698 columns = {0: "text", 1: "ner"} 

699 

700 # this dataset name 

701 dataset_name = self.__class__.__name__.lower() 

702 

703 # default dataset folder is the cache root 

704 if not base_path: 

705 base_path = flair.cache_root / "datasets" 

706 data_folder = base_path / dataset_name 

707 

708 # download data if necessary 

709 anercorp_path = "https://megantosh.s3.eu-central-1.amazonaws.com/ANERcorp/" 

710 # cached_path(f"{anercorp_path}test.txt", Path("datasets") / dataset_name) 

711 cached_path(f"{anercorp_path}train.txt", Path("datasets") / dataset_name) 

712 

713 super(NER_ARABIC_ANER, self).__init__( 

714 data_folder, 

715 columns, 

716 tag_to_bioes=tag_to_bioes, 

717 encoding="utf-8", 

718 in_memory=in_memory, 

719 document_separator_token=None if not document_as_sequence else "-DOCSTART-", 

720 **corpusargs, 

721 ) 

722 

723 

724class NER_ARABIC_AQMAR(ColumnCorpus): 

725 def __init__( 

726 self, 

727 base_path: Union[str, Path] = None, 

728 tag_to_bioes: str = "ner", 

729 in_memory: bool = True, 

730 document_as_sequence: bool = False, 

731 **corpusargs, 

732 ): 

733 """ 

734 Initialize a preprocessed and modified version of the American and Qatari Modeling of Arabic (AQMAR) dataset available 

735 from http://www.cs.cmu.edu/~ark/ArabicNER/AQMAR_Arabic_NER_corpus-1.0.zip. 

736 via http://www.cs.cmu.edu/~ark/AQMAR/ 

737 

738 - Modifications from original dataset: Miscellaneous tags (MIS0, MIS1, MIS2, MIS3) are merged to one tag "MISC" as these categories deviate across the original dataset 

739 - The 28 original Wikipedia articles are merged into a single file containing the articles in alphabetical order 

740 

741 The first time you call this constructor it will automatically download the dataset. 

742 

743 This dataset is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License. 

744 please cite: "Behrang Mohit, Nathan Schneider, Rishav Bhowmick, Kemal Oflazer, and Noah A. Smith (2012), 

745 Recall-Oriented Learning of Named Entities in Arabic Wikipedia. Proceedings of EACL." 

746 

747 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. 

748 :param tag_to_bioes: NER by default 

749 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

750 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

751 """ 

752 if type(base_path) == str: 

753 base_path: Path = Path(base_path) 

754 

755 # column format 

756 columns = {0: "text", 1: "ner"} 

757 

758 # this dataset name 

759 dataset_name = self.__class__.__name__.lower() 

760 

761 # default dataset folder is the cache root 

762 if not base_path: 

763 base_path = flair.cache_root / "datasets" 

764 data_folder = base_path / dataset_name 

765 

766 # download data if necessary 

767 aqmar_path = "https://megantosh.s3.eu-central-1.amazonaws.com/AQMAR/" 

768 # cached_path(f"{anercorp_path}test.txt", Path("datasets") / dataset_name) 

769 cached_path(f"{aqmar_path}train.txt", Path("datasets") / dataset_name) 

770 

771 super(NER_ARABIC_AQMAR, self).__init__( 

772 data_folder, 

773 columns, 

774 tag_to_bioes=tag_to_bioes, 

775 encoding="utf-8", 

776 in_memory=in_memory, 

777 document_separator_token=None if not document_as_sequence else "-DOCSTART-", 

778 **corpusargs, 

779 ) 

780 

781 

782class NER_BASQUE(ColumnCorpus): 

783 def __init__( 

784 self, 

785 base_path: Union[str, Path] = None, 

786 tag_to_bioes: str = "ner", 

787 in_memory: bool = True, 

788 **corpusargs, 

789 ): 

790 if type(base_path) == str: 

791 base_path: Path = Path(base_path) 

792 

793 # column format 

794 columns = {0: "text", 1: "ner"} 

795 

796 # this dataset name 

797 dataset_name = self.__class__.__name__.lower() 

798 

799 # default dataset folder is the cache root 

800 if not base_path: 

801 base_path = flair.cache_root / "datasets" 

802 data_folder = base_path / dataset_name 

803 

804 # download data if necessary 

805 ner_basque_path = "http://ixa2.si.ehu.eus/eiec/" 

806 data_path = flair.cache_root / "datasets" / dataset_name 

807 data_file = data_path / "named_ent_eu.train" 

808 if not data_file.is_file(): 

809 cached_path( 

810 f"{ner_basque_path}/eiec_v1.0.tgz", Path("datasets") / dataset_name 

811 ) 

812 import tarfile, shutil 

813 

814 with tarfile.open( 

815 flair.cache_root / "datasets" / dataset_name / "eiec_v1.0.tgz", 

816 "r:gz", 

817 ) as f_in: 

818 corpus_files = ( 

819 "eiec_v1.0/named_ent_eu.train", 

820 "eiec_v1.0/named_ent_eu.test", 

821 ) 

822 for corpus_file in corpus_files: 

823 f_in.extract(corpus_file, data_path) 

824 shutil.move(f"{data_path}/{corpus_file}", data_path) 

825 

826 super(NER_BASQUE, self).__init__( 

827 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, 

828 ) 

829 

830 

831class NER_CHINESE_WEIBO(ColumnCorpus): 

832 def __init__( 

833 self, 

834 base_path: Union[str, Path] = None, 

835 tag_to_bioes: str = "ner", 

836 in_memory: bool = True, 

837 document_as_sequence: bool = False, 

838 **corpusargs, 

839 ): 

840 """ 

841 Initialize the WEIBO_NER corpus . The first time you call this constructor it will automatically 

842 download the dataset. 

843 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

844 to point to a different folder but typically this should not be necessary. 

845 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict 

846 POS tags instead 

847 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

848 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

849 """ 

850 if type(base_path) == str: 

851 base_path: Path = Path(base_path) 

852 

853 # column format 

854 columns = {0: 'text', 1: 'ner'} 

855 

856 # this dataset name 

857 dataset_name = self.__class__.__name__.lower() 

858 

859 # default dataset folder is the cache root 

860 if not base_path: 

861 base_path = flair.cache_root / "datasets" 

862 data_folder = base_path / dataset_name 

863 

864 # download data if necessary 

865 weiboNER_conll_path = "https://raw.githubusercontent.com/87302380/WEIBO_NER/main/data/" 

866 cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.train", Path("datasets") / dataset_name) 

867 cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.test", Path("datasets") / dataset_name) 

868 cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.dev", Path("datasets") / dataset_name) 

869 

870 super(NER_CHINESE_WEIBO, self).__init__( 

871 data_folder, 

872 columns, 

873 tag_to_bioes=tag_to_bioes, 

874 encoding="utf-8", 

875 in_memory=in_memory, 

876 train_file="weiboNER_2nd_conll_format.train", 

877 test_file="weiboNER_2nd_conll_format.test", 

878 dev_file="weiboNER_2nd_conll_format.dev", 

879 document_separator_token=None if not document_as_sequence else "-DOCSTART-", 

880 **corpusargs, 

881 ) 

882 

883 

884class NER_DANISH_DANE(ColumnCorpus): 

885 def __init__( 

886 self, 

887 base_path: Union[str, Path] = None, 

888 tag_to_bioes: str = "ner", 

889 in_memory: bool = True, 

890 **corpusargs, 

891 ): 

892 if type(base_path) == str: 

893 base_path: Path = Path(base_path) 

894 

895 # column format 

896 columns = {1: 'text', 3: 'pos', 9: 'ner'} 

897 

898 # this dataset name 

899 dataset_name = self.__class__.__name__.lower() 

900 

901 # default dataset folder is the cache root 

902 if not base_path: 

903 base_path = flair.cache_root / "datasets" 

904 data_folder = base_path / dataset_name 

905 

906 # download data if necessary 

907 data_path = flair.cache_root / "datasets" / dataset_name 

908 train_data_file = data_path / "ddt.train.conllu" 

909 if not train_data_file.is_file(): 

910 temp_file = cached_path( 

911 'https://danlp.alexandra.dk/304bd159d5de/datasets/ddt.zip', 

912 Path("datasets") / dataset_name 

913 ) 

914 from zipfile import ZipFile 

915 

916 with ZipFile(temp_file, 'r') as zip_file: 

917 zip_file.extractall(path=data_path) 

918 

919 # Remove CoNLL-U meta information in the last column 

920 for part in ['train', 'dev', 'test']: 

921 lines = [] 

922 data_file = "ddt.{}.conllu".format(part) 

923 with open(data_path / data_file, 'r') as file: 

924 for line in file: 

925 if line.startswith("#") or line == "\n": 

926 lines.append(line) 

927 lines.append(line.replace("name=", "").replace("|SpaceAfter=No", "")) 

928 

929 with open(data_path / data_file, 'w') as file: 

930 file.writelines(lines) 

931 

932 print(data_path / data_file) 

933 

934 super(NER_DANISH_DANE, self).__init__( 

935 data_folder, columns, tag_to_bioes=tag_to_bioes, 

936 in_memory=in_memory, comment_symbol="#", 

937 **corpusargs, 

938 ) 

939 

940 

941class NER_ENGLISH_MOVIE_SIMPLE(ColumnCorpus): 

942 def __init__( 

943 self, 

944 base_path: Union[str, Path] = None, 

945 tag_to_bioes: str = "ner", 

946 in_memory: bool = True, 

947 **corpusargs, 

948 ): 

949 """ 

950 Initialize the eng corpus of the MIT Movie Corpus (it has simpler queries compared to the trivia10k13 corpus) 

951 in BIO format. The first time you call this constructor it will automatically download the dataset. 

952 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

953 to point to a different folder but typically this should not be necessary. 

954 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict 

955 POS tags instead 

956 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

957 """ 

958 # column format 

959 columns = {0: "ner", 1: "text"} 

960 

961 # dataset name 

962 dataset_name = self.__class__.__name__.lower() 

963 

964 # data folder: default dataset folder is the cache root 

965 if type(base_path) == str: 

966 base_path: Path = Path(base_path) 

967 if not base_path: 

968 base_path: Path = flair.cache_root / "datasets" 

969 data_folder = base_path / dataset_name 

970 

971 # download data if necessary 

972 mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/" 

973 train_file = "engtrain.bio" 

974 test_file = "engtest.bio" 

975 cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name) 

976 cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name) 

977 

978 super(NER_ENGLISH_MOVIE_SIMPLE, self).__init__( 

979 data_folder, 

980 columns, 

981 train_file=train_file, 

982 test_file=test_file, 

983 tag_to_bioes=tag_to_bioes, 

984 in_memory=in_memory, 

985 **corpusargs, 

986 ) 

987 

988 

989class NER_ENGLISH_MOVIE_COMPLEX(ColumnCorpus): 

990 def __init__( 

991 self, 

992 base_path: Union[str, Path] = None, 

993 tag_to_bioes: str = "ner", 

994 in_memory: bool = True, 

995 **corpusargs, 

996 ): 

997 """ 

998 Initialize the trivia10k13 corpus of the MIT Movie Corpus (it has more complex queries compared to the eng corpus) 

999 in BIO format. The first time you call this constructor it will automatically download the dataset. 

1000 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

1001 to point to a different folder but typically this should not be necessary. 

1002 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict 

1003 POS tags instead 

1004 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

1005 """ 

1006 # column format 

1007 columns = {0: "ner", 1: "text"} 

1008 

1009 # dataset name 

1010 dataset_name = self.__class__.__name__.lower() 

1011 

1012 # data folder: default dataset folder is the cache root 

1013 if type(base_path) == str: 

1014 base_path: Path = Path(base_path) 

1015 if not base_path: 

1016 base_path: Path = flair.cache_root / "datasets" 

1017 data_folder = base_path / dataset_name 

1018 

1019 # download data if necessary 

1020 mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/" 

1021 train_file = "trivia10k13train.bio" 

1022 test_file = "trivia10k13test.bio" 

1023 cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name) 

1024 cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name) 

1025 

1026 super(NER_ENGLISH_MOVIE_COMPLEX, self).__init__( 

1027 data_folder, 

1028 columns, 

1029 train_file=train_file, 

1030 test_file=test_file, 

1031 tag_to_bioes=tag_to_bioes, 

1032 in_memory=in_memory, 

1033 **corpusargs, 

1034 ) 

1035 

1036 

1037class NER_ENGLISH_SEC_FILLINGS(ColumnCorpus): 

1038 """ 

1039 Initialize corpus of SEC-fillings annotated with English NER tags. See paper "Domain Adaption of Named Entity 

1040 Recognition to Support Credit Risk Assessment" by Alvarado et al, 2015: https://aclanthology.org/U15-1010/ 

1041 :param base_path: Path to the CoNLL-03 corpus (i.e. 'conll_03' folder) on your machine 

1042 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' or 'np' to predict 

1043 POS tags or chunks respectively 

1044 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

1045 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

1046 """ 

1047 

1048 def __init__( 

1049 self, 

1050 base_path: Union[str, Path] = None, 

1051 tag_to_bioes: str = "ner", 

1052 in_memory: bool = True, 

1053 **corpusargs, 

1054 ): 

1055 

1056 if type(base_path) == str: 

1057 base_path: Path = Path(base_path) 

1058 

1059 # column format 

1060 columns = {0: "text", 1: "pos", 3: "ner"} 

1061 

1062 # this dataset name 

1063 dataset_name = self.__class__.__name__.lower() 

1064 

1065 # default dataset folder is the cache root 

1066 if not base_path: 

1067 base_path = flair.cache_root / "datasets" 

1068 data_folder = base_path / dataset_name 

1069 

1070 # download data if necessary 

1071 SEC_FILLINGS_Path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/SEC-filings/CONLL-format/data/" 

1072 cached_path(f"{SEC_FILLINGS_Path}test/FIN3.txt", Path("datasets") / dataset_name) 

1073 cached_path(f"{SEC_FILLINGS_Path}train/FIN5.txt", Path("datasets") / dataset_name) 

1074 

1075 super(NER_ENGLISH_SEC_FILLINGS, self).__init__( 

1076 data_folder, 

1077 columns, 

1078 tag_to_bioes=tag_to_bioes, 

1079 encoding="utf-8", 

1080 in_memory=in_memory, 

1081 train_file='FIN5.txt', 

1082 test_file="FIN3.txt", 

1083 skip_first_line=True, 

1084 **corpusargs, 

1085 ) 

1086 

1087 

1088class NER_ENGLISH_RESTAURANT(ColumnCorpus): 

1089 def __init__( 

1090 self, 

1091 base_path: Union[str, Path] = None, 

1092 tag_to_bioes: str = "ner", 

1093 in_memory: bool = True, 

1094 **corpusargs, 

1095 ): 

1096 """ 

1097 Initialize the experimental MIT Restaurant corpus available on https://groups.csail.mit.edu/sls/downloads/restaurant/. 

1098 The first time you call this constructor it will automatically download the dataset. 

1099 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

1100 to point to a different folder but typically this should not be necessary. 

1101 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict 

1102 POS tags instead 

1103 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

1104 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

1105 """ 

1106 if type(base_path) == str: 

1107 base_path: Path = Path(base_path) 

1108 

1109 # column format 

1110 columns = {0: "text", 1: "ner"} 

1111 

1112 # this dataset name 

1113 dataset_name = self.__class__.__name__.lower() 

1114 

1115 # default dataset folder is the cache root 

1116 if not base_path: 

1117 base_path = flair.cache_root / "datasets" 

1118 data_folder = base_path / dataset_name 

1119 

1120 # download data if necessary 

1121 mit_restaurants_path = "https://megantosh.s3.eu-central-1.amazonaws.com/MITRestoCorpus/" 

1122 cached_path(f"{mit_restaurants_path}test.txt", Path("datasets") / dataset_name) 

1123 cached_path(f"{mit_restaurants_path}train.txt", Path("datasets") / dataset_name) 

1124 

1125 super(NER_ENGLISH_RESTAURANT, self).__init__( 

1126 data_folder, 

1127 columns, 

1128 tag_to_bioes=tag_to_bioes, 

1129 encoding="latin-1", 

1130 in_memory=in_memory, 

1131 **corpusargs, 

1132 ) 

1133 

1134 

1135class NER_ENGLISH_STACKOVERFLOW(ColumnCorpus): 

1136 def __init__( 

1137 self, 

1138 base_path: Union[str, Path] = None, 

1139 tag_to_bioes: str = "ner", 

1140 in_memory: bool = True, 

1141 **corpusargs, 

1142 ): 

1143 """ 

1144 Initialize the STACKOVERFLOW_NER corpus. The first time you call this constructor it will automatically 

1145 download the dataset. 

1146 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

1147 to point to a different folder but typically this should not be necessary. 

1148 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict 

1149 POS tags instead 

1150 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

1151 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

1152 """ 

1153 if type(base_path) == str: 

1154 base_path: Path = Path(base_path) 

1155 

1156 """ 

1157 The Datasets are represented in the Conll format. 

1158 In this format each line of the Dataset is in the following format: 

1159 <word>+"\t"+<NE>"\t"+<word>+"\t"<markdown> 

1160 The end of sentence is marked with an empty line. 

1161 In each line NE represented the human annotated named entity  

1162 and <markdown> represented the code tags provided by the users who wrote the posts. 

1163 """ 

1164 # column format 

1165 columns = {0: "word", 1: "ner", 3: "markdown"} 

1166 

1167 # entity_mapping 

1168 entity_mapping = {"Library_Function": "Function", 

1169 "Function_Name": "Function", 

1170 "Class_Name": "Class", 

1171 "Library_Class": "Class", 

1172 "Organization": "Website", 

1173 "Library_Variable": "Variable", 

1174 "Variable_Name": "Variable", 

1175 "Error_Name": "O", 

1176 "Keyboard_IP": "O", 

1177 "Value": "O", 

1178 "Output_Block": "O" 

1179 } 

1180 

1181 # this dataset name 

1182 dataset_name = self.__class__.__name__.lower() 

1183 

1184 # default dataset folder is the cache root 

1185 if not base_path: 

1186 base_path = flair.cache_root / "datasets" 

1187 data_folder = base_path / dataset_name 

1188 

1189 # download data if necessary 

1190 STACKOVERFLOW_NER_path = "https://raw.githubusercontent.com/jeniyat/StackOverflowNER/master/resources/annotated_ner_data/StackOverflow/" 

1191 

1192 # data validation 

1193 banned_sentences = ["code omitted for annotation", 

1194 "omitted for annotation", 

1195 "CODE_BLOCK :", 

1196 "OP_BLOCK :", 

1197 "Question_URL :", 

1198 "Question_ID :" 

1199 ] 

1200 

1201 files = ["train", "test", "dev"] 

1202 

1203 for file in files: 

1204 questions = 0 

1205 answers = 0 

1206 

1207 cached_path(f"{STACKOVERFLOW_NER_path}{file}.txt", Path("datasets") / dataset_name) 

1208 for line in open(data_folder / (file + ".txt"), mode="r", encoding="utf-8"): 

1209 if line.startswith("Question_ID"): 

1210 questions += 1 

1211 

1212 if line.startswith("Answer_to_Question_ID"): 

1213 answers += 1 

1214 log.info(f"File {file} has {questions} questions and {answers} answers.") 

1215 

1216 super(NER_ENGLISH_STACKOVERFLOW, self).__init__( 

1217 data_folder, 

1218 columns, 

1219 train_file="train.txt", 

1220 test_file="test.txt", 

1221 dev_file="dev.txt", 

1222 tag_to_bioes=tag_to_bioes, 

1223 encoding="utf-8", 

1224 banned_sentences=banned_sentences, 

1225 in_memory=in_memory, 

1226 label_name_map=entity_mapping, 

1227 **corpusargs 

1228 ) 

1229 

1230 

1231class NER_ENGLISH_TWITTER(ColumnCorpus): 

1232 def __init__( 

1233 self, 

1234 base_path: Union[str, Path] = None, 

1235 tag_to_bioes: str = "ner", 

1236 in_memory: bool = True, 

1237 **corpusargs, 

1238 ): 

1239 """ 

1240 Initialize a dataset called twitter_ner which can be found on the following page: 

1241 https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/ner.txt. 

1242 

1243 The first time you call this constructor it will automatically 

1244 download the dataset. 

1245 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

1246 to point to a different folder but typically this should not be necessary. 

1247 :param tag_to_bioes: NER by default, need not be changed 

1248 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

1249 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

1250 """ 

1251 if type(base_path) == str: 

1252 base_path: Path = Path(base_path) 

1253 

1254 # column format 

1255 columns = {0: 'text', 1: 'ner'} 

1256 

1257 # this dataset name 

1258 dataset_name = self.__class__.__name__.lower() 

1259 

1260 # default dataset folder is the cache root 

1261 if not base_path: 

1262 base_path = flair.cache_root / "datasets" 

1263 data_folder = base_path / dataset_name 

1264 

1265 # download data if necessary 

1266 twitter_ner_path = "https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/" 

1267 cached_path(f"{twitter_ner_path}ner.txt", Path("datasets") / dataset_name) 

1268 

1269 super(NER_ENGLISH_TWITTER, self).__init__( 

1270 data_folder, 

1271 columns, 

1272 tag_to_bioes=tag_to_bioes, 

1273 encoding="latin-1", 

1274 train_file="ner.txt", 

1275 in_memory=in_memory, 

1276 **corpusargs, 

1277 ) 

1278 

1279 

1280class NER_ENGLISH_PERSON(ColumnCorpus): 

1281 def __init__( 

1282 self, 

1283 base_path: Union[str, Path] = None, 

1284 in_memory: bool = True, 

1285 ): 

1286 """ 

1287 Initialize the PERSON_NER corpus for person names. The first time you call this constructor it will automatically 

1288 download the dataset. 

1289 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

1290 to point to a different folder but typically this should not be necessary. 

1291 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

1292 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

1293 """ 

1294 

1295 if type(base_path) == str: 

1296 base_path: Path = Path(base_path) 

1297 

1298 # column format 

1299 columns = {0: "text", 1: "ner"} 

1300 

1301 # this dataset name 

1302 dataset_name = self.__class__.__name__.lower() 

1303 

1304 # default dataset folder is the cache root 

1305 if not base_path: 

1306 base_path = flair.cache_root / "datasets" 

1307 data_folder = base_path / dataset_name 

1308 

1309 # download data if necessary 

1310 conll_path = "https://raw.githubusercontent.com/das-sudeshna/genid/master/" 

1311 

1312 # download files if not present locallys 

1313 cached_path(f"{conll_path}conll-g.conll", data_folder / 'raw') 

1314 cached_path(f"{conll_path}ieer-g.conll", data_folder / 'raw') 

1315 cached_path(f"{conll_path}textbook-g.conll", data_folder / 'raw') 

1316 cached_path(f"{conll_path}wiki-g.conll", data_folder / 'raw') 

1317 

1318 self.__concatAllFiles(data_folder) 

1319 

1320 super(NER_ENGLISH_PERSON, self).__init__( 

1321 data_folder, 

1322 columns, 

1323 in_memory=in_memory, 

1324 train_file='bigFile.conll' 

1325 ) 

1326 

1327 @staticmethod 

1328 def __concatAllFiles(data_folder): 

1329 arr = os.listdir(data_folder / 'raw') 

1330 

1331 with open(data_folder / 'bigFile.conll', 'w') as outfile: 

1332 for fname in arr: 

1333 with open(data_folder / 'raw' / fname) as infile: 

1334 outfile.write(infile.read()) 

1335 

1336 

1337class NER_ENGLISH_WEBPAGES(ColumnCorpus): 

1338 def __init__( 

1339 self, 

1340 base_path: Union[str, Path] = None, 

1341 tag_to_bioes: str = "ner", 

1342 in_memory: bool = True, 

1343 **corpusargs, 

1344 ): 

1345 """ 

1346 Initialize the WEBPAGES_NER corpus introduced in the paper "Design Challenges and Misconceptions in Named Entity 

1347 Recognition" by Ratinov and Roth (2009): https://aclanthology.org/W09-1119/. 

1348 The first time you call this constructor it will automatically download the dataset. 

1349 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

1350 to point to a different folder but typically this should not be necessary. 

1351 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict 

1352 POS tags instead 

1353 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

1354 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

1355 """ 

1356 if type(base_path) == str: 

1357 base_path: Path = Path(base_path) 

1358 

1359 # column format 

1360 columns = {0: "ner", 5: "text"} 

1361 

1362 # this dataset name 

1363 dataset_name = self.__class__.__name__.lower() 

1364 

1365 # default dataset folder is the cache root 

1366 if not base_path: 

1367 base_path = Path(flair.cache_root) / "datasets" 

1368 data_folder = base_path / dataset_name 

1369 import tarfile 

1370 if not os.path.isfile(data_folder / 'webpages_ner.txt'): 

1371 # # download zip 

1372 tar_file = "https://cogcomp.seas.upenn.edu/Data/NERWebpagesColumns.tgz" 

1373 webpages_ner_path = cached_path(tar_file, Path("datasets") / dataset_name) 

1374 tf = tarfile.open(webpages_ner_path) 

1375 tf.extractall(data_folder) 

1376 tf.close() 

1377 outputfile = os.path.abspath(data_folder) 

1378 

1379 # merge the files in one as the zip is containing multiples files 

1380 

1381 with open(outputfile / data_folder / "webpages_ner.txt", "w+") as outfile: 

1382 for files in os.walk(outputfile): 

1383 f = files[1] 

1384 ff = os.listdir(outputfile / data_folder / f[-1]) 

1385 for i, file in enumerate(ff): 

1386 if file.endswith('.gold'): 

1387 with open(outputfile / data_folder / f[-1] / file, 'r+', errors='replace') as infile: 

1388 content = infile.read() 

1389 outfile.write(content) 

1390 break 

1391 

1392 super(NER_ENGLISH_WEBPAGES, self).__init__( 

1393 data_folder, 

1394 columns, 

1395 train_file='webpages_ner.txt', 

1396 tag_to_bioes=tag_to_bioes, 

1397 in_memory=in_memory, 

1398 **corpusargs, 

1399 ) 

1400 

1401 

1402class NER_ENGLISH_WNUT_2020(ColumnCorpus): 

1403 def __init__( 

1404 self, 

1405 base_path: Union[str, Path] = None, 

1406 tag_to_bioes: str = "ner", 

1407 in_memory: bool = True, 

1408 document_as_sequence: bool = False, 

1409 **corpusargs, 

1410 ): 

1411 """ 

1412 Initialize the WNUT_2020_NER corpus. The first time you call this constructor it will automatically 

1413 download the dataset. 

1414 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

1415 to point to a different folder but typically this should not be necessary. 

1416 :param tag_to_bioes: NER by default, since it is the only option of the WNUT corpus. 

1417 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

1418 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

1419 """ 

1420 if type(base_path) == str: 

1421 base_path: Path = Path(base_path) 

1422 

1423 # column format 

1424 columns = {0: "text", 1: "ner"} 

1425 

1426 # this dataset name 

1427 dataset_name = self.__class__.__name__.lower() 

1428 

1429 # default dataset folder is the cache root 

1430 if not base_path: 

1431 base_path = flair.cache_root / "datasets" 

1432 data_folder = base_path / dataset_name 

1433 

1434 # download data if necessary 

1435 github_url = "https://github.com/jeniyat/WNUT_2020_NER/archive/master.zip" 

1436 

1437 for sample in ["train", "test", "dev"]: 

1438 

1439 sample_file = data_folder / (sample + ".txt") 

1440 if not sample_file.is_file(): 

1441 

1442 zip_path = cached_path( 

1443 f"{github_url}", Path("datasets") / dataset_name 

1444 ) 

1445 

1446 # unzip the downloaded repo and merge the train, dev and test datasets 

1447 unpack_file(zip_path, data_folder, "zip", False) # unzipped folder name: WNUT_2020_NER-master 

1448 

1449 if sample == "test": 

1450 file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data_2020/Conll_Format/") 

1451 else: 

1452 file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data/Conll_Format/") 

1453 filenames = os.listdir(file_path) 

1454 with open(data_folder / (sample + '.txt'), 'w') as outfile: 

1455 for fname in filenames: 

1456 with open(file_path / fname) as infile: 

1457 lines = infile.read() 

1458 outfile.write(lines) 

1459 

1460 shutil.rmtree(str(data_folder / "WNUT_2020_NER-master")) # clean up when done 

1461 

1462 super(NER_ENGLISH_WNUT_2020, self).__init__( 

1463 data_folder, 

1464 columns, 

1465 tag_to_bioes=tag_to_bioes, 

1466 encoding="utf-8", 

1467 in_memory=in_memory, 

1468 document_separator_token=None if not document_as_sequence else "-DOCSTART-", 

1469 **corpusargs, 

1470 ) 

1471 

1472 

1473class NER_ENGLISH_WIKIGOLD(ColumnCorpus): 

1474 def __init__( 

1475 self, 

1476 base_path: Union[str, Path] = None, 

1477 tag_to_bioes: str = "ner", 

1478 in_memory: bool = True, 

1479 document_as_sequence: bool = False, 

1480 **corpusargs, 

1481 ): 

1482 """ 

1483 Initialize the wikigold corpus. The first time you call this constructor it will automatically 

1484 download the dataset. 

1485 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

1486 to point to a different folder but typically this should not be necessary. 

1487 :param tag_to_bioes: NER by default, should not be changed 

1488 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

1489 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

1490 """ 

1491 if type(base_path) == str: 

1492 base_path: Path = Path(base_path) 

1493 

1494 # column format 

1495 columns = {0: "text", 1: "ner"} 

1496 

1497 # this dataset name 

1498 dataset_name = self.__class__.__name__.lower() 

1499 

1500 # default dataset folder is the cache root 

1501 if not base_path: 

1502 base_path = flair.cache_root / "datasets" 

1503 data_folder = base_path / dataset_name 

1504 

1505 # download data if necessary 

1506 wikigold_ner_path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/wikigold/CONLL-format/data/" 

1507 cached_path(f"{wikigold_ner_path}wikigold.conll.txt", Path("datasets") / dataset_name) 

1508 

1509 super(NER_ENGLISH_WIKIGOLD, self).__init__( 

1510 data_folder, 

1511 columns, 

1512 tag_to_bioes=tag_to_bioes, 

1513 encoding="utf-8", 

1514 in_memory=in_memory, 

1515 train_file='wikigold.conll.txt', 

1516 document_separator_token=None if not document_as_sequence else "-DOCSTART-", 

1517 **corpusargs, 

1518 ) 

1519 

1520 

1521class NER_FINNISH(ColumnCorpus): 

1522 def __init__( 

1523 self, 

1524 base_path: Union[str, Path] = None, 

1525 tag_to_bioes: str = "ner", 

1526 in_memory: bool = True, 

1527 **corpusargs, 

1528 ): 

1529 if type(base_path) == str: 

1530 base_path: Path = Path(base_path) 

1531 

1532 # column format 

1533 columns = {0: "text", 1: "ner"} 

1534 

1535 # this dataset name 

1536 dataset_name = self.__class__.__name__.lower() 

1537 

1538 # default dataset folder is the cache root 

1539 if not base_path: 

1540 base_path = flair.cache_root / "datasets" 

1541 data_folder = base_path / dataset_name 

1542 

1543 # download data if necessary 

1544 ner_finnish_path = "https://raw.githubusercontent.com/mpsilfve/finer-data/master/data/digitoday." 

1545 cached_path(f"{ner_finnish_path}2014.train.csv", Path("datasets") / dataset_name) 

1546 cached_path(f"{ner_finnish_path}2014.dev.csv", Path("datasets") / dataset_name) 

1547 cached_path(f"{ner_finnish_path}2015.test.csv", Path("datasets") / dataset_name) 

1548 

1549 self._remove_lines_without_annotations(data_file=Path(data_folder / "digitoday.2015.test.csv")) 

1550 

1551 super(NER_FINNISH, self).__init__( 

1552 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, skip_first_line=True, **corpusargs, 

1553 ) 

1554 

1555 def _remove_lines_without_annotations(self, data_file: Union[str, Path] = None): 

1556 with open(data_file, 'r') as f: 

1557 lines = f.readlines() 

1558 with open(data_file, 'w') as f: 

1559 for line in lines: 

1560 if len(line.split()) != 1: 

1561 f.write(line) 

1562 

1563 

1564class NER_GERMAN_BIOFID(ColumnCorpus): 

1565 def __init__( 

1566 self, 

1567 base_path: Union[str, Path] = None, 

1568 tag_to_bioes: str = "ner", 

1569 in_memory: bool = True, 

1570 **corpusargs, 

1571 ): 

1572 if type(base_path) == str: 

1573 base_path: Path = Path(base_path) 

1574 

1575 # column format 

1576 columns = {0: "text", 1: "lemma", 2: "pos", 3: "ner"} 

1577 

1578 # this dataset name 

1579 dataset_name = self.__class__.__name__.lower() 

1580 

1581 # default dataset folder is the cache root 

1582 if not base_path: 

1583 base_path = flair.cache_root / "datasets" 

1584 data_folder = base_path / dataset_name 

1585 

1586 # download data if necessary 

1587 biofid_path = "https://raw.githubusercontent.com/texttechnologylab/BIOfid/master/BIOfid-Dataset-NER/" 

1588 cached_path(f"{biofid_path}train.conll", Path("datasets") / dataset_name) 

1589 cached_path(f"{biofid_path}dev.conll", Path("datasets") / dataset_name) 

1590 cached_path(f"{biofid_path}test.conll", Path("datasets") / dataset_name) 

1591 

1592 super(NER_GERMAN_BIOFID, self).__init__( 

1593 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, 

1594 ) 

1595 

1596 

1597class NER_GERMAN_EUROPARL(ColumnCorpus): 

1598 def __init__( 

1599 self, 

1600 base_path: Union[str, Path] = None, 

1601 tag_to_bioes: str = "ner", 

1602 in_memory: bool = True, 

1603 **corpusargs, 

1604 ): 

1605 """ 

1606 Initialize the EUROPARL_NER_GERMAN corpus. The first time you call this constructor it will automatically 

1607 download the dataset. 

1608 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

1609 to point to a different folder but typically this should not be necessary. 

1610 :param tag_to_bioes: 'ner' by default, should not be changed. 

1611 :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage. 

1612 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

1613 """ 

1614 

1615 if type(base_path) == str: 

1616 base_path: Path = Path(base_path) 

1617 

1618 # column format 

1619 columns = {0: 'text', 1: 'lemma', 2: 'pos', 3: 'np', 4: 'ner'} 

1620 

1621 # this dataset name 

1622 dataset_name = self.__class__.__name__.lower() 

1623 

1624 # default dataset folder is the cache root 

1625 if not base_path: 

1626 base_path = flair.cache_root / "datasets" 

1627 data_folder = base_path / dataset_name 

1628 

1629 # download data if necessary 

1630 europarl_ner_german_path = "https://nlpado.de/~sebastian/software/ner/" 

1631 cached_path(f"{europarl_ner_german_path}ep-96-04-15.conll", Path("datasets") / dataset_name) 

1632 cached_path(f"{europarl_ner_german_path}ep-96-04-16.conll", Path("datasets") / dataset_name) 

1633 

1634 self._add_IOB_tags(data_file=Path(data_folder / "ep-96-04-15.conll"), encoding="latin-1", ner_column=4) 

1635 self._add_IOB_tags(data_file=Path(data_folder / "ep-96-04-16.conll"), encoding="latin-1", ner_column=4) 

1636 

1637 super(NER_GERMAN_EUROPARL, self).__init__( 

1638 data_folder, 

1639 columns, 

1640 tag_to_bioes=tag_to_bioes, 

1641 encoding="latin-1", 

1642 in_memory=in_memory, 

1643 train_file='ep-96-04-16.conll', 

1644 test_file='ep-96-04-15.conll', 

1645 **corpusargs, 

1646 ) 

1647 

1648 def _add_IOB_tags(self, data_file: Union[str, Path], encoding: str = "utf8", ner_column: int = 1): 

1649 """ 

1650 Function that adds IOB tags if only chunk names are provided (e.g. words are tagged PER instead 

1651 of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects 

1652 the letter 'O'. Additionally it removes lines with no tags in the data file and can also 

1653 be used if the data is only partially IOB tagged. 

1654 Parameters 

1655 ---------- 

1656 data_file : Union[str, Path] 

1657 Path to the data file. 

1658 encoding : str, optional 

1659 Encoding used in open function. The default is "utf8". 

1660 ner_column : int, optional 

1661 Specifies the ner-tagged column. The default is 1 (the second column). 

1662 

1663 """ 

1664 

1665 def add_I_prefix(current_line: List[str], ner: int, tag: str): 

1666 for i in range(0, len(current_line)): 

1667 if i == 0: 

1668 f.write(line_list[i]) 

1669 elif i == ner: 

1670 f.write(' I-' + tag) 

1671 else: 

1672 f.write(' ' + current_line[i]) 

1673 f.write('\n') 

1674 

1675 with open(file=data_file, mode='r', encoding=encoding) as f: 

1676 lines = f.readlines() 

1677 with open(file=data_file, mode='w', encoding=encoding) as f: 

1678 pred = 'O' # remembers ner tag of predecessing line 

1679 for line in lines: 

1680 line_list = line.split() 

1681 if len(line_list) > 2: # word with tags 

1682 ner_tag = line_list[ner_column] 

1683 if ner_tag in ['0', 'O']: # no chunk 

1684 for i in range(0, len(line_list)): 

1685 if i == 0: 

1686 f.write(line_list[i]) 

1687 elif i == ner_column: 

1688 f.write(' O') 

1689 else: 

1690 f.write(' ' + line_list[i]) 

1691 f.write('\n') 

1692 pred = 'O' 

1693 elif '-' not in ner_tag: # no IOB tags 

1694 if pred == 'O': # found a new chunk 

1695 add_I_prefix(line_list, ner_column, ner_tag) 

1696 pred = ner_tag 

1697 else: # found further part of chunk or new chunk directly after old chunk 

1698 add_I_prefix(line_list, ner_column, ner_tag) 

1699 pred = ner_tag 

1700 else: # line already has IOB tag (tag contains '-') 

1701 f.write(line) 

1702 pred = ner_tag.split('-')[1] 

1703 elif len(line_list) == 0: # empty line 

1704 f.write('\n') 

1705 pred = 'O' 

1706 

1707 

1708class NER_GERMAN_LEGAL(ColumnCorpus): 

1709 def __init__( 

1710 self, 

1711 base_path: Union[str, Path] = None, 

1712 tag_to_bioes: str = "ner", 

1713 in_memory: bool = True, 

1714 **corpusargs, 

1715 ): 

1716 """ 

1717 Initialize the LER_GERMAN (Legal Entity Recognition) corpus. The first time you call this constructor it will automatically 

1718 download the dataset. 

1719 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

1720 to point to a different folder but typically this should not be necessary. 

1721 :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage. 

1722 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

1723 """ 

1724 

1725 if type(base_path) == str: 

1726 base_path: Path = Path(base_path) 

1727 

1728 # column format 

1729 columns = {0: "text", 1: "ner"} 

1730 

1731 # this dataset name 

1732 dataset_name = self.__class__.__name__.lower() 

1733 

1734 # default dataset folder is the cache root 

1735 if not base_path: 

1736 base_path = flair.cache_root / "datasets" 

1737 data_folder = base_path / dataset_name 

1738 

1739 # download data if necessary 

1740 ler_path = "https://raw.githubusercontent.com/elenanereiss/Legal-Entity-Recognition/master/data/" 

1741 cached_path(f"{ler_path}ler.conll", Path("datasets") / dataset_name) 

1742 

1743 super(NER_GERMAN_LEGAL, self).__init__( 

1744 data_folder, 

1745 columns, 

1746 tag_to_bioes=tag_to_bioes, 

1747 in_memory=in_memory, 

1748 train_file='ler.conll', 

1749 **corpusargs, 

1750 ) 

1751 

1752 

1753class NER_GERMAN_GERMEVAL(ColumnCorpus): 

1754 def __init__( 

1755 self, 

1756 base_path: Union[str, Path] = None, 

1757 tag_to_bioes: str = "ner", 

1758 in_memory: bool = True, 

1759 **corpusargs, 

1760 ): 

1761 """ 

1762 Initialize the GermEval NER corpus for German. This is only possible if you've manually downloaded it to your 

1763 machine. Obtain the corpus from https://sites.google.com/site/germeval2014ner/data and put it into some folder. 

1764 Then point the base_path parameter in the constructor to this folder 

1765 :param base_path: Path to the GermEval corpus on your machine 

1766 :param tag_to_bioes: 'ner' by default, should not be changed. 

1767 :param in_memory:If True, keeps dataset in memory giving speedups in training. 

1768 """ 

1769 if type(base_path) == str: 

1770 base_path: Path = Path(base_path) 

1771 

1772 # column format 

1773 columns = {1: "text", 2: "ner"} 

1774 

1775 # this dataset name 

1776 dataset_name = self.__class__.__name__.lower() 

1777 

1778 # default dataset folder is the cache root 

1779 if not base_path: 

1780 base_path = flair.cache_root / "datasets" 

1781 data_folder = base_path / dataset_name 

1782 

1783 # check if data there 

1784 if not data_folder.exists(): 

1785 # create folder 

1786 os.makedirs(data_folder) 

1787 

1788 # download dataset 

1789 import gdown 

1790 gdown.download(url="https://drive.google.com/uc?id={}".format("1Jjhbal535VVz2ap4v4r_rN1UEHTdLK5P"), 

1791 output=str(data_folder / 'train.tsv')) 

1792 gdown.download(url="https://drive.google.com/uc?id={}".format("1u9mb7kNJHWQCWyweMDRMuTFoOHOfeBTH"), 

1793 output=str(data_folder / 'test.tsv')) 

1794 gdown.download(url="https://drive.google.com/uc?id={}".format("1ZfRcQThdtAR5PPRjIDtrVP7BtXSCUBbm"), 

1795 output=str(data_folder / 'dev.tsv')) 

1796 

1797 super(NER_GERMAN_GERMEVAL, self).__init__( 

1798 data_folder, 

1799 columns, 

1800 tag_to_bioes=tag_to_bioes, 

1801 comment_symbol="#", 

1802 in_memory=in_memory, 

1803 **corpusargs, 

1804 ) 

1805 

1806 

1807class NER_GERMAN_POLITICS(ColumnCorpus): 

1808 def __init__( 

1809 self, 

1810 base_path: Union[str, Path] = None, 

1811 tag_to_bioes: str = "ner", 

1812 column_delimiter: str = r"\s+", 

1813 in_memory: bool = True, 

1814 **corpusargs, 

1815 ): 

1816 """ 

1817 Initialize corpus with Named Entity Model for German, Politics (NEMGP) data from 

1818 https://www.thomas-zastrow.de/nlp/. The first time you call this constructor it will automatically download the 

1819 dataset. 

1820 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

1821 to point to a different folder but typically this should not be necessary. 

1822 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict 

1823 POS tags instead 

1824 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

1825 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

1826 """ 

1827 if type(base_path) == str: 

1828 base_path: Path = Path(base_path) 

1829 

1830 # column format 

1831 columns = {0: "text", 1: "ner"} 

1832 

1833 # this dataset name 

1834 dataset_name = self.__class__.__name__.lower() 

1835 

1836 # default dataset folder is the cache root 

1837 if not base_path: 

1838 base_path = flair.cache_root / "datasets" 

1839 data_folder = base_path / dataset_name 

1840 

1841 # download and parse data if necessary 

1842 german_politics_path = "https://www.thomas-zastrow.de/nlp/nemgp_trainingdata_01.txt.zip" 

1843 corpus_file_name = "nemgp_trainingdata_01.txt" 

1844 parsed_dataset = data_folder / "raw" / corpus_file_name 

1845 

1846 if not parsed_dataset.exists(): 

1847 german_politics_zip = cached_path(f"{german_politics_path}", Path("datasets") / dataset_name / "raw") 

1848 unpack_file(german_politics_zip, data_folder / "raw", "zip", False) 

1849 self._convert_to_column_corpus(parsed_dataset) 

1850 

1851 # create train test dev if not exist 

1852 train_dataset = data_folder / "train.txt" 

1853 if not train_dataset.exists(): 

1854 self._create_datasets(parsed_dataset, data_folder) 

1855 

1856 super(NER_GERMAN_POLITICS, self).__init__( 

1857 data_folder, 

1858 columns, 

1859 column_delimiter=column_delimiter, 

1860 train_file='train.txt', 

1861 dev_file='dev.txt', 

1862 test_file='test.txt', 

1863 tag_to_bioes=tag_to_bioes, 

1864 encoding="utf-8", 

1865 in_memory=in_memory, 

1866 **corpusargs, 

1867 ) 

1868 

1869 def _convert_to_column_corpus(self, data_file: Union[str, Path]): 

1870 with open(data_file, 'r', encoding='utf-8') as f: 

1871 lines = f.readlines() 

1872 with open(data_file, 'w', encoding='utf-8') as f: 

1873 tag_bool = False 

1874 new_sentence = True 

1875 for line in lines: 

1876 line = re.sub('\s{2,}', ' ', line).strip().split(' ') 

1877 for substr in line: 

1878 if substr == '.': 

1879 f.write("\n") 

1880 new_sentence = True 

1881 elif "<START:" in substr: 

1882 tag_bool = True 

1883 tag = substr.strip('<START:').strip('>') 

1884 if 'loc' in tag: 

1885 tag_IOB = '-LOC' 

1886 elif 'per' in tag: 

1887 tag_IOB = '-PER' 

1888 elif 'org' in tag: 

1889 tag_IOB = '-ORG' 

1890 elif 'misc' in tag: 

1891 tag_IOB = '-MISC' 

1892 elif "<END>" in substr: 

1893 tag_bool = False 

1894 new_sentence = True 

1895 else: 

1896 if tag_bool: 

1897 if new_sentence is True: 

1898 start = 'B' 

1899 new_sentence = False 

1900 else: 

1901 start = 'I' 

1902 f.write(substr.strip(' ') + " " + start + tag_IOB + "\n") 

1903 else: 

1904 f.write(substr.strip(' ') + " " + 'O' + "\n") 

1905 

1906 def _create_datasets(self, data_file: Union[str, Path], data_folder: Union[str, Path]): 

1907 with open(data_file, 'r') as file: 

1908 num_lines = len(file.readlines()) 

1909 file.seek(0) 

1910 

1911 train_len = round(num_lines * 0.8) 

1912 test_len = round(num_lines * 0.1) 

1913 dev_len = num_lines - train_len - test_len 

1914 

1915 train = open(data_folder / "train.txt", "w") 

1916 test = open(data_folder / "test.txt", "w") 

1917 dev = open(data_folder / "dev.txt", "w") 

1918 

1919 k = 0 

1920 for line in file.readlines(): 

1921 k += 1 

1922 if k <= train_len: 

1923 train.write(line) 

1924 elif k > train_len and k <= (train_len + test_len): 

1925 test.write(line) 

1926 elif k > (train_len + test_len) and k <= num_lines: 

1927 dev.write(line) 

1928 

1929 

1930class NER_HUNGARIAN(ColumnCorpus): 

1931 def __init__( 

1932 self, 

1933 base_path: Union[str, Path] = None, 

1934 tag_to_bioes: str = "ner", 

1935 in_memory: bool = True, 

1936 document_as_sequence: bool = False, 

1937 **corpusargs, 

1938 ): 

1939 """ 

1940 Initialize the NER Business corpus for Hungarian. The first time you call this constructor it will automatically 

1941 download the dataset. 

1942 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

1943 to point to a different folder but typically this should not be necessary. 

1944 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict 

1945 POS tags instead 

1946 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

1947 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

1948 """ 

1949 if type(base_path) == str: 

1950 base_path: Path = Path(base_path) 

1951 

1952 # column format 

1953 columns = {0: "text", 1: "ner"} 

1954 

1955 # this dataset name 

1956 dataset_name = self.__class__.__name__.lower() 

1957 

1958 # default dataset folder is the cache root 

1959 if not base_path: 

1960 base_path = flair.cache_root / "datasets" 

1961 data_folder = base_path / dataset_name 

1962 

1963 # If the extracted corpus file is not yet present in dir 

1964 if not os.path.isfile(data_folder / 'hun_ner_corpus.txt'): 

1965 # download zip if necessary 

1966 hun_ner_path = "https://rgai.sed.hu/sites/rgai.sed.hu/files/business_NER.zip" 

1967 path_to_zipped_corpus = cached_path(hun_ner_path, Path("datasets") / dataset_name) 

1968 # extracted corpus is not present , so unpacking it. 

1969 unpack_file( 

1970 path_to_zipped_corpus, 

1971 data_folder, 

1972 mode="zip", 

1973 keep=True 

1974 ) 

1975 

1976 super(NER_HUNGARIAN, self).__init__( 

1977 data_folder, 

1978 columns, 

1979 train_file='hun_ner_corpus.txt', 

1980 column_delimiter='\t', 

1981 tag_to_bioes=tag_to_bioes, 

1982 encoding="latin-1", 

1983 in_memory=in_memory, 

1984 label_name_map={'0': 'O'}, 

1985 document_separator_token=None if not document_as_sequence else "-DOCSTART-", 

1986 **corpusargs, 

1987 ) 

1988 

1989 

1990class NER_ICELANDIC(ColumnCorpus): 

1991 def __init__( 

1992 self, 

1993 base_path: Union[str, Path] = None, 

1994 tag_to_bioes: str = "ner", 

1995 in_memory: bool = True, 

1996 **corpusargs, 

1997 ): 

1998 """ 

1999 Initialize the ICELANDIC_NER corpus. The first time you call this constructor it will automatically 

2000 download the dataset. 

2001 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

2002 to point to a different folder but typically this should not be necessary. 

2003 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict 

2004 POS tags instead 

2005 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

2006 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

2007 """ 

2008 if type(base_path) == str: 

2009 base_path: Path = Path(base_path) 

2010 

2011 # column format 

2012 columns = {0: "text", 1: "ner"} 

2013 

2014 # this dataset name 

2015 dataset_name = self.__class__.__name__.lower() 

2016 

2017 # default dataset folder is the cache root 

2018 if not base_path: 

2019 base_path = flair.cache_root / "datasets" 

2020 data_folder = base_path / dataset_name 

2021 

2022 if not os.path.isfile(data_folder / 'icelandic_ner.txt'): 

2023 # download zip 

2024 icelandic_ner = "https://repository.clarin.is/repository/xmlui/handle/20.500.12537/42/allzip" 

2025 icelandic_ner_path = cached_path(icelandic_ner, Path("datasets") / dataset_name) 

2026 

2027 # unpacking the zip 

2028 unpack_file( 

2029 icelandic_ner_path, 

2030 data_folder, 

2031 mode="zip", 

2032 keep=True 

2033 ) 

2034 outputfile = os.path.abspath(data_folder) 

2035 

2036 # merge the files in one as the zip is containing multiples files 

2037 

2038 with open(outputfile / data_folder / "icelandic_ner.txt", "wb") as outfile: 

2039 for files in os.walk(outputfile / data_folder): 

2040 f = files[2] 

2041 

2042 for i in range(len(f)): 

2043 if f[i].endswith('.txt'): 

2044 with open(outputfile / data_folder / f[i], 'rb') as infile: 

2045 contents = infile.read() 

2046 outfile.write(contents) 

2047 

2048 super(NER_ICELANDIC, self).__init__( 

2049 data_folder, 

2050 columns, 

2051 train_file='icelandic_ner.txt', 

2052 tag_to_bioes=tag_to_bioes, 

2053 in_memory=in_memory, 

2054 **corpusargs, 

2055 ) 

2056 

2057 

2058class NER_JAPANESE(ColumnCorpus): 

2059 def __init__( 

2060 self, 

2061 base_path: Union[str, Path] = None, 

2062 tag_to_bioes: str = "ner", 

2063 in_memory: bool = True, 

2064 **corpusargs, 

2065 ): 

2066 """ 

2067 Initialize the Hironsan/IOB2 corpus for Japanese. The first time you call this constructor it will automatically 

2068 download the dataset. 

2069 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

2070 to point to a different folder but typically this should not be necessary. 

2071 :param tag_to_bioes: NER by default. 

2072 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

2073 """ 

2074 if type(base_path) == str: 

2075 base_path: Path = Path(base_path) 

2076 

2077 # column format 

2078 columns = {0: 'text', 1: 'ner'} 

2079 

2080 # this dataset name 

2081 dataset_name = self.__class__.__name__.lower() 

2082 

2083 # default dataset folder is the cache root 

2084 if not base_path: 

2085 base_path = flair.cache_root / "datasets" 

2086 data_folder = base_path / dataset_name 

2087 

2088 # download data from github if necessary (hironsan.txt, ja.wikipedia.conll) 

2089 IOB2_path = "https://raw.githubusercontent.com/Hironsan/IOB2Corpus/master/" 

2090 

2091 # download files if not present locally 

2092 cached_path(f"{IOB2_path}hironsan.txt", data_folder / 'raw') 

2093 cached_path(f"{IOB2_path}ja.wikipedia.conll", data_folder / 'raw') 

2094 

2095 # we need to modify the original files by adding new lines after after the end of each sentence 

2096 train_data_file = data_folder / 'train.txt' 

2097 if not train_data_file.is_file(): 

2098 self.__prepare_jap_wikinews_corpus(data_folder / 'raw' / "hironsan.txt", data_folder / 'train.txt') 

2099 self.__prepare_jap_wikipedia_corpus(data_folder / 'raw' / "ja.wikipedia.conll", data_folder / 'train.txt') 

2100 

2101 super(NER_JAPANESE, self).__init__( 

2102 data_folder, 

2103 columns, 

2104 train_file='train.txt', 

2105 tag_to_bioes=tag_to_bioes, 

2106 in_memory=in_memory, 

2107 **corpusargs, 

2108 ) 

2109 

2110 @staticmethod 

2111 def __prepare_jap_wikipedia_corpus(file_in: Union[str, Path], file_out: Union[str, Path]): 

2112 with open(file_in, 'r') as f: 

2113 lines = f.readlines() 

2114 with open(file_out, 'a') as f: 

2115 for line in lines: 

2116 if (line[0] == "。"): 

2117 f.write(line) 

2118 f.write("\n") 

2119 elif (line[0] == "\n"): 

2120 continue 

2121 else: 

2122 f.write(line) 

2123 

2124 @staticmethod 

2125 def __prepare_jap_wikinews_corpus(file_in: Union[str, Path], file_out: Union[str, Path]): 

2126 with open(file_in, 'r') as f: 

2127 lines = f.readlines() 

2128 with open(file_out, 'a') as f: 

2129 for line in lines: 

2130 sp_line = line.split("\t") 

2131 if (sp_line[0] == "\n"): 

2132 f.write("\n") 

2133 else: 

2134 f.write(sp_line[0] + "\t" + sp_line[len(sp_line) - 1]) 

2135 

2136 

2137class NER_MASAKHANE(MultiCorpus): 

2138 def __init__( 

2139 self, 

2140 languages: Union[str, List[str]] = "luo", 

2141 base_path: Union[str, Path] = None, 

2142 tag_to_bioes: str = "ner", 

2143 in_memory: bool = True, 

2144 **corpusargs, 

2145 ): 

2146 """ 

2147 Initialize the Masakhane corpus available on https://github.com/masakhane-io/masakhane-ner/tree/main/data. 

2148 It consists of ten African languages. Pass a language code or a list of language codes to initialize the corpus 

2149 with the languages you require. If you pass "all", all languages will be initialized. 

2150 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

2151 to point to a different folder but typically this should not be necessary. 

2152 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict 

2153 POS tags instead 

2154 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

2155 """ 

2156 if type(base_path) == str: 

2157 base_path: Path = Path(base_path) 

2158 

2159 # if only one language is given 

2160 if type(languages) == str: 

2161 languages = [languages] 

2162 

2163 # column format 

2164 columns = {0: "text", 1: "ner"} 

2165 

2166 # this dataset name 

2167 dataset_name = self.__class__.__name__.lower() 

2168 

2169 # default dataset folder is the cache root 

2170 if not base_path: 

2171 base_path = flair.cache_root / "datasets" 

2172 data_folder = base_path / dataset_name 

2173 

2174 language_to_code = {"amharic": "amh", 

2175 "hausa": "hau", 

2176 "igbo": "ibo", 

2177 "kinyarwanda": "kin", 

2178 "luganda": "lug", 

2179 "luo": "luo", 

2180 "naija": "pcm", 

2181 "swahili": "swa", 

2182 "yoruba": "yor", 

2183 "wolof": "wol", 

2184 } 

2185 

2186 # use all languages if explicitly set to "all" 

2187 if languages == ["all"]: languages = language_to_code.values() 

2188 

2189 corpora = [] 

2190 for language in languages: 

2191 

2192 if language in language_to_code.keys(): 

2193 language = language_to_code[language] 

2194 

2195 if language not in language_to_code.values(): 

2196 log.error(f"Language '{language}' is not in list of supported languages!") 

2197 log.error(f"Supported are '{language_to_code.values()}'!") 

2198 log.error(f"Instantiate this Corpus for instance like so 'corpus = NER_MASAKHANE(languages='luo')'") 

2199 raise Exception() 

2200 

2201 language_folder = data_folder / language 

2202 

2203 # download data if necessary 

2204 data_path = f"https://raw.githubusercontent.com/masakhane-io/masakhane-ner/main/data/{language}/" 

2205 cached_path(f"{data_path}dev.txt", language_folder) 

2206 cached_path(f"{data_path}test.txt", language_folder) 

2207 cached_path(f"{data_path}train.txt", language_folder) 

2208 

2209 # initialize comlumncorpus and add it to list 

2210 log.info(f"Reading data for language {language}") 

2211 corp = ColumnCorpus(data_folder=language_folder, 

2212 column_format=columns, 

2213 tag_to_bioes=tag_to_bioes, 

2214 encoding="utf-8", 

2215 in_memory=in_memory, 

2216 name=language, 

2217 **corpusargs, 

2218 ) 

2219 corpora.append(corp) 

2220 

2221 super(NER_MASAKHANE, self).__init__( 

2222 corpora, 

2223 name='masakhane-' + '-'.join(languages), 

2224 ) 

2225 

2226 

2227class NER_MULTI_WIKIANN(MultiCorpus): 

2228 def __init__( 

2229 self, 

2230 languages: Union[str, List[str]] = "en", 

2231 base_path: Union[str, Path] = None, 

2232 tag_to_bioes: str = "ner", 

2233 in_memory: bool = False, 

2234 **corpusargs, 

2235 ): 

2236 """ 

2237 WkiAnn corpus for cross-lingual NER consisting of datasets from 282 languages that exist 

2238 in Wikipedia. See https://elisa-ie.github.io/wikiann/ for details and for the languages and their 

2239 respective abbreveations, i.e. "en" for english. (license: https://opendatacommons.org/licenses/by/) 

2240 Parameters 

2241 ---------- 

2242 languages : Union[str, List[str]] 

2243 Should be an abbreviation of a language ("en", "de",..) or a list of abbreviations. 

2244 The datasets of all passed languages will be saved in one MultiCorpus. 

2245 (Note that, even though listed on https://elisa-ie.github.io/wikiann/ some datasets are empty. 

2246 This includes "aa", "cho", "ho", "hz", "ii", "jam", "kj", "kr", "mus", "olo" and "tcy".) 

2247 base_path : Union[str, Path], optional 

2248 Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

2249 to point to a different folder but typically this should not be necessary. 

2250 tag_to_bioes : str, optional 

2251 The data is in bio-format. It will by default (with the string "ner" as value) be transformed 

2252 into the bioes format. If you dont want that set it to None. 

2253 

2254 """ 

2255 if type(languages) == str: 

2256 languages = [languages] 

2257 

2258 if type(base_path) == str: 

2259 base_path: Path = Path(base_path) 

2260 

2261 # column format 

2262 columns = {0: "text", 1: "ner"} 

2263 

2264 # this dataset name 

2265 dataset_name = self.__class__.__name__.lower() 

2266 

2267 # default dataset folder is the cache root 

2268 if not base_path: 

2269 base_path = flair.cache_root / "datasets" 

2270 data_folder = base_path / dataset_name 

2271 

2272 # For each language in languages, the file is downloaded if not existent 

2273 # Then a comlumncorpus of that data is created and saved in a list 

2274 # this list is handed to the multicorpus 

2275 

2276 # list that contains the columncopora 

2277 corpora = [] 

2278 

2279 google_drive_path = 'https://drive.google.com/uc?id=' 

2280 # download data if necessary 

2281 first = True 

2282 for language in languages: 

2283 

2284 language_folder = data_folder / language 

2285 file_name = 'wikiann-' + language + '.bio' 

2286 

2287 # if language not downloaded yet, download it 

2288 if not language_folder.exists(): 

2289 if first: 

2290 import gdown 

2291 import tarfile 

2292 first = False 

2293 # create folder 

2294 os.makedirs(language_folder) 

2295 # get google drive id from list 

2296 google_id = self._google_drive_id_from_language_name(language) 

2297 url = google_drive_path + google_id 

2298 

2299 # download from google drive 

2300 gdown.download(url, str(language_folder / language) + '.tar.gz') 

2301 

2302 # unzip 

2303 log.info("Extracting data...") 

2304 tar = tarfile.open(str(language_folder / language) + '.tar.gz', "r:gz") 

2305 # tar.extractall(language_folder,members=[tar.getmember(file_name)]) 

2306 tar.extract(file_name, str(language_folder)) 

2307 tar.close() 

2308 log.info('...done.') 

2309 

2310 # transform data into required format 

2311 # the processed dataset has the additional ending "_new" 

2312 log.info("Processing dataset...") 

2313 self._silver_standard_to_simple_ner_annotation(str(language_folder / file_name)) 

2314 # remove the unprocessed dataset 

2315 os.remove(str(language_folder / file_name)) 

2316 log.info('...done.') 

2317 

2318 # initialize comlumncorpus and add it to list 

2319 log.info(f"Reading data for language {language}") 

2320 corp = ColumnCorpus(data_folder=language_folder, 

2321 column_format=columns, 

2322 train_file=file_name + '_new', 

2323 tag_to_bioes=tag_to_bioes, 

2324 in_memory=in_memory, 

2325 **corpusargs, 

2326 ) 

2327 corpora.append(corp) 

2328 log.info("...done.") 

2329 

2330 super(NER_MULTI_WIKIANN, self).__init__( 

2331 corpora, name='wikiann', 

2332 ) 

2333 

2334 def _silver_standard_to_simple_ner_annotation(self, data_file: Union[str, Path]): 

2335 f_read = open(data_file, 'r', encoding='utf-8') 

2336 f_write = open(data_file + '_new', 'w+', encoding='utf-8') 

2337 while True: 

2338 line = f_read.readline() 

2339 if line: 

2340 if line == '\n': 

2341 f_write.write(line) 

2342 else: 

2343 liste = line.split() 

2344 f_write.write(liste[0] + ' ' + liste[-1] + '\n') 

2345 else: 

2346 break 

2347 f_read.close() 

2348 f_write.close() 

2349 

2350 def _google_drive_id_from_language_name(self, language): 

2351 languages_ids = { 

2352 'aa': '1tDDlydKq7KQQ3_23Ysbtke4HJOe4snIk', # leer 

2353 'ab': '1hB8REj2XA_0DjI9hdQvNvSDpuBIb8qRf', 

2354 'ace': '1WENJS2ppHcZqaBEXRZyk2zY-PqXkTkgG', 

2355 'ady': '1n6On8WWDHxEoybj7F9K15d_fkGPy6KgO', 

2356 'af': '1CPB-0BD2tg3zIT60D3hmJT0i5O_SKja0', 

2357 'ak': '1l2vlGHnQwvm9XhW5S-403fetwUXhBlZm', 

2358 'als': '196xyYjhbie7sYLHLZHWkkurOwQLi8wK-', 

2359 'am': '1ug1IEoExKD3xWpvfZprAPSQi82YF9Cet', 

2360 'an': '1DNLgPOAOsGZBYd6rC5ddhzvc9_DtWnk2', 

2361 'ang': '1W_0ti7Tl8AkqM91lRCMPWEuUnPOAZroV', 

2362 'ar': '1tyvd32udEQG_cNeVpaD5I2fxvCc6XKIS', 

2363 'arc': '1hSOByStqPmP3b9HfQ39EclUZGo8IKCMb', 

2364 'arz': '1CKW5ZhxTpIHmc8Jt5JLz_5O6Cr8Icsan', 

2365 'as': '12opBoIweBLM8XciMHT4B6-MAaKdYdvpE', 

2366 'ast': '1rp64PxGZBDfcw-tpFBjLg_ddLDElG1II', 

2367 'av': '1hncGUrkG1vwAAQgLtwOf41BWkHkEvdss', 

2368 'ay': '1VmIsWpMTz442b4Mx798ZOgtB9vquKQtf', 

2369 'az': '1FXDXsvBSdqc7GGIDZv0hqBOaaw12Ip2-', 

2370 'azb': '1amVqOuHLEkhjn8rkGUl-mXdZlaACWyNT', 

2371 'ba': '1aLx1d8GagI11VZVYOGQy0BEePeqoT0x3', 

2372 'bar': '1JZ8-k8ZmnpWYI_Yl_cBBgjVdxoM9Daci', 

2373 'bat-smg': '1trxKXDFSeKsygTMKi-ZqXSJs7F90k5a8', 

2374 'bcl': '1Hs0k7KVZ2DPsqroZ4cUKcwZG4HdPV794', 

2375 'be-x-old': '1gaK-spj1m6eGYQ-SsngLxxLUvP1VRk08', 

2376 'be': '1_ttfOSy9BzCRkIT_p3mImT82XRPpEiuH', 

2377 'bg': '1Iug6gYKemb0OrLTUrKDc_c66YGypTfCF', 

2378 'bh': '12OcSFLu940A8tVQLxI8pnxKBpTeZHmrh', 

2379 'bi': '1rftVziS_pqARx4mvLJC0sKLY-OL5ZIjE', 

2380 'bjn': '1n17mkRjPUAOWQk5LQs2C3Tz3ShxK0enZ', 

2381 'bm': '1284dwO_sfdsWE7FR06HhfBRUb8ePesKR', 

2382 'bn': '1K2DM1mT4hkr6NlAIBTj95BeVXcgvpgDm', 

2383 'bo': '1SzGHDVK-OguKdjZ4DXWiOJVrie1iHeWm', 

2384 'bpy': '1m-e5EoruJufvwBEgJLmJtx6jzx64pYN2', 

2385 'br': '1xdaBoJ1DnwI0iEq7gQN1dWcABAs_bM9H', 

2386 'bs': '167dsB01trMYFQl8FshtIdfhjw7IfVKbk', 

2387 'bug': '1yCnevM9_KJzFk27Vxsva_20OacLo4Uam', 

2388 'bxr': '1DlByAX3zB-9UyEAVD4wtX-R7mXC-8xum', 

2389 'ca': '1LuUgbd9sGa-5Ahcsy31EK89a3WOowftY', 

2390 'cbk-zam': '1kgF8xoD-kIOWZET_9kp_4yNX6AAXn6PI', 

2391 'cdo': '14x1y6611G-UAEGq92QEHRpreVkYnoUCw', 

2392 'ce': '1QUUCVKA-fkiCHd3KT3zUWefaWnxzlZLu', 

2393 'ceb': '1DJZE9RfaMoPNXHI73KBXAm4YSe-_YCUk', 

2394 'ch': '1YzAfhmatkmTpkZbAcD6X83epCgzD5S2_', 

2395 'cho': '1ciY0vF3c5a2mTOo_k32A2wMs0klK98Kb', # leer 

2396 'chr': '1EHaxz1UZHn7v2bbRzCLAhPsNtRzrG3Ae', 

2397 'chy': '1nNWwMAJr1KNdz3bHf6uIn-thZCknlTeB', 

2398 'ckb': '1llpaftcUSiXCZQZMdAqaJSrhwMdcf9IV', 

2399 'co': '1ZP-8oWgMYfW7a6w6ygEFkKDGbN39QnDn', 

2400 'cr': '1ST0xRicLAG4JdCZwGdaY-0pEXooQh7e6', 

2401 'crh': '1Jmpq2XVYUR_XaXU5XNhtOMnz-qkpsgpE', 

2402 'cs': '1Vydyze-jBkK_S1uV5ewV_Y6dbwhXr7lk', 

2403 'csb': '1naUyF74lZPnnopXdOqf5Xor2kT4WoHfS', 

2404 'cu': '1EN5dVTU6jc7YOYPCHq8EYUF31HlMUKs7', 

2405 'cv': '1gEUAlqYSSDI4TrWCqP1LUq2n0X1XEjN3', 

2406 'cy': '1q5g6NJE5GXf65Vc_P4BnUMHQ49Prz-J1', 

2407 'da': '11onAGOLkkqrIwM784siWlg-cewa5WKm8', 

2408 'de': '1f9nWvNkCCy6XWhd9uf4Dq-2--GzSaYAb', 

2409 'diq': '1IkpJaVbEOuOs9qay_KG9rkxRghWZhWPm', 

2410 'dsb': '1hlExWaMth-2eVIQ3i3siJSG-MN_7Z6MY', 

2411 'dv': '1WpCrslO4I7TMb2uaKVQw4U2U8qMs5szi', 

2412 'dz': '10WX52ePq2KfyGliwPvY_54hIjpzW6klV', 

2413 'ee': '1tYEt3oN2KPzBSWrk9jpCqnW3J1KXdhjz', 

2414 'el': '1cxq4NUYmHwWsEn5waYXfFSanlINXWLfM', 

2415 'eml': '17FgGhPZqZNtzbxpTJOf-6nxEuI5oU4Vd', 

2416 'en': '1mqxeCPjxqmO7e8utj1MQv1CICLFVvKa-', 

2417 'eo': '1YeknLymGcqj44ug2yd4P7xQVpSK27HkK', 

2418 'es': '1Dnx3MVR9r5cuoOgeew2gT8bDvWpOKxkU', 

2419 'et': '1Qhb3kYlQnLefWmNimdN_Vykm4mWzbcWy', 

2420 'eu': '1f613wH88UeITYyBSEMZByK-nRNMwLHTs', 

2421 'ext': '1D0nLOZ3aolCM8TShIRyCgF3-_MhWXccN', 

2422 'fa': '1QOG15HU8VfZvJUNKos024xI-OGm0zhEX', 

2423 'ff': '1h5pVjxDYcq70bSus30oqi9KzDmezVNry', 

2424 'fi': '1y3Kf6qYsSvL8_nSEwE1Y6Bf6ninaPvqa', 

2425 'fiu-vro': '1oKUiqG19WgPd3CCl4FGudk5ATmtNfToR', 

2426 'fj': '10xDMuqtoTJlJFp5ghbhKfNWRpLDK3W4d', 

2427 'fo': '1RhjYqgtri1276Be1N9RrNitdBNkpzh0J', 

2428 'fr': '1sK_T_-wzVPJYrnziNqWTriU52rEsXGjn', 

2429 'frp': '1NUm8B2zClBcEa8dHLBb-ZgzEr8phcQyZ', 

2430 'frr': '1FjNqbIUlOW1deJdB8WCuWjaZfUzKqujV', 

2431 'fur': '1oqHZMK7WAV8oHoZLjGR0PfmO38wmR6XY', 

2432 'fy': '1DvnU6iaTJc9bWedmDklHyx8nzKD1s3Ge', 

2433 'ga': '1Ql6rh7absdYQ8l-3hj_MVKcEC3tHKeFB', 

2434 'gag': '1zli-hOl2abuQ2wsDJU45qbb0xuvYwA3a', 

2435 'gan': '1u2dOwy58y-GaS-tCPJS_i9VRDQIPXwCr', 

2436 'gd': '1umsUpngJiwkLdGQbRqYpkgxZju9dWlRz', 

2437 'gl': '141K2IbLjJfXwFTIf-kthmmG0YWdi8liE', 

2438 'glk': '1ZDaxQ6ilXaoivo4_KllagabbvfOuiZ0c', 

2439 'gn': '1hM4MuCaVnZqnL-w-0N-WcWag22ikVLtZ', 

2440 'gom': '1BNOSw75tzPC0wEgLOCKbwu9wg9gcLOzs', 

2441 'got': '1YSHYBtXc1WvUvMIHPz6HHgJvaXKulJUj', 

2442 'gu': '1VdK-B2drqFwKg8KD23c3dKXY-cZgCMgd', 

2443 'gv': '1XZFohYNbKszEFR-V-yDXxx40V41PV9Zm', 

2444 'ha': '18ZG4tUU0owRtQA8Ey3Dl72ALjryEJWMC', 

2445 'hak': '1QQe3WgrCWbvnVH42QXD7KX4kihHURB0Z', 

2446 'haw': '1FLqlK-wpz4jy768XbQAtxd9PhC-9ciP7', 

2447 'he': '18K-Erc2VOgtIdskaQq4D5A3XkVstDmfX', 

2448 'hi': '1lBRapb5tjBqT176gD36K5yb_qsaFeu-k', 

2449 'hif': '153MQ9Ga4NQ-CkK8UiJM3DjKOk09fhCOV', 

2450 'ho': '1c1AoS7yq15iVkTEE-0f3x25NT4F202B8', # leer 

2451 'hr': '1wS-UtB3sGHuXJQQGR0F5lDegogsgoyif', 

2452 'hsb': '1_3mMLzAE5OmXn2z64rW3OwWbo85Mirbd', 

2453 'ht': '1BwCaF0nfdgkM7Yt7A7d7KyVk0BcuwPGk', 

2454 'hu': '10AkDmTxUWNbOXuYLYZ-ZPbLAdGAGZZ8J', 

2455 'hy': '1Mi2k2alJJquT1ybd3GC3QYDstSagaWdo', 

2456 'hz': '1c1m_-Q92v0Di7Nez6VuaccrN19i8icKV', # leer 

2457 'ia': '1jPyqTmDuVhEhj89N606Cja5heJEbcMoM', 

2458 'id': '1JWIvIh8fQoMQqk1rPvUThaskxnTs8tsf', 

2459 'ie': '1TaKRlTtB8-Wqu4sfvx6JQKIugAlg0pV-', 

2460 'ig': '15NFAf2Qx6BXSjv_Oun9_3QRBWNn49g86', 

2461 'ii': '1qldGJkMOMKwY13DpcgbxQCbff0K982f9', # leer 

2462 'ik': '1VoSTou2ZlwVhply26ujowDz6gjwtxmny', 

2463 'ilo': '1-xMuIT6GaM_YeHqgm1OamGkxYfBREiv3', 

2464 'io': '19Zla0wsAcrZm2c0Pw5ghpp4rHjYs26Pp', 

2465 'is': '11i-NCyqS6HbldIbYulsCgQGZFXR8hwoB', 

2466 'it': '1HmjlOaQunHqL2Te7pIkuBWrnjlmdfYo_', 

2467 'iu': '18jKm1S7Ls3l0_pHqQH8MycG3LhoC2pdX', 

2468 'ja': '10dz8UxyK4RIacXE2HcGdrharmp5rwc3r', 

2469 'jam': '1v99CXf9RnbF6aJo669YeTR6mQRTOLZ74', # leer 

2470 'jbo': '1_LmH9hc6FDGE3F7pyGB1fUEbSwuTYQdD', 

2471 'jv': '1qiSu1uECCLl4IBZS27FBdJIBivkJ7GwE', 

2472 'ka': '172UFuFRBX2V1aWeXlPSpu9TjS-3cxNaD', 

2473 'kaa': '1kh6hMPUdqO-FIxRY6qaIBZothBURXxbY', 

2474 'kab': '1oKjbZI6ZrrALCqnPCYgIjKNrKDA7ehcs', 

2475 'kbd': '1jNbfrboPOwJmlXQBIv053d7n5WXpMRv7', 

2476 'kg': '1iiu5z-sdJ2JLC4Ja9IgDxpRZklIb6nDx', 

2477 'ki': '1GUtt0QI84c5McyLGGxoi5uwjHOq1d6G8', 

2478 'kj': '1nSxXUSGDlXVCIPGlVpcakRc537MwuKZR', # leer 

2479 'kk': '1ryC3UN0myckc1awrWhhb6RIi17C0LCuS', 

2480 'kl': '1gXtGtX9gcTXms1IExICnqZUHefrlcIFf', 

2481 'km': '1DS5ATxvxyfn1iWvq2G6qmjZv9pv0T6hD', 

2482 'kn': '1ZGLYMxbb5-29MNmuUfg2xFhYUbkJFMJJ', 

2483 'ko': '12r8tIkTnwKhLJxy71qpIcoLrT6NNhQYm', 

2484 'koi': '1EdG_wZ_Qk124EPAZw-w6rdEhYLsgcvIj', 

2485 'kr': '19VNQtnBA-YL_avWuVeHQHxJZ9MZ04WPF', # leer 

2486 'krc': '1nReV4Mb7Wdj96czpO5regFbdBPu0zZ_y', 

2487 'ks': '1kzh0Pgrv27WRMstR9MpU8mu7p60TcT-X', 

2488 'ksh': '1iHJvrl2HeRaCumlrx3N7CPrHQ2KuLUkt', 

2489 'ku': '1YqJog7Bkk0fHBCSTxJ9heeE-bfbkbkye', 

2490 'kv': '1s91HI4eq8lQYlZwfrJAgaGlCyAtIhvIJ', 

2491 'kw': '16TaIX2nRfqDp8n7zudd4bqf5abN49dvW', 

2492 'ky': '17HPUKFdKWhUjuR1NOp5f3PQYfMlMCxCT', 

2493 'la': '1NiQuBaUIFEERvVXo6CQLwosPraGyiRYw', 

2494 'lad': '1PEmXCWLCqnjLBomMAYHeObM1AmVHtD08', 

2495 'lb': '1nE4g10xoTU23idmDtOQ0w2QCuizZ6QH_', 

2496 'lbe': '1KOm-AdRcCHfSc1-uYBxBA4GjxXjnIlE-', 

2497 'lez': '1cJAXshrLlF1TZlPHJTpDwEvurIOsz4yR', 

2498 'lg': '1Ur0y7iiEpWBgHECrIrT1OyIC8um_y4th', 

2499 'li': '1TikIqfqcZlSDWhOae1JnjJiDko4nj4Dj', 

2500 'lij': '1ro5ItUcF49iP3JdV82lhCQ07MtZn_VjW', 

2501 'lmo': '1W4rhBy2Pi5SuYWyWbNotOVkVY3kYWS_O', 

2502 'ln': '1bLSV6bWx0CgFm7ByKppZLpYCFL8EIAoD', 

2503 'lo': '1C6SSLeKF3QirjZbAZAcpVX_AXYg_TJG3', 

2504 'lrc': '1GUcS28MlJe_OjeQfS2AJ8uczpD8ut60e', 

2505 'lt': '1gAG6TcMTmC128wWK0rCXRlCTsJY9wFQY', 

2506 'ltg': '12ziP8t_fAAS9JqOCEC0kuJObEyuoiOjD', 

2507 'lv': '1MPuAM04u-AtfybXdpHwCqUpFWbe-zD0_', 

2508 'mai': '1d_nUewBkka2QGEmxCc9v3dTfvo7lPATH', 

2509 'map-bms': '1wrNIE-mqp2xb3lrNdwADe6pb7f35NP6V', 

2510 'mdf': '1BmMGUJy7afuKfhfTBMiKxM3D7FY-JrQ2', 

2511 'mg': '105WaMhcWa-46tCztoj8npUyg0aH18nFL', 

2512 'mh': '1Ej7n6yA1cF1cpD5XneftHtL33iHJwntT', 

2513 'mhr': '1CCPIUaFkEYXiHO0HF8_w07UzVyWchrjS', 

2514 'mi': '1F6au9xQjnF-aNBupGJ1PwaMMM6T_PgdQ', 

2515 'min': '1tVK5SHiCy_DaZSDm3nZBgT5bgWThbJt_', 

2516 'mk': '18NpudytGhSWq_LbmycTDw10cSftlSBGS', 

2517 'ml': '1V73UE-EvcE-vV3V1RTvU4sak6QFcP91y', 

2518 'mn': '14jRXicA87oXZOZllWqUjKBMetNpQEUUp', 

2519 'mo': '1YsLGNMsJ7VsekhdcITQeolzOSK4NzE6U', 

2520 'mr': '1vOr1AIHbgkhTO9Ol9Jx5Wh98Qdyh1QKI', 

2521 'mrj': '1dW-YmEW8a9D5KyXz8ojSdIXWGekNzGzN', 

2522 'ms': '1bs-_5WNRiZBjO-DtcNtkcIle-98homf_', 

2523 'mt': '1L7aU3iGjm6SmPIU74k990qRgHFV9hrL0', 

2524 'mus': '1_b7DcRqiKJFEFwp87cUecqf8A5BDbTIJ', # leer 

2525 'mwl': '1MfP0jba2jQfGVeJOLq26MjI6fYY7xTPu', 

2526 'my': '16wsIGBhNVd2lC2p6n1X8rdMbiaemeiUM', 

2527 'myv': '1KEqHmfx2pfU-a1tdI_7ZxMQAk5NJzJjB', 

2528 'mzn': '1CflvmYEXZnWwpsBmIs2OvG-zDDvLEMDJ', 

2529 'na': '1r0AVjee5wNnrcgJxQmVGPVKg5YWz1irz', 

2530 'nah': '1fx6eu91NegyueZ1i0XaB07CKjUwjHN7H', 

2531 'nap': '1bhT4sXCJvaTchCIV9mwLBtf3a7OprbVB', 

2532 'nds-nl': '1UIFi8eOCuFYJXSAXZ9pCWwkQMlHaY4ye', 

2533 'nds': '1FLgZIXUWa_vekDt4ndY0B5XL7FNLiulr', 

2534 'ne': '1gEoCjSJmzjIH4kdHsbDZzD6ID4_78ekS', 

2535 'new': '1_-p45Ny4w9UvGuhD8uRNSPPeaARYvESH', 

2536 'ng': '11yxPdkmpmnijQUcnFHZ3xcOmLTYJmN_R', 

2537 'nl': '1dqYXg3ilzVOSQ_tz_dF47elSIvSIhgqd', 

2538 'nn': '1pDrtRhQ001z2WUNMWCZQU3RV_M0BqOmv', 

2539 'no': '1zuT8MI96Ivpiu9mEVFNjwbiM8gJlSzY2', 

2540 'nov': '1l38388Rln0NXsSARMZHmTmyfo5C0wYTd', 

2541 'nrm': '10vxPq1Nci7Wpq4XOvx3dtqODskzjdxJQ', 

2542 'nso': '1iaIV8qlT0RDnbeQlnxJ3RehsG3gU5ePK', 

2543 'nv': '1oN31jT0w3wP9aGwAPz91pSdUytnd9B0g', 

2544 'ny': '1eEKH_rUPC560bfEg11kp3kbe8qWm35IG', 

2545 'oc': '1C01cW8G_j8US-DTrsmeal_ENHTtNWn-H', 

2546 'olo': '1vbDwKZKqFq84dusr1SvDx5JbBcPanx9L', # leer 

2547 'om': '1q3h22VMbWg2kgVFm-OArR-E4y1yBQ1JX', 

2548 'or': '1k8LwCE8nC7lq6neXDaS3zRn0KOrd9RnS', 

2549 'os': '1u81KAB34aEQfet00dLMRIBJsfRwbDTij', 

2550 'pa': '1JDEHL1VcLHBamgTPBom_Ryi8hk6PBpsu', 

2551 'pag': '1k905VUWnRgY8kFb2P2431Kr4dZuolYGF', 

2552 'pam': '1ssugGyJb8ipispC60B3I6kzMsri1WcvC', 

2553 'pap': '1Za0wfwatxYoD7jGclmTtRoBP0uV_qImQ', 

2554 'pcd': '1csJlKgtG04pdIYCUWhsCCZARKIGlEYPx', 

2555 'pdc': '1Xnms4RXZKZ1BBQmQJEPokmkiweTpouUw', 

2556 'pfl': '1tPQfHX7E0uKMdDSlwNw5aGmaS5bUK0rn', 

2557 'pi': '16b-KxNxzbEuyoNSlI3bfe2YXmdSEsPFu', 

2558 'pih': '1vwyihTnS8_PE5BNK7cTISmIBqGWvsVnF', 

2559 'pl': '1fijjS0LbfpKcoPB5V8c8fH08T8AkXRp9', 

2560 'pms': '12ySc7X9ajWWqMlBjyrPiEdc-qVBuIkbA', 

2561 'pnb': '1RB3-wjluhTKbdTGCsk3nag1bM3m4wENb', 

2562 'pnt': '1ZCUzms6fY4on_fW8uVgO7cEs9KHydHY_', 

2563 'ps': '1WKl9Av6Sqz6aHKyUM5kIh90mzFzyVWH9', 

2564 'pt': '13BX-_4_hcTUp59HDyczFDI32qUB94vUY', 

2565 'qu': '1CB_C4ygtRoegkqgcqfXNHr8oQd-UcvDE', 

2566 'rm': '1YRSGgWoxEqSojHXuBHJnY8vAHr1VgLu-', 

2567 'rmy': '1uFcCyvOWBJWKFQxbkYSp373xUXVl4IgF', 

2568 'rn': '1ekyyb2MvupYGY_E8_BhKvV664sLvW4aE', 

2569 'ro': '1YfeNTSoxU-zJMnyQotLk5X8B_6nHryBu', 

2570 'roa-rup': '150s4H4TdQ5nNYVC6j0E416TUAjBE85yy', 

2571 'roa-tara': '1H6emfQsD_a5yohK4RMPQ-GrnHXqqVgr3', 

2572 'ru': '11gP2s-SYcfS3j9MjPp5C3_nFeQB-8x86', 

2573 'rue': '1OuSglZAndja1J5D5IUmdbt_niTTyEgYK', 

2574 'rw': '1NuhHfi0-B-Xlr_BApijnxCw0WMEltttP', 

2575 'sa': '1P2S3gL_zvKgXLKJJxg-Fb4z8XdlVpQik', 

2576 'sah': '1qz0MpKckzUref2FX_FYiNzI2p4BDc5oR', 

2577 'sc': '1oAYj_Fty4FUwjAOBEBaiZt_cY8dtpDfA', 

2578 'scn': '1sDN9zHkXWYoHYx-DUu-GPvsUgB_IRa8S', 

2579 'sco': '1i8W7KQPj6YZQLop89vZBSybJNgNsvXWR', 

2580 'sd': '1vaNqfv3S8Gl5pQmig3vwWQ3cqRTsXmMR', 

2581 'se': '1RT9xhn0Vl90zjWYDTw5V1L_u1Oh16tpP', 

2582 'sg': '1iIh2oXD2Szz_AygUvTt3_ZK8a3RYEGZ_', 

2583 'sh': '1qPwLiAm6t4__G-zVEOrBgYx6VRmgDgiS', 

2584 'si': '1G5ryceID0TP6SAO42e-HAbIlCvYmnUN7', 

2585 'simple': '1FVV49o_RlK6M5Iw_7zeJOEDQoTa5zSbq', 

2586 'sk': '11mkYvbmAWKTInj6t4Ma8BUPxoR5o6irL', 

2587 'sl': '1fsIZS5LgMzMzZ6T7ogStyj-ILEZIBRvO', 

2588 'sm': '1yefECpKX_Y4R7G2tggIxvc_BvJfOAz-t', 

2589 'sn': '1fYeCjMPvRAv94kvZjiKI-ktIDLkbv0Ve', 

2590 'so': '1Uc-eSZnJb36SgeTvRU3GirXZOlGD_NB6', 

2591 'sq': '11u-53n71O_yjpwRiCQSwgL7N2w72ZptX', 

2592 'sr': '1PGLGlQi8Q0Eac6dib-uuCJAAHK6SF5Pz', 

2593 'srn': '1JKiL3TSXqK1-KhPfAwMK0uqw90WEzg7M', 

2594 'ss': '1e0quNEsA1dn57-IbincF4D82dRWgzQlp', 

2595 'st': '1ny-FBzpBqIDgv6jMcsoFev3Ih65FNZFO', 

2596 'stq': '15Fx32ROy2IM6lSqAPUykkr3CITR6Xd7v', 

2597 'su': '1C0FJum7bYZpnyptBvfAgwJb0TX2hggtO', 

2598 'sv': '1YyqzOSXzK5yrAou9zeTDWH_7s569mDcz', 

2599 'sw': '1_bNTj6T8eXlNAIuHaveleWlHB_22alJs', 

2600 'szl': '1_dXEip1snK4CPVGqH8x7lF5O-6FdCNFW', 

2601 'ta': '1ZFTONsxGtSnC9QB6RpWSvgD_MbZwIhHH', 

2602 'tcy': '15R6u7KQs1vmDSm_aSDrQMJ3Q6q3Be0r7', # leer 

2603 'te': '11Sx-pBAPeZOXGyv48UNSVMD0AH7uf4YN', 

2604 'tet': '11mr2MYLcv9pz7mHhGGNi5iNCOVErYeOt', 

2605 'tg': '16ttF7HWqM9Cnj4qmgf3ZfNniiOJfZ52w', 

2606 'th': '14xhIt-xr5n9nMuvcwayCGM1-zBCFZquW', 

2607 'ti': '123q5e9MStMShp8eESGtHdSBGLDrCKfJU', 

2608 'tk': '1X-JNInt34BNGhg8A8Peyjw2WjsALdXsD', 

2609 'tl': '1WkQHbWd9cqtTnSHAv0DpUThaBnzeSPTJ', 

2610 'tn': '1fHfQHetZn8-fLuRZEu-cvs-kQYwPvjyL', 

2611 'to': '1cHOLaczYJ8h-OqQgxeoH9vMG3izg6muT', 

2612 'tpi': '1YsRjxVu6NYOrXRb8oqMO9FPaicelFEcu', 

2613 'tr': '1J1Zy02IxvtCK0d1Ba2h_Ulit1mVb9UIX', 

2614 'ts': '1pIcfAt3KmtmDkyhOl-SMSeoM8aP8bOpl', 

2615 'tt': '1vsfzCjj-_bMOn5jBai41TF5GjKJM_Ius', 

2616 'tum': '1NWcg65daI2Bt0awyEgU6apUDbBmiqCus', 

2617 'tw': '1WCYKZIqS7AagS76QFSfbteiOgFNBvNne', 

2618 'ty': '1DIqaP1l-N9VXTNokrlr6EuPMGE765o4h', 

2619 'tyv': '1F3qa05OYLBcjT1lXMurAJFDXP_EesCvM', 

2620 'udm': '1T0YMTAPLOk768sstnewy5Jxgx2RPu3Rb', 

2621 'ug': '1fjezvqlysyZhiQMZdazqLGgk72PqtXAw', 

2622 'uk': '1UMJCHtzxkfLDBJE7NtfN5FeMrnnUVwoh', 

2623 'ur': '1WNaD2TuHvdsF-z0k_emQYchwoQQDFmRk', 

2624 'uz': '11wrG2FSTpRJc2jb5MhgvxjkVDYhT8M-l', 

2625 've': '1PucJ7pJ4CXGEXZ5p_WleZDs2usNz74to', 

2626 'vec': '1cAVjm_y3ehNteDQIYz9yyoq1EKkqOXZ0', 

2627 'vep': '1K_eqV7O6C7KPJWZtmIuzFMKAagj-0O85', 

2628 'vi': '1yQ6nhm1BmG9lD4_NaG1hE5VV6biEaV5f', 

2629 'vls': '1bpQQW6pKHruKJJaKtuggH5rReMXyeVXp', 

2630 'vo': '1D80QRdTpe7H4mHFKpfugscsjX71kiMJN', 

2631 'wa': '1m4B81QYbf74htpInDU5p7d0n0ot8WLPZ', 

2632 'war': '1EC3jsHtu22tHBv6jX_I4rupC5RwV3OYd', 

2633 'wo': '1vChyqNNLu5xYHdyHpACwwpw4l3ptiKlo', 

2634 'wuu': '1_EIn02xCUBcwLOwYnA-lScjS2Lh2ECw6', 

2635 'xal': '19bKXsL1D2UesbB50JPyc9TpG1lNc2POt', 

2636 'xh': '1pPVcxBG3xsCzEnUzlohc_p89gQ9dSJB3', 

2637 'xmf': '1SM9llku6I_ZuZz05mOBuL2lx-KQXvehr', 

2638 'yi': '1WNWr1oV-Nl7c1Jv8x_MiAj2vxRtyQawu', 

2639 'yo': '1yNVOwMOWeglbOcRoZzgd4uwlN5JMynnY', 

2640 'za': '1i7pg162cD_iU9h8dgtI2An8QCcbzUAjB', 

2641 'zea': '1EWSkiSkPBfbyjWjZK0VuKdpqFnFOpXXQ', 

2642 'zh-classical': '1uUKZamNp08KA7s7794sKPOqPALvo_btl', 

2643 'zh-min-nan': '1oSgz3YBXLGUgI7kl-uMOC_ww6L0FNFmp', 

2644 'zh-yue': '1zhwlUeeiyOAU1QqwqZ8n91yXIRPFA7UE', 

2645 'zh': '1LZ96GUhkVHQU-aj2C3WOrtffOp0U3Z7f', 

2646 'zu': '1FyXl_UK1737XB3drqQFhGXiJrJckiB1W' 

2647 } 

2648 return languages_ids[language] 

2649 

2650 

2651class NER_MULTI_XTREME(MultiCorpus): 

2652 def __init__( 

2653 self, 

2654 languages: Union[str, List[str]] = "en", 

2655 base_path: Union[str, Path] = None, 

2656 tag_to_bioes: str = "ner", 

2657 in_memory: bool = False, 

2658 **corpusargs, 

2659 ): 

2660 """ 

2661 Xtreme corpus for cross-lingual NER consisting of datasets of a total of 176 languages. The data comes from the google 

2662 research work XTREME https://github.com/google-research/xtreme. All datasets for NER and respective language abbreviations (e.g. 

2663 "en" for english can be found here https://www.amazon.com/clouddrive/share/d3KGCRCIYwhKJF0H3eWA26hjg2ZCRhjpEQtDL70FSBN/folder/C43gs51bSIaq5sFTQkWNCQ?_encoding=UTF8&*Version*=1&*entries*=0&mgh=1 ) 

2664 The data is derived from the wikiann dataset https://elisa-ie.github.io/wikiann/ (license: https://opendatacommons.org/licenses/by/) 

2665 

2666 Parameters 

2667 ---------- 

2668 languages : Union[str, List[str]], optional 

2669 Default the 40 languages that are used in XTREME are loaded. Otherwise on can hand over a strings or a list of strings 

2670 consisiting of abbreviations for languages. All datasets will be loaded in a MultiCorpus object. 

2671 base_path : Union[str, Path], optional 

2672 Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

2673 to point to a different folder but typically this should not be necessary. 

2674 tag_to_bioes : str, optional 

2675 The data is in bio-format. It will by default (with the string "ner" as value) be transformed 

2676 into the bioes format. If you dont want that set it to None. 

2677 

2678 """ 

2679 # if no languages are given as argument all languages used in XTREME will be loaded 

2680 if not languages: 

2681 languages = ["af", "ar", "bg", "bn", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr", "he", "hi", "hu", 

2682 "id", "it", "ja", "jv", "ka", "kk", "ko", "ml", "mr", "ms", "my", "nl", "pt", "ru", "sw", "ta", 

2683 "te", "th", "tl", "tr", "ur", "vi", "yo", "zh"] 

2684 

2685 # if only one language is given 

2686 if type(languages) == str: 

2687 languages = [languages] 

2688 

2689 if type(base_path) == str: 

2690 base_path: Path = Path(base_path) 

2691 

2692 # column format 

2693 columns = {0: "text", 1: "ner"} 

2694 

2695 # this dataset name 

2696 dataset_name = self.__class__.__name__.lower() 

2697 

2698 # default dataset folder is the cache root 

2699 if not base_path: 

2700 base_path = flair.cache_root / "datasets" 

2701 data_folder = base_path / dataset_name 

2702 

2703 # For each language in languages, the file is downloaded if not existent 

2704 # Then a comlumncorpus of that data is created and saved in a list 

2705 # This list is handed to the multicorpus 

2706 

2707 # list that contains the columncopora 

2708 corpora = [] 

2709 

2710 hu_path = "https://nlp.informatik.hu-berlin.de/resources/datasets/panx_dataset" 

2711 

2712 # download data if necessary 

2713 for language in languages: 

2714 

2715 language_folder = data_folder / language 

2716 

2717 # if language not downloaded yet, download it 

2718 if not language_folder.exists(): 

2719 

2720 file_name = language + '.tar.gz' 

2721 # create folder 

2722 os.makedirs(language_folder) 

2723 

2724 # download from HU Server 

2725 temp_file = cached_path( 

2726 hu_path + "/" + file_name, 

2727 Path("datasets") / dataset_name / language 

2728 ) 

2729 

2730 # unzip 

2731 log.info("Extracting data...") 

2732 import tarfile 

2733 tar = tarfile.open(str(temp_file), "r:gz") 

2734 for part in ["train", "test", "dev"]: 

2735 tar.extract(part, str(language_folder)) 

2736 tar.close() 

2737 log.info('...done.') 

2738 

2739 # transform data into required format 

2740 log.info("Processing dataset...") 

2741 for part in ["train", "test", "dev"]: 

2742 self._xtreme_to_simple_ner_annotation(str(language_folder / part)) 

2743 log.info('...done.') 

2744 

2745 # initialize comlumncorpus and add it to list 

2746 log.info(f"Reading data for language {language}") 

2747 corp = ColumnCorpus(data_folder=language_folder, 

2748 column_format=columns, 

2749 tag_to_bioes=tag_to_bioes, 

2750 in_memory=in_memory, 

2751 **corpusargs, 

2752 ) 

2753 corpora.append(corp) 

2754 

2755 super(NER_MULTI_XTREME, self).__init__( 

2756 corpora, name='xtreme', 

2757 ) 

2758 

2759 def _xtreme_to_simple_ner_annotation(self, data_file: Union[str, Path]): 

2760 with open(data_file, 'r', encoding='utf-8') as f: 

2761 lines = f.readlines() 

2762 with open(data_file, 'w', encoding='utf-8') as f: 

2763 for line in lines: 

2764 if line == '\n': 

2765 f.write(line) 

2766 else: 

2767 liste = line.split() 

2768 f.write(liste[0].split(':', 1)[1] + ' ' + liste[1] + '\n') 

2769 

2770 

2771class NER_MULTI_WIKINER(MultiCorpus): 

2772 def __init__( 

2773 self, 

2774 languages: Union[str, List[str]] = "en", 

2775 base_path: Union[str, Path] = None, 

2776 tag_to_bioes: str = "ner", 

2777 in_memory: bool = False, 

2778 **corpusargs, 

2779 ): 

2780 if type(base_path) == str: 

2781 base_path: Path = Path(base_path) 

2782 

2783 # if only one language is given 

2784 if type(languages) == str: 

2785 languages = [languages] 

2786 

2787 # column format 

2788 columns = {0: "text", 1: "pos", 2: "ner"} 

2789 

2790 # this dataset name 

2791 dataset_name = self.__class__.__name__.lower() 

2792 

2793 # default dataset folder is the cache root 

2794 if not base_path: 

2795 base_path = flair.cache_root / "datasets" 

2796 data_folder = base_path / dataset_name 

2797 

2798 corpora = [] 

2799 for language in languages: 

2800 language_folder = data_folder / language 

2801 

2802 # download data if necessary 

2803 self._download_wikiner(language, language_folder) 

2804 

2805 # initialize comlumncorpus and add it to list 

2806 log.info(f"Read data for language {language}") 

2807 corp = ColumnCorpus(data_folder=language_folder, 

2808 column_format=columns, 

2809 tag_to_bioes=tag_to_bioes, 

2810 in_memory=in_memory, 

2811 **corpusargs, 

2812 ) 

2813 corpora.append(corp) 

2814 

2815 super(NER_MULTI_WIKINER, self).__init__( 

2816 corpora, name='wikiner', 

2817 ) 

2818 

2819 def _download_wikiner(self, language_code: str, dataset_name: str): 

2820 # download data if necessary 

2821 wikiner_path = ( 

2822 "https://raw.githubusercontent.com/dice-group/FOX/master/input/Wikiner/" 

2823 ) 

2824 lc = language_code 

2825 

2826 data_file = ( 

2827 flair.cache_root 

2828 / "datasets" 

2829 / dataset_name 

2830 / f"aij-wikiner-{lc}-wp3.train" 

2831 ) 

2832 if not data_file.is_file(): 

2833 

2834 cached_path( 

2835 f"{wikiner_path}aij-wikiner-{lc}-wp3.bz2", Path("datasets") / dataset_name 

2836 ) 

2837 import bz2, shutil 

2838 

2839 # unpack and write out in CoNLL column-like format 

2840 bz_file = bz2.BZ2File( 

2841 flair.cache_root 

2842 / "datasets" 

2843 / dataset_name 

2844 / f"aij-wikiner-{lc}-wp3.bz2", 

2845 "rb", 

2846 ) 

2847 with bz_file as f, open( 

2848 flair.cache_root 

2849 / "datasets" 

2850 / dataset_name 

2851 / f"aij-wikiner-{lc}-wp3.train", 

2852 "w", 

2853 encoding="utf-8" 

2854 ) as out: 

2855 for line in f: 

2856 line = line.decode("utf-8") 

2857 words = line.split(" ") 

2858 for word in words: 

2859 out.write("\t".join(word.split("|")) + "\n") 

2860 

2861 

2862class NER_SWEDISH(ColumnCorpus): 

2863 def __init__( 

2864 self, 

2865 base_path: Union[str, Path] = None, 

2866 tag_to_bioes: str = "ner", 

2867 in_memory: bool = True, 

2868 **corpusargs, 

2869 ): 

2870 """ 

2871 Initialize the NER_SWEDISH corpus for Swedish. The first time you call this constructor it will automatically 

2872 download the dataset. 

2873 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

2874 to point to a different folder but typically this should not be necessary. 

2875 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

2876 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

2877 """ 

2878 

2879 if type(base_path) == str: 

2880 base_path: Path = Path(base_path) 

2881 

2882 # column format 

2883 columns = {0: "text", 1: "ner"} 

2884 

2885 # this dataset name 

2886 dataset_name = self.__class__.__name__.lower() 

2887 

2888 # default dataset folder is the cache root 

2889 if not base_path: 

2890 base_path = flair.cache_root / "datasets" 

2891 data_folder = base_path / dataset_name 

2892 

2893 # download data if necessary 

2894 ner_spraakbanken_path = "https://raw.githubusercontent.com/klintan/swedish-ner-corpus/master/" 

2895 cached_path(f"{ner_spraakbanken_path}test_corpus.txt", Path("datasets") / dataset_name) 

2896 cached_path(f"{ner_spraakbanken_path}train_corpus.txt", Path("datasets") / dataset_name) 

2897 

2898 # data is not in IOB2 format. Thus we transform it to IOB2 

2899 self._add_IOB2_tags(data_file=Path(data_folder / "test_corpus.txt")) 

2900 self._add_IOB2_tags(data_file=Path(data_folder / "train_corpus.txt")) 

2901 

2902 super(NER_SWEDISH, self).__init__( 

2903 data_folder, 

2904 columns, 

2905 tag_to_bioes=tag_to_bioes, 

2906 in_memory=in_memory, 

2907 **corpusargs, 

2908 ) 

2909 

2910 def _add_IOB2_tags(self, data_file: Union[str, Path], encoding: str = "utf8"): 

2911 """ 

2912 Function that adds IOB2 tags if only chunk names are provided (e.g. words are tagged PER instead 

2913 of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects 

2914 the letter 'O'. Additionally it removes lines with no tags in the data file and can also 

2915 be used if the data is only partially IOB tagged. 

2916 Parameters 

2917 ---------- 

2918 data_file : Union[str, Path] 

2919 Path to the data file. 

2920 encoding : str, optional 

2921 Encoding used in open function. The default is "utf8". 

2922 

2923 """ 

2924 with open(file=data_file, mode='r', encoding=encoding) as f: 

2925 lines = f.readlines() 

2926 with open(file=data_file, mode='w', encoding=encoding) as f: 

2927 pred = 'O' # remembers tag of predecessing line 

2928 for line in lines: 

2929 line_list = line.split() 

2930 if len(line_list) == 2: # word with tag 

2931 word = line_list[0] 

2932 tag = line_list[1] 

2933 if tag in ['0', 'O']: # no chunk 

2934 f.write(word + ' O\n') 

2935 pred = 'O' 

2936 elif '-' not in tag: # no IOB tags 

2937 if pred == 'O': # found a new chunk 

2938 f.write(word + ' B-' + tag + '\n') 

2939 pred = tag 

2940 else: # found further part of chunk or new chunk directly after old chunk 

2941 if pred == tag: 

2942 f.write(word + ' I-' + tag + '\n') 

2943 else: 

2944 f.write(word + ' B-' + tag + '\n') 

2945 pred = tag 

2946 else: # line already has IOB tag (tag contains '-') 

2947 f.write(line) 

2948 pred = tag.split('-')[1] 

2949 elif len(line_list) == 0: # empty line 

2950 f.write('\n') 

2951 pred = 'O' 

2952 

2953 

2954class NER_TURKU(ColumnCorpus): 

2955 def __init__( 

2956 self, 

2957 base_path: Union[str, Path] = None, 

2958 tag_to_bioes: str = "ner", 

2959 in_memory: bool = True, 

2960 **corpusargs, 

2961 ): 

2962 """ 

2963 Initialize the Finnish TurkuNER corpus. The first time you call this constructor it will automatically 

2964 download the dataset. 

2965 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

2966 to point to a different folder but typically this should not be necessary. 

2967 :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict 

2968 POS tags instead 

2969 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

2970 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

2971 """ 

2972 if type(base_path) == str: 

2973 base_path: Path = Path(base_path) 

2974 

2975 # column format 

2976 columns = {0: "text", 1: "ner"} 

2977 

2978 # this dataset name 

2979 dataset_name = self.__class__.__name__.lower() 

2980 

2981 # default dataset folder is the cache root 

2982 if not base_path: 

2983 base_path = flair.cache_root / "datasets" 

2984 data_folder = base_path / dataset_name 

2985 

2986 # download data if necessary 

2987 conll_path = "https://raw.githubusercontent.com/TurkuNLP/turku-ner-corpus/master/data/conll" 

2988 dev_file = "dev.tsv" 

2989 test_file = "test.tsv" 

2990 train_file = "train.tsv" 

2991 cached_path(f"{conll_path}/{dev_file}", Path("datasets") / dataset_name) 

2992 cached_path(f"{conll_path}/{test_file}", Path("datasets") / dataset_name) 

2993 cached_path(f"{conll_path}/{train_file}", Path("datasets") / dataset_name) 

2994 

2995 super(NER_TURKU, self).__init__( 

2996 data_folder, 

2997 columns, 

2998 dev_file=dev_file, 

2999 test_file=test_file, 

3000 train_file=train_file, 

3001 column_delimiter="\t", 

3002 tag_to_bioes=tag_to_bioes, 

3003 encoding="latin-1", 

3004 in_memory=in_memory, 

3005 document_separator_token="-DOCSTART-", 

3006 **corpusargs, 

3007 ) 

3008 

3009 

3010class KEYPHRASE_SEMEVAL2017(ColumnCorpus): 

3011 def __init__( 

3012 self, 

3013 base_path: Union[str, Path] = None, 

3014 tag_to_bioes: str = "keyword", 

3015 in_memory: bool = True, 

3016 **corpusargs, 

3017 ): 

3018 

3019 if type(base_path) == str: 

3020 base_path: Path = Path(base_path) 

3021 

3022 # column format 

3023 columns = {0: "text", 1: "keyword"} 

3024 

3025 # this dataset name 

3026 dataset_name = self.__class__.__name__.lower() 

3027 

3028 # default dataset folder is the cache root 

3029 if not base_path: 

3030 base_path = flair.cache_root / "datasets" 

3031 data_folder = base_path / dataset_name 

3032 

3033 semeval2017_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/SemEval-2017" 

3034 cached_path(f"{semeval2017_path}/train.txt", Path("datasets") / dataset_name) 

3035 cached_path(f"{semeval2017_path}/test.txt", Path("datasets") / dataset_name) 

3036 cached_path(f"{semeval2017_path}/dev.txt", Path("datasets") / dataset_name) 

3037 

3038 super(KEYPHRASE_SEMEVAL2017, self).__init__( 

3039 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, 

3040 ) 

3041 

3042 

3043class KEYPHRASE_INSPEC(ColumnCorpus): 

3044 def __init__( 

3045 self, 

3046 base_path: Union[str, Path] = None, 

3047 tag_to_bioes: str = "keyword", 

3048 in_memory: bool = True, 

3049 **corpusargs, 

3050 ): 

3051 

3052 if type(base_path) == str: 

3053 base_path: Path = Path(base_path) 

3054 

3055 # column format 

3056 columns = {0: "text", 1: "keyword"} 

3057 

3058 # this dataset name 

3059 dataset_name = self.__class__.__name__.lower() 

3060 

3061 # default dataset folder is the cache root 

3062 if not base_path: 

3063 base_path = flair.cache_root / "datasets" 

3064 data_folder = base_path / dataset_name 

3065 

3066 inspec_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/Inspec" 

3067 cached_path(f"{inspec_path}/train.txt", Path("datasets") / dataset_name) 

3068 cached_path(f"{inspec_path}/test.txt", Path("datasets") / dataset_name) 

3069 if not "dev.txt" in os.listdir(data_folder): 

3070 cached_path(f"{inspec_path}/valid.txt", Path("datasets") / dataset_name) 

3071 # rename according to train - test - dev - convention 

3072 os.rename(data_folder / "valid.txt", data_folder / "dev.txt") 

3073 

3074 super(KEYPHRASE_INSPEC, self).__init__( 

3075 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, 

3076 ) 

3077 

3078 

3079class KEYPHRASE_SEMEVAL2010(ColumnCorpus): 

3080 def __init__( 

3081 self, 

3082 base_path: Union[str, Path] = None, 

3083 tag_to_bioes: str = "keyword", 

3084 in_memory: bool = True, 

3085 **corpusargs, 

3086 ): 

3087 

3088 if type(base_path) == str: 

3089 base_path: Path = Path(base_path) 

3090 

3091 # column format 

3092 columns = {0: "text", 1: "keyword"} 

3093 

3094 # this dataset name 

3095 dataset_name = self.__class__.__name__.lower() 

3096 

3097 # default dataset folder is the cache root 

3098 if not base_path: 

3099 base_path = flair.cache_root / "datasets" 

3100 data_folder = base_path / dataset_name 

3101 

3102 semeval2010_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/processed_semeval-2010" 

3103 cached_path(f"{semeval2010_path}/train.txt", Path("datasets") / dataset_name) 

3104 cached_path(f"{semeval2010_path}/test.txt", Path("datasets") / dataset_name) 

3105 

3106 super(KEYPHRASE_SEMEVAL2010, self).__init__( 

3107 data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, 

3108 ) 

3109 

3110 

3111class UP_CHINESE(ColumnCorpus): 

3112 def __init__( 

3113 self, 

3114 base_path: Union[str, Path] = None, 

3115 in_memory: bool = True, 

3116 document_as_sequence: bool = False, 

3117 **corpusargs, 

3118 ): 

3119 """ 

3120 Initialize the Chinese dataset from the Universal Propositions Bank, comming from that webpage: 

3121 https://github.com/System-T/UniversalPropositions/tree/master/UP_Chinese 

3122 

3123 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

3124 to point to a different folder but typically this should not be necessary. 

3125 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

3126 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

3127 """ 

3128 if type(base_path) == str: 

3129 base_path: Path = Path(base_path) 

3130 

3131 # column format 

3132 columns = {1: "text", 9: "frame"} 

3133 

3134 # this dataset name 

3135 dataset_name = self.__class__.__name__.lower() 

3136 

3137 # default dataset folder is the cache root 

3138 if not base_path: 

3139 base_path = flair.cache_root / "datasets" 

3140 data_folder = base_path / dataset_name 

3141 

3142 # download data if necessary 

3143 up_zh_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Chinese/" 

3144 cached_path(f"{up_zh_path}zh-up-train.conllu", Path("datasets") / dataset_name) 

3145 cached_path(f"{up_zh_path}zh-up-dev.conllu", Path("datasets") / dataset_name) 

3146 cached_path(f"{up_zh_path}zh-up-test.conllu", Path("datasets") / dataset_name) 

3147 

3148 super(UP_CHINESE, self).__init__( 

3149 data_folder, 

3150 columns, 

3151 encoding="utf-8", 

3152 train_file="zh-up-train.conllu", 

3153 test_file="zh-up-test.conllu", 

3154 dev_file="zh-up-dev.conllu", 

3155 in_memory=in_memory, 

3156 document_separator_token=None if not document_as_sequence else "-DOCSTART-", 

3157 comment_symbol="#", 

3158 **corpusargs, 

3159 ) 

3160 

3161 

3162class UP_ENGLISH(ColumnCorpus): 

3163 def __init__( 

3164 self, 

3165 base_path: Union[str, Path] = None, 

3166 in_memory: bool = True, 

3167 document_as_sequence: bool = False, 

3168 **corpusargs, 

3169 ): 

3170 """ 

3171 Initialize the English dataset from the Universal Propositions Bank, comming from that webpage: 

3172 https://github.com/System-T/UniversalPropositions. 

3173 

3174 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

3175 to point to a different folder but typically this should not be necessary. 

3176 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

3177 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

3178 """ 

3179 if type(base_path) == str: 

3180 base_path: Path = Path(base_path) 

3181 

3182 # column format 

3183 columns = {1: "text", 10: "frame"} 

3184 

3185 # this dataset name 

3186 dataset_name = self.__class__.__name__.lower() 

3187 

3188 # default dataset folder is the cache root 

3189 if not base_path: 

3190 base_path = flair.cache_root / "datasets" 

3191 data_folder = base_path / dataset_name 

3192 

3193 # download data if necessary 

3194 up_en_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_English-EWT/" 

3195 cached_path(f"{up_en_path}en_ewt-up-train.conllu", Path("datasets") / dataset_name) 

3196 cached_path(f"{up_en_path}en_ewt-up-dev.conllu", Path("datasets") / dataset_name) 

3197 cached_path(f"{up_en_path}en_ewt-up-test.conllu", Path("datasets") / dataset_name) 

3198 

3199 super(UP_ENGLISH, self).__init__( 

3200 data_folder, 

3201 columns, 

3202 encoding="utf-8", 

3203 train_file="en_ewt-up-train.conllu", 

3204 test_file="en_ewt-up-test.conllu", 

3205 dev_file="en_ewt-up-dev.conllu", 

3206 in_memory=in_memory, 

3207 document_separator_token=None if not document_as_sequence else "-DOCSTART-", 

3208 comment_symbol="#", 

3209 label_name_map={"_": "O"}, 

3210 **corpusargs, 

3211 ) 

3212 

3213 

3214class UP_FRENCH(ColumnCorpus): 

3215 def __init__( 

3216 self, 

3217 base_path: Union[str, Path] = None, 

3218 in_memory: bool = True, 

3219 document_as_sequence: bool = False, 

3220 **corpusargs, 

3221 ): 

3222 """ 

3223 Initialize the French dataset from the Universal Propositions Bank, comming from that webpage: 

3224 https://github.com/System-T/UniversalPropositions. 

3225 

3226 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

3227 to point to a different folder but typically this should not be necessary. 

3228 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

3229 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

3230 """ 

3231 if type(base_path) == str: 

3232 base_path: Path = Path(base_path) 

3233 

3234 # column format 

3235 columns = {1: "text", 9: "frame"} 

3236 

3237 # this dataset name 

3238 dataset_name = self.__class__.__name__.lower() 

3239 

3240 # default dataset folder is the cache root 

3241 if not base_path: 

3242 base_path = flair.cache_root / "datasets" 

3243 data_folder = base_path / dataset_name 

3244 

3245 # download data if necessary 

3246 up_fr_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_French/" 

3247 cached_path(f"{up_fr_path}fr-up-train.conllu", Path("datasets") / dataset_name) 

3248 cached_path(f"{up_fr_path}fr-up-dev.conllu", Path("datasets") / dataset_name) 

3249 cached_path(f"{up_fr_path}fr-up-test.conllu", Path("datasets") / dataset_name) 

3250 

3251 super(UP_FRENCH, self).__init__( 

3252 data_folder, 

3253 columns, 

3254 encoding="utf-8", 

3255 train_file="fr-up-train.conllu", 

3256 test_file="fr-up-test.conllu", 

3257 dev_file="fr-up-dev.conllu", 

3258 in_memory=in_memory, 

3259 document_separator_token=None if not document_as_sequence else "-DOCSTART-", 

3260 comment_symbol="#", 

3261 **corpusargs, 

3262 ) 

3263 

3264 

3265class UP_FINNISH(ColumnCorpus): 

3266 def __init__( 

3267 self, 

3268 base_path: Union[str, Path] = None, 

3269 in_memory: bool = True, 

3270 document_as_sequence: bool = False, 

3271 **corpusargs, 

3272 ): 

3273 """ 

3274 Initialize the Finnish dataset from the Universal Propositions Bank, comming from that webpage: 

3275 https://github.com/System-T/UniversalPropositions/tree/master/UP_Finnish 

3276 

3277 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

3278 to point to a different folder but typically this should not be necessary. 

3279 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

3280 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

3281 """ 

3282 if type(base_path) == str: 

3283 base_path: Path = Path(base_path) 

3284 

3285 # column format 

3286 columns = {1: "text", 9: "frame"} 

3287 

3288 # this dataset name 

3289 dataset_name = self.__class__.__name__.lower() 

3290 

3291 # default dataset folder is the cache root 

3292 if not base_path: 

3293 base_path = flair.cache_root / "datasets" 

3294 data_folder = base_path / dataset_name 

3295 

3296 # download data if necessary 

3297 up_fi_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Finnish/" 

3298 cached_path(f"{up_fi_path}fi-up-train.conllu", Path("datasets") / dataset_name) 

3299 cached_path(f"{up_fi_path}fi-up-dev.conllu", Path("datasets") / dataset_name) 

3300 cached_path(f"{up_fi_path}fi-up-test.conllu", Path("datasets") / dataset_name) 

3301 

3302 super(UP_FINNISH, self).__init__( 

3303 data_folder, 

3304 columns, 

3305 encoding="utf-8", 

3306 train_file="fi-up-train.conllu", 

3307 test_file="fi-up-test.conllu", 

3308 dev_file="fi-up-dev.conllu", 

3309 in_memory=in_memory, 

3310 document_separator_token=None if not document_as_sequence else "-DOCSTART-", 

3311 comment_symbol="#", 

3312 **corpusargs, 

3313 ) 

3314 

3315 

3316class UP_GERMAN(ColumnCorpus): 

3317 def __init__( 

3318 self, 

3319 base_path: Union[str, Path] = None, 

3320 in_memory: bool = True, 

3321 document_as_sequence: bool = False, 

3322 **corpusargs, 

3323 ): 

3324 """ 

3325 Initialize the German dataset from the Universal Propositions Bank, comming from that webpage: 

3326 https://github.com/System-T/UniversalPropositions. 

3327 

3328 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

3329 to point to a different folder but typically this should not be necessary. 

3330 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

3331 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

3332 """ 

3333 if type(base_path) == str: 

3334 base_path: Path = Path(base_path) 

3335 

3336 # column format 

3337 columns = {1: "text", 9: "frame"} 

3338 

3339 # this dataset name 

3340 dataset_name = self.__class__.__name__.lower() 

3341 

3342 # default dataset folder is the cache root 

3343 if not base_path: 

3344 base_path = flair.cache_root / "datasets" 

3345 data_folder = base_path / dataset_name 

3346 

3347 # download data if necessary 

3348 up_de_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_German/" 

3349 cached_path(f"{up_de_path}de-up-train.conllu", Path("datasets") / dataset_name) 

3350 cached_path(f"{up_de_path}de-up-dev.conllu", Path("datasets") / dataset_name) 

3351 cached_path(f"{up_de_path}de-up-test.conllu", Path("datasets") / dataset_name) 

3352 

3353 super(UP_GERMAN, self).__init__( 

3354 data_folder, 

3355 columns, 

3356 encoding="utf-8", 

3357 train_file="de-up-train.conllu", 

3358 test_file="de-up-test.conllu", 

3359 dev_file="de-up-dev.conllu", 

3360 in_memory=in_memory, 

3361 document_separator_token=None if not document_as_sequence else "-DOCSTART-", 

3362 comment_symbol="#", 

3363 **corpusargs, 

3364 ) 

3365 

3366 

3367class UP_ITALIAN(ColumnCorpus): 

3368 def __init__( 

3369 self, 

3370 base_path: Union[str, Path] = None, 

3371 in_memory: bool = True, 

3372 document_as_sequence: bool = False, 

3373 **corpusargs, 

3374 ): 

3375 """ 

3376 Initialize the Italian dataset from the Universal Propositions Bank, comming from that webpage: 

3377 https://github.com/System-T/UniversalPropositions/tree/master/UP_Italian 

3378 

3379 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

3380 to point to a different folder but typically this should not be necessary. 

3381 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

3382 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

3383 """ 

3384 if type(base_path) == str: 

3385 base_path: Path = Path(base_path) 

3386 

3387 # column format 

3388 columns = {1: "text", 9: "frame"} 

3389 

3390 # this dataset name 

3391 dataset_name = self.__class__.__name__.lower() 

3392 

3393 # default dataset folder is the cache root 

3394 if not base_path: 

3395 base_path = flair.cache_root / "datasets" 

3396 data_folder = base_path / dataset_name 

3397 

3398 # download data if necessary 

3399 up_it_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Italian/" 

3400 cached_path(f"{up_it_path}it-up-train.conllu", Path("datasets") / dataset_name) 

3401 cached_path(f"{up_it_path}it-up-dev.conllu", Path("datasets") / dataset_name) 

3402 cached_path(f"{up_it_path}it-up-test.conllu", Path("datasets") / dataset_name) 

3403 

3404 super(UP_ITALIAN, self).__init__( 

3405 data_folder, 

3406 columns, 

3407 encoding="utf-8", 

3408 train_file="it-up-train.conllu", 

3409 test_file="it-up-test.conllu", 

3410 dev_file="it-up-dev.conllu", 

3411 in_memory=in_memory, 

3412 document_separator_token=None if not document_as_sequence else "-DOCSTART-", 

3413 comment_symbol="#", 

3414 **corpusargs, 

3415 ) 

3416 

3417 

3418class UP_SPANISH(ColumnCorpus): 

3419 def __init__( 

3420 self, 

3421 base_path: Union[str, Path] = None, 

3422 in_memory: bool = True, 

3423 document_as_sequence: bool = False, 

3424 **corpusargs, 

3425 ): 

3426 """ 

3427 Initialize the Spanish dataset from the Universal Propositions Bank, comming from that webpage: 

3428 https://github.com/System-T/UniversalPropositions 

3429 

3430 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

3431 to point to a different folder but typically this should not be necessary. 

3432 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

3433 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

3434 """ 

3435 if type(base_path) == str: 

3436 base_path: Path = Path(base_path) 

3437 

3438 # column format 

3439 columns = {1: "text", 9: "frame"} 

3440 

3441 # this dataset name 

3442 dataset_name = self.__class__.__name__.lower() 

3443 

3444 # default dataset folder is the cache root 

3445 if not base_path: 

3446 base_path = flair.cache_root / "datasets" 

3447 data_folder = base_path / dataset_name 

3448 

3449 # download data if necessary 

3450 up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish/" 

3451 cached_path(f"{up_es_path}es-up-train.conllu", Path("datasets") / dataset_name) 

3452 cached_path(f"{up_es_path}es-up-dev.conllu", Path("datasets") / dataset_name) 

3453 cached_path(f"{up_es_path}es-up-test.conllu", Path("datasets") / dataset_name) 

3454 

3455 super(UP_SPANISH, self).__init__( 

3456 data_folder, 

3457 columns, 

3458 encoding="utf-8", 

3459 train_file="es-up-train.conllu", 

3460 test_file="es-up-test.conllu", 

3461 dev_file="es-up-dev.conllu", 

3462 in_memory=in_memory, 

3463 document_separator_token=None if not document_as_sequence else "-DOCSTART-", 

3464 comment_symbol="#", 

3465 **corpusargs, 

3466 ) 

3467 

3468 

3469class UP_SPANISH_ANCORA(ColumnCorpus): 

3470 def __init__( 

3471 self, 

3472 base_path: Union[str, Path] = None, 

3473 in_memory: bool = True, 

3474 document_as_sequence: bool = False, 

3475 **corpusargs, 

3476 ): 

3477 """ 

3478 Initialize the Spanish AnCora dataset from the Universal Propositions Bank, comming from that webpage: 

3479 https://github.com/System-T/UniversalPropositions 

3480 

3481 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

3482 to point to a different folder but typically this should not be necessary. 

3483 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

3484 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

3485 """ 

3486 if type(base_path) == str: 

3487 base_path: Path = Path(base_path) 

3488 

3489 # column format 

3490 columns = {1: "text", 9: "frame"} 

3491 

3492 # this dataset name 

3493 dataset_name = self.__class__.__name__.lower() 

3494 

3495 # default dataset folder is the cache root 

3496 if not base_path: 

3497 base_path = flair.cache_root / "datasets" 

3498 data_folder = base_path / dataset_name 

3499 

3500 # download data if necessary 

3501 up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish-AnCora/" 

3502 cached_path(f"{up_es_path}es_ancora-up-train.conllu", Path("datasets") / dataset_name) 

3503 cached_path(f"{up_es_path}es_ancora-up-dev.conllu", Path("datasets") / dataset_name) 

3504 cached_path(f"{up_es_path}es_ancora-up-test.conllu", Path("datasets") / dataset_name) 

3505 

3506 super(UP_SPANISH_ANCORA, self).__init__( 

3507 data_folder, 

3508 columns, 

3509 encoding="utf-8", 

3510 train_file="es_ancora-up-train.conllu", 

3511 test_file="es_ancora-up-test.conllu", 

3512 dev_file="es_ancora-up-dev.conllu", 

3513 in_memory=in_memory, 

3514 document_separator_token=None if not document_as_sequence else "-DOCSTART-", 

3515 comment_symbol="#", 

3516 **corpusargs, 

3517 )