Coverage for /home/ubuntu/Documents/Research/mut_p1/flair/flair/datasets/entity_linking.py: 7%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

708 statements  

1import csv 

2import logging 

3import os 

4from pathlib import Path 

5from typing import Union, List, Dict 

6 

7import requests 

8 

9import flair 

10from flair.data import Dictionary, Sentence, MultiCorpus 

11from flair.datasets import ColumnCorpus 

12from flair.file_utils import cached_path, unpack_file 

13from flair.tokenization import SentenceSplitter, SegtokSentenceSplitter 

14 

15log = logging.getLogger("flair") 

16 

17 

18class EntityLinkingCorpus(ColumnCorpus): 

19 def __init__( 

20 self, 

21 data_folder, 

22 train_file, 

23 columns={0: "text", 1: "nel"}, 

24 column_delimiter="\t", 

25 in_memory=True, 

26 document_separator_token='-DOCSTART-', 

27 **corpusargs, 

28 ): 

29 """ 

30 Super class for all entity linking corpora. Expects the data to be in column format with one column for words and another one for BIO-tags and wikipedia-page 

31 name, e.g. B-Brad_Pitt. 

32 The class provides the function make_entity_dict to create an entity dictionary suited for entity linking. 

33 """ 

34 # TODO: Add a routine, that checks annotations for some widespread errors/inconsistencies??? (e.g. in AQUAINT corpus Iran-Iraq_War vs. Iran-Iraq_war) 

35 

36 super(EntityLinkingCorpus, self).__init__( 

37 data_folder, 

38 columns, 

39 train_file=train_file, 

40 column_delimiter=column_delimiter, 

41 in_memory=in_memory, 

42 document_separator_token=document_separator_token, 

43 **corpusargs, 

44 ) 

45 

46 def make_entity_dict(self, label_type='nel', threshold: int = 1) -> Dictionary: 

47 """ 

48 Create ID-dictionary for the wikipedia-page names. 

49 param threshold: Ignore links that occur less than threshold value 

50 

51 In entity_occurences all wikinames and their number of occurence is saved. 

52 ent_dictionary contains all wikinames that occure at least threshold times and gives each name an ID 

53 """ 

54 self.threshold = threshold 

55 self.entity_occurences = {} 

56 self.total_number_of_entity_mentions = 0 

57 

58 for sentence in self.get_all_sentences(): 

59 if not sentence.is_document_boundary: # exclude "-DOCSTART-"-sentences 

60 

61 spans = sentence.get_spans(label_type) 

62 for span in spans: 

63 annotation = span.tag 

64 self.total_number_of_entity_mentions += 1 

65 if annotation in self.entity_occurences: 

66 self.entity_occurences[annotation] += 1 

67 else: 

68 self.entity_occurences[annotation] = 1 

69 

70 self.number_of_entities = len(self.entity_occurences) 

71 

72 # Create the annotation dictionary 

73 self.ent_dictionary: Dictionary = Dictionary(add_unk=True) 

74 

75 for x in self.entity_occurences: 

76 if self.entity_occurences[x] >= threshold: 

77 self.ent_dictionary.add_item(x) 

78 

79 return self.ent_dictionary 

80 

81 # this fct removes every second unknown label 

82 def remove_unknowns(self): 

83 remove = True 

84 for sentence in self.get_all_sentences(): 

85 if not sentence.is_document_boundary: # exclude "-DOCSTART-"-sentences 

86 

87 spans = sentence.get_spans('nel') 

88 for span in spans: 

89 annotation = span.tag 

90 if self.ent_dictionary.get_idx_for_item(annotation) == 0: # unknown label 

91 if remove: 

92 for token in span: 

93 token.remove_labels('nel') 

94 remove = False 

95 else: 

96 remove = True 

97 

98 

99class NEL_ENGLISH_AQUAINT(EntityLinkingCorpus): 

100 def __init__( 

101 self, 

102 base_path: Union[str, Path] = None, 

103 in_memory: bool = True, 

104 agreement_threshold: float = 0.5, 

105 sentence_splitter: SentenceSplitter = SegtokSentenceSplitter(), 

106 **corpusargs, 

107 ): 

108 """ 

109 Initialize Aquaint Entity Linking corpus introduced in: D. Milne and I. H. Witten. 

110 Learning to link with wikipedia 

111 (https://www.cms.waikato.ac.nz/~ihw/papers/08-DNM-IHW-LearningToLinkWithWikipedia.pdf). 

112 If you call the constructor the first time the dataset gets automatically downloaded and transformed in 

113 tab-separated column format (aquaint.txt). 

114 

115 Parameters 

116 ---------- 

117 base_path : Union[str, Path], optional 

118 Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

119 to point to a different folder but typically this should not be necessary. 

120 in_memory: If True, keeps dataset in memory giving speedups in training. 

121 agreement_threshold: Some link annotations come with an agreement_score representing the agreement from the human annotators. The score ranges from lowest 0.2 

122 to highest 1.0. The lower the score, the less "important" is the entity because fewer annotators thought it was worth linking. 

123 Default is 0.5 which means the majority of annotators must have annoteted the respective entity mention. 

124 """ 

125 if type(base_path) == str: 

126 base_path: Path = Path(base_path) 

127 

128 self.agreement_threshold = agreement_threshold 

129 

130 # this dataset name 

131 dataset_name = self.__class__.__name__.lower() + "_" + type(sentence_splitter).__name__ 

132 

133 # default dataset folder is the cache root 

134 if not base_path: 

135 base_path = flair.cache_root / "datasets" 

136 data_folder = base_path / dataset_name 

137 

138 aquaint_el_path = "https://www.nzdl.org/wikification/data/wikifiedStories.zip" 

139 corpus_file_name = "aquaint.txt" 

140 parsed_dataset = data_folder / corpus_file_name 

141 

142 # download and parse data if necessary 

143 if not parsed_dataset.exists(): 

144 aquaint_el_zip = cached_path(f"{aquaint_el_path}", Path("datasets") / dataset_name) 

145 unpack_file(aquaint_el_zip, data_folder, "zip", False) 

146 

147 try: 

148 with open(parsed_dataset, "w", encoding='utf-8') as txt_out: 

149 

150 # iterate over all html files 

151 for file in os.listdir(data_folder): 

152 

153 if not file.endswith(".htm"): 

154 continue 

155 

156 with open(str(data_folder / file), "r", encoding='utf-8') as txt_in: 

157 text = txt_in.read() 

158 

159 # get rid of html syntax, we only need the text 

160 strings = text.split("<p> ") 

161 strings[0] = strings[0].split('<h1 id="header">')[1][:-7] 

162 

163 for i in range(1, len(strings) - 1): 

164 strings[i] = strings[i][:-7] 

165 

166 strings[-1] = strings[-1][:-23] 

167 

168 # between all documents we write a separator symbol 

169 txt_out.write('-DOCSTART-\n\n') 

170 

171 for string in strings: 

172 

173 # skip empty strings 

174 if not string: continue 

175 

176 # process the annotation format in the text and collect triples (begin_mention, length_mention, wikiname) 

177 indices = [] 

178 lengths = [] 

179 wikinames = [] 

180 

181 current_entity = string.find('[[') # each annotation starts with '[[' 

182 while current_entity != -1: 

183 wikiname = '' 

184 surface_form = '' 

185 j = current_entity + 2 

186 

187 while string[j] not in [']', '|']: 

188 wikiname += string[j] 

189 j += 1 

190 

191 if string[j] == ']': # entity mention ends, i.e. looks like this [[wikiname]] 

192 surface_form = wikiname # in this case entity mention = wiki-page name 

193 else: # string[j] == '|' 

194 j += 1 

195 while string[j] not in [']', '|']: 

196 surface_form += string[j] 

197 j += 1 

198 

199 if string[ 

200 j] == '|': # entity has a score, i.e. looks like this [[wikiname|surface_form|agreement_score]] 

201 agreement_score = float(string[j + 1:j + 4]) 

202 j += 4 # points to first ']' of entity now 

203 if agreement_score < self.agreement_threshold: # discard entity 

204 string = string[:current_entity] + surface_form + string[j + 2:] 

205 current_entity = string.find('[[') 

206 continue 

207 

208 # replace [[wikiname|surface_form|score]] by surface_form and save index, length and wikiname of mention 

209 indices.append(current_entity) 

210 lengths.append(len(surface_form)) 

211 wikinames.append(wikiname[0].upper() + wikiname.replace(' ', '_')[1:]) 

212 

213 string = string[:current_entity] + surface_form + string[j + 2:] 

214 

215 current_entity = string.find('[[') 

216 

217 # sentence splitting and tokenization 

218 sentences = sentence_splitter.split(string) 

219 sentence_offsets = [sentence.start_pos for sentence in sentences] 

220 

221 # iterate through all annotations and add to corresponding tokens 

222 for mention_start, mention_length, wikiname in zip(indices, lengths, wikinames): 

223 

224 # find sentence to which annotation belongs 

225 sentence_index = 0 

226 for i in range(1, len(sentences)): 

227 if mention_start < sentence_offsets[i]: 

228 break 

229 else: 

230 sentence_index += 1 

231 

232 # position within corresponding sentence 

233 mention_start -= sentence_offsets[sentence_index] 

234 mention_end = mention_start + mention_length 

235 

236 # set annotation for tokens of entity mention 

237 first = True 

238 for token in sentences[sentence_index].tokens: 

239 if token.start_pos >= mention_start and token.end_pos <= mention_end: # token belongs to entity mention 

240 if first: 

241 token.set_label(typename='nel', value='B-' + wikiname) 

242 first = False 

243 else: 

244 token.set_label(typename='nel', value='I-' + wikiname) 

245 

246 # write to out-file in column format 

247 for sentence in sentences: 

248 

249 for token in sentence.tokens: 

250 

251 labels = token.get_labels('nel') 

252 

253 if len(labels) == 0: # no entity 

254 txt_out.write(token.text + '\tO\n') 

255 

256 else: # annotation 

257 txt_out.write(token.text + '\t' + labels[0].value + '\n') 

258 

259 txt_out.write('\n') # empty line after each sentence 

260 

261 except: 

262 # in case something goes wrong, delete the dataset and raise error 

263 os.remove(parsed_dataset) 

264 raise 

265 

266 super(NEL_ENGLISH_AQUAINT, self).__init__( 

267 data_folder, 

268 train_file=corpus_file_name, 

269 in_memory=in_memory, 

270 **corpusargs, 

271 ) 

272 

273 

274class NEL_GERMAN_HIPE(EntityLinkingCorpus): 

275 def __init__( 

276 self, 

277 base_path: Union[str, Path] = None, 

278 in_memory: bool = True, 

279 wiki_language: str = 'dewiki', 

280 **corpusargs 

281 ): 

282 """ 

283 Initialize a sentence-segmented version of the HIPE entity linking corpus for historical German (see description 

284 of HIPE at https://impresso.github.io/CLEF-HIPE-2020/). This version was segmented by @stefan-it and is hosted 

285 at https://github.com/stefan-it/clef-hipe. 

286 If you call the constructor the first time the dataset gets automatically downloaded and transformed in 

287 tab-separated column format. 

288 

289 Parameters 

290 ---------- 

291 base_path : Union[str, Path], optional 

292 Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

293 to point to a different folder but typically this should not be necessary. 

294 in_memory: If True, keeps dataset in memory giving speedups in training. 

295 wiki_language : specify the language of the names of the wikipedia pages, i.e. which language version of 

296 Wikipedia URLs to use. Since the text is in german the default language is German. 

297 """ 

298 self.wiki_language = wiki_language 

299 if type(base_path) == str: 

300 base_path: Path = Path(base_path) 

301 

302 # this dataset name 

303 dataset_name = self.__class__.__name__.lower() 

304 

305 # default dataset folder is the cache root 

306 if not base_path: 

307 base_path = flair.cache_root / "datasets" 

308 data_folder = base_path / dataset_name 

309 

310 dev_raw_url = "https://raw.githubusercontent.com/stefan-it/clef-hipe/main/data/future/dev-v1.2/de/HIPE-data-v1.2-dev-de-normalized-manual-eos.tsv" 

311 test_raw_url = "https://raw.githubusercontent.com/stefan-it/clef-hipe/main/data/future/test-v1.3/de/HIPE-data-v1.3-test-de-normalized-manual-eos.tsv" 

312 train_raw_url = "https://raw.githubusercontent.com/stefan-it/clef-hipe/main/data/future/training-v1.2/de/HIPE-data-v1.2-train-de-normalized-manual-eos.tsv" 

313 train_file_name = wiki_language + "_train.tsv" 

314 parsed_dataset = data_folder / train_file_name 

315 

316 # download and parse data if necessary 

317 if not parsed_dataset.exists(): 

318 

319 # from qwikidata.linked_data_interface import get_entity_dict_from_api 

320 

321 original_train_path = cached_path(f"{train_raw_url}", Path("datasets") / dataset_name) 

322 original_test_path = cached_path(f"{test_raw_url}", Path("datasets") / dataset_name) 

323 original_dev_path = cached_path(f"{dev_raw_url}", Path("datasets") / dataset_name) 

324 

325 # generate qid wikiname dictionaries 

326 log.info('Get wikinames from wikidata...') 

327 train_dict = self._get_qid_wikiname_dict(path=original_train_path) 

328 test_dict = self._get_qid_wikiname_dict(original_test_path) 

329 dev_dict = self._get_qid_wikiname_dict(original_dev_path) 

330 log.info('...done!') 

331 

332 # merge dictionaries 

333 qid_wikiname_dict = {**train_dict, **test_dict, **dev_dict} 

334 

335 for doc_path, file_name in zip([original_train_path, original_test_path, original_dev_path], 

336 [train_file_name, wiki_language + '_test.tsv', wiki_language + '_dev.tsv']): 

337 with open(doc_path, 'r', encoding='utf-8') as read, open(data_folder / file_name, 'w', 

338 encoding='utf-8') as write: 

339 

340 # ignore first line 

341 read.readline() 

342 line = read.readline() 

343 last_eos = True 

344 

345 while line: 

346 # commented and empty lines 

347 if line[0] == '#' or line == '\n': 

348 if line[2:13] == 'document_id': # beginning of new document 

349 

350 if last_eos: 

351 write.write('-DOCSTART-\n\n') 

352 last_eos = False 

353 else: 

354 write.write('\n-DOCSTART-\n\n') 

355 

356 else: 

357 line_list = line.split('\t') 

358 if not line_list[7] in ['_', 'NIL']: # line has wikidata link 

359 

360 wikiname = qid_wikiname_dict[line_list[7]] 

361 

362 if wikiname != 'O': 

363 annotation = line_list[1][:2] + wikiname 

364 else: # no entry in chosen language 

365 annotation = 'O' 

366 

367 else: 

368 

369 annotation = 'O' 

370 

371 write.write(line_list[0] + '\t' + annotation + '\n') 

372 

373 if line_list[-1][-4:-1] == 'EOS': # end of sentence 

374 write.write('\n') 

375 last_eos = True 

376 else: 

377 last_eos = False 

378 

379 line = read.readline() 

380 

381 super(NEL_GERMAN_HIPE, self).__init__( 

382 data_folder, 

383 train_file=train_file_name, 

384 dev_file=wiki_language + '_dev.tsv', 

385 test_file=wiki_language + '_test.tsv', 

386 in_memory=in_memory, 

387 **corpusargs, 

388 ) 

389 

390 def _get_qid_wikiname_dict(self, path): 

391 

392 qid_set = set() 

393 with open(path, mode='r', encoding='utf-8') as read: 

394 # read all Q-IDs 

395 

396 # ignore first line 

397 read.readline() 

398 line = read.readline() 

399 

400 while line: 

401 

402 if not (line[0] == '#' or line == '\n'): # commented or empty lines 

403 line_list = line.split('\t') 

404 if not line_list[7] in ['_', 'NIL']: # line has wikidata link 

405 

406 qid_set.add(line_list[7]) 

407 

408 line = read.readline() 

409 

410 base_url = 'https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&props=sitelinks&sitefilter=' + self.wiki_language + '&ids=' 

411 

412 qid_list = list(qid_set) 

413 ids = '' 

414 length = len(qid_list) 

415 qid_wikiname_dict = {} 

416 for i in range(length): 

417 if ( 

418 i + 1) % 50 == 0 or i == length - 1: # there is a limit to the number of ids in one request in the wikidata api 

419 

420 ids += qid_list[i] 

421 # request 

422 response_json = requests.get(base_url + ids).json() 

423 

424 for qid in response_json['entities']: 

425 

426 try: 

427 wikiname = response_json['entities'][qid]['sitelinks'][self.wiki_language]['title'].replace(' ', 

428 '_') 

429 except KeyError: # language not available for specific wikiitem 

430 wikiname = 'O' 

431 

432 qid_wikiname_dict[qid] = wikiname 

433 

434 ids = '' 

435 

436 else: 

437 ids += qid_list[i] 

438 ids += '|' 

439 

440 return qid_wikiname_dict 

441 

442 

443class NEL_ENGLISH_AIDA(EntityLinkingCorpus): 

444 def __init__( 

445 self, 

446 base_path: Union[str, Path] = None, 

447 in_memory: bool = True, 

448 check_existence: bool = False, 

449 **corpusargs 

450 ): 

451 """ 

452 Initialize AIDA CoNLL-YAGO Entity Linking corpus introduced here https://www.mpi-inf.mpg.de/departments/databases-and-information-systems/research/ambiverse-nlu/aida/downloads. 

453 License: https://creativecommons.org/licenses/by-sa/3.0/deed.en_US 

454 If you call the constructor the first time the dataset gets automatically downloaded and transformed in tab-separated column format. 

455 

456 Parameters 

457 ---------- 

458 base_path : Union[str, Path], optional 

459 Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

460 to point to a different folder but typically this should not be necessary. 

461 in_memory: If True, keeps dataset in memory giving speedups in training. 

462 check_existence: If True the existence of the given wikipedia ids/pagenames is checked and non existent ids/names will be igrnored. 

463 """ 

464 if type(base_path) == str: 

465 base_path: Path = Path(base_path) 

466 

467 # this dataset name 

468 dataset_name = self.__class__.__name__.lower() 

469 

470 # default dataset folder is the cache root 

471 if not base_path: 

472 base_path = flair.cache_root / "datasets" 

473 data_folder = base_path / dataset_name 

474 

475 conll_yago_path = "https://nlp.informatik.hu-berlin.de/resources/datasets/conll_entity_linking/" 

476 corpus_file_name = "train" 

477 parsed_dataset = data_folder / corpus_file_name 

478 

479 if not parsed_dataset.exists(): 

480 

481 import wikipediaapi 

482 

483 wiki_wiki = wikipediaapi.Wikipedia(language='en') 

484 

485 testa_unprocessed_path = cached_path(f"{conll_yago_path}aida_conll_testa", Path("datasets") / dataset_name) 

486 testb_unprocessed_path = cached_path(f"{conll_yago_path}aida_conll_testb", Path("datasets") / dataset_name) 

487 train_unprocessed_path = cached_path(f"{conll_yago_path}aida_conll_train", Path("datasets") / dataset_name) 

488 

489 # we use the wikiids in the data instead of directly utilizing the wikipedia urls. 

490 # like this we can quickly check if the corresponding page exists 

491 wikiid_wikiname_dict = self._get_wikiid_wikiname_dict(data_folder) 

492 

493 for name, path in zip(['train', 'testa', 'testb'], 

494 [train_unprocessed_path, testa_unprocessed_path, testb_unprocessed_path]): 

495 with open(data_folder / name, 'w', encoding='utf-8') as write, open(path, 'r', 

496 encoding='utf-8') as read: 

497 

498 for line in read: 

499 

500 line_list = line.split('\t') 

501 if len(line_list) <= 4: 

502 if line_list[0][:10] == '-DOCSTART-': # Docstart 

503 write.write('-DOCSTART-\n\n') 

504 elif line_list[0] == '\n': # empty line 

505 write.write('\n') 

506 else: # text without annotation or marked '--NME--' (no matching entity) 

507 if len(line_list) == 1: 

508 write.write(line_list[0][:-1] + '\tO\n') 

509 else: 

510 write.write(line_list[0] + '\tO\n') 

511 else: # line with annotation 

512 wikiname = wikiid_wikiname_dict[line_list[5].strip()] 

513 if wikiname != 'O': 

514 write.write(line_list[0] + '\t' + line_list[1] + '-' + wikiname + '\n') 

515 else: 

516 # if there is a bad wikiid we can check if the given url in the data exists using wikipediaapi 

517 wikiname = line_list[4].split('/')[-1] 

518 if check_existence: 

519 page = wiki_wiki.page(wikiname) 

520 if page.exists(): 

521 write.write(line_list[0] + '\t' + line_list[1] + '-' + wikiname + '\n') 

522 else: # neither the wikiid nor the url exist 

523 write.write(line_list[0] + '\tO\n') 

524 else: 

525 write.write(line_list[0] + '\t' + line_list[4] + '-' + wikiname + '\n') 

526 

527 # delete unprocessed file 

528 os.remove(path) 

529 

530 super(NEL_ENGLISH_AIDA, self).__init__( 

531 data_folder, 

532 train_file=corpus_file_name, 

533 dev_file='testa', 

534 test_file='testb', 

535 in_memory=in_memory, 

536 **corpusargs, 

537 ) 

538 

539 def _get_wikiid_wikiname_dict(self, base_folder): 

540 

541 # collect all wikiids 

542 wikiid_set = set() 

543 for data_file in ['aida_conll_testa', 'aida_conll_testb', 'aida_conll_train']: 

544 with open(base_folder / data_file, mode='r', encoding='utf-8') as read: 

545 line = read.readline() 

546 while line: 

547 row = line.split('\t') 

548 if len(row) > 4: # line has a wiki annotation 

549 wikiid_set.add(row[5].strip()) 

550 line = read.readline() 

551 

552 # create the dictionary 

553 wikiid_wikiname_dict = {} 

554 wikiid_list = list(wikiid_set) 

555 ids = '' 

556 length = len(wikiid_list) 

557 

558 for i in range(length): 

559 if ( 

560 i + 1) % 50 == 0 or i == length - 1: # there is a limit to the number of ids in one request in the wikimedia api 

561 

562 ids += wikiid_list[i] 

563 # request 

564 resp = requests.get( 

565 'https://en.wikipedia.org/w/api.php', 

566 params={ 

567 'action': 'query', 

568 'prop': 'info', 

569 'pageids': ids, 

570 'format': 'json' 

571 } 

572 ).json() 

573 

574 for wikiid in resp['query']['pages']: 

575 try: 

576 wikiname = resp['query']['pages'][wikiid]['title'].replace(' ', '_') 

577 except KeyError: # bad wikiid 

578 wikiname = 'O' 

579 wikiid_wikiname_dict[wikiid] = wikiname 

580 ids = '' 

581 

582 else: 

583 ids += wikiid_list[i] 

584 ids += '|' 

585 

586 return wikiid_wikiname_dict 

587 

588 

589class NEL_ENGLISH_IITB(EntityLinkingCorpus): 

590 def __init__( 

591 self, 

592 base_path: Union[str, Path] = None, 

593 in_memory: bool = True, 

594 ignore_disagreements: bool = False, 

595 sentence_splitter: SentenceSplitter = SegtokSentenceSplitter(), 

596 **corpusargs 

597 ): 

598 """ 

599 Initialize ITTB Entity Linking corpus introduced in "Collective Annotation of Wikipedia Entities in Web Text" Sayali Kulkarni, Amit Singh, Ganesh Ramakrishnan, and Soumen Chakrabarti. 

600 If you call the constructor the first time the dataset gets automatically downloaded and transformed in tab-separated column format. 

601 

602 Parameters 

603 ---------- 

604 base_path : Union[str, Path], optional 

605 Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

606 to point to a different folder but typically this should not be necessary. 

607 in_memory: If True, keeps dataset in memory giving speedups in training. 

608 ignore_disagreements: If True annotations with annotator disagreement will be ignored. 

609 """ 

610 if type(base_path) == str: 

611 base_path: Path = Path(base_path) 

612 

613 # this dataset name 

614 dataset_name = self.__class__.__name__.lower() + "_" + type(sentence_splitter).__name__ 

615 

616 # default dataset folder is the cache root 

617 if not base_path: 

618 base_path = flair.cache_root / "datasets" 

619 data_folder = base_path / dataset_name 

620 

621 iitb_el_docs_path = "https://www.cse.iitb.ac.in/~soumen/doc/CSAW/Annot/CSAW_crawledDocs.tar.gz" 

622 iitb_el_annotations_path = "https://www.cse.iitb.ac.in/~soumen/doc/CSAW/Annot/CSAW_Annotations.xml" 

623 corpus_file_name = "iitb.txt" 

624 parsed_dataset = data_folder / corpus_file_name 

625 

626 label_type = 'nel' 

627 

628 if not parsed_dataset.exists(): 

629 

630 docs_zip_path = cached_path(f"{iitb_el_docs_path}", Path("datasets") / dataset_name) 

631 annotations_xml_path = cached_path(f"{iitb_el_annotations_path}", Path("datasets") / dataset_name) 

632 

633 unpack_file(docs_zip_path, data_folder, "tar", False) 

634 

635 import xml.etree.ElementTree as ET 

636 tree = ET.parse(annotations_xml_path) 

637 root = tree.getroot() 

638 

639 # names of raw text documents 

640 doc_names = set() 

641 for elem in root: 

642 doc_names.add(elem[0].text) 

643 

644 # open output_file 

645 with open(parsed_dataset, 'w', encoding='utf-8') as write: 

646 # iterate through all documents 

647 for doc_name in doc_names: 

648 with open(data_folder / 'crawledDocs' / doc_name, 'r', encoding='utf-8') as read: 

649 text = read.read() 

650 

651 # split sentences and tokenize 

652 sentences = sentence_splitter.split(text) 

653 sentence_offsets = [sentence.start_pos for sentence in sentences] 

654 

655 # iterate through all annotations and add to corresponding tokens 

656 for elem in root: 

657 

658 if elem[0].text == doc_name and elem[2].text: # annotation belongs to current document 

659 

660 wikiname = elem[2].text.replace(' ', '_') 

661 mention_start = int(elem[3].text) 

662 mention_length = int(elem[4].text) 

663 

664 # find sentence to which annotation belongs 

665 sentence_index = 0 

666 for i in range(1, len(sentences)): 

667 if mention_start < sentence_offsets[i]: 

668 break 

669 else: 

670 sentence_index += 1 

671 

672 # position within corresponding sentence 

673 mention_start -= sentence_offsets[sentence_index] 

674 mention_end = mention_start + mention_length 

675 

676 # set annotation for tokens of entity mention 

677 first = True 

678 for token in sentences[sentence_index].tokens: 

679 if token.start_pos >= mention_start and token.end_pos <= mention_end: # token belongs to entity mention 

680 if first: 

681 token.set_label(typename=elem[1].text, value='B-' + wikiname) 

682 first = False 

683 else: 

684 token.set_label(typename=elem[1].text, value='I-' + wikiname) 

685 

686 # write to out file 

687 write.write('-DOCSTART-\n\n') # each file is one document 

688 

689 for sentence in sentences: 

690 

691 for token in sentence.tokens: 

692 

693 labels = token.labels 

694 

695 if len(labels) == 0: # no entity 

696 write.write(token.text + '\tO\n') 

697 

698 elif len(labels) == 1: # annotation from one annotator 

699 write.write(token.text + '\t' + labels[0].value + '\n') 

700 

701 else: # annotations from two annotators 

702 

703 if labels[0].value == labels[1].value: # annotators agree 

704 write.write(token.text + '\t' + labels[0].value + '\n') 

705 

706 else: # annotators disagree: ignore or arbitrarily take first annotation 

707 

708 if ignore_disagreements: 

709 write.write(token.text + '\tO\n') 

710 

711 else: 

712 write.write(token.text + '\t' + labels[0].value + '\n') 

713 

714 write.write('\n') # empty line after each sentence 

715 

716 super(NEL_ENGLISH_IITB, self).__init__( 

717 data_folder, 

718 train_file=corpus_file_name, 

719 in_memory=in_memory, 

720 **corpusargs, 

721 ) 

722 

723 

724class NEL_ENGLISH_TWEEKI(EntityLinkingCorpus): 

725 def __init__( 

726 self, 

727 base_path: Union[str, Path] = None, 

728 in_memory: bool = True, 

729 **corpusargs, 

730 ): 

731 """ 

732 Initialize Tweeki Entity Linking corpus introduced in "Tweeki: Linking Named Entities on Twitter to a Knowledge Graph" Harandizadeh, Singh. 

733 The data consits of tweets with manually annotated wikipedia links. 

734 If you call the constructor the first time the dataset gets automatically downloaded and transformed in tab-separated column format. 

735 

736 Parameters 

737 ---------- 

738 base_path : Union[str, Path], optional 

739 Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

740 to point to a different folder but typically this should not be necessary. 

741 in_memory: If True, keeps dataset in memory giving speedups in training. 

742 """ 

743 if type(base_path) == str: 

744 base_path: Path = Path(base_path) 

745 

746 # this dataset name 

747 dataset_name = self.__class__.__name__.lower() 

748 

749 # default dataset folder is the cache root 

750 if not base_path: 

751 base_path = flair.cache_root / "datasets" 

752 data_folder = base_path / dataset_name 

753 

754 tweeki_gold_el_path = "https://raw.githubusercontent.com/ucinlp/tweeki/main/data/Tweeki_gold/Tweeki_gold" 

755 corpus_file_name = "tweeki_gold.txt" 

756 parsed_dataset = data_folder / corpus_file_name 

757 

758 # download and parse data if necessary 

759 if not parsed_dataset.exists(): 

760 

761 original_file_path = cached_path(f"{tweeki_gold_el_path}", Path("datasets") / dataset_name) 

762 

763 with open(original_file_path, 'r', encoding='utf-8') as read, open(parsed_dataset, 'w', 

764 encoding='utf-8') as write: 

765 line = read.readline() 

766 while line: 

767 if line.startswith('#'): 

768 out_line = '' 

769 elif line == '\n': # tweet ends 

770 out_line = '\n-DOCSTART-\n\n' 

771 else: 

772 line_list = line.split('\t') 

773 out_line = line_list[1] + '\t' 

774 if line_list[3] == '-\n': # no wiki name 

775 out_line += 'O\n' 

776 else: 

777 out_line += line_list[2][:2] + line_list[3].split('|')[0].replace(' ', '_') + '\n' 

778 write.write(out_line) 

779 line = read.readline() 

780 

781 os.rename(original_file_path, str(original_file_path) + '_original') 

782 

783 super(NEL_ENGLISH_TWEEKI, self).__init__( 

784 data_folder, 

785 train_file=corpus_file_name, 

786 in_memory=in_memory, 

787 **corpusargs, 

788 ) 

789 

790 

791class NEL_ENGLISH_REDDIT(EntityLinkingCorpus): 

792 def __init__( 

793 self, 

794 base_path: Union[str, Path] = None, 

795 in_memory: bool = True, 

796 **corpusargs, 

797 ): 

798 """ 

799 Initialize the Reddit Entity Linking corpus containing gold annotations only (https://arxiv.org/abs/2101.01228v2) in the NER-like column format. 

800 The first time you call this constructor it will automatically download the dataset. 

801 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

802 to point to a different folder but typically this should not be necessary. 

803 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

804 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

805 """ 

806 if type(base_path) == str: 

807 base_path: Path = Path(base_path) 

808 

809 # this dataset name 

810 dataset_name = self.__class__.__name__.lower() 

811 

812 # default dataset folder is the cache root 

813 if not base_path: 

814 base_path = flair.cache_root / "datasets" 

815 data_folder = base_path / dataset_name 

816 

817 # download and parse data if necessary 

818 reddit_el_path = "https://zenodo.org/record/3970806/files/reddit_el.zip" 

819 corpus_file_name = "reddit_el_gold.txt" 

820 parsed_dataset = data_folder / corpus_file_name 

821 

822 if not parsed_dataset.exists(): 

823 reddit_el_zip = cached_path(f"{reddit_el_path}", Path("datasets") / dataset_name) 

824 unpack_file(reddit_el_zip, data_folder, "zip", False) 

825 

826 with open(data_folder / corpus_file_name, "w", encoding='utf-8') as txtout: 

827 

828 # First parse the post titles 

829 with open(data_folder / "posts.tsv", "r", encoding='utf-8') as tsvin1, open( 

830 data_folder / "gold_post_annotations.tsv", "r", encoding='utf-8') as tsvin2: 

831 

832 posts = csv.reader(tsvin1, delimiter="\t") 

833 self.post_annotations = csv.reader(tsvin2, delimiter="\t") 

834 self.curr_annot = next(self.post_annotations) 

835 

836 for row in posts: # Go through all the post titles 

837 

838 txtout.writelines("-DOCSTART-\n\n") # Start each post with a -DOCSTART- token 

839 

840 # Keep track of how many and which entity mentions does a given post title have 

841 link_annots = [] # [start pos, end pos, wiki page title] of an entity mention 

842 

843 # Check if the current post title has an entity link and parse accordingly 

844 if row[0] == self.curr_annot[0]: 

845 

846 link_annots.append((int(self.curr_annot[4]), int(self.curr_annot[5]), self.curr_annot[3])) 

847 link_annots = self._fill_annot_array(link_annots, row[0], post_flag=True) 

848 

849 # Post titles with entity mentions (if any) are handled via this function 

850 self._text_to_cols(Sentence(row[2], use_tokenizer=True), link_annots, txtout) 

851 else: 

852 self._text_to_cols(Sentence(row[2], use_tokenizer=True), link_annots, txtout) 

853 

854 # Then parse the comments 

855 with open(data_folder / "comments.tsv", "r", encoding='utf-8') as tsvin3, open( 

856 data_folder / "gold_comment_annotations.tsv", "r", encoding='utf-8') as tsvin4: 

857 

858 self.comments = csv.reader(tsvin3, delimiter="\t") 

859 self.comment_annotations = csv.reader(tsvin4, delimiter="\t") 

860 self.curr_annot = next(self.comment_annotations) 

861 self.curr_row = next(self.comments) 

862 self.stop_iter = False 

863 

864 # Iterate over the comments.tsv file, until the end is reached 

865 while not self.stop_iter: 

866 

867 txtout.writelines("-DOCSTART-\n") # Start each comment thread with a -DOCSTART- token 

868 

869 # Keep track of the current comment thread and its corresponding key, on which the annotations are matched. 

870 # Each comment thread is handled as one 'document'. 

871 self.curr_comm = self.curr_row[4] 

872 comm_key = self.curr_row[0] 

873 

874 # Python's csv package for some reason fails to correctly parse a handful of rows inside the comments.tsv file. 

875 # This if-condition is needed to handle this problem. 

876 if comm_key in {"en5rf4c", "es3ia8j", "es3lrmw"}: 

877 if comm_key == "en5rf4c": 

878 self.parsed_row = (r.split("\t") for r in self.curr_row[4].split("\n")) 

879 self.curr_comm = next(self.parsed_row) 

880 self._fill_curr_comment(fix_flag=True) 

881 # In case we are dealing with properly parsed rows, proceed with a regular parsing procedure 

882 else: 

883 self._fill_curr_comment(fix_flag=False) 

884 

885 link_annots = [] # [start pos, end pos, wiki page title] of an entity mention 

886 

887 # Check if the current comment thread has an entity link and parse accordingly, same as with post titles above 

888 if comm_key == self.curr_annot[0]: 

889 link_annots.append((int(self.curr_annot[4]), int(self.curr_annot[5]), self.curr_annot[3])) 

890 link_annots = self._fill_annot_array(link_annots, comm_key, post_flag=False) 

891 self._text_to_cols(Sentence(self.curr_comm, use_tokenizer=True), link_annots, txtout) 

892 else: 

893 # In two of the comment thread a case of capital letter spacing occurs, which the SegtokTokenizer cannot properly handle. 

894 # The following if-elif condition handles these two cases and as result writes full capitalized words in each corresponding row, 

895 # and not just single letters into single rows. 

896 if comm_key == "dv74ybb": 

897 self.curr_comm = " ".join( 

898 [word.replace(" ", "") for word in self.curr_comm.split(" ")]) 

899 elif comm_key == "eci2lut": 

900 self.curr_comm = (self.curr_comm[:18] + self.curr_comm[18:27].replace(" ", 

901 "") + self.curr_comm[ 

902 27:55] + 

903 self.curr_comm[55:68].replace(" ", "") + self.curr_comm[ 

904 68:85] + self.curr_comm[ 

905 85:92].replace(" ", 

906 "") + 

907 self.curr_comm[92:]) 

908 

909 self._text_to_cols(Sentence(self.curr_comm, use_tokenizer=True), link_annots, txtout) 

910 

911 super(NEL_ENGLISH_REDDIT, self).__init__( 

912 data_folder, 

913 train_file=corpus_file_name, 

914 in_memory=in_memory, 

915 **corpusargs, 

916 ) 

917 

918 def _text_to_cols(self, sentence: Sentence, links: list, outfile): 

919 """ 

920 Convert a tokenized sentence into column format 

921 :param sentence: Flair Sentence object containing a tokenized post title or comment thread 

922 :param links: array containing information about the starting and ending position of an entity mention, as well 

923 as its corresponding wiki tag 

924 :param outfile: file, to which the output is written 

925 """ 

926 for i in range(0, len(sentence)): 

927 # If there are annotated entity mentions for given post title or a comment thread 

928 if links: 

929 # Keep track which is the correct corresponding entity link, in cases where there is >1 link in a sentence 

930 link_index = [j for j, v in enumerate(links) if 

931 (sentence[i].start_pos >= v[0] and sentence[i].end_pos <= v[1])] 

932 # Write the token with a corresponding tag to file 

933 try: 

934 if any(sentence[i].start_pos == v[0] and sentence[i].end_pos == v[1] for j, v in enumerate(links)): 

935 outfile.writelines(sentence[i].text + "\tS-" + links[link_index[0]][2] + "\n") 

936 elif any( 

937 sentence[i].start_pos == v[0] and sentence[i].end_pos != v[1] for j, v in enumerate(links)): 

938 outfile.writelines(sentence[i].text + "\tB-" + links[link_index[0]][2] + "\n") 

939 elif any( 

940 sentence[i].start_pos >= v[0] and sentence[i].end_pos <= v[1] for j, v in enumerate(links)): 

941 outfile.writelines(sentence[i].text + "\tI-" + links[link_index[0]][2] + "\n") 

942 else: 

943 outfile.writelines(sentence[i].text + "\tO\n") 

944 # IndexError is raised in cases when there is exactly one link in a sentence, therefore can be dismissed 

945 except IndexError: 

946 pass 

947 

948 # If a comment thread or a post title has no entity link, all tokens are assigned the O tag 

949 else: 

950 outfile.writelines(sentence[i].text + "\tO\n") 

951 

952 # Prevent writing empty lines if e.g. a quote comes after a dot or initials are tokenized 

953 # incorrectly, in order to keep the desired format (empty line as a sentence separator). 

954 try: 

955 if ((sentence[i].text in {".", "!", "?", "!*"}) and 

956 (sentence[i + 1].text not in {'"', '“', "'", "''", "!", "?", ";)", "."}) and 

957 ("." not in sentence[i - 1].text)): 

958 outfile.writelines("\n") 

959 except IndexError: 

960 # Thrown when the second check above happens, but the last token of a sentence is reached. 

961 # Indicates that the EOS punctuaion mark is present, therefore an empty line needs to be written below. 

962 outfile.writelines("\n") 

963 

964 # If there is no punctuation mark indicating EOS, an empty line is still needed after the EOS 

965 if sentence[-1].text not in {".", "!", "?"}: 

966 outfile.writelines("\n") 

967 

968 def _fill_annot_array(self, annot_array: list, key: str, post_flag: bool) -> list: 

969 """ 

970 Fills the array containing information about the entity mention annotations, used in the _text_to_cols method 

971 :param annot_array: array to be filled 

972 :param key: reddit id, on which the post title/comment thread is matched with its corresponding annotation 

973 :param post_flag: flag indicating whether the annotations are collected for the post titles (=True) 

974 or comment threads (=False) 

975 """ 

976 next_annot = None 

977 while True: 

978 # Check if further annotations belong to the current post title or comment thread as well 

979 try: 

980 next_annot = next(self.post_annotations) if post_flag else next(self.comment_annotations) 

981 if next_annot[0] == key: 

982 annot_array.append((int(next_annot[4]), int(next_annot[5]), next_annot[3])) 

983 else: 

984 self.curr_annot = next_annot 

985 break 

986 # Stop when the end of an annotation file is reached 

987 except StopIteration: 

988 break 

989 return annot_array 

990 

991 def _fill_curr_comment(self, fix_flag: bool): 

992 """ 

993 Extends the string containing the current comment thread, which is passed to _text_to_cols method, when the 

994 comments are parsed. 

995 :param fix_flag: flag indicating whether the method is called when the incorrectly imported rows are parsed (=True) 

996 or regular rows (=False) 

997 """ 

998 next_row = None 

999 while True: 

1000 # Check if further annotations belong to the current sentence as well 

1001 try: 

1002 next_row = next(self.comments) if not fix_flag else next(self.parsed_row) 

1003 if len(next_row) < 2: 

1004 # 'else " "' is needed to keep the proper token positions (for accordance with annotations) 

1005 self.curr_comm += next_row[0] if any(next_row) else " " 

1006 else: 

1007 self.curr_row = next_row 

1008 break 

1009 except StopIteration: # When the end of the comments.tsv file is reached 

1010 self.curr_row = next_row 

1011 self.stop_iter = True if not fix_flag else False 

1012 break 

1013 

1014 

1015def from_ufsac_to_tsv(xml_file: Union[str, Path], conll_file: Union[str, Path], datasetname: str, 

1016 encoding: str = "utf8", 

1017 cut_multisense: bool = True): 

1018 """ 

1019 Function that converts the UFSAC format into tab separated column format in a new file. 

1020 Parameters 

1021 ---------- 

1022 xml_file : Union[str, Path] 

1023 Path to the xml file. 

1024 conll_file : Union[str, Path] 

1025 Path for the new conll file. 

1026 datasetname: str 

1027 Name of the dataset from UFSAC, needed because of different handling of multi-word-spans in the datasets 

1028 encoding : str, optional 

1029 Encoding used in open function. The default is "utf8". 

1030 cut_multisense : bool, optional 

1031 Boolean that determines whether or not the wn30_key tag should be cut if it contains multiple possible senses. 

1032 If True only the first listed sense will be used. Otherwise the whole list of senses will be detected 

1033 as one new sense. The default is True. 

1034 

1035 """ 

1036 

1037 def make_line(word, begin_or_inside, attributes): 

1038 """ 

1039 Function that creates an output line from a word. 

1040 Parameters 

1041 ---------- 

1042 word : 

1043 String of the actual word. 

1044 begin_or_inside: 

1045 Either 'B-' or 'I-' 

1046 attributes: 

1047 List of attributes of the word (pos, lemma, wn30_key) 

1048 """ 

1049 line = word 

1050 if cut_multisense == True: 

1051 attributes[-1] = attributes[-1].split(';')[0] # take only first sense 

1052 

1053 for attrib in attributes: 

1054 if attrib != 'O': 

1055 line = line + '\t' + begin_or_inside + attrib 

1056 else: 

1057 line = line + '\tO' 

1058 line += '\n' 

1059 

1060 return line 

1061 

1062 def split_span(word_fields: List[str], datasetname: str()): 

1063 """ 

1064 Function that splits a word if necessary, i.e. if it is a multiple-word-span. 

1065 Parameters 

1066 ---------- 

1067 word_fields : 

1068 list ['surface_form', 'lemma', 'pos', 'wn30_key'] of a word 

1069 datasetname: 

1070 name of corresponding dataset 

1071 """ 

1072 

1073 span = word_fields[0] 

1074 

1075 if datasetname in ['trainomatic', 'masc']: # splitting not sensible for these datasets 

1076 return [span] 

1077 elif datasetname == 'omsti': 

1078 if word_fields[ 

1079 3] != 'O' and not span == '_' and not '__' in span: # has annotation and does not consist only of '_' (still not 100% clean) 

1080 return span.split('_') 

1081 else: 

1082 return [span] 

1083 else: # for all other datasets splitting at '_' is always sensible 

1084 return span.split('_') 

1085 

1086 txt_out = open(file=conll_file, mode='w', encoding=encoding) 

1087 import xml.etree.ElementTree as ET 

1088 tree = ET.parse(xml_file) 

1089 corpus = tree.getroot() 

1090 

1091 number_of_docs = len(corpus.findall('document')) 

1092 

1093 fields = ['surface_form', 'lemma', 'pos', 'wn30_key'] 

1094 for document in corpus: 

1095 # Docstart 

1096 if number_of_docs > 1: 

1097 txt_out.write('-DOCSTART-\n\n') 

1098 

1099 for paragraph in document: 

1100 

1101 for sentence in paragraph: 

1102 

1103 for word in sentence: 

1104 

1105 dictionary = word.attrib 

1106 fields_of_word = [word.attrib[field] if (field in dictionary) else 'O' for field in fields] 

1107 

1108 chunks = split_span(fields_of_word, datasetname) 

1109 

1110 txt_out.write(make_line(chunks[0], 'B-', fields_of_word[1:])) 

1111 

1112 # if there is more than one word in the chunk we write each in a separate line 

1113 for chunk in chunks[1:]: 

1114 # print(chunks) 

1115 txt_out.write(make_line(chunk, 'I-', fields_of_word[1:])) 

1116 

1117 # empty line after each sentence 

1118 txt_out.write('\n') 

1119 

1120 txt_out.close() 

1121 

1122 

1123def determine_tsv_file(filename: str, data_folder: str, cut_multisense: bool = True): 

1124 """ 

1125 Checks if the converted .tsv file already exists and if not, creates it. Returns name of the file. 

1126 ---------- 

1127 string : str 

1128 String that contains the name of the file. 

1129 data_folder : str 

1130 String that contains the name of the folder in which the CoNLL file should reside. 

1131 cut_multisense : bool, optional 

1132 Boolean that determines whether or not the wn30_key tag should be cut if it contains multiple possible senses. 

1133 If True only the first listed sense will be used. Otherwise the whole list of senses will be detected 

1134 as one new sense. The default is True. 

1135 """ 

1136 

1137 if cut_multisense is True and filename not in ['semeval2007task17', 'trainomatic', 

1138 'wngt']: # these three datasets do not have multiple senses 

1139 

1140 conll_file_name = filename + '_cut.tsv' 

1141 

1142 else: 

1143 

1144 conll_file_name = filename + '.tsv' 

1145 

1146 path_to_conll_file = data_folder / conll_file_name 

1147 

1148 if not path_to_conll_file.exists(): 

1149 # convert the file to CoNLL 

1150 

1151 from_ufsac_to_tsv(xml_file=Path(data_folder / 'original_data' / (filename + '.xml')), 

1152 conll_file=Path(data_folder / conll_file_name), 

1153 datasetname=filename, 

1154 cut_multisense=cut_multisense) 

1155 

1156 return conll_file_name 

1157 

1158 

1159class WSD_UFSAC(MultiCorpus): 

1160 def __init__( 

1161 self, 

1162 filenames: Union[str, List[str]] = ['masc', 'semcor'], 

1163 base_path: Union[str, Path] = None, 

1164 in_memory: bool = True, 

1165 cut_multisense: bool = True, 

1166 columns={0: "text", 3: "wn30_key"}, 

1167 tag_to_bioes=None, 

1168 banned_sentences: List[str] = None, 

1169 sample_missing_splits_in_multicorpus: bool = True, 

1170 sample_missing_splits_in_each_corpus: bool = True, 

1171 use_raganato_ALL_as_test_data: bool = False, 

1172 name: str = 'multicorpus' 

1173 ): 

1174 """ 

1175 Initialize a custom corpus with any Word Sense Disambiguation (WSD) datasets in the UFSAC format from https://github.com/getalp/UFSAC. 

1176 If the constructor is called for the first time the data is automatically downloaded and transformed from xml to a tab separated column format. 

1177 Since only the WordNet 3.0 version for senses is consistently available for all provided datasets we will only consider this version. 

1178 Also we ignore the id annotation used in datasets that were originally created for evaluation tasks 

1179 :param filenames: Here you can pass a single datasetname or a list of ddatasetnames. The available names are: 

1180 'masc', 'omsti', 'raganato_ALL', 'raganato_semeval2007', 'raganato_semeval2013', 'raganato_semeval2015', 'raganato_senseval2', 'raganato_senseval3', 

1181 'semcor', 'semeval2007task17', 'semeval2007task7', 'semeval2013task12', 'semeval2015task13', 'senseval2', 'senseval2_lexical_sample_test', 

1182 'senseval2_lexical_sample_train', 'senseval3task1', 'senseval3task6_test', 'senseval3task6_train', 'trainomatic', 'wngt'. 

1183 So you can pass for example filenames = ['masc', 'omsti', 'wngt']. Default two mid-sized datasets 'masc' and 'semcor' are loaded. 

1184 :param base_path: You can override this to point to a specific folder but typically this should not be necessary. 

1185 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

1186 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

1187 :param cut_multisense: Boolean that determines whether or not the wn30_key tag should be cut if it contains 

1188 multiple possible senses. If True only the first listed sense will be used and the 

1189 suffix '_cut' will be added to the name of the CoNLL file. Otherwise the whole list of 

1190 senses will be detected as one new sense. The default is True. 

1191 :param columns: Columns to consider when loading the dataset. You can add 1: "lemma" or 2: "pos" to the default dict {0: "text", 3: "wn30_key"} 

1192 if you want to use additional pos and/or lemma for the words. 

1193 :param tag_to_bioes: whether to convert to BIOES tagging scheme 

1194 :param banned_sentences: Optionally remove sentences from the corpus. Works only if `in_memory` is true 

1195 :param sample_missing_splits_in_multicorpus: Whether to sample missing splits when loading the multicorpus (this is redundant if 

1196 sample_missing_splits_in_each_corpus is True) 

1197 :param sample_missing_splits_in_each_corpus: Whether to sample missing splits when loading each single corpus given in filenames. 

1198 :param use_raganato_ALL_as_test_data: If True, the raganato_ALL dataset (Raganato et al. "Word Sense Disambiguation: A unified evaluation framework and empirical compariso") 

1199 will be used as test data. Note that the sample_missing_splits parameters are set to 'only_dev' in this case if set to True. 

1200 :param name: Name of your (costum) corpus 

1201 """ 

1202 if type(base_path) == str: 

1203 base_path: Path = Path(base_path) 

1204 

1205 # this dataset name 

1206 dataset_name = self.__class__.__name__.lower() 

1207 

1208 # default dataset folder is the cache root 

1209 if not base_path: 

1210 base_path = flair.cache_root / "datasets" 

1211 data_folder = base_path / dataset_name 

1212 original_data_folder = data_folder / 'original_data' 

1213 

1214 # check if data there, if not, download the data 

1215 if not original_data_folder.exists(): 

1216 # create folder 

1217 data_folder.mkdir(parents=True) 

1218 

1219 # download data 

1220 import gdown 

1221 

1222 url = 'https://drive.google.com/uc?id=1Oigo3kzRosz2VjyA44vpJZ58tDFyLRMO' 

1223 

1224 output = data_folder / (dataset_name + '.tar') 

1225 

1226 gdown.download(url, str(output), quiet=False) 

1227 

1228 output = data_folder / (dataset_name + '.tar') 

1229 unpack_file(file=output, 

1230 unpack_to=data_folder, 

1231 mode='tar', keep=False) 

1232 

1233 os.rename(data_folder / 'ufsac-public-2.1', original_data_folder) 

1234 

1235 # transform data into column format if necessary 

1236 

1237 # if no filenames are specified we use all the data 

1238 if not filenames: 

1239 filenames = [name[:-4] for name in os.listdir(original_data_folder) if not 'raganato' in name] 

1240 

1241 if type(filenames) == str: 

1242 filenames = [filenames] 

1243 

1244 corpora = [] 

1245 

1246 print('Transforming data into column format and creating corpora...') 

1247 

1248 if use_raganato_ALL_as_test_data: 

1249 # in this case no test data should be generated by sampling from train data. But if the sample arguments are set to true, the dev set will be sampled 

1250 if sample_missing_splits_in_each_corpus: 

1251 sample_missing_splits_in_each_corpus = 'only_dev' 

1252 if sample_missing_splits_in_multicorpus: 

1253 sample_missing_splits_in_multicorpus = 'only_dev' 

1254 

1255 # also we remove 'raganato_ALL' from filenames in case its in the list 

1256 if 'raganato_ALL' in filenames: 

1257 filenames.remove('raganato_ALL') 

1258 

1259 # generate the test file 

1260 test_file = determine_tsv_file(filename='raganato_ALL', data_folder=data_folder, 

1261 cut_multisense=cut_multisense) 

1262 

1263 corpus = ColumnCorpus(data_folder=data_folder, 

1264 column_format=columns, 

1265 test_file=test_file, # corpus only has test data 

1266 in_memory=in_memory, 

1267 tag_to_bioes=tag_to_bioes, 

1268 column_delimiter='\t', 

1269 document_separator_token='-DOCSTART-', 

1270 banned_sentences=banned_sentences, 

1271 autofind_splits=False, 

1272 sample_missing_splits=sample_missing_splits_in_each_corpus, 

1273 ) 

1274 corpora.append(corpus) 

1275 

1276 for filename in filenames: 

1277 # make column file and save to data_folder 

1278 

1279 new_filename = determine_tsv_file(filename=filename, data_folder=data_folder, cut_multisense=cut_multisense) 

1280 

1281 corpus = ColumnCorpus(data_folder=data_folder, 

1282 column_format=columns, 

1283 train_file=new_filename, 

1284 in_memory=in_memory, 

1285 tag_to_bioes=tag_to_bioes, 

1286 column_delimiter='\t', 

1287 document_separator_token='-DOCSTART-', 

1288 banned_sentences=banned_sentences, 

1289 autofind_splits=False, 

1290 sample_missing_splits=sample_missing_splits_in_each_corpus, 

1291 ) 

1292 corpora.append(corpus) 

1293 print('...done!') 

1294 

1295 super(WSD_UFSAC, self).__init__( 

1296 corpora, 

1297 sample_missing_splits=sample_missing_splits_in_multicorpus, 

1298 name=name 

1299 ) 

1300 

1301 

1302class WSD_RAGANATO_ALL(EntityLinkingCorpus): 

1303 def __init__( 

1304 self, 

1305 base_path: Union[str, Path] = None, 

1306 in_memory: bool = True, 

1307 columns={0: "text", 3: "wn30_key"}, 

1308 tag_to_bioes=None, 

1309 label_name_map: Dict[str, str] = None, 

1310 banned_sentences: List[str] = None, 

1311 sample_missing_splits: bool = True, 

1312 cut_multisense: bool = True 

1313 ): 

1314 """ 

1315 Initialize ragnato_ALL (concatenation of all SensEval and SemEval all-words tasks) provided in UFSAC https://github.com/getalp/UFSAC 

1316 When first initializing the corpus the whole UFSAC data is downloaded. 

1317 """ 

1318 if type(base_path) == str: 

1319 base_path: Path = Path(base_path) 

1320 

1321 dataset_name = 'wsd_ufsac' 

1322 

1323 # default dataset folder is the cache root 

1324 if not base_path: 

1325 base_path = flair.cache_root / "datasets" 

1326 data_folder = base_path / dataset_name 

1327 original_data_folder = data_folder / 'original_data' 

1328 

1329 # We check if the the UFSAC data has already been downloaded. If not, we download it. 

1330 # Note that this downloads more datasets than just SemCor. But the size of the download is only around 190 Mb (around 4.5 Gb unpacked) 

1331 if not original_data_folder.exists(): 

1332 # create folder 

1333 data_folder.mkdir(parents=True) 

1334 

1335 # download data 

1336 import gdown 

1337 

1338 url = 'https://drive.google.com/uc?id=1Oigo3kzRosz2VjyA44vpJZ58tDFyLRMO' 

1339 

1340 output = data_folder / (dataset_name + '.tar') 

1341 

1342 gdown.download(url, str(output), quiet=False) 

1343 

1344 output = data_folder / (dataset_name + '.tar') 

1345 unpack_file(file=output, 

1346 unpack_to=data_folder, 

1347 mode='tar', keep=False) 

1348 

1349 os.rename(data_folder / 'ufsac-public-2.1', original_data_folder) 

1350 

1351 train_file = determine_tsv_file(filename='raganato_ALL', data_folder=data_folder, cut_multisense=cut_multisense) 

1352 

1353 super(WSD_RAGANATO_ALL, self).__init__( 

1354 data_folder=data_folder, 

1355 columns=columns, 

1356 train_file=train_file, 

1357 in_memory=in_memory, 

1358 document_separator_token='-DOCSTART-', 

1359 column_delimiter='\t', 

1360 autofind_splits=False, 

1361 tag_to_bioes=tag_to_bioes, 

1362 label_name_map=label_name_map, 

1363 banned_sentences=banned_sentences, 

1364 sample_missing_splits=sample_missing_splits, 

1365 ) 

1366 

1367 

1368class WSD_SEMCOR(EntityLinkingCorpus): 

1369 def __init__( 

1370 self, 

1371 base_path: Union[str, Path] = None, 

1372 in_memory: bool = True, 

1373 columns={0: "text", 3: "wn30_key"}, 

1374 tag_to_bioes=None, 

1375 label_name_map: Dict[str, str] = None, 

1376 banned_sentences: List[str] = None, 

1377 sample_missing_splits: bool = True, 

1378 cut_multisense: bool = True, 

1379 use_raganato_ALL_as_test_data: bool = False, 

1380 ): 

1381 """ 

1382 Initialize SemCor provided in UFSAC https://github.com/getalp/UFSAC 

1383 When first initializing the corpus the whole UFSAC data is downloaded. 

1384 """ 

1385 if type(base_path) == str: 

1386 base_path: Path = Path(base_path) 

1387 

1388 dataset_name = 'wsd_ufsac' 

1389 

1390 # default dataset folder is the cache root 

1391 if not base_path: 

1392 base_path = flair.cache_root / "datasets" 

1393 data_folder = base_path / dataset_name 

1394 original_data_folder = data_folder / 'original_data' 

1395 

1396 # We check if the the UFSAC data has already been downloaded. If not, we download it. 

1397 # Note that this downloads more datasets than just SemCor. But the size of the download is only around 190 Mb (around 4.5 Gb unpacked) 

1398 if not original_data_folder.exists(): 

1399 # create folder 

1400 data_folder.mkdir(parents=True) 

1401 

1402 # download data 

1403 import gdown 

1404 

1405 url = 'https://drive.google.com/uc?id=1Oigo3kzRosz2VjyA44vpJZ58tDFyLRMO' 

1406 

1407 output = data_folder / (dataset_name + '.tar') 

1408 

1409 gdown.download(url, str(output), quiet=False) 

1410 

1411 output = data_folder / (dataset_name + '.tar') 

1412 unpack_file(file=output, 

1413 unpack_to=data_folder, 

1414 mode='tar', keep=False) 

1415 

1416 os.rename(data_folder / 'ufsac-public-2.1', original_data_folder) 

1417 

1418 if use_raganato_ALL_as_test_data: 

1419 # in this case no test data should be generated by sampling from train data. But if sample_missing_splits is true, the dev set will be sampled. 

1420 if sample_missing_splits: 

1421 sample_missing_splits = 'only_dev' 

1422 

1423 # generate the test file 

1424 test_file = determine_tsv_file(filename='raganato_ALL', data_folder=data_folder, 

1425 cut_multisense=cut_multisense) 

1426 else: 

1427 test_file = None 

1428 

1429 train_file = determine_tsv_file(filename='semcor', data_folder=data_folder, cut_multisense=cut_multisense) 

1430 

1431 super(WSD_SEMCOR, self).__init__( 

1432 data_folder=data_folder, 

1433 columns=columns, 

1434 train_file=train_file, 

1435 test_file=test_file, 

1436 in_memory=in_memory, 

1437 document_separator_token='-DOCSTART-', 

1438 column_delimiter='\t', 

1439 autofind_splits=False, 

1440 tag_to_bioes=tag_to_bioes, 

1441 label_name_map=label_name_map, 

1442 banned_sentences=banned_sentences, 

1443 sample_missing_splits=sample_missing_splits, 

1444 ) 

1445 

1446 

1447class WSD_WORDNET_GLOSS_TAGGED(EntityLinkingCorpus): 

1448 def __init__( 

1449 self, 

1450 base_path: Union[str, Path] = None, 

1451 in_memory: bool = True, 

1452 columns={0: "text", 3: "wn30_key"}, 

1453 tag_to_bioes=None, 

1454 label_name_map: Dict[str, str] = None, 

1455 banned_sentences: List[str] = None, 

1456 sample_missing_splits: bool = True, 

1457 use_raganato_ALL_as_test_data: bool = False, 

1458 ): 

1459 """ 

1460 Initialize Princeton WordNet Gloss Corpus provided in UFSAC https://github.com/getalp/UFSAC 

1461 When first initializing the corpus the whole UFSAC data is downloaded. 

1462 """ 

1463 if type(base_path) == str: 

1464 base_path: Path = Path(base_path) 

1465 

1466 dataset_name = 'wsd_ufsac' 

1467 

1468 # default dataset folder is the cache root 

1469 if not base_path: 

1470 base_path = flair.cache_root / "datasets" 

1471 data_folder = base_path / dataset_name 

1472 original_data_folder = data_folder / 'original_data' 

1473 

1474 # We check if the the UFSAC data has already been downloaded. If not, we download it. 

1475 # Note that this downloads more datasets than just WordNet Gloss Tagged. But the size of the download is only around 190 Mb (around 4.5 Gb unpacked) 

1476 if not original_data_folder.exists(): 

1477 # create folder 

1478 data_folder.mkdir(parents=True) 

1479 

1480 # download data 

1481 import gdown 

1482 

1483 url = 'https://drive.google.com/uc?id=1Oigo3kzRosz2VjyA44vpJZ58tDFyLRMO' 

1484 

1485 output = data_folder / (dataset_name + '.tar') 

1486 

1487 gdown.download(url, str(output), quiet=False) 

1488 

1489 output = data_folder / (dataset_name + '.tar') 

1490 unpack_file(file=output, 

1491 unpack_to=data_folder, 

1492 mode='tar', keep=False) 

1493 

1494 os.rename(data_folder / 'ufsac-public-2.1', original_data_folder) 

1495 

1496 if use_raganato_ALL_as_test_data: 

1497 # in this case no test data should be generated by sampling from train data. But if sample_missing_splits is true, the dev set will be sampled. 

1498 if sample_missing_splits: 

1499 sample_missing_splits = 'only_dev' 

1500 

1501 # generate the test file 

1502 test_file = determine_tsv_file(filename='raganato_ALL', data_folder=data_folder, cut_multisense=True) 

1503 else: 

1504 test_file = None 

1505 

1506 train_file = determine_tsv_file(filename='wngt', data_folder=data_folder, 

1507 cut_multisense=False) # does not have multisense! 

1508 

1509 super(WSD_WORDNET_GLOSS_TAGGED, self).__init__( 

1510 data_folder=data_folder, 

1511 columns=columns, 

1512 train_file=train_file, 

1513 test_file=test_file, 

1514 in_memory=in_memory, 

1515 document_separator_token='-DOCSTART-', 

1516 column_delimiter='\t', 

1517 autofind_splits=False, 

1518 tag_to_bioes=tag_to_bioes, 

1519 label_name_map=label_name_map, 

1520 banned_sentences=banned_sentences, 

1521 sample_missing_splits=sample_missing_splits, 

1522 ) 

1523 

1524 

1525class WSD_MASC(EntityLinkingCorpus): 

1526 def __init__( 

1527 self, 

1528 base_path: Union[str, Path] = None, 

1529 in_memory: bool = True, 

1530 columns={0: "text", 3: "wn30_key"}, 

1531 tag_to_bioes=None, 

1532 label_name_map: Dict[str, str] = None, 

1533 banned_sentences: List[str] = None, 

1534 sample_missing_splits: bool = True, 

1535 cut_multisense: bool = True, 

1536 use_raganato_ALL_as_test_data: bool = False, 

1537 ): 

1538 """ 

1539 Initialize MASC (Manually Annotated Sub-Corpus) provided in UFSAC https://github.com/getalp/UFSAC 

1540 When first initializing the corpus the whole UFSAC data is downloaded. 

1541 """ 

1542 if type(base_path) == str: 

1543 base_path: Path = Path(base_path) 

1544 

1545 dataset_name = 'wsd_ufsac' 

1546 

1547 # default dataset folder is the cache root 

1548 if not base_path: 

1549 base_path = flair.cache_root / "datasets" 

1550 data_folder = base_path / dataset_name 

1551 original_data_folder = data_folder / 'original_data' 

1552 

1553 # We check if the the UFSAC data has already been downloaded. If not, we download it. 

1554 # Note that this downloads more datasets than just MASC. But the size of the download is only around 190 Mb (around 4.5 Gb unpacked) 

1555 if not original_data_folder.exists(): 

1556 # create folder 

1557 data_folder.mkdir(parents=True) 

1558 

1559 # download data 

1560 import gdown 

1561 

1562 url = 'https://drive.google.com/uc?id=1Oigo3kzRosz2VjyA44vpJZ58tDFyLRMO' 

1563 

1564 output = data_folder / (dataset_name + '.tar') 

1565 

1566 gdown.download(url, str(output), quiet=False) 

1567 

1568 output = data_folder / (dataset_name + '.tar') 

1569 unpack_file(file=output, 

1570 unpack_to=data_folder, 

1571 mode='tar', keep=False) 

1572 

1573 os.rename(data_folder / 'ufsac-public-2.1', original_data_folder) 

1574 

1575 if use_raganato_ALL_as_test_data: 

1576 # in this case no test data should be generated by sampling from train data. But if sample_missing_splits is true, the dev set will be sampled. 

1577 if sample_missing_splits: 

1578 sample_missing_splits = 'only_dev' 

1579 

1580 # generate the test file 

1581 test_file = determine_tsv_file(filename='raganato_ALL', data_folder=data_folder, 

1582 cut_multisense=cut_multisense) 

1583 else: 

1584 test_file = None 

1585 

1586 train_file = determine_tsv_file(filename='masc', data_folder=data_folder, cut_multisense=cut_multisense) 

1587 

1588 super(WSD_MASC, self).__init__( 

1589 data_folder=data_folder, 

1590 columns=columns, 

1591 train_file=train_file, 

1592 test_file=test_file, 

1593 in_memory=in_memory, 

1594 document_separator_token='-DOCSTART-', 

1595 column_delimiter='\t', 

1596 autofind_splits=False, 

1597 tag_to_bioes=tag_to_bioes, 

1598 label_name_map=label_name_map, 

1599 banned_sentences=banned_sentences, 

1600 sample_missing_splits=sample_missing_splits, 

1601 ) 

1602 

1603 

1604class WSD_OMSTI(EntityLinkingCorpus): 

1605 def __init__( 

1606 self, 

1607 base_path: Union[str, Path] = None, 

1608 in_memory: bool = True, 

1609 columns={0: "text", 3: "wn30_key"}, 

1610 tag_to_bioes=None, 

1611 label_name_map: Dict[str, str] = None, 

1612 banned_sentences: List[str] = None, 

1613 sample_missing_splits: bool = True, 

1614 cut_multisense: bool = True, 

1615 use_raganato_ALL_as_test_data: bool = False, 

1616 ): 

1617 """ 

1618 Initialize OMSTI (One Million Sense-Tagged Instances) provided in UFSAC https://github.com/getalp/UFSAC 

1619 When first initializing the corpus the whole UFSAC data is downloaded. 

1620 """ 

1621 if type(base_path) == str: 

1622 base_path: Path = Path(base_path) 

1623 

1624 dataset_name = 'wsd_ufsac' 

1625 

1626 # default dataset folder is the cache root 

1627 if not base_path: 

1628 base_path = flair.cache_root / "datasets" 

1629 data_folder = base_path / dataset_name 

1630 original_data_folder = data_folder / 'original_data' 

1631 

1632 # We check if the the UFSAC data has already been downloaded. If not, we download it. 

1633 # Note that this downloads more datasets than just OMSTI. But the size of the download is only around 190 Mb (around 4.5 Gb unpacked) 

1634 if not original_data_folder.exists(): 

1635 # create folder 

1636 data_folder.mkdir(parents=True) 

1637 

1638 # download data 

1639 import gdown 

1640 

1641 url = 'https://drive.google.com/uc?id=1Oigo3kzRosz2VjyA44vpJZ58tDFyLRMO' 

1642 

1643 output = data_folder / (dataset_name + '.tar') 

1644 

1645 gdown.download(url, str(output), quiet=False) 

1646 

1647 output = data_folder / (dataset_name + '.tar') 

1648 unpack_file(file=output, 

1649 unpack_to=data_folder, 

1650 mode='tar', keep=False) 

1651 

1652 os.rename(data_folder / 'ufsac-public-2.1', original_data_folder) 

1653 

1654 if use_raganato_ALL_as_test_data: 

1655 # in this case no test data should be generated by sampling from train data. But if sample_missing_splits is true, the dev set will be sampled. 

1656 if sample_missing_splits: 

1657 sample_missing_splits = 'only_dev' 

1658 

1659 # generate the test file 

1660 test_file = determine_tsv_file(filename='raganato_ALL', data_folder=data_folder, 

1661 cut_multisense=cut_multisense) 

1662 else: 

1663 test_file = None 

1664 

1665 train_file = determine_tsv_file(filename='omsti', data_folder=data_folder, cut_multisense=cut_multisense) 

1666 

1667 super(WSD_OMSTI, self).__init__( 

1668 data_folder=data_folder, 

1669 columns=columns, 

1670 train_file=train_file, 

1671 test_file=test_file, 

1672 in_memory=in_memory, 

1673 document_separator_token='-DOCSTART-', 

1674 column_delimiter='\t', 

1675 autofind_splits=False, 

1676 tag_to_bioes=tag_to_bioes, 

1677 label_name_map=label_name_map, 

1678 banned_sentences=banned_sentences, 

1679 sample_missing_splits=sample_missing_splits, 

1680 ) 

1681 

1682 

1683class WSD_TRAINOMATIC(EntityLinkingCorpus): 

1684 def __init__( 

1685 self, 

1686 base_path: Union[str, Path] = None, 

1687 in_memory: bool = True, 

1688 columns={0: "text", 3: "wn30_key"}, 

1689 tag_to_bioes=None, 

1690 label_name_map: Dict[str, str] = None, 

1691 banned_sentences: List[str] = None, 

1692 sample_missing_splits: bool = True, 

1693 use_raganato_ALL_as_test_data: bool = False, 

1694 ): 

1695 """ 

1696 Initialize Train-O-Matic provided in UFSAC https://github.com/getalp/UFSAC 

1697 When first initializing the corpus the whole UFSAC data is downloaded. 

1698 """ 

1699 if type(base_path) == str: 

1700 base_path: Path = Path(base_path) 

1701 

1702 dataset_name = 'wsd_ufsac' 

1703 

1704 # default dataset folder is the cache root 

1705 if not base_path: 

1706 base_path = flair.cache_root / "datasets" 

1707 data_folder = base_path / dataset_name 

1708 original_data_folder = data_folder / 'original_data' 

1709 

1710 # We check if the the UFSAC data has already been downloaded. If not, we download it. 

1711 # Note that this downloads more datasets than just Train-O-Matic. But the size of the download is only around 190 Mb (around 4.5 Gb unpacked) 

1712 if not original_data_folder.exists(): 

1713 # create folder 

1714 data_folder.mkdir(parents=True) 

1715 

1716 # download data 

1717 import gdown 

1718 

1719 url = 'https://drive.google.com/uc?id=1Oigo3kzRosz2VjyA44vpJZ58tDFyLRMO' 

1720 

1721 output = data_folder / (dataset_name + '.tar') 

1722 

1723 gdown.download(url, str(output), quiet=False) 

1724 

1725 output = data_folder / (dataset_name + '.tar') 

1726 unpack_file(file=output, 

1727 unpack_to=data_folder, 

1728 mode='tar', keep=False) 

1729 

1730 os.rename(data_folder / 'ufsac-public-2.1', original_data_folder) 

1731 

1732 if use_raganato_ALL_as_test_data: 

1733 # in this case no test data should be generated by sampling from train data. But if sample_missing_splits is true, the dev set will be sampled. 

1734 if sample_missing_splits: 

1735 sample_missing_splits = 'only_dev' 

1736 

1737 # generate the test file 

1738 test_file = determine_tsv_file(filename='raganato_ALL', data_folder=data_folder, cut_multisense=True) 

1739 else: 

1740 test_file = None 

1741 

1742 train_file = determine_tsv_file(filename='trainomatic', data_folder=data_folder, 

1743 cut_multisense=False) # no multisenses 

1744 

1745 super(WSD_TRAINOMATIC, self).__init__( 

1746 data_folder=data_folder, 

1747 columns=columns, 

1748 train_file=train_file, 

1749 test_file=test_file, 

1750 in_memory=in_memory, 

1751 document_separator_token='-DOCSTART-', 

1752 column_delimiter='\t', 

1753 autofind_splits=False, 

1754 tag_to_bioes=tag_to_bioes, 

1755 label_name_map=label_name_map, 

1756 banned_sentences=banned_sentences, 

1757 sample_missing_splits=sample_missing_splits, 

1758 )