Coverage for flair/flair/datasets/entity_linking.py: 7%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

707 statements  

1import csv 

2import logging 

3import os 

4from pathlib import Path 

5from typing import Union, List, Dict 

6 

7import requests 

8 

9import flair 

10from flair.data import Dictionary, Sentence, MultiCorpus 

11from flair.datasets import ColumnCorpus 

12from flair.file_utils import cached_path, unpack_file 

13from flair.tokenization import SentenceSplitter, SegtokSentenceSplitter 

14 

15log = logging.getLogger("flair") 

16 

17 

18class EntityLinkingCorpus(ColumnCorpus): 

19 def __init__( 

20 self, 

21 data_folder, 

22 train_file, 

23 columns={0: "text", 1: "nel"}, 

24 column_delimiter="\t", 

25 in_memory=True, 

26 document_separator_token='-DOCSTART-', 

27 **corpusargs, 

28 ): 

29 """ 

30 Super class for all entity linking corpora. Expects the data to be in column format with one column for words and another one for BIO-tags and wikipedia-page 

31 name, e.g. B-Brad_Pitt. 

32 The class provides the function make_entity_dict to create an entity dictionary suited for entity linking. 

33 """ 

34 # TODO: Add a routine, that checks annotations for some widespread errors/inconsistencies??? (e.g. in AQUAINT corpus Iran-Iraq_War vs. Iran-Iraq_war) 

35 

36 super(EntityLinkingCorpus, self).__init__( 

37 data_folder, 

38 columns, 

39 train_file=train_file, 

40 column_delimiter=column_delimiter, 

41 in_memory=in_memory, 

42 document_separator_token=document_separator_token, 

43 **corpusargs, 

44 ) 

45 

46 def make_entity_dict(self, label_type='nel', threshold: int = 1) -> Dictionary: 

47 """ 

48 Create ID-dictionary for the wikipedia-page names. 

49 param threshold: Ignore links that occur less than threshold value 

50 

51 In entity_occurences all wikinames and their number of occurence is saved. 

52 ent_dictionary contains all wikinames that occure at least threshold times and gives each name an ID 

53 """ 

54 self.threshold = threshold 

55 self.entity_occurences = {} 

56 self.total_number_of_entity_mentions = 0 

57 

58 for sentence in self.get_all_sentences(): 

59 if not sentence.is_document_boundary: # exclude "-DOCSTART-"-sentences 

60 

61 spans = sentence.get_spans(label_type) 

62 for span in spans: 

63 annotation = span.tag 

64 self.total_number_of_entity_mentions += 1 

65 if annotation in self.entity_occurences: 

66 self.entity_occurences[annotation] += 1 

67 else: 

68 self.entity_occurences[annotation] = 1 

69 

70 self.number_of_entities = len(self.entity_occurences) 

71 

72 # Create the annotation dictionary 

73 self.ent_dictionary: Dictionary = Dictionary(add_unk=True) 

74 

75 for x in self.entity_occurences: 

76 if self.entity_occurences[x] >= threshold: 

77 self.ent_dictionary.add_item(x) 

78 

79 return self.ent_dictionary 

80 

81 # this fct removes every second unknown label 

82 def remove_unknowns(self): 

83 remove = True 

84 for sentence in self.get_all_sentences(): 

85 if not sentence.is_document_boundary: # exclude "-DOCSTART-"-sentences 

86 

87 spans = sentence.get_spans('nel') 

88 for span in spans: 

89 annotation = span.tag 

90 if self.ent_dictionary.get_idx_for_item(annotation) == 0: # unknown label 

91 if remove: 

92 for token in span: 

93 token.remove_labels('nel') 

94 remove = False 

95 else: 

96 remove = True 

97 

98 

99class NEL_ENGLISH_AQUAINT(EntityLinkingCorpus): 

100 def __init__( 

101 self, 

102 base_path: Union[str, Path] = None, 

103 in_memory: bool = True, 

104 agreement_threshold: float = 0.5, 

105 sentence_splitter: SentenceSplitter = SegtokSentenceSplitter(), 

106 **corpusargs, 

107 ): 

108 """ 

109 Initialize Aquaint Entity Linking corpus introduced in: D. Milne and I. H. Witten. 

110 Learning to link with wikipedia 

111 (https://www.cms.waikato.ac.nz/~ihw/papers/08-DNM-IHW-LearningToLinkWithWikipedia.pdf). 

112 If you call the constructor the first time the dataset gets automatically downloaded and transformed in 

113 tab-separated column format (aquaint.txt). 

114 

115 Parameters 

116 ---------- 

117 base_path : Union[str, Path], optional 

118 Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

119 to point to a different folder but typically this should not be necessary. 

120 in_memory: If True, keeps dataset in memory giving speedups in training. 

121 agreement_threshold: Some link annotations come with an agreement_score representing the agreement from the human annotators. The score ranges from lowest 0.2 

122 to highest 1.0. The lower the score, the less "important" is the entity because fewer annotators thought it was worth linking. 

123 Default is 0.5 which means the majority of annotators must have annoteted the respective entity mention. 

124 """ 

125 if type(base_path) == str: 

126 base_path: Path = Path(base_path) 

127 

128 self.agreement_threshold = agreement_threshold 

129 

130 # this dataset name 

131 dataset_name = self.__class__.__name__.lower() + "_" + type(sentence_splitter).__name__ 

132 

133 # default dataset folder is the cache root 

134 if not base_path: 

135 base_path = flair.cache_root / "datasets" 

136 data_folder = base_path / dataset_name 

137 

138 aquaint_el_path = "https://www.nzdl.org/wikification/data/wikifiedStories.zip" 

139 corpus_file_name = "aquaint.txt" 

140 parsed_dataset = data_folder / corpus_file_name 

141 

142 # download and parse data if necessary 

143 if not parsed_dataset.exists(): 

144 aquaint_el_zip = cached_path(f"{aquaint_el_path}", Path("datasets") / dataset_name) 

145 unpack_file(aquaint_el_zip, data_folder, "zip", False) 

146 

147 try: 

148 with open(parsed_dataset, "w", encoding='utf-8') as txt_out: 

149 

150 # iterate over all html files 

151 for file in os.listdir(data_folder): 

152 

153 if not file.endswith(".htm"): 

154 continue 

155 

156 with open(str(data_folder / file), "r", encoding='utf-8') as txt_in: 

157 text = txt_in.read() 

158 

159 # get rid of html syntax, we only need the text 

160 strings = text.split("<p> ") 

161 strings[0] = strings[0].split('<h1 id="header">')[1][:-7] 

162 

163 for i in range(1, len(strings) - 1): 

164 strings[i] = strings[i][:-7] 

165 

166 strings[-1] = strings[-1][:-23] 

167 

168 # between all documents we write a separator symbol 

169 txt_out.write('-DOCSTART-\n\n') 

170 

171 for string in strings: 

172 

173 # skip empty strings 

174 if not string: continue 

175 

176 # process the annotation format in the text and collect triples (begin_mention, length_mention, wikiname) 

177 indices = [] 

178 lengths = [] 

179 wikinames = [] 

180 

181 current_entity = string.find('[[') # each annotation starts with '[[' 

182 while current_entity != -1: 

183 wikiname = '' 

184 surface_form = '' 

185 j = current_entity + 2 

186 

187 while string[j] not in [']', '|']: 

188 wikiname += string[j] 

189 j += 1 

190 

191 if string[j] == ']': # entity mention ends, i.e. looks like this [[wikiname]] 

192 surface_form = wikiname # in this case entity mention = wiki-page name 

193 else: # string[j] == '|' 

194 j += 1 

195 while string[j] not in [']', '|']: 

196 surface_form += string[j] 

197 j += 1 

198 

199 if string[ 

200 j] == '|': # entity has a score, i.e. looks like this [[wikiname|surface_form|agreement_score]] 

201 agreement_score = float(string[j + 1:j + 4]) 

202 j += 4 # points to first ']' of entity now 

203 if agreement_score < self.agreement_threshold: # discard entity 

204 string = string[:current_entity] + surface_form + string[j + 2:] 

205 current_entity = string.find('[[') 

206 continue 

207 

208 # replace [[wikiname|surface_form|score]] by surface_form and save index, length and wikiname of mention 

209 indices.append(current_entity) 

210 lengths.append(len(surface_form)) 

211 wikinames.append(wikiname[0].upper() + wikiname.replace(' ', '_')[1:]) 

212 

213 string = string[:current_entity] + surface_form + string[j + 2:] 

214 

215 current_entity = string.find('[[') 

216 

217 # sentence splitting and tokenization 

218 sentences = sentence_splitter.split(string) 

219 sentence_offsets = [sentence.start_pos for sentence in sentences] 

220 

221 # iterate through all annotations and add to corresponding tokens 

222 for mention_start, mention_length, wikiname in zip(indices, lengths, wikinames): 

223 

224 # find sentence to which annotation belongs 

225 sentence_index = 0 

226 for i in range(1, len(sentences)): 

227 if mention_start < sentence_offsets[i]: 

228 break 

229 else: 

230 sentence_index += 1 

231 

232 # position within corresponding sentence 

233 mention_start -= sentence_offsets[sentence_index] 

234 mention_end = mention_start + mention_length 

235 

236 # set annotation for tokens of entity mention 

237 first = True 

238 for token in sentences[sentence_index].tokens: 

239 if token.start_pos >= mention_start and token.end_pos <= mention_end: # token belongs to entity mention 

240 if first: 

241 token.set_label(typename='nel', value='B-' + wikiname) 

242 first = False 

243 else: 

244 token.set_label(typename='nel', value='I-' + wikiname) 

245 

246 # write to out-file in column format 

247 for sentence in sentences: 

248 

249 for token in sentence.tokens: 

250 

251 labels = token.get_labels('nel') 

252 

253 if len(labels) == 0: # no entity 

254 txt_out.write(token.text + '\tO\n') 

255 

256 else: # annotation 

257 txt_out.write(token.text + '\t' + labels[0].value + '\n') 

258 

259 txt_out.write('\n') # empty line after each sentence 

260 

261 except: 

262 # in case something goes wrong, delete the dataset and raise error 

263 os.remove(parsed_dataset) 

264 raise 

265 

266 super(NEL_ENGLISH_AQUAINT, self).__init__( 

267 data_folder, 

268 train_file=corpus_file_name, 

269 in_memory=in_memory, 

270 **corpusargs, 

271 ) 

272 

273 

274class NEL_GERMAN_HIPE(EntityLinkingCorpus): 

275 def __init__( 

276 self, 

277 base_path: Union[str, Path] = None, 

278 in_memory: bool = True, 

279 wiki_language: str = 'dewiki', 

280 **corpusargs 

281 ): 

282 """ 

283 Initialize a sentence-segmented version of the HIPE entity linking corpus for historical German (see description 

284 of HIPE at https://impresso.github.io/CLEF-HIPE-2020/). This version was segmented by @stefan-it and is hosted 

285 at https://github.com/stefan-it/clef-hipe. 

286 If you call the constructor the first time the dataset gets automatically downloaded and transformed in 

287 tab-separated column format. 

288 

289 Parameters 

290 ---------- 

291 base_path : Union[str, Path], optional 

292 Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

293 to point to a different folder but typically this should not be necessary. 

294 in_memory: If True, keeps dataset in memory giving speedups in training. 

295 wiki_language : specify the language of the names of the wikipedia pages, i.e. which language version of 

296 Wikipedia URLs to use. Since the text is in german the default language is German. 

297 """ 

298 self.wiki_language = wiki_language 

299 if type(base_path) == str: 

300 base_path: Path = Path(base_path) 

301 

302 # this dataset name 

303 dataset_name = self.__class__.__name__.lower() 

304 

305 # default dataset folder is the cache root 

306 if not base_path: 

307 base_path = flair.cache_root / "datasets" 

308 data_folder = base_path / dataset_name 

309 

310 dev_raw_url = "https://raw.githubusercontent.com/stefan-it/clef-hipe/main/data/future/dev-v1.2/de/HIPE-data-v1.2-dev-de-normalized-manual-eos.tsv" 

311 test_raw_url = "https://raw.githubusercontent.com/stefan-it/clef-hipe/main/data/future/test-v1.3/de/HIPE-data-v1.3-test-de-normalized-manual-eos.tsv" 

312 train_raw_url = "https://raw.githubusercontent.com/stefan-it/clef-hipe/main/data/future/training-v1.2/de/HIPE-data-v1.2-train-de-normalized-manual-eos.tsv" 

313 train_file_name = wiki_language + "_train.tsv" 

314 parsed_dataset = data_folder / train_file_name 

315 

316 # download and parse data if necessary 

317 if not parsed_dataset.exists(): 

318 

319 # from qwikidata.linked_data_interface import get_entity_dict_from_api 

320 

321 original_train_path = cached_path(f"{train_raw_url}", Path("datasets") / dataset_name) 

322 original_test_path = cached_path(f"{test_raw_url}", Path("datasets") / dataset_name) 

323 original_dev_path = cached_path(f"{dev_raw_url}", Path("datasets") / dataset_name) 

324 

325 # generate qid wikiname dictionaries 

326 log.info('Get wikinames from wikidata...') 

327 train_dict = self._get_qid_wikiname_dict(path=original_train_path) 

328 test_dict = self._get_qid_wikiname_dict(original_test_path) 

329 dev_dict = self._get_qid_wikiname_dict(original_dev_path) 

330 log.info('...done!') 

331 

332 # merge dictionaries 

333 qid_wikiname_dict = {**train_dict, **test_dict, **dev_dict} 

334 

335 for doc_path, file_name in zip([original_train_path, original_test_path, original_dev_path], 

336 [train_file_name, wiki_language + '_test.tsv', wiki_language + '_dev.tsv']): 

337 with open(doc_path, 'r', encoding='utf-8') as read, open(data_folder / file_name, 'w', 

338 encoding='utf-8') as write: 

339 

340 # ignore first line 

341 read.readline() 

342 line = read.readline() 

343 last_eos = True 

344 

345 while line: 

346 # commented and empty lines 

347 if line[0] == '#' or line == '\n': 

348 if line[2:13] == 'document_id': # beginning of new document 

349 

350 if last_eos: 

351 write.write('-DOCSTART-\n\n') 

352 last_eos = False 

353 else: 

354 write.write('\n-DOCSTART-\n\n') 

355 

356 else: 

357 line_list = line.split('\t') 

358 if not line_list[7] in ['_', 'NIL']: # line has wikidata link 

359 

360 wikiname = qid_wikiname_dict[line_list[7]] 

361 

362 if wikiname != 'O': 

363 annotation = line_list[1][:2] + wikiname 

364 else: # no entry in chosen language 

365 annotation = 'O' 

366 

367 else: 

368 

369 annotation = 'O' 

370 

371 write.write(line_list[0] + '\t' + annotation + '\n') 

372 

373 if line_list[-1][-4:-1] == 'EOS': # end of sentence 

374 write.write('\n') 

375 last_eos = True 

376 else: 

377 last_eos = False 

378 

379 line = read.readline() 

380 

381 super(NEL_GERMAN_HIPE, self).__init__( 

382 data_folder, 

383 train_file=train_file_name, 

384 dev_file=wiki_language + '_dev.tsv', 

385 test_file=wiki_language + '_test.tsv', 

386 in_memory=in_memory, 

387 **corpusargs, 

388 ) 

389 

390 def _get_qid_wikiname_dict(self, path): 

391 

392 qid_set = set() 

393 with open(path, mode='r', encoding='utf-8') as read: 

394 # read all Q-IDs 

395 

396 # ignore first line 

397 read.readline() 

398 line = read.readline() 

399 

400 while line: 

401 

402 if not (line[0] == '#' or line == '\n'): # commented or empty lines 

403 line_list = line.split('\t') 

404 if not line_list[7] in ['_', 'NIL']: # line has wikidata link 

405 

406 qid_set.add(line_list[7]) 

407 

408 line = read.readline() 

409 

410 base_url = 'https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&props=sitelinks&sitefilter=' + self.wiki_language + '&ids=' 

411 

412 qid_list = list(qid_set) 

413 ids = '' 

414 length = len(qid_list) 

415 qid_wikiname_dict = {} 

416 for i in range(length): 

417 if ( 

418 i + 1) % 50 == 0 or i == length - 1: # there is a limit to the number of ids in one request in the wikidata api 

419 

420 ids += qid_list[i] 

421 # request 

422 response_json = requests.get(base_url + ids).json() 

423 

424 for qid in response_json['entities']: 

425 

426 try: 

427 wikiname = response_json['entities'][qid]['sitelinks'][self.wiki_language]['title'].replace(' ', 

428 '_') 

429 except KeyError: # language not available for specific wikiitem 

430 wikiname = 'O' 

431 

432 qid_wikiname_dict[qid] = wikiname 

433 

434 ids = '' 

435 

436 else: 

437 ids += qid_list[i] 

438 ids += '|' 

439 

440 return qid_wikiname_dict 

441 

442 

443class NEL_ENGLISH_AIDA(EntityLinkingCorpus): 

444 def __init__( 

445 self, 

446 base_path: Union[str, Path] = None, 

447 in_memory: bool = True, 

448 check_existence: bool = False, 

449 **corpusargs 

450 ): 

451 """ 

452 Initialize AIDA CoNLL-YAGO Entity Linking corpus introduced here https://www.mpi-inf.mpg.de/departments/databases-and-information-systems/research/ambiverse-nlu/aida/downloads. 

453 License: https://creativecommons.org/licenses/by-sa/3.0/deed.en_US 

454 If you call the constructor the first time the dataset gets automatically downloaded and transformed in tab-separated column format. 

455 

456 Parameters 

457 ---------- 

458 base_path : Union[str, Path], optional 

459 Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

460 to point to a different folder but typically this should not be necessary. 

461 in_memory: If True, keeps dataset in memory giving speedups in training. 

462 check_existence: If True the existence of the given wikipedia ids/pagenames is checked and non existent ids/names will be igrnored. 

463 """ 

464 if type(base_path) == str: 

465 base_path: Path = Path(base_path) 

466 

467 # this dataset name 

468 dataset_name = self.__class__.__name__.lower() 

469 

470 # default dataset folder is the cache root 

471 if not base_path: 

472 base_path = flair.cache_root / "datasets" 

473 data_folder = base_path / dataset_name 

474 

475 conll_yago_path = "https://nlp.informatik.hu-berlin.de/resources/datasets/conll_entity_linking/" 

476 corpus_file_name = "train" 

477 parsed_dataset = data_folder / corpus_file_name 

478 

479 if not parsed_dataset.exists(): 

480 

481 import wikipediaapi 

482 

483 wiki_wiki = wikipediaapi.Wikipedia(language='en') 

484 

485 testa_unprocessed_path = cached_path(f"{conll_yago_path}aida_conll_testa", Path("datasets") / dataset_name) 

486 testb_unprocessed_path = cached_path(f"{conll_yago_path}aida_conll_testb", Path("datasets") / dataset_name) 

487 train_unprocessed_path = cached_path(f"{conll_yago_path}aida_conll_train", Path("datasets") / dataset_name) 

488 

489 # we use the wikiids in the data instead of directly utilizing the wikipedia urls. 

490 # like this we can quickly check if the corresponding page exists 

491 wikiid_wikiname_dict = self._get_wikiid_wikiname_dict(data_folder) 

492 

493 for name, path in zip(['train', 'testa', 'testb'], 

494 [train_unprocessed_path, testa_unprocessed_path, testb_unprocessed_path]): 

495 with open(data_folder / name, 'w', encoding='utf-8') as write, open(path, 'r', 

496 encoding='utf-8') as read: 

497 

498 for line in read: 

499 

500 line_list = line.split('\t') 

501 if len(line_list) <= 4: 

502 if line_list[0][:10] == '-DOCSTART-': # Docstart 

503 write.write('-DOCSTART-\n\n') 

504 elif line_list[0] == '\n': # empty line 

505 write.write('\n') 

506 else: # text without annotation or marked '--NME--' (no matching entity) 

507 if len(line_list) == 1: 

508 write.write(line_list[0][:-1] + '\tO\n') 

509 else: 

510 write.write(line_list[0] + '\tO\n') 

511 else: # line with annotation 

512 wikiname = wikiid_wikiname_dict[line_list[5].strip()] 

513 if wikiname != 'O': 

514 write.write(line_list[0] + '\t' + line_list[1] + '-' + wikiname + '\n') 

515 else: 

516 # if there is a bad wikiid we can check if the given url in the data exists using wikipediaapi 

517 wikiname = line_list[4].split('/')[-1] 

518 if check_existence: 

519 page = wiki_wiki.page(wikiname) 

520 if page.exists(): 

521 write.write(line_list[0] + '\t' + line_list[1] + '-' + wikiname + '\n') 

522 else: # neither the wikiid nor the url exist 

523 write.write(line_list[0] + '\tO\n') 

524 else: 

525 write.write(line_list[0] + '\t' + line_list[4] + '-' + wikiname + '\n') 

526 

527 # delete unprocessed file 

528 os.remove(path) 

529 

530 super(NEL_ENGLISH_AIDA, self).__init__( 

531 data_folder, 

532 train_file=corpus_file_name, 

533 dev_file='testa', 

534 test_file='testb', 

535 in_memory=in_memory, 

536 **corpusargs, 

537 ) 

538 

539 def _get_wikiid_wikiname_dict(self, base_folder): 

540 

541 # collect all wikiids 

542 wikiid_set = set() 

543 for data_file in ['aida_conll_testa', 'aida_conll_testb', 'aida_conll_train']: 

544 with open(base_folder / data_file, mode='r', encoding='utf-8') as read: 

545 line = read.readline() 

546 while line: 

547 row = line.split('\t') 

548 if len(row) > 4: # line has a wiki annotation 

549 wikiid_set.add(row[5].strip()) 

550 line = read.readline() 

551 

552 # create the dictionary 

553 wikiid_wikiname_dict = {} 

554 wikiid_list = list(wikiid_set) 

555 ids = '' 

556 length = len(wikiid_list) 

557 

558 for i in range(length): 

559 if ( 

560 i + 1) % 50 == 0 or i == length - 1: # there is a limit to the number of ids in one request in the wikimedia api 

561 

562 ids += wikiid_list[i] 

563 # request 

564 resp = requests.get( 

565 'https://en.wikipedia.org/w/api.php', 

566 params={ 

567 'action': 'query', 

568 'prop': 'info', 

569 'pageids': ids, 

570 'format': 'json' 

571 } 

572 ).json() 

573 

574 for wikiid in resp['query']['pages']: 

575 try: 

576 wikiname = resp['query']['pages'][wikiid]['title'].replace(' ', '_') 

577 except KeyError: # bad wikiid 

578 wikiname = 'O' 

579 wikiid_wikiname_dict[wikiid] = wikiname 

580 ids = '' 

581 

582 else: 

583 ids += wikiid_list[i] 

584 ids += '|' 

585 

586 return wikiid_wikiname_dict 

587 

588 

589class NEL_ENGLISH_IITB(EntityLinkingCorpus): 

590 def __init__( 

591 self, 

592 base_path: Union[str, Path] = None, 

593 in_memory: bool = True, 

594 ignore_disagreements: bool = False, 

595 sentence_splitter: SentenceSplitter = SegtokSentenceSplitter(), 

596 **corpusargs 

597 ): 

598 """ 

599 Initialize ITTB Entity Linking corpus introduced in "Collective Annotation of Wikipedia Entities in Web Text" Sayali Kulkarni, Amit Singh, Ganesh Ramakrishnan, and Soumen Chakrabarti. 

600 If you call the constructor the first time the dataset gets automatically downloaded and transformed in tab-separated column format. 

601 

602 Parameters 

603 ---------- 

604 base_path : Union[str, Path], optional 

605 Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

606 to point to a different folder but typically this should not be necessary. 

607 in_memory: If True, keeps dataset in memory giving speedups in training. 

608 ignore_disagreements: If True annotations with annotator disagreement will be ignored. 

609 """ 

610 if type(base_path) == str: 

611 base_path: Path = Path(base_path) 

612 

613 # this dataset name 

614 dataset_name = self.__class__.__name__.lower() + "_" + type(sentence_splitter).__name__ 

615 

616 # default dataset folder is the cache root 

617 if not base_path: 

618 base_path = flair.cache_root / "datasets" 

619 data_folder = base_path / dataset_name 

620 

621 iitb_el_docs_path = "https://www.cse.iitb.ac.in/~soumen/doc/CSAW/Annot/CSAW_crawledDocs.tar.gz" 

622 iitb_el_annotations_path = "https://www.cse.iitb.ac.in/~soumen/doc/CSAW/Annot/CSAW_Annotations.xml" 

623 corpus_file_name = "iitb.txt" 

624 parsed_dataset = data_folder / corpus_file_name 

625 

626 label_type = 'nel' 

627 

628 if not parsed_dataset.exists(): 

629 

630 docs_zip_path = cached_path(f"{iitb_el_docs_path}", Path("datasets") / dataset_name) 

631 annotations_xml_path = cached_path(f"{iitb_el_annotations_path}", Path("datasets") / dataset_name) 

632 

633 unpack_file(docs_zip_path, data_folder, "tar", False) 

634 

635 import xml.etree.ElementTree as ET 

636 tree = ET.parse(annotations_xml_path) 

637 root = tree.getroot() 

638 

639 # names of raw text documents 

640 doc_names = set() 

641 for elem in root: 

642 doc_names.add(elem[0].text) 

643 

644 # open output_file 

645 with open(parsed_dataset, 'w', encoding='utf-8') as write: 

646 # iterate through all documents 

647 for doc_name in doc_names: 

648 with open(data_folder / 'crawledDocs' / doc_name, 'r', encoding='utf-8') as read: 

649 text = read.read() 

650 

651 # split sentences and tokenize 

652 sentences = sentence_splitter.split(text) 

653 sentence_offsets = [sentence.start_pos for sentence in sentences] 

654 

655 # iterate through all annotations and add to corresponding tokens 

656 for elem in root: 

657 

658 if elem[0].text == doc_name and elem[2].text: # annotation belongs to current document 

659 

660 wikiname = elem[2].text.replace(' ', '_') 

661 mention_start = int(elem[3].text) 

662 mention_length = int(elem[4].text) 

663 

664 # find sentence to which annotation belongs 

665 sentence_index = 0 

666 for i in range(1, len(sentences)): 

667 if mention_start < sentence_offsets[i]: 

668 break 

669 else: 

670 sentence_index += 1 

671 

672 # position within corresponding sentence 

673 mention_start -= sentence_offsets[sentence_index] 

674 mention_end = mention_start + mention_length 

675 

676 # set annotation for tokens of entity mention 

677 first = True 

678 for token in sentences[sentence_index].tokens: 

679 if token.start_pos >= mention_start and token.end_pos <= mention_end: # token belongs to entity mention 

680 if first: 

681 token.set_label(typename=elem[1].text, value='B-' + wikiname) 

682 first = False 

683 else: 

684 token.set_label(typename=elem[1].text, value='I-' + wikiname) 

685 

686 # write to out file 

687 write.write('-DOCSTART-\n\n') # each file is one document 

688 

689 for sentence in sentences: 

690 

691 for token in sentence.tokens: 

692 

693 labels = token.labels 

694 

695 if len(labels) == 0: # no entity 

696 write.write(token.text + '\tO\n') 

697 

698 elif len(labels) == 1: # annotation from one annotator 

699 write.write(token.text + '\t' + labels[0].value + '\n') 

700 

701 else: # annotations from two annotators 

702 

703 if labels[0].value == labels[1].value: # annotators agree 

704 write.write(token.text + '\t' + labels[0].value + '\n') 

705 

706 else: # annotators disagree: ignore or arbitrarily take first annotation 

707 

708 if ignore_disagreements: 

709 write.write(token.text + '\tO\n') 

710 

711 else: 

712 write.write(token.text + '\t' + labels[0].value + '\n') 

713 

714 write.write('\n') # empty line after each sentence 

715 

716 super(NEL_ENGLISH_IITB, self).__init__( 

717 data_folder, 

718 train_file=corpus_file_name, 

719 in_memory=in_memory, 

720 **corpusargs, 

721 ) 

722 

723 

724class NEL_ENGLISH_TWEEKI(EntityLinkingCorpus): 

725 def __init__( 

726 self, 

727 base_path: Union[str, Path] = None, 

728 in_memory: bool = True, 

729 **corpusargs, 

730 ): 

731 """ 

732 Initialize Tweeki Entity Linking corpus introduced in "Tweeki: Linking Named Entities on Twitter to a Knowledge Graph" Harandizadeh, Singh. 

733 The data consits of tweets with manually annotated wikipedia links. 

734 If you call the constructor the first time the dataset gets automatically downloaded and transformed in tab-separated column format. 

735 

736 Parameters 

737 ---------- 

738 base_path : Union[str, Path], optional 

739 Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

740 to point to a different folder but typically this should not be necessary. 

741 in_memory: If True, keeps dataset in memory giving speedups in training. 

742 """ 

743 if type(base_path) == str: 

744 base_path: Path = Path(base_path) 

745 

746 # this dataset name 

747 dataset_name = self.__class__.__name__.lower() 

748 

749 # default dataset folder is the cache root 

750 if not base_path: 

751 base_path = flair.cache_root / "datasets" 

752 data_folder = base_path / dataset_name 

753 

754 tweeki_gold_el_path = "https://raw.githubusercontent.com/ucinlp/tweeki/main/data/Tweeki_gold/Tweeki_gold" 

755 corpus_file_name = "tweeki_gold.txt" 

756 parsed_dataset = data_folder / corpus_file_name 

757 

758 # download and parse data if necessary 

759 if not parsed_dataset.exists(): 

760 

761 original_file_path = cached_path(f"{tweeki_gold_el_path}", Path("datasets") / dataset_name) 

762 

763 with open(original_file_path, 'r', encoding='utf-8') as read, open(parsed_dataset, 'w', 

764 encoding='utf-8') as write: 

765 line = read.readline() 

766 while line: 

767 if line.startswith('#'): 

768 out_line = '' 

769 elif line == '\n': # tweet ends 

770 out_line = '\n-DOCSTART-\n\n' 

771 else: 

772 line_list = line.split('\t') 

773 out_line = line_list[1] + '\t' 

774 if line_list[3] == '-\n': # no wiki name 

775 out_line += 'O\n' 

776 else: 

777 out_line += line_list[2][:2] + line_list[3].split('|')[0].replace(' ', '_') + '\n' 

778 write.write(out_line) 

779 line = read.readline() 

780 

781 os.rename(original_file_path, str(original_file_path) + '_original') 

782 

783 super(NEL_ENGLISH_TWEEKI, self).__init__( 

784 data_folder, 

785 train_file=corpus_file_name, 

786 in_memory=in_memory, 

787 **corpusargs, 

788 ) 

789 

790 

791class NEL_ENGLISH_REDDIT(EntityLinkingCorpus): 

792 def __init__( 

793 self, 

794 base_path: Union[str, Path] = None, 

795 in_memory: bool = True, 

796 **corpusargs, 

797 ): 

798 """ 

799 Initialize the Reddit Entity Linking corpus containing gold annotations only (https://arxiv.org/abs/2101.01228v2) in the NER-like column format. 

800 The first time you call this constructor it will automatically download the dataset. 

801 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this 

802 to point to a different folder but typically this should not be necessary. 

803 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

804 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

805 """ 

806 if type(base_path) == str: 

807 base_path: Path = Path(base_path) 

808 

809 # this dataset name 

810 dataset_name = self.__class__.__name__.lower() 

811 

812 # default dataset folder is the cache root 

813 if not base_path: 

814 base_path = flair.cache_root / "datasets" 

815 data_folder = base_path / dataset_name 

816 

817 # download and parse data if necessary 

818 reddit_el_path = "https://zenodo.org/record/3970806/files/reddit_el.zip" 

819 corpus_file_name = "reddit_el_gold.txt" 

820 parsed_dataset = data_folder / corpus_file_name 

821 

822 if not parsed_dataset.exists(): 

823 reddit_el_zip = cached_path(f"{reddit_el_path}", Path("datasets") / dataset_name) 

824 unpack_file(reddit_el_zip, data_folder, "zip", False) 

825 

826 with open(data_folder / corpus_file_name, "w", encoding='utf-8') as txtout: 

827 

828 # First parse the post titles 

829 with open(data_folder / "posts.tsv", "r", encoding='utf-8') as tsvin1, open( 

830 data_folder / "gold_post_annotations.tsv", "r", encoding='utf-8') as tsvin2: 

831 

832 posts = csv.reader(tsvin1, delimiter="\t") 

833 self.post_annotations = csv.reader(tsvin2, delimiter="\t") 

834 self.curr_annot = next(self.post_annotations) 

835 

836 for row in posts: # Go through all the post titles 

837 

838 txtout.writelines("-DOCSTART-\n\n") # Start each post with a -DOCSTART- token 

839 

840 # Keep track of how many and which entity mentions does a given post title have 

841 link_annots = [] # [start pos, end pos, wiki page title] of an entity mention 

842 

843 # Check if the current post title has an entity link and parse accordingly 

844 if row[0] == self.curr_annot[0]: 

845 

846 link_annots.append((int(self.curr_annot[4]), int(self.curr_annot[5]), self.curr_annot[3])) 

847 link_annots = self._fill_annot_array(link_annots, row[0], post_flag=True) 

848 

849 # Post titles with entity mentions (if any) are handled via this function 

850 self._text_to_cols(Sentence(row[2], use_tokenizer=True), link_annots, txtout) 

851 else: 

852 self._text_to_cols(Sentence(row[2], use_tokenizer=True), link_annots, txtout) 

853 

854 # Then parse the comments 

855 with open(data_folder / "comments.tsv", "r", encoding='utf-8') as tsvin3, open( 

856 data_folder / "gold_comment_annotations.tsv", "r", encoding='utf-8') as tsvin4: 

857 

858 self.comments = csv.reader(tsvin3, delimiter="\t") 

859 self.comment_annotations = csv.reader(tsvin4, delimiter="\t") 

860 self.curr_annot = next(self.comment_annotations) 

861 self.curr_row = next(self.comments) 

862 self.stop_iter = False 

863 

864 # Iterate over the comments.tsv file, until the end is reached 

865 while not self.stop_iter: 

866 

867 txtout.writelines("-DOCSTART-\n") # Start each comment thread with a -DOCSTART- token 

868 

869 # Keep track of the current comment thread and its corresponding key, on which the annotations are matched. 

870 # Each comment thread is handled as one 'document'. 

871 self.curr_comm = self.curr_row[4] 

872 comm_key = self.curr_row[0] 

873 

874 # Python's csv package for some reason fails to correctly parse a handful of rows inside the comments.tsv file. 

875 # This if-condition is needed to handle this problem. 

876 if comm_key in {"en5rf4c", "es3ia8j", "es3lrmw"}: 

877 if comm_key == "en5rf4c": 

878 self.parsed_row = (r.split("\t") for r in self.curr_row[4].split("\n")) 

879 self.curr_comm = next(self.parsed_row) 

880 self._fill_curr_comment(fix_flag=True) 

881 # In case we are dealing with properly parsed rows, proceed with a regular parsing procedure 

882 else: 

883 self._fill_curr_comment(fix_flag=False) 

884 

885 link_annots = [] # [start pos, end pos, wiki page title] of an entity mention 

886 

887 # Check if the current comment thread has an entity link and parse accordingly, same as with post titles above 

888 if comm_key == self.curr_annot[0]: 

889 link_annots.append((int(self.curr_annot[4]), int(self.curr_annot[5]), self.curr_annot[3])) 

890 link_annots = self._fill_annot_array(link_annots, comm_key, post_flag=False) 

891 self._text_to_cols(Sentence(self.curr_comm, use_tokenizer=True), link_annots, txtout) 

892 else: 

893 # In two of the comment thread a case of capital letter spacing occurs, which the SegtokTokenizer cannot properly handle. 

894 # The following if-elif condition handles these two cases and as result writes full capitalized words in each corresponding row, 

895 # and not just single letters into single rows. 

896 if comm_key == "dv74ybb": 

897 self.curr_comm = " ".join( 

898 [word.replace(" ", "") for word in self.curr_comm.split(" ")]) 

899 elif comm_key == "eci2lut": 

900 self.curr_comm = (self.curr_comm[:18] + self.curr_comm[18:27].replace(" ", 

901 "") + self.curr_comm[ 

902 27:55] + 

903 self.curr_comm[55:68].replace(" ", "") + self.curr_comm[ 

904 68:85] + self.curr_comm[ 

905 85:92].replace(" ", 

906 "") + 

907 self.curr_comm[92:]) 

908 

909 self._text_to_cols(Sentence(self.curr_comm, use_tokenizer=True), link_annots, txtout) 

910 

911 super(NEL_ENGLISH_REDDIT, self).__init__( 

912 data_folder, 

913 train_file=corpus_file_name, 

914 in_memory=in_memory, 

915 **corpusargs, 

916 ) 

917 

918 def _text_to_cols(self, sentence: Sentence, links: list, outfile): 

919 """ 

920 Convert a tokenized sentence into column format 

921 :param sentence: Flair Sentence object containing a tokenized post title or comment thread 

922 :param links: array containing information about the starting and ending position of an entity mention, as well 

923 as its corresponding wiki tag 

924 :param outfile: file, to which the output is written 

925 """ 

926 for i in range(0, len(sentence)): 

927 # If there are annotated entity mentions for given post title or a comment thread 

928 if links: 

929 # Keep track which is the correct corresponding entity link, in cases where there is >1 link in a sentence 

930 link_index = [j for j, v in enumerate(links) if 

931 (sentence[i].start_pos >= v[0] and sentence[i].end_pos <= v[1])] 

932 # Write the token with a corresponding tag to file 

933 try: 

934 if any(sentence[i].start_pos == v[0] and sentence[i].end_pos == v[1] for j, v in enumerate(links)): 

935 outfile.writelines(sentence[i].text + "\tS-" + links[link_index[0]][2] + "\n") 

936 elif any( 

937 sentence[i].start_pos == v[0] and sentence[i].end_pos != v[1] for j, v in enumerate(links)): 

938 outfile.writelines(sentence[i].text + "\tB-" + links[link_index[0]][2] + "\n") 

939 elif any( 

940 sentence[i].start_pos >= v[0] and sentence[i].end_pos <= v[1] for j, v in enumerate(links)): 

941 outfile.writelines(sentence[i].text + "\tI-" + links[link_index[0]][2] + "\n") 

942 else: 

943 outfile.writelines(sentence[i].text + "\tO\n") 

944 # IndexError is raised in cases when there is exactly one link in a sentence, therefore can be dismissed 

945 except IndexError: 

946 pass 

947 

948 # If a comment thread or a post title has no entity link, all tokens are assigned the O tag 

949 else: 

950 outfile.writelines(sentence[i].text + "\tO\n") 

951 

952 # Prevent writing empty lines if e.g. a quote comes after a dot or initials are tokenized 

953 # incorrectly, in order to keep the desired format (empty line as a sentence separator). 

954 try: 

955 if ((sentence[i].text in {".", "!", "?", "!*"}) and 

956 (sentence[i + 1].text not in {'"', '“', "'", "''", "!", "?", ";)", "."}) and 

957 ("." not in sentence[i - 1].text)): 

958 outfile.writelines("\n") 

959 except IndexError: 

960 # Thrown when the second check above happens, but the last token of a sentence is reached. 

961 # Indicates that the EOS punctuaion mark is present, therefore an empty line needs to be written below. 

962 outfile.writelines("\n") 

963 

964 # If there is no punctuation mark indicating EOS, an empty line is still needed after the EOS 

965 if sentence[-1].text not in {".", "!", "?"}: 

966 outfile.writelines("\n") 

967 

968 def _fill_annot_array(self, annot_array: list, key: str, post_flag: bool) -> list: 

969 """ 

970 Fills the array containing information about the entity mention annotations, used in the _text_to_cols method 

971 :param annot_array: array to be filled 

972 :param key: reddit id, on which the post title/comment thread is matched with its corresponding annotation 

973 :param post_flag: flag indicating whether the annotations are collected for the post titles (=True) 

974 or comment threads (=False) 

975 """ 

976 next_annot = None 

977 while True: 

978 # Check if further annotations belong to the current post title or comment thread as well 

979 try: 

980 next_annot = next(self.post_annotations) if post_flag else next(self.comment_annotations) 

981 if next_annot[0] == key: 

982 annot_array.append((int(next_annot[4]), int(next_annot[5]), next_annot[3])) 

983 else: 

984 self.curr_annot = next_annot 

985 break 

986 # Stop when the end of an annotation file is reached 

987 except StopIteration: 

988 break 

989 return annot_array 

990 

991 def _fill_curr_comment(self, fix_flag: bool): 

992 """ 

993 Extends the string containing the current comment thread, which is passed to _text_to_cols method, when the 

994 comments are parsed. 

995 :param fix_flag: flag indicating whether the method is called when the incorrectly imported rows are parsed (=True) 

996 or regular rows (=False) 

997 """ 

998 next_row = None 

999 while True: 

1000 # Check if further annotations belong to the current sentence as well 

1001 try: 

1002 next_row = next(self.comments) if not fix_flag else next(self.parsed_row) 

1003 if len(next_row) < 2: 

1004 # 'else " "' is needed to keep the proper token positions (for accordance with annotations) 

1005 self.curr_comm += next_row[0] if any(next_row) else " " 

1006 else: 

1007 self.curr_row = next_row 

1008 break 

1009 except StopIteration: # When the end of the comments.tsv file is reached 

1010 self.curr_row = next_row 

1011 self.stop_iter = True if not fix_flag else False 

1012 break 

1013 

1014 

1015def from_ufsac_to_tsv(xml_file: Union[str, Path], conll_file: Union[str, Path], datasetname: str, 

1016 encoding: str = "utf8", 

1017 cut_multisense: bool = True): 

1018 """ 

1019 Function that converts the UFSAC format into tab separated column format in a new file. 

1020 Parameters 

1021 ---------- 

1022 xml_file : Union[str, Path] 

1023 Path to the xml file. 

1024 conll_file : Union[str, Path] 

1025 Path for the new conll file. 

1026 datasetname: str 

1027 Name of the dataset from UFSAC, needed because of different handling of multi-word-spans in the datasets 

1028 encoding : str, optional 

1029 Encoding used in open function. The default is "utf8". 

1030 cut_multisense : bool, optional 

1031 Boolean that determines whether or not the wn30_key tag should be cut if it contains multiple possible senses. 

1032 If True only the first listed sense will be used. Otherwise the whole list of senses will be detected 

1033 as one new sense. The default is True. 

1034 

1035 """ 

1036 

1037 def make_line(word, begin_or_inside, attributes): 

1038 """ 

1039 Function that creates an output line from a word. 

1040 Parameters 

1041 ---------- 

1042 word : 

1043 String of the actual word. 

1044 begin_or_inside: 

1045 Either 'B-' or 'I-' 

1046 attributes: 

1047 List of attributes of the word (pos, lemma, wn30_key) 

1048 """ 

1049 line = word 

1050 if cut_multisense == True: 

1051 attributes[-1] = attributes[-1].split(';')[0] # take only first sense 

1052 

1053 for attrib in attributes: 

1054 if attrib != 'O': 

1055 line = line + '\t' + begin_or_inside + attrib 

1056 else: 

1057 line = line + '\tO' 

1058 line += '\n' 

1059 

1060 return line 

1061 

1062 def split_span(word_fields: List[str], datasetname: str()): 

1063 """ 

1064 Function that splits a word if necessary, i.e. if it is a multiple-word-span. 

1065 Parameters 

1066 ---------- 

1067 word_fields : 

1068 list ['surface_form', 'lemma', 'pos', 'wn30_key'] of a word 

1069 datasetname: 

1070 name of corresponding dataset 

1071 """ 

1072 

1073 span = word_fields[0] 

1074 

1075 if datasetname in ['trainomatic', 'masc']: # splitting not sensible for these datasets 

1076 return [span] 

1077 elif datasetname == 'omsti': 

1078 if word_fields[ 

1079 3] != 'O' and not span == '_' and not '__' in span: # has annotation and does not consist only of '_' (still not 100% clean) 

1080 return span.split('_') 

1081 else: 

1082 return [span] 

1083 else: # for all other datasets splitting at '_' is always sensible 

1084 return span.split('_') 

1085 

1086 txt_out = open(file=conll_file, mode='w', encoding=encoding) 

1087 import xml.etree.ElementTree as ET 

1088 tree = ET.parse(xml_file) 

1089 corpus = tree.getroot() 

1090 

1091 number_of_docs = len(corpus.findall('document')) 

1092 

1093 fields = ['surface_form', 'lemma', 'pos', 'wn30_key'] 

1094 for document in corpus: 

1095 # Docstart 

1096 if number_of_docs > 1: 

1097 txt_out.write('-DOCSTART-\n\n') 

1098 

1099 for paragraph in document: 

1100 

1101 for sentence in paragraph: 

1102 

1103 for word in sentence: 

1104 

1105 dictionary = word.attrib 

1106 fields_of_word = [word.attrib[field] if (field in dictionary) else 'O' for field in fields] 

1107 

1108 chunks = split_span(fields_of_word, datasetname) 

1109 

1110 txt_out.write(make_line(chunks[0], 'B-', fields_of_word[1:])) 

1111 

1112 # if there is more than one word in the chunk we write each in a separate line 

1113 for chunk in chunks[1:]: 

1114 # print(chunks) 

1115 txt_out.write(make_line(chunk, 'I-', fields_of_word[1:])) 

1116 

1117 # empty line after each sentence 

1118 txt_out.write('\n') 

1119 

1120 txt_out.close() 

1121 

1122 

1123def determine_tsv_file(filename: str, data_folder: str, cut_multisense: bool = True): 

1124 """ 

1125 Checks if the converted .tsv file already exists and if not, creates it. Returns name of the file. 

1126 ---------- 

1127 string : str 

1128 String that contains the name of the file. 

1129 data_folder : str 

1130 String that contains the name of the folder in which the CoNLL file should reside. 

1131 cut_multisense : bool, optional 

1132 Boolean that determines whether or not the wn30_key tag should be cut if it contains multiple possible senses. 

1133 If True only the first listed sense will be used. Otherwise the whole list of senses will be detected 

1134 as one new sense. The default is True. 

1135 """ 

1136 

1137 if cut_multisense is True and filename not in ['semeval2007task17', 'trainomatic', 

1138 'wngt']: # these three datasets do not have multiple senses 

1139 

1140 conll_file_name = filename + '_cut.tsv' 

1141 

1142 else: 

1143 

1144 conll_file_name = filename + '.tsv' 

1145 

1146 path_to_conll_file = data_folder / conll_file_name 

1147 

1148 if not path_to_conll_file.exists(): 

1149 # convert the file to CoNLL 

1150 

1151 from_ufsac_to_tsv(xml_file=Path(data_folder / 'original_data' / (filename + '.xml')), 

1152 conll_file=Path(data_folder / conll_file_name), 

1153 datasetname=filename, 

1154 cut_multisense=cut_multisense) 

1155 

1156 return conll_file_name 

1157 

1158 

1159class WSD_UFSAC(MultiCorpus): 

1160 def __init__( 

1161 self, 

1162 filenames: Union[str, List[str]] = ['masc', 'semcor'], 

1163 base_path: Union[str, Path] = None, 

1164 in_memory: bool = True, 

1165 cut_multisense: bool = True, 

1166 columns={0: "text", 3: "wn30_key"}, 

1167 tag_to_bioes=None, 

1168 banned_sentences: List[str] = None, 

1169 sample_missing_splits_in_multicorpus: bool = True, 

1170 sample_missing_splits_in_each_corpus: bool = True, 

1171 use_raganato_ALL_as_test_data: bool = False, 

1172 name: str = 'multicorpus' 

1173 ): 

1174 """ 

1175 Initialize a custom corpus with any Word Sense Disambiguation (WSD) datasets in the UFSAC format from https://github.com/getalp/UFSAC. 

1176 If the constructor is called for the first time the data is automatically downloaded and transformed from xml to a tab separated column format. 

1177 Since only the WordNet 3.0 version for senses is consistently available for all provided datasets we will only consider this version. 

1178 Also we ignore the id annotation used in datasets that were originally created for evaluation tasks 

1179 :param filenames: Here you can pass a single datasetname or a list of ddatasetnames. The available names are: 

1180 'masc', 'omsti', 'raganato_ALL', 'raganato_semeval2007', 'raganato_semeval2013', 'raganato_semeval2015', 'raganato_senseval2', 'raganato_senseval3', 

1181 'semcor', 'semeval2007task17', 'semeval2007task7', 'semeval2013task12', 'semeval2015task13', 'senseval2', 'senseval2_lexical_sample_test', 

1182 'senseval2_lexical_sample_train', 'senseval3task1', 'senseval3task6_test', 'senseval3task6_train', 'trainomatic', 'wngt'. 

1183 So you can pass for example filenames = ['masc', 'omsti', 'wngt']. Default two mid-sized datasets 'masc' and 'semcor' are loaded. 

1184 :param base_path: You can override this to point to a specific folder but typically this should not be necessary. 

1185 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

1186 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object 

1187 :param cut_multisense: Boolean that determines whether or not the wn30_key tag should be cut if it contains 

1188 multiple possible senses. If True only the first listed sense will be used and the 

1189 suffix '_cut' will be added to the name of the CoNLL file. Otherwise the whole list of 

1190 senses will be detected as one new sense. The default is True. 

1191 :param columns: Columns to consider when loading the dataset. You can add 1: "lemma" or 2: "pos" to the default dict {0: "text", 3: "wn30_key"} 

1192 if you want to use additional pos and/or lemma for the words. 

1193 :param tag_to_bioes: whether to convert to BIOES tagging scheme 

1194 :param banned_sentences: Optionally remove sentences from the corpus. Works only if `in_memory` is true 

1195 :param sample_missing_splits_in_multicorpus: Whether to sample missing splits when loading the multicorpus (this is redundant if 

1196 sample_missing_splits_in_each_corpus is True) 

1197 :param sample_missing_splits_in_each_corpus: Whether to sample missing splits when loading each single corpus given in filenames. 

1198 :param use_raganato_ALL_as_test_data: If True, the raganato_ALL dataset (Raganato et al. "Word Sense Disambiguation: A unified evaluation framework and empirical compariso") 

1199 will be used as test data. Note that the sample_missing_splits parameters are set to 'only_dev' in this case if set to True. 

1200 :param name: Name of your (costum) corpus 

1201 """ 

1202 if type(base_path) == str: 

1203 base_path: Path = Path(base_path) 

1204 

1205 # this dataset name 

1206 dataset_name = self.__class__.__name__.lower() 

1207 

1208 # default dataset folder is the cache root 

1209 if not base_path: 

1210 base_path = flair.cache_root / "datasets" 

1211 data_folder = base_path / dataset_name 

1212 original_data_folder = data_folder / 'original_data' 

1213 

1214 # check if data there, if not, download the data 

1215 if not original_data_folder.exists(): 

1216 # create folder 

1217 data_folder.mkdir(parents=True) 

1218 

1219 # download data 

1220 import gdown 

1221 

1222 url = 'https://drive.google.com/uc?id=1Oigo3kzRosz2VjyA44vpJZ58tDFyLRMO' 

1223 

1224 output = data_folder / (dataset_name + '.tar') 

1225 

1226 gdown.download(url, str(output), quiet=False) 

1227 

1228 output = data_folder / (dataset_name + '.tar') 

1229 unpack_file(file=output, 

1230 unpack_to=data_folder, 

1231 mode='tar', keep=False) 

1232 

1233 os.rename(data_folder / 'ufsac-public-2.1', original_data_folder) 

1234 

1235 # transform data into column format if necessary 

1236 

1237 # if no filenames are specified we use all the data 

1238 if not filenames: 

1239 filenames = [name[:-4] for name in os.listdir(original_data_folder) if not 'raganato' in name] 

1240 

1241 if type(filenames) == str: 

1242 filenames = [filenames] 

1243 

1244 corpora = [] 

1245 

1246 print('Transforming data into column format and creating corpora...') 

1247 

1248 if use_raganato_ALL_as_test_data: 

1249 # in this case no test data should be generated by sampling from train data. But if the sample arguments are set to true, the dev set will be sampled 

1250 if sample_missing_splits_in_each_corpus: 

1251 sample_missing_splits_in_each_corpus = 'only_dev' 

1252 if sample_missing_splits_in_multicorpus: 

1253 sample_missing_splits_in_multicorpus = 'only_dev' 

1254 

1255 # also we remove 'raganato_ALL' from filenames in case its in the list 

1256 filenames.remove('raganato_ALL') 

1257 

1258 # generate the test file 

1259 test_file = determine_tsv_file(filename='raganato_ALL', data_folder=data_folder, 

1260 cut_multisense=cut_multisense) 

1261 

1262 corpus = ColumnCorpus(data_folder=data_folder, 

1263 column_format=columns, 

1264 test_file=test_file, # corpus only has test data 

1265 in_memory=in_memory, 

1266 tag_to_bioes=tag_to_bioes, 

1267 column_delimiter='\t', 

1268 document_separator_token='-DOCSTART-', 

1269 banned_sentences=banned_sentences, 

1270 autofind_splits=False, 

1271 sample_missing_splits=sample_missing_splits_in_each_corpus, 

1272 ) 

1273 corpora.append(corpus) 

1274 

1275 for filename in filenames: 

1276 # make column file and save to data_folder 

1277 

1278 new_filename = determine_tsv_file(filename=filename, data_folder=data_folder, cut_multisense=cut_multisense) 

1279 

1280 corpus = ColumnCorpus(data_folder=data_folder, 

1281 column_format=columns, 

1282 train_file=new_filename, 

1283 in_memory=in_memory, 

1284 tag_to_bioes=tag_to_bioes, 

1285 column_delimiter='\t', 

1286 document_separator_token='-DOCSTART-', 

1287 banned_sentences=banned_sentences, 

1288 autofind_splits=False, 

1289 sample_missing_splits=sample_missing_splits_in_each_corpus, 

1290 ) 

1291 corpora.append(corpus) 

1292 print('...done!') 

1293 

1294 super(WSD_UFSAC, self).__init__( 

1295 corpora, 

1296 sample_missing_splits=sample_missing_splits_in_multicorpus, 

1297 name=name 

1298 ) 

1299 

1300 

1301class WSD_RAGANATO_ALL(EntityLinkingCorpus): 

1302 def __init__( 

1303 self, 

1304 base_path: Union[str, Path] = None, 

1305 in_memory: bool = True, 

1306 columns={0: "text", 3: "wn30_key"}, 

1307 tag_to_bioes=None, 

1308 label_name_map: Dict[str, str] = None, 

1309 banned_sentences: List[str] = None, 

1310 sample_missing_splits: bool = True, 

1311 cut_multisense: bool = True 

1312 ): 

1313 """ 

1314 Initialize ragnato_ALL (concatenation of all SensEval and SemEval all-words tasks) provided in UFSAC https://github.com/getalp/UFSAC 

1315 When first initializing the corpus the whole UFSAC data is downloaded. 

1316 """ 

1317 if type(base_path) == str: 

1318 base_path: Path = Path(base_path) 

1319 

1320 dataset_name = 'wsd_ufsac' 

1321 

1322 # default dataset folder is the cache root 

1323 if not base_path: 

1324 base_path = flair.cache_root / "datasets" 

1325 data_folder = base_path / dataset_name 

1326 original_data_folder = data_folder / 'original_data' 

1327 

1328 # We check if the the UFSAC data has already been downloaded. If not, we download it. 

1329 # Note that this downloads more datasets than just SemCor. But the size of the download is only around 190 Mb (around 4.5 Gb unpacked) 

1330 if not original_data_folder.exists(): 

1331 # create folder 

1332 data_folder.mkdir(parents=True) 

1333 

1334 # download data 

1335 import gdown 

1336 

1337 url = 'https://drive.google.com/uc?id=1Oigo3kzRosz2VjyA44vpJZ58tDFyLRMO' 

1338 

1339 output = data_folder / (dataset_name + '.tar') 

1340 

1341 gdown.download(url, str(output), quiet=False) 

1342 

1343 output = data_folder / (dataset_name + '.tar') 

1344 unpack_file(file=output, 

1345 unpack_to=data_folder, 

1346 mode='tar', keep=False) 

1347 

1348 os.rename(data_folder / 'ufsac-public-2.1', original_data_folder) 

1349 

1350 train_file = determine_tsv_file(filename='raganato_ALL', data_folder=data_folder, cut_multisense=cut_multisense) 

1351 

1352 super(WSD_RAGANATO_ALL, self).__init__( 

1353 data_folder=data_folder, 

1354 columns=columns, 

1355 train_file=train_file, 

1356 in_memory=in_memory, 

1357 document_separator_token='-DOCSTART-', 

1358 column_delimiter='\t', 

1359 autofind_splits=False, 

1360 tag_to_bioes=tag_to_bioes, 

1361 label_name_map=label_name_map, 

1362 banned_sentences=banned_sentences, 

1363 sample_missing_splits=sample_missing_splits, 

1364 ) 

1365 

1366 

1367class WSD_SEMCOR(EntityLinkingCorpus): 

1368 def __init__( 

1369 self, 

1370 base_path: Union[str, Path] = None, 

1371 in_memory: bool = True, 

1372 columns={0: "text", 3: "wn30_key"}, 

1373 tag_to_bioes=None, 

1374 label_name_map: Dict[str, str] = None, 

1375 banned_sentences: List[str] = None, 

1376 sample_missing_splits: bool = True, 

1377 cut_multisense: bool = True, 

1378 use_raganato_ALL_as_test_data: bool = False, 

1379 ): 

1380 """ 

1381 Initialize SemCor provided in UFSAC https://github.com/getalp/UFSAC 

1382 When first initializing the corpus the whole UFSAC data is downloaded. 

1383 """ 

1384 if type(base_path) == str: 

1385 base_path: Path = Path(base_path) 

1386 

1387 dataset_name = 'wsd_ufsac' 

1388 

1389 # default dataset folder is the cache root 

1390 if not base_path: 

1391 base_path = flair.cache_root / "datasets" 

1392 data_folder = base_path / dataset_name 

1393 original_data_folder = data_folder / 'original_data' 

1394 

1395 # We check if the the UFSAC data has already been downloaded. If not, we download it. 

1396 # Note that this downloads more datasets than just SemCor. But the size of the download is only around 190 Mb (around 4.5 Gb unpacked) 

1397 if not original_data_folder.exists(): 

1398 # create folder 

1399 data_folder.mkdir(parents=True) 

1400 

1401 # download data 

1402 import gdown 

1403 

1404 url = 'https://drive.google.com/uc?id=1Oigo3kzRosz2VjyA44vpJZ58tDFyLRMO' 

1405 

1406 output = data_folder / (dataset_name + '.tar') 

1407 

1408 gdown.download(url, str(output), quiet=False) 

1409 

1410 output = data_folder / (dataset_name + '.tar') 

1411 unpack_file(file=output, 

1412 unpack_to=data_folder, 

1413 mode='tar', keep=False) 

1414 

1415 os.rename(data_folder / 'ufsac-public-2.1', original_data_folder) 

1416 

1417 if use_raganato_ALL_as_test_data: 

1418 # in this case no test data should be generated by sampling from train data. But if sample_missing_splits is true, the dev set will be sampled. 

1419 if sample_missing_splits: 

1420 sample_missing_splits = 'only_dev' 

1421 

1422 # generate the test file 

1423 test_file = determine_tsv_file(filename='raganato_ALL', data_folder=data_folder, 

1424 cut_multisense=cut_multisense) 

1425 else: 

1426 test_file = None 

1427 

1428 train_file = determine_tsv_file(filename='semcor', data_folder=data_folder, cut_multisense=cut_multisense) 

1429 

1430 super(WSD_SEMCOR, self).__init__( 

1431 data_folder=data_folder, 

1432 columns=columns, 

1433 train_file=train_file, 

1434 test_file=test_file, 

1435 in_memory=in_memory, 

1436 document_separator_token='-DOCSTART-', 

1437 column_delimiter='\t', 

1438 autofind_splits=False, 

1439 tag_to_bioes=tag_to_bioes, 

1440 label_name_map=label_name_map, 

1441 banned_sentences=banned_sentences, 

1442 sample_missing_splits=sample_missing_splits, 

1443 ) 

1444 

1445 

1446class WSD_WORDNET_GLOSS_TAGGED(EntityLinkingCorpus): 

1447 def __init__( 

1448 self, 

1449 base_path: Union[str, Path] = None, 

1450 in_memory: bool = True, 

1451 columns={0: "text", 3: "wn30_key"}, 

1452 tag_to_bioes=None, 

1453 label_name_map: Dict[str, str] = None, 

1454 banned_sentences: List[str] = None, 

1455 sample_missing_splits: bool = True, 

1456 use_raganato_ALL_as_test_data: bool = False, 

1457 ): 

1458 """ 

1459 Initialize Princeton WordNet Gloss Corpus provided in UFSAC https://github.com/getalp/UFSAC 

1460 When first initializing the corpus the whole UFSAC data is downloaded. 

1461 """ 

1462 if type(base_path) == str: 

1463 base_path: Path = Path(base_path) 

1464 

1465 dataset_name = 'wsd_ufsac' 

1466 

1467 # default dataset folder is the cache root 

1468 if not base_path: 

1469 base_path = flair.cache_root / "datasets" 

1470 data_folder = base_path / dataset_name 

1471 original_data_folder = data_folder / 'original_data' 

1472 

1473 # We check if the the UFSAC data has already been downloaded. If not, we download it. 

1474 # Note that this downloads more datasets than just WordNet Gloss Tagged. But the size of the download is only around 190 Mb (around 4.5 Gb unpacked) 

1475 if not original_data_folder.exists(): 

1476 # create folder 

1477 data_folder.mkdir(parents=True) 

1478 

1479 # download data 

1480 import gdown 

1481 

1482 url = 'https://drive.google.com/uc?id=1Oigo3kzRosz2VjyA44vpJZ58tDFyLRMO' 

1483 

1484 output = data_folder / (dataset_name + '.tar') 

1485 

1486 gdown.download(url, str(output), quiet=False) 

1487 

1488 output = data_folder / (dataset_name + '.tar') 

1489 unpack_file(file=output, 

1490 unpack_to=data_folder, 

1491 mode='tar', keep=False) 

1492 

1493 os.rename(data_folder / 'ufsac-public-2.1', original_data_folder) 

1494 

1495 if use_raganato_ALL_as_test_data: 

1496 # in this case no test data should be generated by sampling from train data. But if sample_missing_splits is true, the dev set will be sampled. 

1497 if sample_missing_splits: 

1498 sample_missing_splits = 'only_dev' 

1499 

1500 # generate the test file 

1501 test_file = determine_tsv_file(filename='raganato_ALL', data_folder=data_folder, cut_multisense=True) 

1502 else: 

1503 test_file = None 

1504 

1505 train_file = determine_tsv_file(filename='wngt', data_folder=data_folder, 

1506 cut_multisense=False) # does not have multisense! 

1507 

1508 super(WSD_WORDNET_GLOSS_TAGGED, self).__init__( 

1509 data_folder=data_folder, 

1510 columns=columns, 

1511 train_file=train_file, 

1512 test_file=test_file, 

1513 in_memory=in_memory, 

1514 document_separator_token='-DOCSTART-', 

1515 column_delimiter='\t', 

1516 autofind_splits=False, 

1517 tag_to_bioes=tag_to_bioes, 

1518 label_name_map=label_name_map, 

1519 banned_sentences=banned_sentences, 

1520 sample_missing_splits=sample_missing_splits, 

1521 ) 

1522 

1523 

1524class WSD_MASC(EntityLinkingCorpus): 

1525 def __init__( 

1526 self, 

1527 base_path: Union[str, Path] = None, 

1528 in_memory: bool = True, 

1529 columns={0: "text", 3: "wn30_key"}, 

1530 tag_to_bioes=None, 

1531 label_name_map: Dict[str, str] = None, 

1532 banned_sentences: List[str] = None, 

1533 sample_missing_splits: bool = True, 

1534 cut_multisense: bool = True, 

1535 use_raganato_ALL_as_test_data: bool = False, 

1536 ): 

1537 """ 

1538 Initialize MASC (Manually Annotated Sub-Corpus) provided in UFSAC https://github.com/getalp/UFSAC 

1539 When first initializing the corpus the whole UFSAC data is downloaded. 

1540 """ 

1541 if type(base_path) == str: 

1542 base_path: Path = Path(base_path) 

1543 

1544 dataset_name = 'wsd_ufsac' 

1545 

1546 # default dataset folder is the cache root 

1547 if not base_path: 

1548 base_path = flair.cache_root / "datasets" 

1549 data_folder = base_path / dataset_name 

1550 original_data_folder = data_folder / 'original_data' 

1551 

1552 # We check if the the UFSAC data has already been downloaded. If not, we download it. 

1553 # Note that this downloads more datasets than just MASC. But the size of the download is only around 190 Mb (around 4.5 Gb unpacked) 

1554 if not original_data_folder.exists(): 

1555 # create folder 

1556 data_folder.mkdir(parents=True) 

1557 

1558 # download data 

1559 import gdown 

1560 

1561 url = 'https://drive.google.com/uc?id=1Oigo3kzRosz2VjyA44vpJZ58tDFyLRMO' 

1562 

1563 output = data_folder / (dataset_name + '.tar') 

1564 

1565 gdown.download(url, str(output), quiet=False) 

1566 

1567 output = data_folder / (dataset_name + '.tar') 

1568 unpack_file(file=output, 

1569 unpack_to=data_folder, 

1570 mode='tar', keep=False) 

1571 

1572 os.rename(data_folder / 'ufsac-public-2.1', original_data_folder) 

1573 

1574 if use_raganato_ALL_as_test_data: 

1575 # in this case no test data should be generated by sampling from train data. But if sample_missing_splits is true, the dev set will be sampled. 

1576 if sample_missing_splits: 

1577 sample_missing_splits = 'only_dev' 

1578 

1579 # generate the test file 

1580 test_file = determine_tsv_file(filename='raganato_ALL', data_folder=data_folder, 

1581 cut_multisense=cut_multisense) 

1582 else: 

1583 test_file = None 

1584 

1585 train_file = determine_tsv_file(filename='masc', data_folder=data_folder, cut_multisense=cut_multisense) 

1586 

1587 super(WSD_MASC, self).__init__( 

1588 data_folder=data_folder, 

1589 columns=columns, 

1590 train_file=train_file, 

1591 test_file=test_file, 

1592 in_memory=in_memory, 

1593 document_separator_token='-DOCSTART-', 

1594 column_delimiter='\t', 

1595 autofind_splits=False, 

1596 tag_to_bioes=tag_to_bioes, 

1597 label_name_map=label_name_map, 

1598 banned_sentences=banned_sentences, 

1599 sample_missing_splits=sample_missing_splits, 

1600 ) 

1601 

1602 

1603class WSD_OMSTI(EntityLinkingCorpus): 

1604 def __init__( 

1605 self, 

1606 base_path: Union[str, Path] = None, 

1607 in_memory: bool = True, 

1608 columns={0: "text", 3: "wn30_key"}, 

1609 tag_to_bioes=None, 

1610 label_name_map: Dict[str, str] = None, 

1611 banned_sentences: List[str] = None, 

1612 sample_missing_splits: bool = True, 

1613 cut_multisense: bool = True, 

1614 use_raganato_ALL_as_test_data: bool = False, 

1615 ): 

1616 """ 

1617 Initialize OMSTI (One Million Sense-Tagged Instances) provided in UFSAC https://github.com/getalp/UFSAC 

1618 When first initializing the corpus the whole UFSAC data is downloaded. 

1619 """ 

1620 if type(base_path) == str: 

1621 base_path: Path = Path(base_path) 

1622 

1623 dataset_name = 'wsd_ufsac' 

1624 

1625 # default dataset folder is the cache root 

1626 if not base_path: 

1627 base_path = flair.cache_root / "datasets" 

1628 data_folder = base_path / dataset_name 

1629 original_data_folder = data_folder / 'original_data' 

1630 

1631 # We check if the the UFSAC data has already been downloaded. If not, we download it. 

1632 # Note that this downloads more datasets than just OMSTI. But the size of the download is only around 190 Mb (around 4.5 Gb unpacked) 

1633 if not original_data_folder.exists(): 

1634 # create folder 

1635 data_folder.mkdir(parents=True) 

1636 

1637 # download data 

1638 import gdown 

1639 

1640 url = 'https://drive.google.com/uc?id=1Oigo3kzRosz2VjyA44vpJZ58tDFyLRMO' 

1641 

1642 output = data_folder / (dataset_name + '.tar') 

1643 

1644 gdown.download(url, str(output), quiet=False) 

1645 

1646 output = data_folder / (dataset_name + '.tar') 

1647 unpack_file(file=output, 

1648 unpack_to=data_folder, 

1649 mode='tar', keep=False) 

1650 

1651 os.rename(data_folder / 'ufsac-public-2.1', original_data_folder) 

1652 

1653 if use_raganato_ALL_as_test_data: 

1654 # in this case no test data should be generated by sampling from train data. But if sample_missing_splits is true, the dev set will be sampled. 

1655 if sample_missing_splits: 

1656 sample_missing_splits = 'only_dev' 

1657 

1658 # generate the test file 

1659 test_file = determine_tsv_file(filename='raganato_ALL', data_folder=data_folder, 

1660 cut_multisense=cut_multisense) 

1661 else: 

1662 test_file = None 

1663 

1664 train_file = determine_tsv_file(filename='omsti', data_folder=data_folder, cut_multisense=cut_multisense) 

1665 

1666 super(WSD_OMSTI, self).__init__( 

1667 data_folder=data_folder, 

1668 columns=columns, 

1669 train_file=train_file, 

1670 test_file=test_file, 

1671 in_memory=in_memory, 

1672 document_separator_token='-DOCSTART-', 

1673 column_delimiter='\t', 

1674 autofind_splits=False, 

1675 tag_to_bioes=tag_to_bioes, 

1676 label_name_map=label_name_map, 

1677 banned_sentences=banned_sentences, 

1678 sample_missing_splits=sample_missing_splits, 

1679 ) 

1680 

1681 

1682class WSD_TRAINOMATIC(EntityLinkingCorpus): 

1683 def __init__( 

1684 self, 

1685 base_path: Union[str, Path] = None, 

1686 in_memory: bool = True, 

1687 columns={0: "text", 3: "wn30_key"}, 

1688 tag_to_bioes=None, 

1689 label_name_map: Dict[str, str] = None, 

1690 banned_sentences: List[str] = None, 

1691 sample_missing_splits: bool = True, 

1692 use_raganato_ALL_as_test_data: bool = False, 

1693 ): 

1694 """ 

1695 Initialize Train-O-Matic provided in UFSAC https://github.com/getalp/UFSAC 

1696 When first initializing the corpus the whole UFSAC data is downloaded. 

1697 """ 

1698 if type(base_path) == str: 

1699 base_path: Path = Path(base_path) 

1700 

1701 dataset_name = 'wsd_ufsac' 

1702 

1703 # default dataset folder is the cache root 

1704 if not base_path: 

1705 base_path = flair.cache_root / "datasets" 

1706 data_folder = base_path / dataset_name 

1707 original_data_folder = data_folder / 'original_data' 

1708 

1709 # We check if the the UFSAC data has already been downloaded. If not, we download it. 

1710 # Note that this downloads more datasets than just Train-O-Matic. But the size of the download is only around 190 Mb (around 4.5 Gb unpacked) 

1711 if not original_data_folder.exists(): 

1712 # create folder 

1713 data_folder.mkdir(parents=True) 

1714 

1715 # download data 

1716 import gdown 

1717 

1718 url = 'https://drive.google.com/uc?id=1Oigo3kzRosz2VjyA44vpJZ58tDFyLRMO' 

1719 

1720 output = data_folder / (dataset_name + '.tar') 

1721 

1722 gdown.download(url, str(output), quiet=False) 

1723 

1724 output = data_folder / (dataset_name + '.tar') 

1725 unpack_file(file=output, 

1726 unpack_to=data_folder, 

1727 mode='tar', keep=False) 

1728 

1729 os.rename(data_folder / 'ufsac-public-2.1', original_data_folder) 

1730 

1731 if use_raganato_ALL_as_test_data: 

1732 # in this case no test data should be generated by sampling from train data. But if sample_missing_splits is true, the dev set will be sampled. 

1733 if sample_missing_splits: 

1734 sample_missing_splits = 'only_dev' 

1735 

1736 # generate the test file 

1737 test_file = determine_tsv_file(filename='raganato_ALL', data_folder=data_folder, cut_multisense=True) 

1738 else: 

1739 test_file = None 

1740 

1741 train_file = determine_tsv_file(filename='trainomatic', data_folder=data_folder, 

1742 cut_multisense=False) # no multisenses 

1743 

1744 super(WSD_TRAINOMATIC, self).__init__( 

1745 data_folder=data_folder, 

1746 columns=columns, 

1747 train_file=train_file, 

1748 test_file=test_file, 

1749 in_memory=in_memory, 

1750 document_separator_token='-DOCSTART-', 

1751 column_delimiter='\t', 

1752 autofind_splits=False, 

1753 tag_to_bioes=tag_to_bioes, 

1754 label_name_map=label_name_map, 

1755 banned_sentences=banned_sentences, 

1756 sample_missing_splits=sample_missing_splits, 

1757 )