Coverage for flair/flair/datasets/entity_linking.py: 7%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import csv
2import logging
3import os
4from pathlib import Path
5from typing import Union, List, Dict
7import requests
9import flair
10from flair.data import Dictionary, Sentence, MultiCorpus
11from flair.datasets import ColumnCorpus
12from flair.file_utils import cached_path, unpack_file
13from flair.tokenization import SentenceSplitter, SegtokSentenceSplitter
15log = logging.getLogger("flair")
18class EntityLinkingCorpus(ColumnCorpus):
19 def __init__(
20 self,
21 data_folder,
22 train_file,
23 columns={0: "text", 1: "nel"},
24 column_delimiter="\t",
25 in_memory=True,
26 document_separator_token='-DOCSTART-',
27 **corpusargs,
28 ):
29 """
30 Super class for all entity linking corpora. Expects the data to be in column format with one column for words and another one for BIO-tags and wikipedia-page
31 name, e.g. B-Brad_Pitt.
32 The class provides the function make_entity_dict to create an entity dictionary suited for entity linking.
33 """
34 # TODO: Add a routine, that checks annotations for some widespread errors/inconsistencies??? (e.g. in AQUAINT corpus Iran-Iraq_War vs. Iran-Iraq_war)
36 super(EntityLinkingCorpus, self).__init__(
37 data_folder,
38 columns,
39 train_file=train_file,
40 column_delimiter=column_delimiter,
41 in_memory=in_memory,
42 document_separator_token=document_separator_token,
43 **corpusargs,
44 )
46 def make_entity_dict(self, label_type='nel', threshold: int = 1) -> Dictionary:
47 """
48 Create ID-dictionary for the wikipedia-page names.
49 param threshold: Ignore links that occur less than threshold value
51 In entity_occurences all wikinames and their number of occurence is saved.
52 ent_dictionary contains all wikinames that occure at least threshold times and gives each name an ID
53 """
54 self.threshold = threshold
55 self.entity_occurences = {}
56 self.total_number_of_entity_mentions = 0
58 for sentence in self.get_all_sentences():
59 if not sentence.is_document_boundary: # exclude "-DOCSTART-"-sentences
61 spans = sentence.get_spans(label_type)
62 for span in spans:
63 annotation = span.tag
64 self.total_number_of_entity_mentions += 1
65 if annotation in self.entity_occurences:
66 self.entity_occurences[annotation] += 1
67 else:
68 self.entity_occurences[annotation] = 1
70 self.number_of_entities = len(self.entity_occurences)
72 # Create the annotation dictionary
73 self.ent_dictionary: Dictionary = Dictionary(add_unk=True)
75 for x in self.entity_occurences:
76 if self.entity_occurences[x] >= threshold:
77 self.ent_dictionary.add_item(x)
79 return self.ent_dictionary
81 # this fct removes every second unknown label
82 def remove_unknowns(self):
83 remove = True
84 for sentence in self.get_all_sentences():
85 if not sentence.is_document_boundary: # exclude "-DOCSTART-"-sentences
87 spans = sentence.get_spans('nel')
88 for span in spans:
89 annotation = span.tag
90 if self.ent_dictionary.get_idx_for_item(annotation) == 0: # unknown label
91 if remove:
92 for token in span:
93 token.remove_labels('nel')
94 remove = False
95 else:
96 remove = True
99class NEL_ENGLISH_AQUAINT(EntityLinkingCorpus):
100 def __init__(
101 self,
102 base_path: Union[str, Path] = None,
103 in_memory: bool = True,
104 agreement_threshold: float = 0.5,
105 sentence_splitter: SentenceSplitter = SegtokSentenceSplitter(),
106 **corpusargs,
107 ):
108 """
109 Initialize Aquaint Entity Linking corpus introduced in: D. Milne and I. H. Witten.
110 Learning to link with wikipedia
111 (https://www.cms.waikato.ac.nz/~ihw/papers/08-DNM-IHW-LearningToLinkWithWikipedia.pdf).
112 If you call the constructor the first time the dataset gets automatically downloaded and transformed in
113 tab-separated column format (aquaint.txt).
115 Parameters
116 ----------
117 base_path : Union[str, Path], optional
118 Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
119 to point to a different folder but typically this should not be necessary.
120 in_memory: If True, keeps dataset in memory giving speedups in training.
121 agreement_threshold: Some link annotations come with an agreement_score representing the agreement from the human annotators. The score ranges from lowest 0.2
122 to highest 1.0. The lower the score, the less "important" is the entity because fewer annotators thought it was worth linking.
123 Default is 0.5 which means the majority of annotators must have annoteted the respective entity mention.
124 """
125 if type(base_path) == str:
126 base_path: Path = Path(base_path)
128 self.agreement_threshold = agreement_threshold
130 # this dataset name
131 dataset_name = self.__class__.__name__.lower() + "_" + type(sentence_splitter).__name__
133 # default dataset folder is the cache root
134 if not base_path:
135 base_path = flair.cache_root / "datasets"
136 data_folder = base_path / dataset_name
138 aquaint_el_path = "https://www.nzdl.org/wikification/data/wikifiedStories.zip"
139 corpus_file_name = "aquaint.txt"
140 parsed_dataset = data_folder / corpus_file_name
142 # download and parse data if necessary
143 if not parsed_dataset.exists():
144 aquaint_el_zip = cached_path(f"{aquaint_el_path}", Path("datasets") / dataset_name)
145 unpack_file(aquaint_el_zip, data_folder, "zip", False)
147 try:
148 with open(parsed_dataset, "w", encoding='utf-8') as txt_out:
150 # iterate over all html files
151 for file in os.listdir(data_folder):
153 if not file.endswith(".htm"):
154 continue
156 with open(str(data_folder / file), "r", encoding='utf-8') as txt_in:
157 text = txt_in.read()
159 # get rid of html syntax, we only need the text
160 strings = text.split("<p> ")
161 strings[0] = strings[0].split('<h1 id="header">')[1][:-7]
163 for i in range(1, len(strings) - 1):
164 strings[i] = strings[i][:-7]
166 strings[-1] = strings[-1][:-23]
168 # between all documents we write a separator symbol
169 txt_out.write('-DOCSTART-\n\n')
171 for string in strings:
173 # skip empty strings
174 if not string: continue
176 # process the annotation format in the text and collect triples (begin_mention, length_mention, wikiname)
177 indices = []
178 lengths = []
179 wikinames = []
181 current_entity = string.find('[[') # each annotation starts with '[['
182 while current_entity != -1:
183 wikiname = ''
184 surface_form = ''
185 j = current_entity + 2
187 while string[j] not in [']', '|']:
188 wikiname += string[j]
189 j += 1
191 if string[j] == ']': # entity mention ends, i.e. looks like this [[wikiname]]
192 surface_form = wikiname # in this case entity mention = wiki-page name
193 else: # string[j] == '|'
194 j += 1
195 while string[j] not in [']', '|']:
196 surface_form += string[j]
197 j += 1
199 if string[
200 j] == '|': # entity has a score, i.e. looks like this [[wikiname|surface_form|agreement_score]]
201 agreement_score = float(string[j + 1:j + 4])
202 j += 4 # points to first ']' of entity now
203 if agreement_score < self.agreement_threshold: # discard entity
204 string = string[:current_entity] + surface_form + string[j + 2:]
205 current_entity = string.find('[[')
206 continue
208 # replace [[wikiname|surface_form|score]] by surface_form and save index, length and wikiname of mention
209 indices.append(current_entity)
210 lengths.append(len(surface_form))
211 wikinames.append(wikiname[0].upper() + wikiname.replace(' ', '_')[1:])
213 string = string[:current_entity] + surface_form + string[j + 2:]
215 current_entity = string.find('[[')
217 # sentence splitting and tokenization
218 sentences = sentence_splitter.split(string)
219 sentence_offsets = [sentence.start_pos for sentence in sentences]
221 # iterate through all annotations and add to corresponding tokens
222 for mention_start, mention_length, wikiname in zip(indices, lengths, wikinames):
224 # find sentence to which annotation belongs
225 sentence_index = 0
226 for i in range(1, len(sentences)):
227 if mention_start < sentence_offsets[i]:
228 break
229 else:
230 sentence_index += 1
232 # position within corresponding sentence
233 mention_start -= sentence_offsets[sentence_index]
234 mention_end = mention_start + mention_length
236 # set annotation for tokens of entity mention
237 first = True
238 for token in sentences[sentence_index].tokens:
239 if token.start_pos >= mention_start and token.end_pos <= mention_end: # token belongs to entity mention
240 if first:
241 token.set_label(typename='nel', value='B-' + wikiname)
242 first = False
243 else:
244 token.set_label(typename='nel', value='I-' + wikiname)
246 # write to out-file in column format
247 for sentence in sentences:
249 for token in sentence.tokens:
251 labels = token.get_labels('nel')
253 if len(labels) == 0: # no entity
254 txt_out.write(token.text + '\tO\n')
256 else: # annotation
257 txt_out.write(token.text + '\t' + labels[0].value + '\n')
259 txt_out.write('\n') # empty line after each sentence
261 except:
262 # in case something goes wrong, delete the dataset and raise error
263 os.remove(parsed_dataset)
264 raise
266 super(NEL_ENGLISH_AQUAINT, self).__init__(
267 data_folder,
268 train_file=corpus_file_name,
269 in_memory=in_memory,
270 **corpusargs,
271 )
274class NEL_GERMAN_HIPE(EntityLinkingCorpus):
275 def __init__(
276 self,
277 base_path: Union[str, Path] = None,
278 in_memory: bool = True,
279 wiki_language: str = 'dewiki',
280 **corpusargs
281 ):
282 """
283 Initialize a sentence-segmented version of the HIPE entity linking corpus for historical German (see description
284 of HIPE at https://impresso.github.io/CLEF-HIPE-2020/). This version was segmented by @stefan-it and is hosted
285 at https://github.com/stefan-it/clef-hipe.
286 If you call the constructor the first time the dataset gets automatically downloaded and transformed in
287 tab-separated column format.
289 Parameters
290 ----------
291 base_path : Union[str, Path], optional
292 Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
293 to point to a different folder but typically this should not be necessary.
294 in_memory: If True, keeps dataset in memory giving speedups in training.
295 wiki_language : specify the language of the names of the wikipedia pages, i.e. which language version of
296 Wikipedia URLs to use. Since the text is in german the default language is German.
297 """
298 self.wiki_language = wiki_language
299 if type(base_path) == str:
300 base_path: Path = Path(base_path)
302 # this dataset name
303 dataset_name = self.__class__.__name__.lower()
305 # default dataset folder is the cache root
306 if not base_path:
307 base_path = flair.cache_root / "datasets"
308 data_folder = base_path / dataset_name
310 dev_raw_url = "https://raw.githubusercontent.com/stefan-it/clef-hipe/main/data/future/dev-v1.2/de/HIPE-data-v1.2-dev-de-normalized-manual-eos.tsv"
311 test_raw_url = "https://raw.githubusercontent.com/stefan-it/clef-hipe/main/data/future/test-v1.3/de/HIPE-data-v1.3-test-de-normalized-manual-eos.tsv"
312 train_raw_url = "https://raw.githubusercontent.com/stefan-it/clef-hipe/main/data/future/training-v1.2/de/HIPE-data-v1.2-train-de-normalized-manual-eos.tsv"
313 train_file_name = wiki_language + "_train.tsv"
314 parsed_dataset = data_folder / train_file_name
316 # download and parse data if necessary
317 if not parsed_dataset.exists():
319 # from qwikidata.linked_data_interface import get_entity_dict_from_api
321 original_train_path = cached_path(f"{train_raw_url}", Path("datasets") / dataset_name)
322 original_test_path = cached_path(f"{test_raw_url}", Path("datasets") / dataset_name)
323 original_dev_path = cached_path(f"{dev_raw_url}", Path("datasets") / dataset_name)
325 # generate qid wikiname dictionaries
326 log.info('Get wikinames from wikidata...')
327 train_dict = self._get_qid_wikiname_dict(path=original_train_path)
328 test_dict = self._get_qid_wikiname_dict(original_test_path)
329 dev_dict = self._get_qid_wikiname_dict(original_dev_path)
330 log.info('...done!')
332 # merge dictionaries
333 qid_wikiname_dict = {**train_dict, **test_dict, **dev_dict}
335 for doc_path, file_name in zip([original_train_path, original_test_path, original_dev_path],
336 [train_file_name, wiki_language + '_test.tsv', wiki_language + '_dev.tsv']):
337 with open(doc_path, 'r', encoding='utf-8') as read, open(data_folder / file_name, 'w',
338 encoding='utf-8') as write:
340 # ignore first line
341 read.readline()
342 line = read.readline()
343 last_eos = True
345 while line:
346 # commented and empty lines
347 if line[0] == '#' or line == '\n':
348 if line[2:13] == 'document_id': # beginning of new document
350 if last_eos:
351 write.write('-DOCSTART-\n\n')
352 last_eos = False
353 else:
354 write.write('\n-DOCSTART-\n\n')
356 else:
357 line_list = line.split('\t')
358 if not line_list[7] in ['_', 'NIL']: # line has wikidata link
360 wikiname = qid_wikiname_dict[line_list[7]]
362 if wikiname != 'O':
363 annotation = line_list[1][:2] + wikiname
364 else: # no entry in chosen language
365 annotation = 'O'
367 else:
369 annotation = 'O'
371 write.write(line_list[0] + '\t' + annotation + '\n')
373 if line_list[-1][-4:-1] == 'EOS': # end of sentence
374 write.write('\n')
375 last_eos = True
376 else:
377 last_eos = False
379 line = read.readline()
381 super(NEL_GERMAN_HIPE, self).__init__(
382 data_folder,
383 train_file=train_file_name,
384 dev_file=wiki_language + '_dev.tsv',
385 test_file=wiki_language + '_test.tsv',
386 in_memory=in_memory,
387 **corpusargs,
388 )
390 def _get_qid_wikiname_dict(self, path):
392 qid_set = set()
393 with open(path, mode='r', encoding='utf-8') as read:
394 # read all Q-IDs
396 # ignore first line
397 read.readline()
398 line = read.readline()
400 while line:
402 if not (line[0] == '#' or line == '\n'): # commented or empty lines
403 line_list = line.split('\t')
404 if not line_list[7] in ['_', 'NIL']: # line has wikidata link
406 qid_set.add(line_list[7])
408 line = read.readline()
410 base_url = 'https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&props=sitelinks&sitefilter=' + self.wiki_language + '&ids='
412 qid_list = list(qid_set)
413 ids = ''
414 length = len(qid_list)
415 qid_wikiname_dict = {}
416 for i in range(length):
417 if (
418 i + 1) % 50 == 0 or i == length - 1: # there is a limit to the number of ids in one request in the wikidata api
420 ids += qid_list[i]
421 # request
422 response_json = requests.get(base_url + ids).json()
424 for qid in response_json['entities']:
426 try:
427 wikiname = response_json['entities'][qid]['sitelinks'][self.wiki_language]['title'].replace(' ',
428 '_')
429 except KeyError: # language not available for specific wikiitem
430 wikiname = 'O'
432 qid_wikiname_dict[qid] = wikiname
434 ids = ''
436 else:
437 ids += qid_list[i]
438 ids += '|'
440 return qid_wikiname_dict
443class NEL_ENGLISH_AIDA(EntityLinkingCorpus):
444 def __init__(
445 self,
446 base_path: Union[str, Path] = None,
447 in_memory: bool = True,
448 check_existence: bool = False,
449 **corpusargs
450 ):
451 """
452 Initialize AIDA CoNLL-YAGO Entity Linking corpus introduced here https://www.mpi-inf.mpg.de/departments/databases-and-information-systems/research/ambiverse-nlu/aida/downloads.
453 License: https://creativecommons.org/licenses/by-sa/3.0/deed.en_US
454 If you call the constructor the first time the dataset gets automatically downloaded and transformed in tab-separated column format.
456 Parameters
457 ----------
458 base_path : Union[str, Path], optional
459 Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
460 to point to a different folder but typically this should not be necessary.
461 in_memory: If True, keeps dataset in memory giving speedups in training.
462 check_existence: If True the existence of the given wikipedia ids/pagenames is checked and non existent ids/names will be igrnored.
463 """
464 if type(base_path) == str:
465 base_path: Path = Path(base_path)
467 # this dataset name
468 dataset_name = self.__class__.__name__.lower()
470 # default dataset folder is the cache root
471 if not base_path:
472 base_path = flair.cache_root / "datasets"
473 data_folder = base_path / dataset_name
475 conll_yago_path = "https://nlp.informatik.hu-berlin.de/resources/datasets/conll_entity_linking/"
476 corpus_file_name = "train"
477 parsed_dataset = data_folder / corpus_file_name
479 if not parsed_dataset.exists():
481 import wikipediaapi
483 wiki_wiki = wikipediaapi.Wikipedia(language='en')
485 testa_unprocessed_path = cached_path(f"{conll_yago_path}aida_conll_testa", Path("datasets") / dataset_name)
486 testb_unprocessed_path = cached_path(f"{conll_yago_path}aida_conll_testb", Path("datasets") / dataset_name)
487 train_unprocessed_path = cached_path(f"{conll_yago_path}aida_conll_train", Path("datasets") / dataset_name)
489 # we use the wikiids in the data instead of directly utilizing the wikipedia urls.
490 # like this we can quickly check if the corresponding page exists
491 wikiid_wikiname_dict = self._get_wikiid_wikiname_dict(data_folder)
493 for name, path in zip(['train', 'testa', 'testb'],
494 [train_unprocessed_path, testa_unprocessed_path, testb_unprocessed_path]):
495 with open(data_folder / name, 'w', encoding='utf-8') as write, open(path, 'r',
496 encoding='utf-8') as read:
498 for line in read:
500 line_list = line.split('\t')
501 if len(line_list) <= 4:
502 if line_list[0][:10] == '-DOCSTART-': # Docstart
503 write.write('-DOCSTART-\n\n')
504 elif line_list[0] == '\n': # empty line
505 write.write('\n')
506 else: # text without annotation or marked '--NME--' (no matching entity)
507 if len(line_list) == 1:
508 write.write(line_list[0][:-1] + '\tO\n')
509 else:
510 write.write(line_list[0] + '\tO\n')
511 else: # line with annotation
512 wikiname = wikiid_wikiname_dict[line_list[5].strip()]
513 if wikiname != 'O':
514 write.write(line_list[0] + '\t' + line_list[1] + '-' + wikiname + '\n')
515 else:
516 # if there is a bad wikiid we can check if the given url in the data exists using wikipediaapi
517 wikiname = line_list[4].split('/')[-1]
518 if check_existence:
519 page = wiki_wiki.page(wikiname)
520 if page.exists():
521 write.write(line_list[0] + '\t' + line_list[1] + '-' + wikiname + '\n')
522 else: # neither the wikiid nor the url exist
523 write.write(line_list[0] + '\tO\n')
524 else:
525 write.write(line_list[0] + '\t' + line_list[4] + '-' + wikiname + '\n')
527 # delete unprocessed file
528 os.remove(path)
530 super(NEL_ENGLISH_AIDA, self).__init__(
531 data_folder,
532 train_file=corpus_file_name,
533 dev_file='testa',
534 test_file='testb',
535 in_memory=in_memory,
536 **corpusargs,
537 )
539 def _get_wikiid_wikiname_dict(self, base_folder):
541 # collect all wikiids
542 wikiid_set = set()
543 for data_file in ['aida_conll_testa', 'aida_conll_testb', 'aida_conll_train']:
544 with open(base_folder / data_file, mode='r', encoding='utf-8') as read:
545 line = read.readline()
546 while line:
547 row = line.split('\t')
548 if len(row) > 4: # line has a wiki annotation
549 wikiid_set.add(row[5].strip())
550 line = read.readline()
552 # create the dictionary
553 wikiid_wikiname_dict = {}
554 wikiid_list = list(wikiid_set)
555 ids = ''
556 length = len(wikiid_list)
558 for i in range(length):
559 if (
560 i + 1) % 50 == 0 or i == length - 1: # there is a limit to the number of ids in one request in the wikimedia api
562 ids += wikiid_list[i]
563 # request
564 resp = requests.get(
565 'https://en.wikipedia.org/w/api.php',
566 params={
567 'action': 'query',
568 'prop': 'info',
569 'pageids': ids,
570 'format': 'json'
571 }
572 ).json()
574 for wikiid in resp['query']['pages']:
575 try:
576 wikiname = resp['query']['pages'][wikiid]['title'].replace(' ', '_')
577 except KeyError: # bad wikiid
578 wikiname = 'O'
579 wikiid_wikiname_dict[wikiid] = wikiname
580 ids = ''
582 else:
583 ids += wikiid_list[i]
584 ids += '|'
586 return wikiid_wikiname_dict
589class NEL_ENGLISH_IITB(EntityLinkingCorpus):
590 def __init__(
591 self,
592 base_path: Union[str, Path] = None,
593 in_memory: bool = True,
594 ignore_disagreements: bool = False,
595 sentence_splitter: SentenceSplitter = SegtokSentenceSplitter(),
596 **corpusargs
597 ):
598 """
599 Initialize ITTB Entity Linking corpus introduced in "Collective Annotation of Wikipedia Entities in Web Text" Sayali Kulkarni, Amit Singh, Ganesh Ramakrishnan, and Soumen Chakrabarti.
600 If you call the constructor the first time the dataset gets automatically downloaded and transformed in tab-separated column format.
602 Parameters
603 ----------
604 base_path : Union[str, Path], optional
605 Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
606 to point to a different folder but typically this should not be necessary.
607 in_memory: If True, keeps dataset in memory giving speedups in training.
608 ignore_disagreements: If True annotations with annotator disagreement will be ignored.
609 """
610 if type(base_path) == str:
611 base_path: Path = Path(base_path)
613 # this dataset name
614 dataset_name = self.__class__.__name__.lower() + "_" + type(sentence_splitter).__name__
616 # default dataset folder is the cache root
617 if not base_path:
618 base_path = flair.cache_root / "datasets"
619 data_folder = base_path / dataset_name
621 iitb_el_docs_path = "https://www.cse.iitb.ac.in/~soumen/doc/CSAW/Annot/CSAW_crawledDocs.tar.gz"
622 iitb_el_annotations_path = "https://www.cse.iitb.ac.in/~soumen/doc/CSAW/Annot/CSAW_Annotations.xml"
623 corpus_file_name = "iitb.txt"
624 parsed_dataset = data_folder / corpus_file_name
626 label_type = 'nel'
628 if not parsed_dataset.exists():
630 docs_zip_path = cached_path(f"{iitb_el_docs_path}", Path("datasets") / dataset_name)
631 annotations_xml_path = cached_path(f"{iitb_el_annotations_path}", Path("datasets") / dataset_name)
633 unpack_file(docs_zip_path, data_folder, "tar", False)
635 import xml.etree.ElementTree as ET
636 tree = ET.parse(annotations_xml_path)
637 root = tree.getroot()
639 # names of raw text documents
640 doc_names = set()
641 for elem in root:
642 doc_names.add(elem[0].text)
644 # open output_file
645 with open(parsed_dataset, 'w', encoding='utf-8') as write:
646 # iterate through all documents
647 for doc_name in doc_names:
648 with open(data_folder / 'crawledDocs' / doc_name, 'r', encoding='utf-8') as read:
649 text = read.read()
651 # split sentences and tokenize
652 sentences = sentence_splitter.split(text)
653 sentence_offsets = [sentence.start_pos for sentence in sentences]
655 # iterate through all annotations and add to corresponding tokens
656 for elem in root:
658 if elem[0].text == doc_name and elem[2].text: # annotation belongs to current document
660 wikiname = elem[2].text.replace(' ', '_')
661 mention_start = int(elem[3].text)
662 mention_length = int(elem[4].text)
664 # find sentence to which annotation belongs
665 sentence_index = 0
666 for i in range(1, len(sentences)):
667 if mention_start < sentence_offsets[i]:
668 break
669 else:
670 sentence_index += 1
672 # position within corresponding sentence
673 mention_start -= sentence_offsets[sentence_index]
674 mention_end = mention_start + mention_length
676 # set annotation for tokens of entity mention
677 first = True
678 for token in sentences[sentence_index].tokens:
679 if token.start_pos >= mention_start and token.end_pos <= mention_end: # token belongs to entity mention
680 if first:
681 token.set_label(typename=elem[1].text, value='B-' + wikiname)
682 first = False
683 else:
684 token.set_label(typename=elem[1].text, value='I-' + wikiname)
686 # write to out file
687 write.write('-DOCSTART-\n\n') # each file is one document
689 for sentence in sentences:
691 for token in sentence.tokens:
693 labels = token.labels
695 if len(labels) == 0: # no entity
696 write.write(token.text + '\tO\n')
698 elif len(labels) == 1: # annotation from one annotator
699 write.write(token.text + '\t' + labels[0].value + '\n')
701 else: # annotations from two annotators
703 if labels[0].value == labels[1].value: # annotators agree
704 write.write(token.text + '\t' + labels[0].value + '\n')
706 else: # annotators disagree: ignore or arbitrarily take first annotation
708 if ignore_disagreements:
709 write.write(token.text + '\tO\n')
711 else:
712 write.write(token.text + '\t' + labels[0].value + '\n')
714 write.write('\n') # empty line after each sentence
716 super(NEL_ENGLISH_IITB, self).__init__(
717 data_folder,
718 train_file=corpus_file_name,
719 in_memory=in_memory,
720 **corpusargs,
721 )
724class NEL_ENGLISH_TWEEKI(EntityLinkingCorpus):
725 def __init__(
726 self,
727 base_path: Union[str, Path] = None,
728 in_memory: bool = True,
729 **corpusargs,
730 ):
731 """
732 Initialize Tweeki Entity Linking corpus introduced in "Tweeki: Linking Named Entities on Twitter to a Knowledge Graph" Harandizadeh, Singh.
733 The data consits of tweets with manually annotated wikipedia links.
734 If you call the constructor the first time the dataset gets automatically downloaded and transformed in tab-separated column format.
736 Parameters
737 ----------
738 base_path : Union[str, Path], optional
739 Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
740 to point to a different folder but typically this should not be necessary.
741 in_memory: If True, keeps dataset in memory giving speedups in training.
742 """
743 if type(base_path) == str:
744 base_path: Path = Path(base_path)
746 # this dataset name
747 dataset_name = self.__class__.__name__.lower()
749 # default dataset folder is the cache root
750 if not base_path:
751 base_path = flair.cache_root / "datasets"
752 data_folder = base_path / dataset_name
754 tweeki_gold_el_path = "https://raw.githubusercontent.com/ucinlp/tweeki/main/data/Tweeki_gold/Tweeki_gold"
755 corpus_file_name = "tweeki_gold.txt"
756 parsed_dataset = data_folder / corpus_file_name
758 # download and parse data if necessary
759 if not parsed_dataset.exists():
761 original_file_path = cached_path(f"{tweeki_gold_el_path}", Path("datasets") / dataset_name)
763 with open(original_file_path, 'r', encoding='utf-8') as read, open(parsed_dataset, 'w',
764 encoding='utf-8') as write:
765 line = read.readline()
766 while line:
767 if line.startswith('#'):
768 out_line = ''
769 elif line == '\n': # tweet ends
770 out_line = '\n-DOCSTART-\n\n'
771 else:
772 line_list = line.split('\t')
773 out_line = line_list[1] + '\t'
774 if line_list[3] == '-\n': # no wiki name
775 out_line += 'O\n'
776 else:
777 out_line += line_list[2][:2] + line_list[3].split('|')[0].replace(' ', '_') + '\n'
778 write.write(out_line)
779 line = read.readline()
781 os.rename(original_file_path, str(original_file_path) + '_original')
783 super(NEL_ENGLISH_TWEEKI, self).__init__(
784 data_folder,
785 train_file=corpus_file_name,
786 in_memory=in_memory,
787 **corpusargs,
788 )
791class NEL_ENGLISH_REDDIT(EntityLinkingCorpus):
792 def __init__(
793 self,
794 base_path: Union[str, Path] = None,
795 in_memory: bool = True,
796 **corpusargs,
797 ):
798 """
799 Initialize the Reddit Entity Linking corpus containing gold annotations only (https://arxiv.org/abs/2101.01228v2) in the NER-like column format.
800 The first time you call this constructor it will automatically download the dataset.
801 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
802 to point to a different folder but typically this should not be necessary.
803 :param in_memory: If True, keeps dataset in memory giving speedups in training.
804 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
805 """
806 if type(base_path) == str:
807 base_path: Path = Path(base_path)
809 # this dataset name
810 dataset_name = self.__class__.__name__.lower()
812 # default dataset folder is the cache root
813 if not base_path:
814 base_path = flair.cache_root / "datasets"
815 data_folder = base_path / dataset_name
817 # download and parse data if necessary
818 reddit_el_path = "https://zenodo.org/record/3970806/files/reddit_el.zip"
819 corpus_file_name = "reddit_el_gold.txt"
820 parsed_dataset = data_folder / corpus_file_name
822 if not parsed_dataset.exists():
823 reddit_el_zip = cached_path(f"{reddit_el_path}", Path("datasets") / dataset_name)
824 unpack_file(reddit_el_zip, data_folder, "zip", False)
826 with open(data_folder / corpus_file_name, "w", encoding='utf-8') as txtout:
828 # First parse the post titles
829 with open(data_folder / "posts.tsv", "r", encoding='utf-8') as tsvin1, open(
830 data_folder / "gold_post_annotations.tsv", "r", encoding='utf-8') as tsvin2:
832 posts = csv.reader(tsvin1, delimiter="\t")
833 self.post_annotations = csv.reader(tsvin2, delimiter="\t")
834 self.curr_annot = next(self.post_annotations)
836 for row in posts: # Go through all the post titles
838 txtout.writelines("-DOCSTART-\n\n") # Start each post with a -DOCSTART- token
840 # Keep track of how many and which entity mentions does a given post title have
841 link_annots = [] # [start pos, end pos, wiki page title] of an entity mention
843 # Check if the current post title has an entity link and parse accordingly
844 if row[0] == self.curr_annot[0]:
846 link_annots.append((int(self.curr_annot[4]), int(self.curr_annot[5]), self.curr_annot[3]))
847 link_annots = self._fill_annot_array(link_annots, row[0], post_flag=True)
849 # Post titles with entity mentions (if any) are handled via this function
850 self._text_to_cols(Sentence(row[2], use_tokenizer=True), link_annots, txtout)
851 else:
852 self._text_to_cols(Sentence(row[2], use_tokenizer=True), link_annots, txtout)
854 # Then parse the comments
855 with open(data_folder / "comments.tsv", "r", encoding='utf-8') as tsvin3, open(
856 data_folder / "gold_comment_annotations.tsv", "r", encoding='utf-8') as tsvin4:
858 self.comments = csv.reader(tsvin3, delimiter="\t")
859 self.comment_annotations = csv.reader(tsvin4, delimiter="\t")
860 self.curr_annot = next(self.comment_annotations)
861 self.curr_row = next(self.comments)
862 self.stop_iter = False
864 # Iterate over the comments.tsv file, until the end is reached
865 while not self.stop_iter:
867 txtout.writelines("-DOCSTART-\n") # Start each comment thread with a -DOCSTART- token
869 # Keep track of the current comment thread and its corresponding key, on which the annotations are matched.
870 # Each comment thread is handled as one 'document'.
871 self.curr_comm = self.curr_row[4]
872 comm_key = self.curr_row[0]
874 # Python's csv package for some reason fails to correctly parse a handful of rows inside the comments.tsv file.
875 # This if-condition is needed to handle this problem.
876 if comm_key in {"en5rf4c", "es3ia8j", "es3lrmw"}:
877 if comm_key == "en5rf4c":
878 self.parsed_row = (r.split("\t") for r in self.curr_row[4].split("\n"))
879 self.curr_comm = next(self.parsed_row)
880 self._fill_curr_comment(fix_flag=True)
881 # In case we are dealing with properly parsed rows, proceed with a regular parsing procedure
882 else:
883 self._fill_curr_comment(fix_flag=False)
885 link_annots = [] # [start pos, end pos, wiki page title] of an entity mention
887 # Check if the current comment thread has an entity link and parse accordingly, same as with post titles above
888 if comm_key == self.curr_annot[0]:
889 link_annots.append((int(self.curr_annot[4]), int(self.curr_annot[5]), self.curr_annot[3]))
890 link_annots = self._fill_annot_array(link_annots, comm_key, post_flag=False)
891 self._text_to_cols(Sentence(self.curr_comm, use_tokenizer=True), link_annots, txtout)
892 else:
893 # In two of the comment thread a case of capital letter spacing occurs, which the SegtokTokenizer cannot properly handle.
894 # The following if-elif condition handles these two cases and as result writes full capitalized words in each corresponding row,
895 # and not just single letters into single rows.
896 if comm_key == "dv74ybb":
897 self.curr_comm = " ".join(
898 [word.replace(" ", "") for word in self.curr_comm.split(" ")])
899 elif comm_key == "eci2lut":
900 self.curr_comm = (self.curr_comm[:18] + self.curr_comm[18:27].replace(" ",
901 "") + self.curr_comm[
902 27:55] +
903 self.curr_comm[55:68].replace(" ", "") + self.curr_comm[
904 68:85] + self.curr_comm[
905 85:92].replace(" ",
906 "") +
907 self.curr_comm[92:])
909 self._text_to_cols(Sentence(self.curr_comm, use_tokenizer=True), link_annots, txtout)
911 super(NEL_ENGLISH_REDDIT, self).__init__(
912 data_folder,
913 train_file=corpus_file_name,
914 in_memory=in_memory,
915 **corpusargs,
916 )
918 def _text_to_cols(self, sentence: Sentence, links: list, outfile):
919 """
920 Convert a tokenized sentence into column format
921 :param sentence: Flair Sentence object containing a tokenized post title or comment thread
922 :param links: array containing information about the starting and ending position of an entity mention, as well
923 as its corresponding wiki tag
924 :param outfile: file, to which the output is written
925 """
926 for i in range(0, len(sentence)):
927 # If there are annotated entity mentions for given post title or a comment thread
928 if links:
929 # Keep track which is the correct corresponding entity link, in cases where there is >1 link in a sentence
930 link_index = [j for j, v in enumerate(links) if
931 (sentence[i].start_pos >= v[0] and sentence[i].end_pos <= v[1])]
932 # Write the token with a corresponding tag to file
933 try:
934 if any(sentence[i].start_pos == v[0] and sentence[i].end_pos == v[1] for j, v in enumerate(links)):
935 outfile.writelines(sentence[i].text + "\tS-" + links[link_index[0]][2] + "\n")
936 elif any(
937 sentence[i].start_pos == v[0] and sentence[i].end_pos != v[1] for j, v in enumerate(links)):
938 outfile.writelines(sentence[i].text + "\tB-" + links[link_index[0]][2] + "\n")
939 elif any(
940 sentence[i].start_pos >= v[0] and sentence[i].end_pos <= v[1] for j, v in enumerate(links)):
941 outfile.writelines(sentence[i].text + "\tI-" + links[link_index[0]][2] + "\n")
942 else:
943 outfile.writelines(sentence[i].text + "\tO\n")
944 # IndexError is raised in cases when there is exactly one link in a sentence, therefore can be dismissed
945 except IndexError:
946 pass
948 # If a comment thread or a post title has no entity link, all tokens are assigned the O tag
949 else:
950 outfile.writelines(sentence[i].text + "\tO\n")
952 # Prevent writing empty lines if e.g. a quote comes after a dot or initials are tokenized
953 # incorrectly, in order to keep the desired format (empty line as a sentence separator).
954 try:
955 if ((sentence[i].text in {".", "!", "?", "!*"}) and
956 (sentence[i + 1].text not in {'"', '“', "'", "''", "!", "?", ";)", "."}) and
957 ("." not in sentence[i - 1].text)):
958 outfile.writelines("\n")
959 except IndexError:
960 # Thrown when the second check above happens, but the last token of a sentence is reached.
961 # Indicates that the EOS punctuaion mark is present, therefore an empty line needs to be written below.
962 outfile.writelines("\n")
964 # If there is no punctuation mark indicating EOS, an empty line is still needed after the EOS
965 if sentence[-1].text not in {".", "!", "?"}:
966 outfile.writelines("\n")
968 def _fill_annot_array(self, annot_array: list, key: str, post_flag: bool) -> list:
969 """
970 Fills the array containing information about the entity mention annotations, used in the _text_to_cols method
971 :param annot_array: array to be filled
972 :param key: reddit id, on which the post title/comment thread is matched with its corresponding annotation
973 :param post_flag: flag indicating whether the annotations are collected for the post titles (=True)
974 or comment threads (=False)
975 """
976 next_annot = None
977 while True:
978 # Check if further annotations belong to the current post title or comment thread as well
979 try:
980 next_annot = next(self.post_annotations) if post_flag else next(self.comment_annotations)
981 if next_annot[0] == key:
982 annot_array.append((int(next_annot[4]), int(next_annot[5]), next_annot[3]))
983 else:
984 self.curr_annot = next_annot
985 break
986 # Stop when the end of an annotation file is reached
987 except StopIteration:
988 break
989 return annot_array
991 def _fill_curr_comment(self, fix_flag: bool):
992 """
993 Extends the string containing the current comment thread, which is passed to _text_to_cols method, when the
994 comments are parsed.
995 :param fix_flag: flag indicating whether the method is called when the incorrectly imported rows are parsed (=True)
996 or regular rows (=False)
997 """
998 next_row = None
999 while True:
1000 # Check if further annotations belong to the current sentence as well
1001 try:
1002 next_row = next(self.comments) if not fix_flag else next(self.parsed_row)
1003 if len(next_row) < 2:
1004 # 'else " "' is needed to keep the proper token positions (for accordance with annotations)
1005 self.curr_comm += next_row[0] if any(next_row) else " "
1006 else:
1007 self.curr_row = next_row
1008 break
1009 except StopIteration: # When the end of the comments.tsv file is reached
1010 self.curr_row = next_row
1011 self.stop_iter = True if not fix_flag else False
1012 break
1015def from_ufsac_to_tsv(xml_file: Union[str, Path], conll_file: Union[str, Path], datasetname: str,
1016 encoding: str = "utf8",
1017 cut_multisense: bool = True):
1018 """
1019 Function that converts the UFSAC format into tab separated column format in a new file.
1020 Parameters
1021 ----------
1022 xml_file : Union[str, Path]
1023 Path to the xml file.
1024 conll_file : Union[str, Path]
1025 Path for the new conll file.
1026 datasetname: str
1027 Name of the dataset from UFSAC, needed because of different handling of multi-word-spans in the datasets
1028 encoding : str, optional
1029 Encoding used in open function. The default is "utf8".
1030 cut_multisense : bool, optional
1031 Boolean that determines whether or not the wn30_key tag should be cut if it contains multiple possible senses.
1032 If True only the first listed sense will be used. Otherwise the whole list of senses will be detected
1033 as one new sense. The default is True.
1035 """
1037 def make_line(word, begin_or_inside, attributes):
1038 """
1039 Function that creates an output line from a word.
1040 Parameters
1041 ----------
1042 word :
1043 String of the actual word.
1044 begin_or_inside:
1045 Either 'B-' or 'I-'
1046 attributes:
1047 List of attributes of the word (pos, lemma, wn30_key)
1048 """
1049 line = word
1050 if cut_multisense == True:
1051 attributes[-1] = attributes[-1].split(';')[0] # take only first sense
1053 for attrib in attributes:
1054 if attrib != 'O':
1055 line = line + '\t' + begin_or_inside + attrib
1056 else:
1057 line = line + '\tO'
1058 line += '\n'
1060 return line
1062 def split_span(word_fields: List[str], datasetname: str()):
1063 """
1064 Function that splits a word if necessary, i.e. if it is a multiple-word-span.
1065 Parameters
1066 ----------
1067 word_fields :
1068 list ['surface_form', 'lemma', 'pos', 'wn30_key'] of a word
1069 datasetname:
1070 name of corresponding dataset
1071 """
1073 span = word_fields[0]
1075 if datasetname in ['trainomatic', 'masc']: # splitting not sensible for these datasets
1076 return [span]
1077 elif datasetname == 'omsti':
1078 if word_fields[
1079 3] != 'O' and not span == '_' and not '__' in span: # has annotation and does not consist only of '_' (still not 100% clean)
1080 return span.split('_')
1081 else:
1082 return [span]
1083 else: # for all other datasets splitting at '_' is always sensible
1084 return span.split('_')
1086 txt_out = open(file=conll_file, mode='w', encoding=encoding)
1087 import xml.etree.ElementTree as ET
1088 tree = ET.parse(xml_file)
1089 corpus = tree.getroot()
1091 number_of_docs = len(corpus.findall('document'))
1093 fields = ['surface_form', 'lemma', 'pos', 'wn30_key']
1094 for document in corpus:
1095 # Docstart
1096 if number_of_docs > 1:
1097 txt_out.write('-DOCSTART-\n\n')
1099 for paragraph in document:
1101 for sentence in paragraph:
1103 for word in sentence:
1105 dictionary = word.attrib
1106 fields_of_word = [word.attrib[field] if (field in dictionary) else 'O' for field in fields]
1108 chunks = split_span(fields_of_word, datasetname)
1110 txt_out.write(make_line(chunks[0], 'B-', fields_of_word[1:]))
1112 # if there is more than one word in the chunk we write each in a separate line
1113 for chunk in chunks[1:]:
1114 # print(chunks)
1115 txt_out.write(make_line(chunk, 'I-', fields_of_word[1:]))
1117 # empty line after each sentence
1118 txt_out.write('\n')
1120 txt_out.close()
1123def determine_tsv_file(filename: str, data_folder: str, cut_multisense: bool = True):
1124 """
1125 Checks if the converted .tsv file already exists and if not, creates it. Returns name of the file.
1126 ----------
1127 string : str
1128 String that contains the name of the file.
1129 data_folder : str
1130 String that contains the name of the folder in which the CoNLL file should reside.
1131 cut_multisense : bool, optional
1132 Boolean that determines whether or not the wn30_key tag should be cut if it contains multiple possible senses.
1133 If True only the first listed sense will be used. Otherwise the whole list of senses will be detected
1134 as one new sense. The default is True.
1135 """
1137 if cut_multisense is True and filename not in ['semeval2007task17', 'trainomatic',
1138 'wngt']: # these three datasets do not have multiple senses
1140 conll_file_name = filename + '_cut.tsv'
1142 else:
1144 conll_file_name = filename + '.tsv'
1146 path_to_conll_file = data_folder / conll_file_name
1148 if not path_to_conll_file.exists():
1149 # convert the file to CoNLL
1151 from_ufsac_to_tsv(xml_file=Path(data_folder / 'original_data' / (filename + '.xml')),
1152 conll_file=Path(data_folder / conll_file_name),
1153 datasetname=filename,
1154 cut_multisense=cut_multisense)
1156 return conll_file_name
1159class WSD_UFSAC(MultiCorpus):
1160 def __init__(
1161 self,
1162 filenames: Union[str, List[str]] = ['masc', 'semcor'],
1163 base_path: Union[str, Path] = None,
1164 in_memory: bool = True,
1165 cut_multisense: bool = True,
1166 columns={0: "text", 3: "wn30_key"},
1167 tag_to_bioes=None,
1168 banned_sentences: List[str] = None,
1169 sample_missing_splits_in_multicorpus: bool = True,
1170 sample_missing_splits_in_each_corpus: bool = True,
1171 use_raganato_ALL_as_test_data: bool = False,
1172 name: str = 'multicorpus'
1173 ):
1174 """
1175 Initialize a custom corpus with any Word Sense Disambiguation (WSD) datasets in the UFSAC format from https://github.com/getalp/UFSAC.
1176 If the constructor is called for the first time the data is automatically downloaded and transformed from xml to a tab separated column format.
1177 Since only the WordNet 3.0 version for senses is consistently available for all provided datasets we will only consider this version.
1178 Also we ignore the id annotation used in datasets that were originally created for evaluation tasks
1179 :param filenames: Here you can pass a single datasetname or a list of ddatasetnames. The available names are:
1180 'masc', 'omsti', 'raganato_ALL', 'raganato_semeval2007', 'raganato_semeval2013', 'raganato_semeval2015', 'raganato_senseval2', 'raganato_senseval3',
1181 'semcor', 'semeval2007task17', 'semeval2007task7', 'semeval2013task12', 'semeval2015task13', 'senseval2', 'senseval2_lexical_sample_test',
1182 'senseval2_lexical_sample_train', 'senseval3task1', 'senseval3task6_test', 'senseval3task6_train', 'trainomatic', 'wngt'.
1183 So you can pass for example filenames = ['masc', 'omsti', 'wngt']. Default two mid-sized datasets 'masc' and 'semcor' are loaded.
1184 :param base_path: You can override this to point to a specific folder but typically this should not be necessary.
1185 :param in_memory: If True, keeps dataset in memory giving speedups in training.
1186 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
1187 :param cut_multisense: Boolean that determines whether or not the wn30_key tag should be cut if it contains
1188 multiple possible senses. If True only the first listed sense will be used and the
1189 suffix '_cut' will be added to the name of the CoNLL file. Otherwise the whole list of
1190 senses will be detected as one new sense. The default is True.
1191 :param columns: Columns to consider when loading the dataset. You can add 1: "lemma" or 2: "pos" to the default dict {0: "text", 3: "wn30_key"}
1192 if you want to use additional pos and/or lemma for the words.
1193 :param tag_to_bioes: whether to convert to BIOES tagging scheme
1194 :param banned_sentences: Optionally remove sentences from the corpus. Works only if `in_memory` is true
1195 :param sample_missing_splits_in_multicorpus: Whether to sample missing splits when loading the multicorpus (this is redundant if
1196 sample_missing_splits_in_each_corpus is True)
1197 :param sample_missing_splits_in_each_corpus: Whether to sample missing splits when loading each single corpus given in filenames.
1198 :param use_raganato_ALL_as_test_data: If True, the raganato_ALL dataset (Raganato et al. "Word Sense Disambiguation: A unified evaluation framework and empirical compariso")
1199 will be used as test data. Note that the sample_missing_splits parameters are set to 'only_dev' in this case if set to True.
1200 :param name: Name of your (costum) corpus
1201 """
1202 if type(base_path) == str:
1203 base_path: Path = Path(base_path)
1205 # this dataset name
1206 dataset_name = self.__class__.__name__.lower()
1208 # default dataset folder is the cache root
1209 if not base_path:
1210 base_path = flair.cache_root / "datasets"
1211 data_folder = base_path / dataset_name
1212 original_data_folder = data_folder / 'original_data'
1214 # check if data there, if not, download the data
1215 if not original_data_folder.exists():
1216 # create folder
1217 data_folder.mkdir(parents=True)
1219 # download data
1220 import gdown
1222 url = 'https://drive.google.com/uc?id=1Oigo3kzRosz2VjyA44vpJZ58tDFyLRMO'
1224 output = data_folder / (dataset_name + '.tar')
1226 gdown.download(url, str(output), quiet=False)
1228 output = data_folder / (dataset_name + '.tar')
1229 unpack_file(file=output,
1230 unpack_to=data_folder,
1231 mode='tar', keep=False)
1233 os.rename(data_folder / 'ufsac-public-2.1', original_data_folder)
1235 # transform data into column format if necessary
1237 # if no filenames are specified we use all the data
1238 if not filenames:
1239 filenames = [name[:-4] for name in os.listdir(original_data_folder) if not 'raganato' in name]
1241 if type(filenames) == str:
1242 filenames = [filenames]
1244 corpora = []
1246 print('Transforming data into column format and creating corpora...')
1248 if use_raganato_ALL_as_test_data:
1249 # in this case no test data should be generated by sampling from train data. But if the sample arguments are set to true, the dev set will be sampled
1250 if sample_missing_splits_in_each_corpus:
1251 sample_missing_splits_in_each_corpus = 'only_dev'
1252 if sample_missing_splits_in_multicorpus:
1253 sample_missing_splits_in_multicorpus = 'only_dev'
1255 # also we remove 'raganato_ALL' from filenames in case its in the list
1256 if 'raganato_ALL' in filenames:
1257 filenames.remove('raganato_ALL')
1259 # generate the test file
1260 test_file = determine_tsv_file(filename='raganato_ALL', data_folder=data_folder,
1261 cut_multisense=cut_multisense)
1263 corpus = ColumnCorpus(data_folder=data_folder,
1264 column_format=columns,
1265 test_file=test_file, # corpus only has test data
1266 in_memory=in_memory,
1267 tag_to_bioes=tag_to_bioes,
1268 column_delimiter='\t',
1269 document_separator_token='-DOCSTART-',
1270 banned_sentences=banned_sentences,
1271 autofind_splits=False,
1272 sample_missing_splits=sample_missing_splits_in_each_corpus,
1273 )
1274 corpora.append(corpus)
1276 for filename in filenames:
1277 # make column file and save to data_folder
1279 new_filename = determine_tsv_file(filename=filename, data_folder=data_folder, cut_multisense=cut_multisense)
1281 corpus = ColumnCorpus(data_folder=data_folder,
1282 column_format=columns,
1283 train_file=new_filename,
1284 in_memory=in_memory,
1285 tag_to_bioes=tag_to_bioes,
1286 column_delimiter='\t',
1287 document_separator_token='-DOCSTART-',
1288 banned_sentences=banned_sentences,
1289 autofind_splits=False,
1290 sample_missing_splits=sample_missing_splits_in_each_corpus,
1291 )
1292 corpora.append(corpus)
1293 print('...done!')
1295 super(WSD_UFSAC, self).__init__(
1296 corpora,
1297 sample_missing_splits=sample_missing_splits_in_multicorpus,
1298 name=name
1299 )
1302class WSD_RAGANATO_ALL(EntityLinkingCorpus):
1303 def __init__(
1304 self,
1305 base_path: Union[str, Path] = None,
1306 in_memory: bool = True,
1307 columns={0: "text", 3: "wn30_key"},
1308 tag_to_bioes=None,
1309 label_name_map: Dict[str, str] = None,
1310 banned_sentences: List[str] = None,
1311 sample_missing_splits: bool = True,
1312 cut_multisense: bool = True
1313 ):
1314 """
1315 Initialize ragnato_ALL (concatenation of all SensEval and SemEval all-words tasks) provided in UFSAC https://github.com/getalp/UFSAC
1316 When first initializing the corpus the whole UFSAC data is downloaded.
1317 """
1318 if type(base_path) == str:
1319 base_path: Path = Path(base_path)
1321 dataset_name = 'wsd_ufsac'
1323 # default dataset folder is the cache root
1324 if not base_path:
1325 base_path = flair.cache_root / "datasets"
1326 data_folder = base_path / dataset_name
1327 original_data_folder = data_folder / 'original_data'
1329 # We check if the the UFSAC data has already been downloaded. If not, we download it.
1330 # Note that this downloads more datasets than just SemCor. But the size of the download is only around 190 Mb (around 4.5 Gb unpacked)
1331 if not original_data_folder.exists():
1332 # create folder
1333 data_folder.mkdir(parents=True)
1335 # download data
1336 import gdown
1338 url = 'https://drive.google.com/uc?id=1Oigo3kzRosz2VjyA44vpJZ58tDFyLRMO'
1340 output = data_folder / (dataset_name + '.tar')
1342 gdown.download(url, str(output), quiet=False)
1344 output = data_folder / (dataset_name + '.tar')
1345 unpack_file(file=output,
1346 unpack_to=data_folder,
1347 mode='tar', keep=False)
1349 os.rename(data_folder / 'ufsac-public-2.1', original_data_folder)
1351 train_file = determine_tsv_file(filename='raganato_ALL', data_folder=data_folder, cut_multisense=cut_multisense)
1353 super(WSD_RAGANATO_ALL, self).__init__(
1354 data_folder=data_folder,
1355 columns=columns,
1356 train_file=train_file,
1357 in_memory=in_memory,
1358 document_separator_token='-DOCSTART-',
1359 column_delimiter='\t',
1360 autofind_splits=False,
1361 tag_to_bioes=tag_to_bioes,
1362 label_name_map=label_name_map,
1363 banned_sentences=banned_sentences,
1364 sample_missing_splits=sample_missing_splits,
1365 )
1368class WSD_SEMCOR(EntityLinkingCorpus):
1369 def __init__(
1370 self,
1371 base_path: Union[str, Path] = None,
1372 in_memory: bool = True,
1373 columns={0: "text", 3: "wn30_key"},
1374 tag_to_bioes=None,
1375 label_name_map: Dict[str, str] = None,
1376 banned_sentences: List[str] = None,
1377 sample_missing_splits: bool = True,
1378 cut_multisense: bool = True,
1379 use_raganato_ALL_as_test_data: bool = False,
1380 ):
1381 """
1382 Initialize SemCor provided in UFSAC https://github.com/getalp/UFSAC
1383 When first initializing the corpus the whole UFSAC data is downloaded.
1384 """
1385 if type(base_path) == str:
1386 base_path: Path = Path(base_path)
1388 dataset_name = 'wsd_ufsac'
1390 # default dataset folder is the cache root
1391 if not base_path:
1392 base_path = flair.cache_root / "datasets"
1393 data_folder = base_path / dataset_name
1394 original_data_folder = data_folder / 'original_data'
1396 # We check if the the UFSAC data has already been downloaded. If not, we download it.
1397 # Note that this downloads more datasets than just SemCor. But the size of the download is only around 190 Mb (around 4.5 Gb unpacked)
1398 if not original_data_folder.exists():
1399 # create folder
1400 data_folder.mkdir(parents=True)
1402 # download data
1403 import gdown
1405 url = 'https://drive.google.com/uc?id=1Oigo3kzRosz2VjyA44vpJZ58tDFyLRMO'
1407 output = data_folder / (dataset_name + '.tar')
1409 gdown.download(url, str(output), quiet=False)
1411 output = data_folder / (dataset_name + '.tar')
1412 unpack_file(file=output,
1413 unpack_to=data_folder,
1414 mode='tar', keep=False)
1416 os.rename(data_folder / 'ufsac-public-2.1', original_data_folder)
1418 if use_raganato_ALL_as_test_data:
1419 # in this case no test data should be generated by sampling from train data. But if sample_missing_splits is true, the dev set will be sampled.
1420 if sample_missing_splits:
1421 sample_missing_splits = 'only_dev'
1423 # generate the test file
1424 test_file = determine_tsv_file(filename='raganato_ALL', data_folder=data_folder,
1425 cut_multisense=cut_multisense)
1426 else:
1427 test_file = None
1429 train_file = determine_tsv_file(filename='semcor', data_folder=data_folder, cut_multisense=cut_multisense)
1431 super(WSD_SEMCOR, self).__init__(
1432 data_folder=data_folder,
1433 columns=columns,
1434 train_file=train_file,
1435 test_file=test_file,
1436 in_memory=in_memory,
1437 document_separator_token='-DOCSTART-',
1438 column_delimiter='\t',
1439 autofind_splits=False,
1440 tag_to_bioes=tag_to_bioes,
1441 label_name_map=label_name_map,
1442 banned_sentences=banned_sentences,
1443 sample_missing_splits=sample_missing_splits,
1444 )
1447class WSD_WORDNET_GLOSS_TAGGED(EntityLinkingCorpus):
1448 def __init__(
1449 self,
1450 base_path: Union[str, Path] = None,
1451 in_memory: bool = True,
1452 columns={0: "text", 3: "wn30_key"},
1453 tag_to_bioes=None,
1454 label_name_map: Dict[str, str] = None,
1455 banned_sentences: List[str] = None,
1456 sample_missing_splits: bool = True,
1457 use_raganato_ALL_as_test_data: bool = False,
1458 ):
1459 """
1460 Initialize Princeton WordNet Gloss Corpus provided in UFSAC https://github.com/getalp/UFSAC
1461 When first initializing the corpus the whole UFSAC data is downloaded.
1462 """
1463 if type(base_path) == str:
1464 base_path: Path = Path(base_path)
1466 dataset_name = 'wsd_ufsac'
1468 # default dataset folder is the cache root
1469 if not base_path:
1470 base_path = flair.cache_root / "datasets"
1471 data_folder = base_path / dataset_name
1472 original_data_folder = data_folder / 'original_data'
1474 # We check if the the UFSAC data has already been downloaded. If not, we download it.
1475 # Note that this downloads more datasets than just WordNet Gloss Tagged. But the size of the download is only around 190 Mb (around 4.5 Gb unpacked)
1476 if not original_data_folder.exists():
1477 # create folder
1478 data_folder.mkdir(parents=True)
1480 # download data
1481 import gdown
1483 url = 'https://drive.google.com/uc?id=1Oigo3kzRosz2VjyA44vpJZ58tDFyLRMO'
1485 output = data_folder / (dataset_name + '.tar')
1487 gdown.download(url, str(output), quiet=False)
1489 output = data_folder / (dataset_name + '.tar')
1490 unpack_file(file=output,
1491 unpack_to=data_folder,
1492 mode='tar', keep=False)
1494 os.rename(data_folder / 'ufsac-public-2.1', original_data_folder)
1496 if use_raganato_ALL_as_test_data:
1497 # in this case no test data should be generated by sampling from train data. But if sample_missing_splits is true, the dev set will be sampled.
1498 if sample_missing_splits:
1499 sample_missing_splits = 'only_dev'
1501 # generate the test file
1502 test_file = determine_tsv_file(filename='raganato_ALL', data_folder=data_folder, cut_multisense=True)
1503 else:
1504 test_file = None
1506 train_file = determine_tsv_file(filename='wngt', data_folder=data_folder,
1507 cut_multisense=False) # does not have multisense!
1509 super(WSD_WORDNET_GLOSS_TAGGED, self).__init__(
1510 data_folder=data_folder,
1511 columns=columns,
1512 train_file=train_file,
1513 test_file=test_file,
1514 in_memory=in_memory,
1515 document_separator_token='-DOCSTART-',
1516 column_delimiter='\t',
1517 autofind_splits=False,
1518 tag_to_bioes=tag_to_bioes,
1519 label_name_map=label_name_map,
1520 banned_sentences=banned_sentences,
1521 sample_missing_splits=sample_missing_splits,
1522 )
1525class WSD_MASC(EntityLinkingCorpus):
1526 def __init__(
1527 self,
1528 base_path: Union[str, Path] = None,
1529 in_memory: bool = True,
1530 columns={0: "text", 3: "wn30_key"},
1531 tag_to_bioes=None,
1532 label_name_map: Dict[str, str] = None,
1533 banned_sentences: List[str] = None,
1534 sample_missing_splits: bool = True,
1535 cut_multisense: bool = True,
1536 use_raganato_ALL_as_test_data: bool = False,
1537 ):
1538 """
1539 Initialize MASC (Manually Annotated Sub-Corpus) provided in UFSAC https://github.com/getalp/UFSAC
1540 When first initializing the corpus the whole UFSAC data is downloaded.
1541 """
1542 if type(base_path) == str:
1543 base_path: Path = Path(base_path)
1545 dataset_name = 'wsd_ufsac'
1547 # default dataset folder is the cache root
1548 if not base_path:
1549 base_path = flair.cache_root / "datasets"
1550 data_folder = base_path / dataset_name
1551 original_data_folder = data_folder / 'original_data'
1553 # We check if the the UFSAC data has already been downloaded. If not, we download it.
1554 # Note that this downloads more datasets than just MASC. But the size of the download is only around 190 Mb (around 4.5 Gb unpacked)
1555 if not original_data_folder.exists():
1556 # create folder
1557 data_folder.mkdir(parents=True)
1559 # download data
1560 import gdown
1562 url = 'https://drive.google.com/uc?id=1Oigo3kzRosz2VjyA44vpJZ58tDFyLRMO'
1564 output = data_folder / (dataset_name + '.tar')
1566 gdown.download(url, str(output), quiet=False)
1568 output = data_folder / (dataset_name + '.tar')
1569 unpack_file(file=output,
1570 unpack_to=data_folder,
1571 mode='tar', keep=False)
1573 os.rename(data_folder / 'ufsac-public-2.1', original_data_folder)
1575 if use_raganato_ALL_as_test_data:
1576 # in this case no test data should be generated by sampling from train data. But if sample_missing_splits is true, the dev set will be sampled.
1577 if sample_missing_splits:
1578 sample_missing_splits = 'only_dev'
1580 # generate the test file
1581 test_file = determine_tsv_file(filename='raganato_ALL', data_folder=data_folder,
1582 cut_multisense=cut_multisense)
1583 else:
1584 test_file = None
1586 train_file = determine_tsv_file(filename='masc', data_folder=data_folder, cut_multisense=cut_multisense)
1588 super(WSD_MASC, self).__init__(
1589 data_folder=data_folder,
1590 columns=columns,
1591 train_file=train_file,
1592 test_file=test_file,
1593 in_memory=in_memory,
1594 document_separator_token='-DOCSTART-',
1595 column_delimiter='\t',
1596 autofind_splits=False,
1597 tag_to_bioes=tag_to_bioes,
1598 label_name_map=label_name_map,
1599 banned_sentences=banned_sentences,
1600 sample_missing_splits=sample_missing_splits,
1601 )
1604class WSD_OMSTI(EntityLinkingCorpus):
1605 def __init__(
1606 self,
1607 base_path: Union[str, Path] = None,
1608 in_memory: bool = True,
1609 columns={0: "text", 3: "wn30_key"},
1610 tag_to_bioes=None,
1611 label_name_map: Dict[str, str] = None,
1612 banned_sentences: List[str] = None,
1613 sample_missing_splits: bool = True,
1614 cut_multisense: bool = True,
1615 use_raganato_ALL_as_test_data: bool = False,
1616 ):
1617 """
1618 Initialize OMSTI (One Million Sense-Tagged Instances) provided in UFSAC https://github.com/getalp/UFSAC
1619 When first initializing the corpus the whole UFSAC data is downloaded.
1620 """
1621 if type(base_path) == str:
1622 base_path: Path = Path(base_path)
1624 dataset_name = 'wsd_ufsac'
1626 # default dataset folder is the cache root
1627 if not base_path:
1628 base_path = flair.cache_root / "datasets"
1629 data_folder = base_path / dataset_name
1630 original_data_folder = data_folder / 'original_data'
1632 # We check if the the UFSAC data has already been downloaded. If not, we download it.
1633 # Note that this downloads more datasets than just OMSTI. But the size of the download is only around 190 Mb (around 4.5 Gb unpacked)
1634 if not original_data_folder.exists():
1635 # create folder
1636 data_folder.mkdir(parents=True)
1638 # download data
1639 import gdown
1641 url = 'https://drive.google.com/uc?id=1Oigo3kzRosz2VjyA44vpJZ58tDFyLRMO'
1643 output = data_folder / (dataset_name + '.tar')
1645 gdown.download(url, str(output), quiet=False)
1647 output = data_folder / (dataset_name + '.tar')
1648 unpack_file(file=output,
1649 unpack_to=data_folder,
1650 mode='tar', keep=False)
1652 os.rename(data_folder / 'ufsac-public-2.1', original_data_folder)
1654 if use_raganato_ALL_as_test_data:
1655 # in this case no test data should be generated by sampling from train data. But if sample_missing_splits is true, the dev set will be sampled.
1656 if sample_missing_splits:
1657 sample_missing_splits = 'only_dev'
1659 # generate the test file
1660 test_file = determine_tsv_file(filename='raganato_ALL', data_folder=data_folder,
1661 cut_multisense=cut_multisense)
1662 else:
1663 test_file = None
1665 train_file = determine_tsv_file(filename='omsti', data_folder=data_folder, cut_multisense=cut_multisense)
1667 super(WSD_OMSTI, self).__init__(
1668 data_folder=data_folder,
1669 columns=columns,
1670 train_file=train_file,
1671 test_file=test_file,
1672 in_memory=in_memory,
1673 document_separator_token='-DOCSTART-',
1674 column_delimiter='\t',
1675 autofind_splits=False,
1676 tag_to_bioes=tag_to_bioes,
1677 label_name_map=label_name_map,
1678 banned_sentences=banned_sentences,
1679 sample_missing_splits=sample_missing_splits,
1680 )
1683class WSD_TRAINOMATIC(EntityLinkingCorpus):
1684 def __init__(
1685 self,
1686 base_path: Union[str, Path] = None,
1687 in_memory: bool = True,
1688 columns={0: "text", 3: "wn30_key"},
1689 tag_to_bioes=None,
1690 label_name_map: Dict[str, str] = None,
1691 banned_sentences: List[str] = None,
1692 sample_missing_splits: bool = True,
1693 use_raganato_ALL_as_test_data: bool = False,
1694 ):
1695 """
1696 Initialize Train-O-Matic provided in UFSAC https://github.com/getalp/UFSAC
1697 When first initializing the corpus the whole UFSAC data is downloaded.
1698 """
1699 if type(base_path) == str:
1700 base_path: Path = Path(base_path)
1702 dataset_name = 'wsd_ufsac'
1704 # default dataset folder is the cache root
1705 if not base_path:
1706 base_path = flair.cache_root / "datasets"
1707 data_folder = base_path / dataset_name
1708 original_data_folder = data_folder / 'original_data'
1710 # We check if the the UFSAC data has already been downloaded. If not, we download it.
1711 # Note that this downloads more datasets than just Train-O-Matic. But the size of the download is only around 190 Mb (around 4.5 Gb unpacked)
1712 if not original_data_folder.exists():
1713 # create folder
1714 data_folder.mkdir(parents=True)
1716 # download data
1717 import gdown
1719 url = 'https://drive.google.com/uc?id=1Oigo3kzRosz2VjyA44vpJZ58tDFyLRMO'
1721 output = data_folder / (dataset_name + '.tar')
1723 gdown.download(url, str(output), quiet=False)
1725 output = data_folder / (dataset_name + '.tar')
1726 unpack_file(file=output,
1727 unpack_to=data_folder,
1728 mode='tar', keep=False)
1730 os.rename(data_folder / 'ufsac-public-2.1', original_data_folder)
1732 if use_raganato_ALL_as_test_data:
1733 # in this case no test data should be generated by sampling from train data. But if sample_missing_splits is true, the dev set will be sampled.
1734 if sample_missing_splits:
1735 sample_missing_splits = 'only_dev'
1737 # generate the test file
1738 test_file = determine_tsv_file(filename='raganato_ALL', data_folder=data_folder, cut_multisense=True)
1739 else:
1740 test_file = None
1742 train_file = determine_tsv_file(filename='trainomatic', data_folder=data_folder,
1743 cut_multisense=False) # no multisenses
1745 super(WSD_TRAINOMATIC, self).__init__(
1746 data_folder=data_folder,
1747 columns=columns,
1748 train_file=train_file,
1749 test_file=test_file,
1750 in_memory=in_memory,
1751 document_separator_token='-DOCSTART-',
1752 column_delimiter='\t',
1753 autofind_splits=False,
1754 tag_to_bioes=tag_to_bioes,
1755 label_name_map=label_name_map,
1756 banned_sentences=banned_sentences,
1757 sample_missing_splits=sample_missing_splits,
1758 )