Coverage for flair/flair/datasets/entity_linking.py: 7%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import csv
2import logging
3import os
4from pathlib import Path
5from typing import Union, List, Dict
7import requests
9import flair
10from flair.data import Dictionary, Sentence, MultiCorpus
11from flair.datasets import ColumnCorpus
12from flair.file_utils import cached_path, unpack_file
13from flair.tokenization import SentenceSplitter, SegtokSentenceSplitter
15log = logging.getLogger("flair")
18class EntityLinkingCorpus(ColumnCorpus):
19 def __init__(
20 self,
21 data_folder,
22 train_file,
23 columns={0: "text", 1: "nel"},
24 column_delimiter="\t",
25 in_memory=True,
26 document_separator_token='-DOCSTART-',
27 **corpusargs,
28 ):
29 """
30 Super class for all entity linking corpora. Expects the data to be in column format with one column for words and another one for BIO-tags and wikipedia-page
31 name, e.g. B-Brad_Pitt.
32 The class provides the function make_entity_dict to create an entity dictionary suited for entity linking.
33 """
34 # TODO: Add a routine, that checks annotations for some widespread errors/inconsistencies??? (e.g. in AQUAINT corpus Iran-Iraq_War vs. Iran-Iraq_war)
36 super(EntityLinkingCorpus, self).__init__(
37 data_folder,
38 columns,
39 train_file=train_file,
40 column_delimiter=column_delimiter,
41 in_memory=in_memory,
42 document_separator_token=document_separator_token,
43 **corpusargs,
44 )
46 def make_entity_dict(self, label_type='nel', threshold: int = 1) -> Dictionary:
47 """
48 Create ID-dictionary for the wikipedia-page names.
49 param threshold: Ignore links that occur less than threshold value
51 In entity_occurences all wikinames and their number of occurence is saved.
52 ent_dictionary contains all wikinames that occure at least threshold times and gives each name an ID
53 """
54 self.threshold = threshold
55 self.entity_occurences = {}
56 self.total_number_of_entity_mentions = 0
58 for sentence in self.get_all_sentences():
59 if not sentence.is_document_boundary: # exclude "-DOCSTART-"-sentences
61 spans = sentence.get_spans(label_type)
62 for span in spans:
63 annotation = span.tag
64 self.total_number_of_entity_mentions += 1
65 if annotation in self.entity_occurences:
66 self.entity_occurences[annotation] += 1
67 else:
68 self.entity_occurences[annotation] = 1
70 self.number_of_entities = len(self.entity_occurences)
72 # Create the annotation dictionary
73 self.ent_dictionary: Dictionary = Dictionary(add_unk=True)
75 for x in self.entity_occurences:
76 if self.entity_occurences[x] >= threshold:
77 self.ent_dictionary.add_item(x)
79 return self.ent_dictionary
81 # this fct removes every second unknown label
82 def remove_unknowns(self):
83 remove = True
84 for sentence in self.get_all_sentences():
85 if not sentence.is_document_boundary: # exclude "-DOCSTART-"-sentences
87 spans = sentence.get_spans('nel')
88 for span in spans:
89 annotation = span.tag
90 if self.ent_dictionary.get_idx_for_item(annotation) == 0: # unknown label
91 if remove:
92 for token in span:
93 token.remove_labels('nel')
94 remove = False
95 else:
96 remove = True
99class NEL_ENGLISH_AQUAINT(EntityLinkingCorpus):
100 def __init__(
101 self,
102 base_path: Union[str, Path] = None,
103 in_memory: bool = True,
104 agreement_threshold: float = 0.5,
105 sentence_splitter: SentenceSplitter = SegtokSentenceSplitter(),
106 **corpusargs,
107 ):
108 """
109 Initialize Aquaint Entity Linking corpus introduced in: D. Milne and I. H. Witten.
110 Learning to link with wikipedia
111 (https://www.cms.waikato.ac.nz/~ihw/papers/08-DNM-IHW-LearningToLinkWithWikipedia.pdf).
112 If you call the constructor the first time the dataset gets automatically downloaded and transformed in
113 tab-separated column format (aquaint.txt).
115 Parameters
116 ----------
117 base_path : Union[str, Path], optional
118 Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
119 to point to a different folder but typically this should not be necessary.
120 in_memory: If True, keeps dataset in memory giving speedups in training.
121 agreement_threshold: Some link annotations come with an agreement_score representing the agreement from the human annotators. The score ranges from lowest 0.2
122 to highest 1.0. The lower the score, the less "important" is the entity because fewer annotators thought it was worth linking.
123 Default is 0.5 which means the majority of annotators must have annoteted the respective entity mention.
124 """
125 if type(base_path) == str:
126 base_path: Path = Path(base_path)
128 self.agreement_threshold = agreement_threshold
130 # this dataset name
131 dataset_name = self.__class__.__name__.lower() + "_" + type(sentence_splitter).__name__
133 # default dataset folder is the cache root
134 if not base_path:
135 base_path = flair.cache_root / "datasets"
136 data_folder = base_path / dataset_name
138 aquaint_el_path = "https://www.nzdl.org/wikification/data/wikifiedStories.zip"
139 corpus_file_name = "aquaint.txt"
140 parsed_dataset = data_folder / corpus_file_name
142 # download and parse data if necessary
143 if not parsed_dataset.exists():
144 aquaint_el_zip = cached_path(f"{aquaint_el_path}", Path("datasets") / dataset_name)
145 unpack_file(aquaint_el_zip, data_folder, "zip", False)
147 try:
148 with open(parsed_dataset, "w", encoding='utf-8') as txt_out:
150 # iterate over all html files
151 for file in os.listdir(data_folder):
153 if not file.endswith(".htm"):
154 continue
156 with open(str(data_folder / file), "r", encoding='utf-8') as txt_in:
157 text = txt_in.read()
159 # get rid of html syntax, we only need the text
160 strings = text.split("<p> ")
161 strings[0] = strings[0].split('<h1 id="header">')[1][:-7]
163 for i in range(1, len(strings) - 1):
164 strings[i] = strings[i][:-7]
166 strings[-1] = strings[-1][:-23]
168 # between all documents we write a separator symbol
169 txt_out.write('-DOCSTART-\n\n')
171 for string in strings:
173 # skip empty strings
174 if not string: continue
176 # process the annotation format in the text and collect triples (begin_mention, length_mention, wikiname)
177 indices = []
178 lengths = []
179 wikinames = []
181 current_entity = string.find('[[') # each annotation starts with '[['
182 while current_entity != -1:
183 wikiname = ''
184 surface_form = ''
185 j = current_entity + 2
187 while string[j] not in [']', '|']:
188 wikiname += string[j]
189 j += 1
191 if string[j] == ']': # entity mention ends, i.e. looks like this [[wikiname]]
192 surface_form = wikiname # in this case entity mention = wiki-page name
193 else: # string[j] == '|'
194 j += 1
195 while string[j] not in [']', '|']:
196 surface_form += string[j]
197 j += 1
199 if string[
200 j] == '|': # entity has a score, i.e. looks like this [[wikiname|surface_form|agreement_score]]
201 agreement_score = float(string[j + 1:j + 4])
202 j += 4 # points to first ']' of entity now
203 if agreement_score < self.agreement_threshold: # discard entity
204 string = string[:current_entity] + surface_form + string[j + 2:]
205 current_entity = string.find('[[')
206 continue
208 # replace [[wikiname|surface_form|score]] by surface_form and save index, length and wikiname of mention
209 indices.append(current_entity)
210 lengths.append(len(surface_form))
211 wikinames.append(wikiname[0].upper() + wikiname.replace(' ', '_')[1:])
213 string = string[:current_entity] + surface_form + string[j + 2:]
215 current_entity = string.find('[[')
217 # sentence splitting and tokenization
218 sentences = sentence_splitter.split(string)
219 sentence_offsets = [sentence.start_pos for sentence in sentences]
221 # iterate through all annotations and add to corresponding tokens
222 for mention_start, mention_length, wikiname in zip(indices, lengths, wikinames):
224 # find sentence to which annotation belongs
225 sentence_index = 0
226 for i in range(1, len(sentences)):
227 if mention_start < sentence_offsets[i]:
228 break
229 else:
230 sentence_index += 1
232 # position within corresponding sentence
233 mention_start -= sentence_offsets[sentence_index]
234 mention_end = mention_start + mention_length
236 # set annotation for tokens of entity mention
237 first = True
238 for token in sentences[sentence_index].tokens:
239 if token.start_pos >= mention_start and token.end_pos <= mention_end: # token belongs to entity mention
240 if first:
241 token.set_label(typename='nel', value='B-' + wikiname)
242 first = False
243 else:
244 token.set_label(typename='nel', value='I-' + wikiname)
246 # write to out-file in column format
247 for sentence in sentences:
249 for token in sentence.tokens:
251 labels = token.get_labels('nel')
253 if len(labels) == 0: # no entity
254 txt_out.write(token.text + '\tO\n')
256 else: # annotation
257 txt_out.write(token.text + '\t' + labels[0].value + '\n')
259 txt_out.write('\n') # empty line after each sentence
261 except:
262 # in case something goes wrong, delete the dataset and raise error
263 os.remove(parsed_dataset)
264 raise
266 super(NEL_ENGLISH_AQUAINT, self).__init__(
267 data_folder,
268 train_file=corpus_file_name,
269 in_memory=in_memory,
270 **corpusargs,
271 )
274class NEL_GERMAN_HIPE(EntityLinkingCorpus):
275 def __init__(
276 self,
277 base_path: Union[str, Path] = None,
278 in_memory: bool = True,
279 wiki_language: str = 'dewiki',
280 **corpusargs
281 ):
282 """
283 Initialize a sentence-segmented version of the HIPE entity linking corpus for historical German (see description
284 of HIPE at https://impresso.github.io/CLEF-HIPE-2020/). This version was segmented by @stefan-it and is hosted
285 at https://github.com/stefan-it/clef-hipe.
286 If you call the constructor the first time the dataset gets automatically downloaded and transformed in
287 tab-separated column format.
289 Parameters
290 ----------
291 base_path : Union[str, Path], optional
292 Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
293 to point to a different folder but typically this should not be necessary.
294 in_memory: If True, keeps dataset in memory giving speedups in training.
295 wiki_language : specify the language of the names of the wikipedia pages, i.e. which language version of
296 Wikipedia URLs to use. Since the text is in german the default language is German.
297 """
298 self.wiki_language = wiki_language
299 if type(base_path) == str:
300 base_path: Path = Path(base_path)
302 # this dataset name
303 dataset_name = self.__class__.__name__.lower()
305 # default dataset folder is the cache root
306 if not base_path:
307 base_path = flair.cache_root / "datasets"
308 data_folder = base_path / dataset_name
310 dev_raw_url = "https://raw.githubusercontent.com/stefan-it/clef-hipe/main/data/future/dev-v1.2/de/HIPE-data-v1.2-dev-de-normalized-manual-eos.tsv"
311 test_raw_url = "https://raw.githubusercontent.com/stefan-it/clef-hipe/main/data/future/test-v1.3/de/HIPE-data-v1.3-test-de-normalized-manual-eos.tsv"
312 train_raw_url = "https://raw.githubusercontent.com/stefan-it/clef-hipe/main/data/future/training-v1.2/de/HIPE-data-v1.2-train-de-normalized-manual-eos.tsv"
313 train_file_name = wiki_language + "_train.tsv"
314 parsed_dataset = data_folder / train_file_name
316 # download and parse data if necessary
317 if not parsed_dataset.exists():
319 # from qwikidata.linked_data_interface import get_entity_dict_from_api
321 original_train_path = cached_path(f"{train_raw_url}", Path("datasets") / dataset_name)
322 original_test_path = cached_path(f"{test_raw_url}", Path("datasets") / dataset_name)
323 original_dev_path = cached_path(f"{dev_raw_url}", Path("datasets") / dataset_name)
325 # generate qid wikiname dictionaries
326 log.info('Get wikinames from wikidata...')
327 train_dict = self._get_qid_wikiname_dict(path=original_train_path)
328 test_dict = self._get_qid_wikiname_dict(original_test_path)
329 dev_dict = self._get_qid_wikiname_dict(original_dev_path)
330 log.info('...done!')
332 # merge dictionaries
333 qid_wikiname_dict = {**train_dict, **test_dict, **dev_dict}
335 for doc_path, file_name in zip([original_train_path, original_test_path, original_dev_path],
336 [train_file_name, wiki_language + '_test.tsv', wiki_language + '_dev.tsv']):
337 with open(doc_path, 'r', encoding='utf-8') as read, open(data_folder / file_name, 'w',
338 encoding='utf-8') as write:
340 # ignore first line
341 read.readline()
342 line = read.readline()
343 last_eos = True
345 while line:
346 # commented and empty lines
347 if line[0] == '#' or line == '\n':
348 if line[2:13] == 'document_id': # beginning of new document
350 if last_eos:
351 write.write('-DOCSTART-\n\n')
352 last_eos = False
353 else:
354 write.write('\n-DOCSTART-\n\n')
356 else:
357 line_list = line.split('\t')
358 if not line_list[7] in ['_', 'NIL']: # line has wikidata link
360 wikiname = qid_wikiname_dict[line_list[7]]
362 if wikiname != 'O':
363 annotation = line_list[1][:2] + wikiname
364 else: # no entry in chosen language
365 annotation = 'O'
367 else:
369 annotation = 'O'
371 write.write(line_list[0] + '\t' + annotation + '\n')
373 if line_list[-1][-4:-1] == 'EOS': # end of sentence
374 write.write('\n')
375 last_eos = True
376 else:
377 last_eos = False
379 line = read.readline()
381 super(NEL_GERMAN_HIPE, self).__init__(
382 data_folder,
383 train_file=train_file_name,
384 dev_file=wiki_language + '_dev.tsv',
385 test_file=wiki_language + '_test.tsv',
386 in_memory=in_memory,
387 **corpusargs,
388 )
390 def _get_qid_wikiname_dict(self, path):
392 qid_set = set()
393 with open(path, mode='r', encoding='utf-8') as read:
394 # read all Q-IDs
396 # ignore first line
397 read.readline()
398 line = read.readline()
400 while line:
402 if not (line[0] == '#' or line == '\n'): # commented or empty lines
403 line_list = line.split('\t')
404 if not line_list[7] in ['_', 'NIL']: # line has wikidata link
406 qid_set.add(line_list[7])
408 line = read.readline()
410 base_url = 'https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&props=sitelinks&sitefilter=' + self.wiki_language + '&ids='
412 qid_list = list(qid_set)
413 ids = ''
414 length = len(qid_list)
415 qid_wikiname_dict = {}
416 for i in range(length):
417 if (
418 i + 1) % 50 == 0 or i == length - 1: # there is a limit to the number of ids in one request in the wikidata api
420 ids += qid_list[i]
421 # request
422 response_json = requests.get(base_url + ids).json()
424 for qid in response_json['entities']:
426 try:
427 wikiname = response_json['entities'][qid]['sitelinks'][self.wiki_language]['title'].replace(' ',
428 '_')
429 except KeyError: # language not available for specific wikiitem
430 wikiname = 'O'
432 qid_wikiname_dict[qid] = wikiname
434 ids = ''
436 else:
437 ids += qid_list[i]
438 ids += '|'
440 return qid_wikiname_dict
443class NEL_ENGLISH_AIDA(EntityLinkingCorpus):
444 def __init__(
445 self,
446 base_path: Union[str, Path] = None,
447 in_memory: bool = True,
448 check_existence: bool = False,
449 **corpusargs
450 ):
451 """
452 Initialize AIDA CoNLL-YAGO Entity Linking corpus introduced here https://www.mpi-inf.mpg.de/departments/databases-and-information-systems/research/ambiverse-nlu/aida/downloads.
453 License: https://creativecommons.org/licenses/by-sa/3.0/deed.en_US
454 If you call the constructor the first time the dataset gets automatically downloaded and transformed in tab-separated column format.
456 Parameters
457 ----------
458 base_path : Union[str, Path], optional
459 Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
460 to point to a different folder but typically this should not be necessary.
461 in_memory: If True, keeps dataset in memory giving speedups in training.
462 check_existence: If True the existence of the given wikipedia ids/pagenames is checked and non existent ids/names will be igrnored.
463 """
464 if type(base_path) == str:
465 base_path: Path = Path(base_path)
467 # this dataset name
468 dataset_name = self.__class__.__name__.lower()
470 # default dataset folder is the cache root
471 if not base_path:
472 base_path = flair.cache_root / "datasets"
473 data_folder = base_path / dataset_name
475 conll_yago_path = "https://nlp.informatik.hu-berlin.de/resources/datasets/conll_entity_linking/"
476 corpus_file_name = "train"
477 parsed_dataset = data_folder / corpus_file_name
479 if not parsed_dataset.exists():
481 import wikipediaapi
483 wiki_wiki = wikipediaapi.Wikipedia(language='en')
485 testa_unprocessed_path = cached_path(f"{conll_yago_path}aida_conll_testa", Path("datasets") / dataset_name)
486 testb_unprocessed_path = cached_path(f"{conll_yago_path}aida_conll_testb", Path("datasets") / dataset_name)
487 train_unprocessed_path = cached_path(f"{conll_yago_path}aida_conll_train", Path("datasets") / dataset_name)
489 # we use the wikiids in the data instead of directly utilizing the wikipedia urls.
490 # like this we can quickly check if the corresponding page exists
491 wikiid_wikiname_dict = self._get_wikiid_wikiname_dict(data_folder)
493 for name, path in zip(['train', 'testa', 'testb'],
494 [train_unprocessed_path, testa_unprocessed_path, testb_unprocessed_path]):
495 with open(data_folder / name, 'w', encoding='utf-8') as write, open(path, 'r',
496 encoding='utf-8') as read:
498 for line in read:
500 line_list = line.split('\t')
501 if len(line_list) <= 4:
502 if line_list[0][:10] == '-DOCSTART-': # Docstart
503 write.write('-DOCSTART-\n\n')
504 elif line_list[0] == '\n': # empty line
505 write.write('\n')
506 else: # text without annotation or marked '--NME--' (no matching entity)
507 if len(line_list) == 1:
508 write.write(line_list[0][:-1] + '\tO\n')
509 else:
510 write.write(line_list[0] + '\tO\n')
511 else: # line with annotation
512 wikiname = wikiid_wikiname_dict[line_list[5].strip()]
513 if wikiname != 'O':
514 write.write(line_list[0] + '\t' + line_list[1] + '-' + wikiname + '\n')
515 else:
516 # if there is a bad wikiid we can check if the given url in the data exists using wikipediaapi
517 wikiname = line_list[4].split('/')[-1]
518 if check_existence:
519 page = wiki_wiki.page(wikiname)
520 if page.exists():
521 write.write(line_list[0] + '\t' + line_list[1] + '-' + wikiname + '\n')
522 else: # neither the wikiid nor the url exist
523 write.write(line_list[0] + '\tO\n')
524 else:
525 write.write(line_list[0] + '\t' + line_list[4] + '-' + wikiname + '\n')
527 # delete unprocessed file
528 os.remove(path)
530 super(NEL_ENGLISH_AIDA, self).__init__(
531 data_folder,
532 train_file=corpus_file_name,
533 dev_file='testa',
534 test_file='testb',
535 in_memory=in_memory,
536 **corpusargs,
537 )
539 def _get_wikiid_wikiname_dict(self, base_folder):
541 # collect all wikiids
542 wikiid_set = set()
543 for data_file in ['aida_conll_testa', 'aida_conll_testb', 'aida_conll_train']:
544 with open(base_folder / data_file, mode='r', encoding='utf-8') as read:
545 line = read.readline()
546 while line:
547 row = line.split('\t')
548 if len(row) > 4: # line has a wiki annotation
549 wikiid_set.add(row[5].strip())
550 line = read.readline()
552 # create the dictionary
553 wikiid_wikiname_dict = {}
554 wikiid_list = list(wikiid_set)
555 ids = ''
556 length = len(wikiid_list)
558 for i in range(length):
559 if (
560 i + 1) % 50 == 0 or i == length - 1: # there is a limit to the number of ids in one request in the wikimedia api
562 ids += wikiid_list[i]
563 # request
564 resp = requests.get(
565 'https://en.wikipedia.org/w/api.php',
566 params={
567 'action': 'query',
568 'prop': 'info',
569 'pageids': ids,
570 'format': 'json'
571 }
572 ).json()
574 for wikiid in resp['query']['pages']:
575 try:
576 wikiname = resp['query']['pages'][wikiid]['title'].replace(' ', '_')
577 except KeyError: # bad wikiid
578 wikiname = 'O'
579 wikiid_wikiname_dict[wikiid] = wikiname
580 ids = ''
582 else:
583 ids += wikiid_list[i]
584 ids += '|'
586 return wikiid_wikiname_dict
589class NEL_ENGLISH_IITB(EntityLinkingCorpus):
590 def __init__(
591 self,
592 base_path: Union[str, Path] = None,
593 in_memory: bool = True,
594 ignore_disagreements: bool = False,
595 sentence_splitter: SentenceSplitter = SegtokSentenceSplitter(),
596 **corpusargs
597 ):
598 """
599 Initialize ITTB Entity Linking corpus introduced in "Collective Annotation of Wikipedia Entities in Web Text" Sayali Kulkarni, Amit Singh, Ganesh Ramakrishnan, and Soumen Chakrabarti.
600 If you call the constructor the first time the dataset gets automatically downloaded and transformed in tab-separated column format.
602 Parameters
603 ----------
604 base_path : Union[str, Path], optional
605 Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
606 to point to a different folder but typically this should not be necessary.
607 in_memory: If True, keeps dataset in memory giving speedups in training.
608 ignore_disagreements: If True annotations with annotator disagreement will be ignored.
609 """
610 if type(base_path) == str:
611 base_path: Path = Path(base_path)
613 # this dataset name
614 dataset_name = self.__class__.__name__.lower() + "_" + type(sentence_splitter).__name__
616 # default dataset folder is the cache root
617 if not base_path:
618 base_path = flair.cache_root / "datasets"
619 data_folder = base_path / dataset_name
621 iitb_el_docs_path = "https://www.cse.iitb.ac.in/~soumen/doc/CSAW/Annot/CSAW_crawledDocs.tar.gz"
622 iitb_el_annotations_path = "https://www.cse.iitb.ac.in/~soumen/doc/CSAW/Annot/CSAW_Annotations.xml"
623 corpus_file_name = "iitb.txt"
624 parsed_dataset = data_folder / corpus_file_name
626 label_type = 'nel'
628 if not parsed_dataset.exists():
630 docs_zip_path = cached_path(f"{iitb_el_docs_path}", Path("datasets") / dataset_name)
631 annotations_xml_path = cached_path(f"{iitb_el_annotations_path}", Path("datasets") / dataset_name)
633 unpack_file(docs_zip_path, data_folder, "tar", False)
635 import xml.etree.ElementTree as ET
636 tree = ET.parse(annotations_xml_path)
637 root = tree.getroot()
639 # names of raw text documents
640 doc_names = set()
641 for elem in root:
642 doc_names.add(elem[0].text)
644 # open output_file
645 with open(parsed_dataset, 'w', encoding='utf-8') as write:
646 # iterate through all documents
647 for doc_name in doc_names:
648 with open(data_folder / 'crawledDocs' / doc_name, 'r', encoding='utf-8') as read:
649 text = read.read()
651 # split sentences and tokenize
652 sentences = sentence_splitter.split(text)
653 sentence_offsets = [sentence.start_pos for sentence in sentences]
655 # iterate through all annotations and add to corresponding tokens
656 for elem in root:
658 if elem[0].text == doc_name and elem[2].text: # annotation belongs to current document
660 wikiname = elem[2].text.replace(' ', '_')
661 mention_start = int(elem[3].text)
662 mention_length = int(elem[4].text)
664 # find sentence to which annotation belongs
665 sentence_index = 0
666 for i in range(1, len(sentences)):
667 if mention_start < sentence_offsets[i]:
668 break
669 else:
670 sentence_index += 1
672 # position within corresponding sentence
673 mention_start -= sentence_offsets[sentence_index]
674 mention_end = mention_start + mention_length
676 # set annotation for tokens of entity mention
677 first = True
678 for token in sentences[sentence_index].tokens:
679 if token.start_pos >= mention_start and token.end_pos <= mention_end: # token belongs to entity mention
680 if first:
681 token.set_label(typename=elem[1].text, value='B-' + wikiname)
682 first = False
683 else:
684 token.set_label(typename=elem[1].text, value='I-' + wikiname)
686 # write to out file
687 write.write('-DOCSTART-\n\n') # each file is one document
689 for sentence in sentences:
691 for token in sentence.tokens:
693 labels = token.labels
695 if len(labels) == 0: # no entity
696 write.write(token.text + '\tO\n')
698 elif len(labels) == 1: # annotation from one annotator
699 write.write(token.text + '\t' + labels[0].value + '\n')
701 else: # annotations from two annotators
703 if labels[0].value == labels[1].value: # annotators agree
704 write.write(token.text + '\t' + labels[0].value + '\n')
706 else: # annotators disagree: ignore or arbitrarily take first annotation
708 if ignore_disagreements:
709 write.write(token.text + '\tO\n')
711 else:
712 write.write(token.text + '\t' + labels[0].value + '\n')
714 write.write('\n') # empty line after each sentence
716 super(NEL_ENGLISH_IITB, self).__init__(
717 data_folder,
718 train_file=corpus_file_name,
719 in_memory=in_memory,
720 **corpusargs,
721 )
724class NEL_ENGLISH_TWEEKI(EntityLinkingCorpus):
725 def __init__(
726 self,
727 base_path: Union[str, Path] = None,
728 in_memory: bool = True,
729 **corpusargs,
730 ):
731 """
732 Initialize Tweeki Entity Linking corpus introduced in "Tweeki: Linking Named Entities on Twitter to a Knowledge Graph" Harandizadeh, Singh.
733 The data consits of tweets with manually annotated wikipedia links.
734 If you call the constructor the first time the dataset gets automatically downloaded and transformed in tab-separated column format.
736 Parameters
737 ----------
738 base_path : Union[str, Path], optional
739 Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
740 to point to a different folder but typically this should not be necessary.
741 in_memory: If True, keeps dataset in memory giving speedups in training.
742 """
743 if type(base_path) == str:
744 base_path: Path = Path(base_path)
746 # this dataset name
747 dataset_name = self.__class__.__name__.lower()
749 # default dataset folder is the cache root
750 if not base_path:
751 base_path = flair.cache_root / "datasets"
752 data_folder = base_path / dataset_name
754 tweeki_gold_el_path = "https://raw.githubusercontent.com/ucinlp/tweeki/main/data/Tweeki_gold/Tweeki_gold"
755 corpus_file_name = "tweeki_gold.txt"
756 parsed_dataset = data_folder / corpus_file_name
758 # download and parse data if necessary
759 if not parsed_dataset.exists():
761 original_file_path = cached_path(f"{tweeki_gold_el_path}", Path("datasets") / dataset_name)
763 with open(original_file_path, 'r', encoding='utf-8') as read, open(parsed_dataset, 'w',
764 encoding='utf-8') as write:
765 line = read.readline()
766 while line:
767 if line.startswith('#'):
768 out_line = ''
769 elif line == '\n': # tweet ends
770 out_line = '\n-DOCSTART-\n\n'
771 else:
772 line_list = line.split('\t')
773 out_line = line_list[1] + '\t'
774 if line_list[3] == '-\n': # no wiki name
775 out_line += 'O\n'
776 else:
777 out_line += line_list[2][:2] + line_list[3].split('|')[0].replace(' ', '_') + '\n'
778 write.write(out_line)
779 line = read.readline()
781 os.rename(original_file_path, str(original_file_path) + '_original')
783 super(NEL_ENGLISH_TWEEKI, self).__init__(
784 data_folder,
785 train_file=corpus_file_name,
786 in_memory=in_memory,
787 **corpusargs,
788 )
791class NEL_ENGLISH_REDDIT(EntityLinkingCorpus):
792 def __init__(
793 self,
794 base_path: Union[str, Path] = None,
795 in_memory: bool = True,
796 **corpusargs,
797 ):
798 """
799 Initialize the Reddit Entity Linking corpus containing gold annotations only (https://arxiv.org/abs/2101.01228v2) in the NER-like column format.
800 The first time you call this constructor it will automatically download the dataset.
801 :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
802 to point to a different folder but typically this should not be necessary.
803 :param in_memory: If True, keeps dataset in memory giving speedups in training.
804 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
805 """
806 if type(base_path) == str:
807 base_path: Path = Path(base_path)
809 # this dataset name
810 dataset_name = self.__class__.__name__.lower()
812 # default dataset folder is the cache root
813 if not base_path:
814 base_path = flair.cache_root / "datasets"
815 data_folder = base_path / dataset_name
817 # download and parse data if necessary
818 reddit_el_path = "https://zenodo.org/record/3970806/files/reddit_el.zip"
819 corpus_file_name = "reddit_el_gold.txt"
820 parsed_dataset = data_folder / corpus_file_name
822 if not parsed_dataset.exists():
823 reddit_el_zip = cached_path(f"{reddit_el_path}", Path("datasets") / dataset_name)
824 unpack_file(reddit_el_zip, data_folder, "zip", False)
826 with open(data_folder / corpus_file_name, "w", encoding='utf-8') as txtout:
828 # First parse the post titles
829 with open(data_folder / "posts.tsv", "r", encoding='utf-8') as tsvin1, open(
830 data_folder / "gold_post_annotations.tsv", "r", encoding='utf-8') as tsvin2:
832 posts = csv.reader(tsvin1, delimiter="\t")
833 self.post_annotations = csv.reader(tsvin2, delimiter="\t")
834 self.curr_annot = next(self.post_annotations)
836 for row in posts: # Go through all the post titles
838 txtout.writelines("-DOCSTART-\n\n") # Start each post with a -DOCSTART- token
840 # Keep track of how many and which entity mentions does a given post title have
841 link_annots = [] # [start pos, end pos, wiki page title] of an entity mention
843 # Check if the current post title has an entity link and parse accordingly
844 if row[0] == self.curr_annot[0]:
846 link_annots.append((int(self.curr_annot[4]), int(self.curr_annot[5]), self.curr_annot[3]))
847 link_annots = self._fill_annot_array(link_annots, row[0], post_flag=True)
849 # Post titles with entity mentions (if any) are handled via this function
850 self._text_to_cols(Sentence(row[2], use_tokenizer=True), link_annots, txtout)
851 else:
852 self._text_to_cols(Sentence(row[2], use_tokenizer=True), link_annots, txtout)
854 # Then parse the comments
855 with open(data_folder / "comments.tsv", "r", encoding='utf-8') as tsvin3, open(
856 data_folder / "gold_comment_annotations.tsv", "r", encoding='utf-8') as tsvin4:
858 self.comments = csv.reader(tsvin3, delimiter="\t")
859 self.comment_annotations = csv.reader(tsvin4, delimiter="\t")
860 self.curr_annot = next(self.comment_annotations)
861 self.curr_row = next(self.comments)
862 self.stop_iter = False
864 # Iterate over the comments.tsv file, until the end is reached
865 while not self.stop_iter:
867 txtout.writelines("-DOCSTART-\n") # Start each comment thread with a -DOCSTART- token
869 # Keep track of the current comment thread and its corresponding key, on which the annotations are matched.
870 # Each comment thread is handled as one 'document'.
871 self.curr_comm = self.curr_row[4]
872 comm_key = self.curr_row[0]
874 # Python's csv package for some reason fails to correctly parse a handful of rows inside the comments.tsv file.
875 # This if-condition is needed to handle this problem.
876 if comm_key in {"en5rf4c", "es3ia8j", "es3lrmw"}:
877 if comm_key == "en5rf4c":
878 self.parsed_row = (r.split("\t") for r in self.curr_row[4].split("\n"))
879 self.curr_comm = next(self.parsed_row)
880 self._fill_curr_comment(fix_flag=True)
881 # In case we are dealing with properly parsed rows, proceed with a regular parsing procedure
882 else:
883 self._fill_curr_comment(fix_flag=False)
885 link_annots = [] # [start pos, end pos, wiki page title] of an entity mention
887 # Check if the current comment thread has an entity link and parse accordingly, same as with post titles above
888 if comm_key == self.curr_annot[0]:
889 link_annots.append((int(self.curr_annot[4]), int(self.curr_annot[5]), self.curr_annot[3]))
890 link_annots = self._fill_annot_array(link_annots, comm_key, post_flag=False)
891 self._text_to_cols(Sentence(self.curr_comm, use_tokenizer=True), link_annots, txtout)
892 else:
893 # In two of the comment thread a case of capital letter spacing occurs, which the SegtokTokenizer cannot properly handle.
894 # The following if-elif condition handles these two cases and as result writes full capitalized words in each corresponding row,
895 # and not just single letters into single rows.
896 if comm_key == "dv74ybb":
897 self.curr_comm = " ".join(
898 [word.replace(" ", "") for word in self.curr_comm.split(" ")])
899 elif comm_key == "eci2lut":
900 self.curr_comm = (self.curr_comm[:18] + self.curr_comm[18:27].replace(" ",
901 "") + self.curr_comm[
902 27:55] +
903 self.curr_comm[55:68].replace(" ", "") + self.curr_comm[
904 68:85] + self.curr_comm[
905 85:92].replace(" ",
906 "") +
907 self.curr_comm[92:])
909 self._text_to_cols(Sentence(self.curr_comm, use_tokenizer=True), link_annots, txtout)
911 super(NEL_ENGLISH_REDDIT, self).__init__(
912 data_folder,
913 train_file=corpus_file_name,
914 in_memory=in_memory,
915 **corpusargs,
916 )
918 def _text_to_cols(self, sentence: Sentence, links: list, outfile):
919 """
920 Convert a tokenized sentence into column format
921 :param sentence: Flair Sentence object containing a tokenized post title or comment thread
922 :param links: array containing information about the starting and ending position of an entity mention, as well
923 as its corresponding wiki tag
924 :param outfile: file, to which the output is written
925 """
926 for i in range(0, len(sentence)):
927 # If there are annotated entity mentions for given post title or a comment thread
928 if links:
929 # Keep track which is the correct corresponding entity link, in cases where there is >1 link in a sentence
930 link_index = [j for j, v in enumerate(links) if
931 (sentence[i].start_pos >= v[0] and sentence[i].end_pos <= v[1])]
932 # Write the token with a corresponding tag to file
933 try:
934 if any(sentence[i].start_pos == v[0] and sentence[i].end_pos == v[1] for j, v in enumerate(links)):
935 outfile.writelines(sentence[i].text + "\tS-" + links[link_index[0]][2] + "\n")
936 elif any(
937 sentence[i].start_pos == v[0] and sentence[i].end_pos != v[1] for j, v in enumerate(links)):
938 outfile.writelines(sentence[i].text + "\tB-" + links[link_index[0]][2] + "\n")
939 elif any(
940 sentence[i].start_pos >= v[0] and sentence[i].end_pos <= v[1] for j, v in enumerate(links)):
941 outfile.writelines(sentence[i].text + "\tI-" + links[link_index[0]][2] + "\n")
942 else:
943 outfile.writelines(sentence[i].text + "\tO\n")
944 # IndexError is raised in cases when there is exactly one link in a sentence, therefore can be dismissed
945 except IndexError:
946 pass
948 # If a comment thread or a post title has no entity link, all tokens are assigned the O tag
949 else:
950 outfile.writelines(sentence[i].text + "\tO\n")
952 # Prevent writing empty lines if e.g. a quote comes after a dot or initials are tokenized
953 # incorrectly, in order to keep the desired format (empty line as a sentence separator).
954 try:
955 if ((sentence[i].text in {".", "!", "?", "!*"}) and
956 (sentence[i + 1].text not in {'"', '“', "'", "''", "!", "?", ";)", "."}) and
957 ("." not in sentence[i - 1].text)):
958 outfile.writelines("\n")
959 except IndexError:
960 # Thrown when the second check above happens, but the last token of a sentence is reached.
961 # Indicates that the EOS punctuaion mark is present, therefore an empty line needs to be written below.
962 outfile.writelines("\n")
964 # If there is no punctuation mark indicating EOS, an empty line is still needed after the EOS
965 if sentence[-1].text not in {".", "!", "?"}:
966 outfile.writelines("\n")
968 def _fill_annot_array(self, annot_array: list, key: str, post_flag: bool) -> list:
969 """
970 Fills the array containing information about the entity mention annotations, used in the _text_to_cols method
971 :param annot_array: array to be filled
972 :param key: reddit id, on which the post title/comment thread is matched with its corresponding annotation
973 :param post_flag: flag indicating whether the annotations are collected for the post titles (=True)
974 or comment threads (=False)
975 """
976 next_annot = None
977 while True:
978 # Check if further annotations belong to the current post title or comment thread as well
979 try:
980 next_annot = next(self.post_annotations) if post_flag else next(self.comment_annotations)
981 if next_annot[0] == key:
982 annot_array.append((int(next_annot[4]), int(next_annot[5]), next_annot[3]))
983 else:
984 self.curr_annot = next_annot
985 break
986 # Stop when the end of an annotation file is reached
987 except StopIteration:
988 break
989 return annot_array
991 def _fill_curr_comment(self, fix_flag: bool):
992 """
993 Extends the string containing the current comment thread, which is passed to _text_to_cols method, when the
994 comments are parsed.
995 :param fix_flag: flag indicating whether the method is called when the incorrectly imported rows are parsed (=True)
996 or regular rows (=False)
997 """
998 next_row = None
999 while True:
1000 # Check if further annotations belong to the current sentence as well
1001 try:
1002 next_row = next(self.comments) if not fix_flag else next(self.parsed_row)
1003 if len(next_row) < 2:
1004 # 'else " "' is needed to keep the proper token positions (for accordance with annotations)
1005 self.curr_comm += next_row[0] if any(next_row) else " "
1006 else:
1007 self.curr_row = next_row
1008 break
1009 except StopIteration: # When the end of the comments.tsv file is reached
1010 self.curr_row = next_row
1011 self.stop_iter = True if not fix_flag else False
1012 break
1015def from_ufsac_to_tsv(xml_file: Union[str, Path], conll_file: Union[str, Path], datasetname: str,
1016 encoding: str = "utf8",
1017 cut_multisense: bool = True):
1018 """
1019 Function that converts the UFSAC format into tab separated column format in a new file.
1020 Parameters
1021 ----------
1022 xml_file : Union[str, Path]
1023 Path to the xml file.
1024 conll_file : Union[str, Path]
1025 Path for the new conll file.
1026 datasetname: str
1027 Name of the dataset from UFSAC, needed because of different handling of multi-word-spans in the datasets
1028 encoding : str, optional
1029 Encoding used in open function. The default is "utf8".
1030 cut_multisense : bool, optional
1031 Boolean that determines whether or not the wn30_key tag should be cut if it contains multiple possible senses.
1032 If True only the first listed sense will be used. Otherwise the whole list of senses will be detected
1033 as one new sense. The default is True.
1035 """
1037 def make_line(word, begin_or_inside, attributes):
1038 """
1039 Function that creates an output line from a word.
1040 Parameters
1041 ----------
1042 word :
1043 String of the actual word.
1044 begin_or_inside:
1045 Either 'B-' or 'I-'
1046 attributes:
1047 List of attributes of the word (pos, lemma, wn30_key)
1048 """
1049 line = word
1050 if cut_multisense == True:
1051 attributes[-1] = attributes[-1].split(';')[0] # take only first sense
1053 for attrib in attributes:
1054 if attrib != 'O':
1055 line = line + '\t' + begin_or_inside + attrib
1056 else:
1057 line = line + '\tO'
1058 line += '\n'
1060 return line
1062 def split_span(word_fields: List[str], datasetname: str()):
1063 """
1064 Function that splits a word if necessary, i.e. if it is a multiple-word-span.
1065 Parameters
1066 ----------
1067 word_fields :
1068 list ['surface_form', 'lemma', 'pos', 'wn30_key'] of a word
1069 datasetname:
1070 name of corresponding dataset
1071 """
1073 span = word_fields[0]
1075 if datasetname in ['trainomatic', 'masc']: # splitting not sensible for these datasets
1076 return [span]
1077 elif datasetname == 'omsti':
1078 if word_fields[
1079 3] != 'O' and not span == '_' and not '__' in span: # has annotation and does not consist only of '_' (still not 100% clean)
1080 return span.split('_')
1081 else:
1082 return [span]
1083 else: # for all other datasets splitting at '_' is always sensible
1084 return span.split('_')
1086 txt_out = open(file=conll_file, mode='w', encoding=encoding)
1087 import xml.etree.ElementTree as ET
1088 tree = ET.parse(xml_file)
1089 corpus = tree.getroot()
1091 number_of_docs = len(corpus.findall('document'))
1093 fields = ['surface_form', 'lemma', 'pos', 'wn30_key']
1094 for document in corpus:
1095 # Docstart
1096 if number_of_docs > 1:
1097 txt_out.write('-DOCSTART-\n\n')
1099 for paragraph in document:
1101 for sentence in paragraph:
1103 for word in sentence:
1105 dictionary = word.attrib
1106 fields_of_word = [word.attrib[field] if (field in dictionary) else 'O' for field in fields]
1108 chunks = split_span(fields_of_word, datasetname)
1110 txt_out.write(make_line(chunks[0], 'B-', fields_of_word[1:]))
1112 # if there is more than one word in the chunk we write each in a separate line
1113 for chunk in chunks[1:]:
1114 # print(chunks)
1115 txt_out.write(make_line(chunk, 'I-', fields_of_word[1:]))
1117 # empty line after each sentence
1118 txt_out.write('\n')
1120 txt_out.close()
1123def determine_tsv_file(filename: str, data_folder: str, cut_multisense: bool = True):
1124 """
1125 Checks if the converted .tsv file already exists and if not, creates it. Returns name of the file.
1126 ----------
1127 string : str
1128 String that contains the name of the file.
1129 data_folder : str
1130 String that contains the name of the folder in which the CoNLL file should reside.
1131 cut_multisense : bool, optional
1132 Boolean that determines whether or not the wn30_key tag should be cut if it contains multiple possible senses.
1133 If True only the first listed sense will be used. Otherwise the whole list of senses will be detected
1134 as one new sense. The default is True.
1135 """
1137 if cut_multisense is True and filename not in ['semeval2007task17', 'trainomatic',
1138 'wngt']: # these three datasets do not have multiple senses
1140 conll_file_name = filename + '_cut.tsv'
1142 else:
1144 conll_file_name = filename + '.tsv'
1146 path_to_conll_file = data_folder / conll_file_name
1148 if not path_to_conll_file.exists():
1149 # convert the file to CoNLL
1151 from_ufsac_to_tsv(xml_file=Path(data_folder / 'original_data' / (filename + '.xml')),
1152 conll_file=Path(data_folder / conll_file_name),
1153 datasetname=filename,
1154 cut_multisense=cut_multisense)
1156 return conll_file_name
1159class WSD_UFSAC(MultiCorpus):
1160 def __init__(
1161 self,
1162 filenames: Union[str, List[str]] = ['masc', 'semcor'],
1163 base_path: Union[str, Path] = None,
1164 in_memory: bool = True,
1165 cut_multisense: bool = True,
1166 columns={0: "text", 3: "wn30_key"},
1167 tag_to_bioes=None,
1168 banned_sentences: List[str] = None,
1169 sample_missing_splits_in_multicorpus: bool = True,
1170 sample_missing_splits_in_each_corpus: bool = True,
1171 use_raganato_ALL_as_test_data: bool = False,
1172 name: str = 'multicorpus'
1173 ):
1174 """
1175 Initialize a custom corpus with any Word Sense Disambiguation (WSD) datasets in the UFSAC format from https://github.com/getalp/UFSAC.
1176 If the constructor is called for the first time the data is automatically downloaded and transformed from xml to a tab separated column format.
1177 Since only the WordNet 3.0 version for senses is consistently available for all provided datasets we will only consider this version.
1178 Also we ignore the id annotation used in datasets that were originally created for evaluation tasks
1179 :param filenames: Here you can pass a single datasetname or a list of ddatasetnames. The available names are:
1180 'masc', 'omsti', 'raganato_ALL', 'raganato_semeval2007', 'raganato_semeval2013', 'raganato_semeval2015', 'raganato_senseval2', 'raganato_senseval3',
1181 'semcor', 'semeval2007task17', 'semeval2007task7', 'semeval2013task12', 'semeval2015task13', 'senseval2', 'senseval2_lexical_sample_test',
1182 'senseval2_lexical_sample_train', 'senseval3task1', 'senseval3task6_test', 'senseval3task6_train', 'trainomatic', 'wngt'.
1183 So you can pass for example filenames = ['masc', 'omsti', 'wngt']. Default two mid-sized datasets 'masc' and 'semcor' are loaded.
1184 :param base_path: You can override this to point to a specific folder but typically this should not be necessary.
1185 :param in_memory: If True, keeps dataset in memory giving speedups in training.
1186 :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
1187 :param cut_multisense: Boolean that determines whether or not the wn30_key tag should be cut if it contains
1188 multiple possible senses. If True only the first listed sense will be used and the
1189 suffix '_cut' will be added to the name of the CoNLL file. Otherwise the whole list of
1190 senses will be detected as one new sense. The default is True.
1191 :param columns: Columns to consider when loading the dataset. You can add 1: "lemma" or 2: "pos" to the default dict {0: "text", 3: "wn30_key"}
1192 if you want to use additional pos and/or lemma for the words.
1193 :param tag_to_bioes: whether to convert to BIOES tagging scheme
1194 :param banned_sentences: Optionally remove sentences from the corpus. Works only if `in_memory` is true
1195 :param sample_missing_splits_in_multicorpus: Whether to sample missing splits when loading the multicorpus (this is redundant if
1196 sample_missing_splits_in_each_corpus is True)
1197 :param sample_missing_splits_in_each_corpus: Whether to sample missing splits when loading each single corpus given in filenames.
1198 :param use_raganato_ALL_as_test_data: If True, the raganato_ALL dataset (Raganato et al. "Word Sense Disambiguation: A unified evaluation framework and empirical compariso")
1199 will be used as test data. Note that the sample_missing_splits parameters are set to 'only_dev' in this case if set to True.
1200 :param name: Name of your (costum) corpus
1201 """
1202 if type(base_path) == str:
1203 base_path: Path = Path(base_path)
1205 # this dataset name
1206 dataset_name = self.__class__.__name__.lower()
1208 # default dataset folder is the cache root
1209 if not base_path:
1210 base_path = flair.cache_root / "datasets"
1211 data_folder = base_path / dataset_name
1212 original_data_folder = data_folder / 'original_data'
1214 # check if data there, if not, download the data
1215 if not original_data_folder.exists():
1216 # create folder
1217 data_folder.mkdir(parents=True)
1219 # download data
1220 import gdown
1222 url = 'https://drive.google.com/uc?id=1Oigo3kzRosz2VjyA44vpJZ58tDFyLRMO'
1224 output = data_folder / (dataset_name + '.tar')
1226 gdown.download(url, str(output), quiet=False)
1228 output = data_folder / (dataset_name + '.tar')
1229 unpack_file(file=output,
1230 unpack_to=data_folder,
1231 mode='tar', keep=False)
1233 os.rename(data_folder / 'ufsac-public-2.1', original_data_folder)
1235 # transform data into column format if necessary
1237 # if no filenames are specified we use all the data
1238 if not filenames:
1239 filenames = [name[:-4] for name in os.listdir(original_data_folder) if not 'raganato' in name]
1241 if type(filenames) == str:
1242 filenames = [filenames]
1244 corpora = []
1246 print('Transforming data into column format and creating corpora...')
1248 if use_raganato_ALL_as_test_data:
1249 # in this case no test data should be generated by sampling from train data. But if the sample arguments are set to true, the dev set will be sampled
1250 if sample_missing_splits_in_each_corpus:
1251 sample_missing_splits_in_each_corpus = 'only_dev'
1252 if sample_missing_splits_in_multicorpus:
1253 sample_missing_splits_in_multicorpus = 'only_dev'
1255 # also we remove 'raganato_ALL' from filenames in case its in the list
1256 filenames.remove('raganato_ALL')
1258 # generate the test file
1259 test_file = determine_tsv_file(filename='raganato_ALL', data_folder=data_folder,
1260 cut_multisense=cut_multisense)
1262 corpus = ColumnCorpus(data_folder=data_folder,
1263 column_format=columns,
1264 test_file=test_file, # corpus only has test data
1265 in_memory=in_memory,
1266 tag_to_bioes=tag_to_bioes,
1267 column_delimiter='\t',
1268 document_separator_token='-DOCSTART-',
1269 banned_sentences=banned_sentences,
1270 autofind_splits=False,
1271 sample_missing_splits=sample_missing_splits_in_each_corpus,
1272 )
1273 corpora.append(corpus)
1275 for filename in filenames:
1276 # make column file and save to data_folder
1278 new_filename = determine_tsv_file(filename=filename, data_folder=data_folder, cut_multisense=cut_multisense)
1280 corpus = ColumnCorpus(data_folder=data_folder,
1281 column_format=columns,
1282 train_file=new_filename,
1283 in_memory=in_memory,
1284 tag_to_bioes=tag_to_bioes,
1285 column_delimiter='\t',
1286 document_separator_token='-DOCSTART-',
1287 banned_sentences=banned_sentences,
1288 autofind_splits=False,
1289 sample_missing_splits=sample_missing_splits_in_each_corpus,
1290 )
1291 corpora.append(corpus)
1292 print('...done!')
1294 super(WSD_UFSAC, self).__init__(
1295 corpora,
1296 sample_missing_splits=sample_missing_splits_in_multicorpus,
1297 name=name
1298 )
1301class WSD_RAGANATO_ALL(EntityLinkingCorpus):
1302 def __init__(
1303 self,
1304 base_path: Union[str, Path] = None,
1305 in_memory: bool = True,
1306 columns={0: "text", 3: "wn30_key"},
1307 tag_to_bioes=None,
1308 label_name_map: Dict[str, str] = None,
1309 banned_sentences: List[str] = None,
1310 sample_missing_splits: bool = True,
1311 cut_multisense: bool = True
1312 ):
1313 """
1314 Initialize ragnato_ALL (concatenation of all SensEval and SemEval all-words tasks) provided in UFSAC https://github.com/getalp/UFSAC
1315 When first initializing the corpus the whole UFSAC data is downloaded.
1316 """
1317 if type(base_path) == str:
1318 base_path: Path = Path(base_path)
1320 dataset_name = 'wsd_ufsac'
1322 # default dataset folder is the cache root
1323 if not base_path:
1324 base_path = flair.cache_root / "datasets"
1325 data_folder = base_path / dataset_name
1326 original_data_folder = data_folder / 'original_data'
1328 # We check if the the UFSAC data has already been downloaded. If not, we download it.
1329 # Note that this downloads more datasets than just SemCor. But the size of the download is only around 190 Mb (around 4.5 Gb unpacked)
1330 if not original_data_folder.exists():
1331 # create folder
1332 data_folder.mkdir(parents=True)
1334 # download data
1335 import gdown
1337 url = 'https://drive.google.com/uc?id=1Oigo3kzRosz2VjyA44vpJZ58tDFyLRMO'
1339 output = data_folder / (dataset_name + '.tar')
1341 gdown.download(url, str(output), quiet=False)
1343 output = data_folder / (dataset_name + '.tar')
1344 unpack_file(file=output,
1345 unpack_to=data_folder,
1346 mode='tar', keep=False)
1348 os.rename(data_folder / 'ufsac-public-2.1', original_data_folder)
1350 train_file = determine_tsv_file(filename='raganato_ALL', data_folder=data_folder, cut_multisense=cut_multisense)
1352 super(WSD_RAGANATO_ALL, self).__init__(
1353 data_folder=data_folder,
1354 columns=columns,
1355 train_file=train_file,
1356 in_memory=in_memory,
1357 document_separator_token='-DOCSTART-',
1358 column_delimiter='\t',
1359 autofind_splits=False,
1360 tag_to_bioes=tag_to_bioes,
1361 label_name_map=label_name_map,
1362 banned_sentences=banned_sentences,
1363 sample_missing_splits=sample_missing_splits,
1364 )
1367class WSD_SEMCOR(EntityLinkingCorpus):
1368 def __init__(
1369 self,
1370 base_path: Union[str, Path] = None,
1371 in_memory: bool = True,
1372 columns={0: "text", 3: "wn30_key"},
1373 tag_to_bioes=None,
1374 label_name_map: Dict[str, str] = None,
1375 banned_sentences: List[str] = None,
1376 sample_missing_splits: bool = True,
1377 cut_multisense: bool = True,
1378 use_raganato_ALL_as_test_data: bool = False,
1379 ):
1380 """
1381 Initialize SemCor provided in UFSAC https://github.com/getalp/UFSAC
1382 When first initializing the corpus the whole UFSAC data is downloaded.
1383 """
1384 if type(base_path) == str:
1385 base_path: Path = Path(base_path)
1387 dataset_name = 'wsd_ufsac'
1389 # default dataset folder is the cache root
1390 if not base_path:
1391 base_path = flair.cache_root / "datasets"
1392 data_folder = base_path / dataset_name
1393 original_data_folder = data_folder / 'original_data'
1395 # We check if the the UFSAC data has already been downloaded. If not, we download it.
1396 # Note that this downloads more datasets than just SemCor. But the size of the download is only around 190 Mb (around 4.5 Gb unpacked)
1397 if not original_data_folder.exists():
1398 # create folder
1399 data_folder.mkdir(parents=True)
1401 # download data
1402 import gdown
1404 url = 'https://drive.google.com/uc?id=1Oigo3kzRosz2VjyA44vpJZ58tDFyLRMO'
1406 output = data_folder / (dataset_name + '.tar')
1408 gdown.download(url, str(output), quiet=False)
1410 output = data_folder / (dataset_name + '.tar')
1411 unpack_file(file=output,
1412 unpack_to=data_folder,
1413 mode='tar', keep=False)
1415 os.rename(data_folder / 'ufsac-public-2.1', original_data_folder)
1417 if use_raganato_ALL_as_test_data:
1418 # in this case no test data should be generated by sampling from train data. But if sample_missing_splits is true, the dev set will be sampled.
1419 if sample_missing_splits:
1420 sample_missing_splits = 'only_dev'
1422 # generate the test file
1423 test_file = determine_tsv_file(filename='raganato_ALL', data_folder=data_folder,
1424 cut_multisense=cut_multisense)
1425 else:
1426 test_file = None
1428 train_file = determine_tsv_file(filename='semcor', data_folder=data_folder, cut_multisense=cut_multisense)
1430 super(WSD_SEMCOR, self).__init__(
1431 data_folder=data_folder,
1432 columns=columns,
1433 train_file=train_file,
1434 test_file=test_file,
1435 in_memory=in_memory,
1436 document_separator_token='-DOCSTART-',
1437 column_delimiter='\t',
1438 autofind_splits=False,
1439 tag_to_bioes=tag_to_bioes,
1440 label_name_map=label_name_map,
1441 banned_sentences=banned_sentences,
1442 sample_missing_splits=sample_missing_splits,
1443 )
1446class WSD_WORDNET_GLOSS_TAGGED(EntityLinkingCorpus):
1447 def __init__(
1448 self,
1449 base_path: Union[str, Path] = None,
1450 in_memory: bool = True,
1451 columns={0: "text", 3: "wn30_key"},
1452 tag_to_bioes=None,
1453 label_name_map: Dict[str, str] = None,
1454 banned_sentences: List[str] = None,
1455 sample_missing_splits: bool = True,
1456 use_raganato_ALL_as_test_data: bool = False,
1457 ):
1458 """
1459 Initialize Princeton WordNet Gloss Corpus provided in UFSAC https://github.com/getalp/UFSAC
1460 When first initializing the corpus the whole UFSAC data is downloaded.
1461 """
1462 if type(base_path) == str:
1463 base_path: Path = Path(base_path)
1465 dataset_name = 'wsd_ufsac'
1467 # default dataset folder is the cache root
1468 if not base_path:
1469 base_path = flair.cache_root / "datasets"
1470 data_folder = base_path / dataset_name
1471 original_data_folder = data_folder / 'original_data'
1473 # We check if the the UFSAC data has already been downloaded. If not, we download it.
1474 # Note that this downloads more datasets than just WordNet Gloss Tagged. But the size of the download is only around 190 Mb (around 4.5 Gb unpacked)
1475 if not original_data_folder.exists():
1476 # create folder
1477 data_folder.mkdir(parents=True)
1479 # download data
1480 import gdown
1482 url = 'https://drive.google.com/uc?id=1Oigo3kzRosz2VjyA44vpJZ58tDFyLRMO'
1484 output = data_folder / (dataset_name + '.tar')
1486 gdown.download(url, str(output), quiet=False)
1488 output = data_folder / (dataset_name + '.tar')
1489 unpack_file(file=output,
1490 unpack_to=data_folder,
1491 mode='tar', keep=False)
1493 os.rename(data_folder / 'ufsac-public-2.1', original_data_folder)
1495 if use_raganato_ALL_as_test_data:
1496 # in this case no test data should be generated by sampling from train data. But if sample_missing_splits is true, the dev set will be sampled.
1497 if sample_missing_splits:
1498 sample_missing_splits = 'only_dev'
1500 # generate the test file
1501 test_file = determine_tsv_file(filename='raganato_ALL', data_folder=data_folder, cut_multisense=True)
1502 else:
1503 test_file = None
1505 train_file = determine_tsv_file(filename='wngt', data_folder=data_folder,
1506 cut_multisense=False) # does not have multisense!
1508 super(WSD_WORDNET_GLOSS_TAGGED, self).__init__(
1509 data_folder=data_folder,
1510 columns=columns,
1511 train_file=train_file,
1512 test_file=test_file,
1513 in_memory=in_memory,
1514 document_separator_token='-DOCSTART-',
1515 column_delimiter='\t',
1516 autofind_splits=False,
1517 tag_to_bioes=tag_to_bioes,
1518 label_name_map=label_name_map,
1519 banned_sentences=banned_sentences,
1520 sample_missing_splits=sample_missing_splits,
1521 )
1524class WSD_MASC(EntityLinkingCorpus):
1525 def __init__(
1526 self,
1527 base_path: Union[str, Path] = None,
1528 in_memory: bool = True,
1529 columns={0: "text", 3: "wn30_key"},
1530 tag_to_bioes=None,
1531 label_name_map: Dict[str, str] = None,
1532 banned_sentences: List[str] = None,
1533 sample_missing_splits: bool = True,
1534 cut_multisense: bool = True,
1535 use_raganato_ALL_as_test_data: bool = False,
1536 ):
1537 """
1538 Initialize MASC (Manually Annotated Sub-Corpus) provided in UFSAC https://github.com/getalp/UFSAC
1539 When first initializing the corpus the whole UFSAC data is downloaded.
1540 """
1541 if type(base_path) == str:
1542 base_path: Path = Path(base_path)
1544 dataset_name = 'wsd_ufsac'
1546 # default dataset folder is the cache root
1547 if not base_path:
1548 base_path = flair.cache_root / "datasets"
1549 data_folder = base_path / dataset_name
1550 original_data_folder = data_folder / 'original_data'
1552 # We check if the the UFSAC data has already been downloaded. If not, we download it.
1553 # Note that this downloads more datasets than just MASC. But the size of the download is only around 190 Mb (around 4.5 Gb unpacked)
1554 if not original_data_folder.exists():
1555 # create folder
1556 data_folder.mkdir(parents=True)
1558 # download data
1559 import gdown
1561 url = 'https://drive.google.com/uc?id=1Oigo3kzRosz2VjyA44vpJZ58tDFyLRMO'
1563 output = data_folder / (dataset_name + '.tar')
1565 gdown.download(url, str(output), quiet=False)
1567 output = data_folder / (dataset_name + '.tar')
1568 unpack_file(file=output,
1569 unpack_to=data_folder,
1570 mode='tar', keep=False)
1572 os.rename(data_folder / 'ufsac-public-2.1', original_data_folder)
1574 if use_raganato_ALL_as_test_data:
1575 # in this case no test data should be generated by sampling from train data. But if sample_missing_splits is true, the dev set will be sampled.
1576 if sample_missing_splits:
1577 sample_missing_splits = 'only_dev'
1579 # generate the test file
1580 test_file = determine_tsv_file(filename='raganato_ALL', data_folder=data_folder,
1581 cut_multisense=cut_multisense)
1582 else:
1583 test_file = None
1585 train_file = determine_tsv_file(filename='masc', data_folder=data_folder, cut_multisense=cut_multisense)
1587 super(WSD_MASC, self).__init__(
1588 data_folder=data_folder,
1589 columns=columns,
1590 train_file=train_file,
1591 test_file=test_file,
1592 in_memory=in_memory,
1593 document_separator_token='-DOCSTART-',
1594 column_delimiter='\t',
1595 autofind_splits=False,
1596 tag_to_bioes=tag_to_bioes,
1597 label_name_map=label_name_map,
1598 banned_sentences=banned_sentences,
1599 sample_missing_splits=sample_missing_splits,
1600 )
1603class WSD_OMSTI(EntityLinkingCorpus):
1604 def __init__(
1605 self,
1606 base_path: Union[str, Path] = None,
1607 in_memory: bool = True,
1608 columns={0: "text", 3: "wn30_key"},
1609 tag_to_bioes=None,
1610 label_name_map: Dict[str, str] = None,
1611 banned_sentences: List[str] = None,
1612 sample_missing_splits: bool = True,
1613 cut_multisense: bool = True,
1614 use_raganato_ALL_as_test_data: bool = False,
1615 ):
1616 """
1617 Initialize OMSTI (One Million Sense-Tagged Instances) provided in UFSAC https://github.com/getalp/UFSAC
1618 When first initializing the corpus the whole UFSAC data is downloaded.
1619 """
1620 if type(base_path) == str:
1621 base_path: Path = Path(base_path)
1623 dataset_name = 'wsd_ufsac'
1625 # default dataset folder is the cache root
1626 if not base_path:
1627 base_path = flair.cache_root / "datasets"
1628 data_folder = base_path / dataset_name
1629 original_data_folder = data_folder / 'original_data'
1631 # We check if the the UFSAC data has already been downloaded. If not, we download it.
1632 # Note that this downloads more datasets than just OMSTI. But the size of the download is only around 190 Mb (around 4.5 Gb unpacked)
1633 if not original_data_folder.exists():
1634 # create folder
1635 data_folder.mkdir(parents=True)
1637 # download data
1638 import gdown
1640 url = 'https://drive.google.com/uc?id=1Oigo3kzRosz2VjyA44vpJZ58tDFyLRMO'
1642 output = data_folder / (dataset_name + '.tar')
1644 gdown.download(url, str(output), quiet=False)
1646 output = data_folder / (dataset_name + '.tar')
1647 unpack_file(file=output,
1648 unpack_to=data_folder,
1649 mode='tar', keep=False)
1651 os.rename(data_folder / 'ufsac-public-2.1', original_data_folder)
1653 if use_raganato_ALL_as_test_data:
1654 # in this case no test data should be generated by sampling from train data. But if sample_missing_splits is true, the dev set will be sampled.
1655 if sample_missing_splits:
1656 sample_missing_splits = 'only_dev'
1658 # generate the test file
1659 test_file = determine_tsv_file(filename='raganato_ALL', data_folder=data_folder,
1660 cut_multisense=cut_multisense)
1661 else:
1662 test_file = None
1664 train_file = determine_tsv_file(filename='omsti', data_folder=data_folder, cut_multisense=cut_multisense)
1666 super(WSD_OMSTI, self).__init__(
1667 data_folder=data_folder,
1668 columns=columns,
1669 train_file=train_file,
1670 test_file=test_file,
1671 in_memory=in_memory,
1672 document_separator_token='-DOCSTART-',
1673 column_delimiter='\t',
1674 autofind_splits=False,
1675 tag_to_bioes=tag_to_bioes,
1676 label_name_map=label_name_map,
1677 banned_sentences=banned_sentences,
1678 sample_missing_splits=sample_missing_splits,
1679 )
1682class WSD_TRAINOMATIC(EntityLinkingCorpus):
1683 def __init__(
1684 self,
1685 base_path: Union[str, Path] = None,
1686 in_memory: bool = True,
1687 columns={0: "text", 3: "wn30_key"},
1688 tag_to_bioes=None,
1689 label_name_map: Dict[str, str] = None,
1690 banned_sentences: List[str] = None,
1691 sample_missing_splits: bool = True,
1692 use_raganato_ALL_as_test_data: bool = False,
1693 ):
1694 """
1695 Initialize Train-O-Matic provided in UFSAC https://github.com/getalp/UFSAC
1696 When first initializing the corpus the whole UFSAC data is downloaded.
1697 """
1698 if type(base_path) == str:
1699 base_path: Path = Path(base_path)
1701 dataset_name = 'wsd_ufsac'
1703 # default dataset folder is the cache root
1704 if not base_path:
1705 base_path = flair.cache_root / "datasets"
1706 data_folder = base_path / dataset_name
1707 original_data_folder = data_folder / 'original_data'
1709 # We check if the the UFSAC data has already been downloaded. If not, we download it.
1710 # Note that this downloads more datasets than just Train-O-Matic. But the size of the download is only around 190 Mb (around 4.5 Gb unpacked)
1711 if not original_data_folder.exists():
1712 # create folder
1713 data_folder.mkdir(parents=True)
1715 # download data
1716 import gdown
1718 url = 'https://drive.google.com/uc?id=1Oigo3kzRosz2VjyA44vpJZ58tDFyLRMO'
1720 output = data_folder / (dataset_name + '.tar')
1722 gdown.download(url, str(output), quiet=False)
1724 output = data_folder / (dataset_name + '.tar')
1725 unpack_file(file=output,
1726 unpack_to=data_folder,
1727 mode='tar', keep=False)
1729 os.rename(data_folder / 'ufsac-public-2.1', original_data_folder)
1731 if use_raganato_ALL_as_test_data:
1732 # in this case no test data should be generated by sampling from train data. But if sample_missing_splits is true, the dev set will be sampled.
1733 if sample_missing_splits:
1734 sample_missing_splits = 'only_dev'
1736 # generate the test file
1737 test_file = determine_tsv_file(filename='raganato_ALL', data_folder=data_folder, cut_multisense=True)
1738 else:
1739 test_file = None
1741 train_file = determine_tsv_file(filename='trainomatic', data_folder=data_folder,
1742 cut_multisense=False) # no multisenses
1744 super(WSD_TRAINOMATIC, self).__init__(
1745 data_folder=data_folder,
1746 columns=columns,
1747 train_file=train_file,
1748 test_file=test_file,
1749 in_memory=in_memory,
1750 document_separator_token='-DOCSTART-',
1751 column_delimiter='\t',
1752 autofind_splits=False,
1753 tag_to_bioes=tag_to_bioes,
1754 label_name_map=label_name_map,
1755 banned_sentences=banned_sentences,
1756 sample_missing_splits=sample_missing_splits,
1757 )