Coverage for flair/flair/datasets/relation_extraction.py: 10%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import bisect
2import io
3import json
4import logging
5import os
6import re
7from collections import defaultdict
8from pathlib import Path
9from typing import List, Union, Sequence, Dict, Any, Tuple, Set
11import conllu
12import gdown
14import flair
15from flair.data import Sentence
16from flair.datasets.conllu import CoNLLUCorpus
17from flair.file_utils import cached_path
18from flair.tokenization import (
19 SentenceSplitter,
20 SegtokSentenceSplitter,
21)
23log = logging.getLogger("flair")
26def convert_ptb_token(token: str) -> str:
27 """Convert PTB tokens to normal tokens"""
28 return {
29 "-lrb-": "(",
30 "-rrb-": ")",
31 "-lsb-": "[",
32 "-rsb-": "]",
33 "-lcb-": "{",
34 "-rcb-": "}",
35 }.get(token.lower(), token)
38class RE_ENGLISH_SEMEVAL2010(CoNLLUCorpus):
39 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, augment_train: bool = False):
40 """
41 SemEval-2010 Task 8 on Multi-Way Classification of Semantic Relations Between Pairs of
42 Nominals: https://aclanthology.org/S10-1006.pdf
43 :param base_path:
44 :param in_memory:
45 :param augment_train:
46 """
47 if type(base_path) == str:
48 base_path: Path = Path(base_path)
50 # this dataset name
51 dataset_name = self.__class__.__name__.lower()
53 # default dataset folder is the cache root
54 if not base_path:
55 base_path = flair.cache_root / "datasets"
56 data_folder = base_path / dataset_name
58 # download data if necessary
59 semeval_2010_task_8_url = (
60 "https://drive.google.com/uc?id=0B_jQiLugGTAkMDQ5ZjZiMTUtMzQ1Yy00YWNmLWJlZDYtOWY1ZDMwY2U4YjFk"
61 )
62 train_file_name = "semeval2010-task8-train-aug.conllu" if augment_train else "semeval2010-task8-train.conllu"
63 data_file = data_folder / train_file_name
65 # if True:
66 if not data_file.is_file():
67 source_data_folder = data_folder / "original"
68 source_data_file = source_data_folder / "SemEval2010_task8_all_data.zip"
69 os.makedirs(source_data_folder, exist_ok=True)
70 gdown.download(semeval_2010_task_8_url, str(source_data_file))
71 self.extract_and_convert_to_conllu(
72 data_file=source_data_file,
73 data_folder=data_folder,
74 augment_train=augment_train,
75 )
77 super(RE_ENGLISH_SEMEVAL2010, self).__init__(
78 data_folder,
79 train_file=train_file_name,
80 test_file="semeval2010-task8-test.conllu",
81 token_annotation_fields=['ner'],
82 in_memory=in_memory,
83 )
85 def extract_and_convert_to_conllu(self, data_file, data_folder, augment_train):
86 import zipfile
88 source_file_paths = [
89 "SemEval2010_task8_all_data/SemEval2010_task8_training/TRAIN_FILE.TXT",
90 "SemEval2010_task8_all_data/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT",
91 ]
92 train_filename = "semeval2010-task8-train-aug.conllu" if augment_train else "semeval2010-task8-train.conllu"
93 target_filenames = [train_filename, "semeval2010-task8-test.conllu"]
95 with zipfile.ZipFile(data_file) as zip_file:
97 for source_file_path, target_filename in zip(source_file_paths, target_filenames):
98 with zip_file.open(source_file_path, mode="r") as source_file:
100 target_file_path = Path(data_folder) / target_filename
101 with open(target_file_path, mode="w", encoding="utf-8") as target_file:
102 # write CoNLL-U Plus header
103 target_file.write("# global.columns = id form ner\n")
105 raw_lines = []
106 for line in io.TextIOWrapper(source_file, encoding="utf-8"):
107 line = line.strip()
109 if not line:
110 token_list = self._semeval_lines_to_token_list(raw_lines,
111 augment_relations=augment_train if "train" in target_filename else False)
112 target_file.write(token_list.serialize())
114 raw_lines = []
115 continue
117 raw_lines.append(line)
119 def _semeval_lines_to_token_list(self, raw_lines, augment_relations):
120 raw_id, raw_text = raw_lines[0].split("\t")
121 label = raw_lines[1]
122 id_ = int(raw_id)
123 raw_text = raw_text.strip('"')
125 # Some special cases (e.g., missing spaces before entity marker)
126 if id_ in [213, 4612, 6373, 8411, 9867]:
127 raw_text = raw_text.replace("<e2>", " <e2>")
128 if id_ in [2740, 4219, 4784]:
129 raw_text = raw_text.replace("<e1>", " <e1>")
130 if id_ == 9256:
131 raw_text = raw_text.replace("log- jam", "log-jam")
133 # necessary if text should be whitespace tokenizeable
134 if id_ in [2609, 7589]:
135 raw_text = raw_text.replace("1 1/2", "1-1/2")
136 if id_ == 10591:
137 raw_text = raw_text.replace("1 1/4", "1-1/4")
138 if id_ == 10665:
139 raw_text = raw_text.replace("6 1/2", "6-1/2")
141 raw_text = re.sub(r"([.,!?()])$", r" \1", raw_text)
142 raw_text = re.sub(r"(e[12]>)([',;:\"\(\)])", r"\1 \2", raw_text)
143 raw_text = re.sub(r"([',;:\"\(\)])(</?e[12])", r"\1 \2", raw_text)
144 raw_text = raw_text.replace("<e1>", "<e1> ")
145 raw_text = raw_text.replace("<e2>", "<e2> ")
146 raw_text = raw_text.replace("</e1>", " </e1>")
147 raw_text = raw_text.replace("</e2>", " </e2>")
149 tokens = raw_text.split(" ")
151 # Handle case where tail may occur before the head
152 subj_start = tokens.index("<e1>")
153 obj_start = tokens.index("<e2>")
154 if subj_start < obj_start:
155 tokens.pop(subj_start)
156 subj_end = tokens.index("</e1>")
157 tokens.pop(subj_end)
158 obj_start = tokens.index("<e2>")
159 tokens.pop(obj_start)
160 obj_end = tokens.index("</e2>")
161 tokens.pop(obj_end)
162 else:
163 tokens.pop(obj_start)
164 obj_end = tokens.index("</e2>")
165 tokens.pop(obj_end)
166 subj_start = tokens.index("<e1>")
167 tokens.pop(subj_start)
168 subj_end = tokens.index("</e1>")
169 tokens.pop(subj_end)
171 relation = ";".join([str(subj_start + 1), str(subj_end), str(obj_start + 1), str(obj_end), label])
173 if augment_relations:
174 label_inverted = label.replace("e1", "e3")
175 label_inverted = label_inverted.replace("e2", "e1")
176 label_inverted = label_inverted.replace("e3", "e2")
177 relation_inverted = ";".join(
178 [str(obj_start + 1), str(obj_end), str(subj_start + 1), str(subj_end), label_inverted])
180 metadata = {
181 "text": " ".join(tokens),
182 "sentence_id": str(id_),
183 "relations": relation + "|" + relation_inverted if augment_relations else relation,
184 }
186 token_dicts = []
187 for idx, token in enumerate(tokens):
188 tag = "O"
189 prefix = ""
191 if subj_start <= idx < subj_end:
192 prefix = "B-" if idx == subj_start else "I-"
193 tag = "E1"
194 elif obj_start <= idx < obj_end:
195 prefix = "B-" if idx == obj_start else "I-"
196 tag = "E2"
198 token_dicts.append(
199 {
200 "id": str(idx + 1),
201 "form": token,
202 "ner": prefix + tag,
203 }
204 )
206 return conllu.TokenList(tokens=token_dicts, metadata=metadata)
209class RE_ENGLISH_TACRED(CoNLLUCorpus):
210 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
211 """
212 TAC Relation Extraction Dataset with 41 relations from https://nlp.stanford.edu/projects/tacred/.
213 Manual download is required for this dataset.
214 :param base_path:
215 :param in_memory:
216 """
217 if type(base_path) == str:
218 base_path: Path = Path(base_path)
220 # this dataset name
221 dataset_name = self.__class__.__name__.lower()
223 # default dataset folder is the cache root
224 if not base_path:
225 base_path = flair.cache_root / "datasets"
226 data_folder = base_path / dataset_name
228 data_file = data_folder / "tacred-train.conllu"
230 if not data_file.is_file():
231 source_data_folder = data_folder / "original"
232 source_data_file = source_data_folder / "TACRED_LDC.zip"
233 os.makedirs(source_data_folder, exist_ok=True)
234 self.extract_and_convert_to_conllu(
235 data_file=source_data_file,
236 data_folder=data_folder,
237 )
239 super(RE_ENGLISH_TACRED, self).__init__(
240 data_folder,
241 token_annotation_fields=['ner'],
242 in_memory=in_memory,
243 )
245 def extract_and_convert_to_conllu(self, data_file, data_folder):
246 import zipfile
248 source_file_paths = [
249 "tacred/data/json/train.json",
250 "tacred/data/json/dev.json",
251 "tacred/data/json/test.json",
252 ]
253 target_filenames = ["tacred-train.conllu", "tacred-dev.conllu", "tacred-test.conllu"]
255 with zipfile.ZipFile(data_file) as zip_file:
257 for source_file_path, target_filename in zip(source_file_paths, target_filenames):
258 with zip_file.open(source_file_path, mode="r") as source_file:
260 target_file_path = Path(data_folder) / target_filename
261 with open(target_file_path, mode="w", encoding="utf-8") as target_file:
262 # write CoNLL-U Plus header
263 target_file.write("# global.columns = id form ner\n")
265 for example in json.load(source_file):
266 token_list = self._tacred_example_to_token_list(example)
267 target_file.write(token_list.serialize())
269 def _tacred_example_to_token_list(self, example: Dict[str, Any]) -> conllu.TokenList:
270 id_ = example["id"]
271 tokens = example["token"]
272 ner = example["stanford_ner"]
274 subj_start = example["subj_start"]
275 subj_end = example["subj_end"]
276 obj_start = example["obj_start"]
277 obj_end = example["obj_end"]
279 subj_tag = example["subj_type"]
280 obj_tag = example["obj_type"]
282 label = example["relation"]
284 metadata = {
285 "text": " ".join(tokens),
286 "sentence_id": str(id_),
287 "relations": ";".join(
288 [str(subj_start + 1), str(subj_end + 1), str(obj_start + 1), str(obj_end + 1), label]
289 ),
290 }
292 prev_tag = None
293 token_dicts = []
294 for idx, (token, tag) in enumerate(zip(tokens, ner)):
295 if subj_start <= idx <= subj_end:
296 tag = subj_tag
298 if obj_start <= idx <= obj_end:
299 tag = obj_tag
301 prefix = ""
302 if tag != "O":
303 if tag != prev_tag:
304 prefix = "B-"
305 else:
306 prefix = "I-"
308 prev_tag = tag
310 token_dicts.append(
311 {
312 "id": str(idx + 1),
313 "form": convert_ptb_token(token),
314 "ner": prefix + tag,
315 }
316 )
318 return conllu.TokenList(tokens=token_dicts, metadata=metadata)
321class RE_ENGLISH_CONLL04(CoNLLUCorpus):
322 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
323 if type(base_path) == str:
324 base_path: Path = Path(base_path)
326 # this dataset name
327 dataset_name = self.__class__.__name__.lower()
329 # default dataset folder is the cache root
330 if not base_path:
331 base_path = flair.cache_root / "datasets"
332 data_folder = base_path / dataset_name
334 # TODO: change data source to original CoNLL04 -- this dataset has span formatting errors
335 # download data if necessary
336 conll04_url = (
337 "https://raw.githubusercontent.com/bekou/multihead_joint_entity_relation_extraction/master/data/CoNLL04/"
338 )
339 data_file = data_folder / "conll04-train.conllu"
341 if True or not data_file.is_file():
342 source_data_folder = data_folder / "original"
343 cached_path(f"{conll04_url}train.txt", source_data_folder)
344 cached_path(f"{conll04_url}dev.txt", source_data_folder)
345 cached_path(f"{conll04_url}test.txt", source_data_folder)
347 self.convert_to_conllu(
348 source_data_folder=source_data_folder,
349 data_folder=data_folder,
350 )
352 super(RE_ENGLISH_CONLL04, self).__init__(
353 data_folder,
354 token_annotation_fields=['ner'],
355 in_memory=in_memory,
356 )
358 def _parse_incr(self, source_file) -> Sequence[conllu.TokenList]:
359 fields = ["id", "form", "ner", "relations", "relation_heads"]
360 field_parsers = {
361 "relations": lambda line, i: json.loads(line[i].replace("'", '"')),
362 "relation_heads": lambda line, i: json.loads(line[i]),
363 }
364 metadata_parsers = {"__fallback__": lambda k, v: tuple(k.split())}
366 lines = []
367 for index, line in enumerate(source_file):
368 if index > 0 and line.startswith("#"):
369 source_str = "".join(lines)
370 src_token_list = conllu.parse(
371 source_str, fields=fields, field_parsers=field_parsers, metadata_parsers=metadata_parsers
372 )
373 lines = []
374 yield src_token_list[0]
376 lines.append(line)
378 source_str = "".join(lines)
379 src_token_list = conllu.parse(
380 source_str, fields=fields, field_parsers=field_parsers, metadata_parsers=metadata_parsers
381 )
382 yield src_token_list[0]
384 def convert_to_conllu(self, source_data_folder, data_folder):
385 source_filenames = [
386 "train.txt",
387 "dev.txt",
388 "test.txt",
389 ]
390 target_filenames = ["conll04-train.conllu", "conll04-dev.conllu", "conll04-test.conllu"]
392 for source_filename, target_filename in zip(source_filenames, target_filenames):
393 with open(source_data_folder / source_filename, mode="r") as source_file:
395 with open(data_folder / target_filename, mode="w", encoding="utf-8") as target_file:
396 # write CoNLL-U Plus header
397 target_file.write("# global.columns = id form ner\n")
399 for src_token_list in self._parse_incr(source_file):
400 token_list = self._src_token_list_to_token_list(src_token_list)
401 target_file.write(token_list.serialize())
403 def _bio_tags_to_spans(self, tags: List[str]) -> List[Tuple[int, int]]:
404 spans = []
405 span_start = 0
406 span_end = 0
407 active_conll_tag = None
408 for index, tag in enumerate(tags):
409 bio_tag = tag[0]
410 conll_tag = tag[2:]
411 if bio_tag == "O":
412 # The span has ended.
413 if active_conll_tag is not None:
414 spans.append((span_start, span_end))
415 active_conll_tag = None
416 continue
417 elif bio_tag == "B" or (bio_tag == "I" and conll_tag != active_conll_tag):
418 # We are entering a new span; reset indices
419 # and active tag to new span.
420 if active_conll_tag is not None:
421 spans.append((span_start, span_end))
422 active_conll_tag = conll_tag
423 span_start = index
424 span_end = index
425 elif bio_tag == "I" and conll_tag == active_conll_tag:
426 # We're inside a span.
427 span_end += 1
428 else:
429 raise Exception("That should never happen.")
431 # Last token might have been a part of a valid span.
432 if active_conll_tag is not None:
433 spans.append((span_start, span_end))
435 return spans
437 def _src_token_list_to_token_list(self, src_token_list):
438 tokens = []
439 token_dicts = []
440 ner_tags = []
441 for index, token in enumerate(src_token_list, start=1):
442 text = token["form"]
443 ner_tag = token["ner"]
444 tokens.append(text)
445 ner_tags.append(ner_tag)
447 token_dicts.append(
448 {
449 "id": str(index),
450 "form": text,
451 "ner": ner_tag,
452 }
453 )
455 span_end_to_span = {end: (start, end) for start, end in self._bio_tags_to_spans(ner_tags)}
457 relations = []
458 for index, token in enumerate(src_token_list):
459 for relation, head in zip(token["relations"], token["relation_heads"]):
460 if relation == "N":
461 continue
463 subj_start, subj_end = span_end_to_span[index]
464 obj_start, obj_end = span_end_to_span[head]
465 relations.append((subj_start, subj_end, obj_start, obj_end, relation))
467 doc_id = src_token_list.metadata["doc"]
469 metadata = {
470 "text": " ".join(tokens),
471 "sentence_id": doc_id,
472 "relations": "|".join(
473 [
474 ";".join([str(subj_start + 1), str(subj_end + 1), str(obj_start + 1), str(obj_end + 1), relation])
475 for subj_start, subj_end, obj_start, obj_end, relation in relations
476 ]
477 ),
478 }
480 return conllu.TokenList(tokens=token_dicts, metadata=metadata)
483class RE_ENGLISH_DRUGPROT(CoNLLUCorpus):
484 def __init__(
485 self,
486 base_path: Union[str, Path] = None,
487 in_memory: bool = True,
488 sentence_splitter: SentenceSplitter = SegtokSentenceSplitter(),
489 ):
490 """
491 DrugProt corpus: Biocreative VII Track 1 from https://zenodo.org/record/5119892#.YSdSaVuxU5k/ on
492 drug and chemical-protein interactions.
493 """
494 if type(base_path) == str:
495 base_path: Path = Path(base_path)
497 self.sentence_splitter = sentence_splitter
499 # this dataset name
500 dataset_name = self.__class__.__name__.lower() + "_" + type(self.sentence_splitter).__name__ + "_v3"
502 # default dataset folder is the cache root
503 if not base_path:
504 base_path = flair.cache_root / "datasets"
505 data_folder = base_path / dataset_name
507 drugprot_url = (
508 "https://zenodo.org/record/5119892/files/drugprot-training-development-test-background.zip"
509 )
510 data_file = data_folder / "drugprot-train.conllu"
512 if not data_file.is_file():
513 source_data_folder = data_folder / "original"
514 cached_path(drugprot_url, source_data_folder)
515 self.extract_and_convert_to_conllu(
516 data_file=source_data_folder / "drugprot-training-development-test-background.zip",
517 data_folder=data_folder,
518 )
520 super(RE_ENGLISH_DRUGPROT, self).__init__(
521 data_folder,
522 in_memory=in_memory,
523 token_annotation_fields=["ner", "ner-2"],
524 sample_missing_splits=False,
525 )
527 def extract_and_convert_to_conllu(self, data_file, data_folder):
528 import zipfile
530 splits = ["training", "development"]
531 target_filenames = ["drugprot-train.conllu", "drugprot-dev.conllu"]
533 with zipfile.ZipFile(data_file) as zip_file:
534 for split, target_filename in zip(splits, target_filenames):
535 pmid_to_entities = defaultdict(dict)
536 pmid_to_relations = defaultdict(set)
538 with zip_file.open(
539 f"drugprot-gs-training-development/{split}/drugprot_{split}_entities.tsv") as entites_file:
540 for line in io.TextIOWrapper(entites_file, encoding="utf-8"):
541 fields = line.strip().split("\t")
542 pmid, ent_id, ent_type, start, end, mention = fields
543 pmid_to_entities[pmid][ent_id] = (
544 ent_type, int(start), int(end), mention)
546 with zip_file.open(
547 f"drugprot-gs-training-development/{split}/drugprot_{split}_relations.tsv") as relations_file:
548 for line in io.TextIOWrapper(relations_file, encoding="utf-8"):
549 fields = line.strip().split("\t")
550 pmid, rel_type, arg1, arg2 = fields
551 ent1 = arg1.split(":")[1]
552 ent2 = arg2.split(":")[1]
553 pmid_to_relations[pmid].add((rel_type, ent1, ent2))
555 tokenlists: List[conllu.TokenList] = []
556 with zip_file.open(
557 f"drugprot-gs-training-development/{split}/drugprot_{split}_abstracs.tsv") as abstracts_file:
558 for line in io.TextIOWrapper(abstracts_file, encoding="utf-8"):
559 fields = line.strip().split("\t")
560 pmid, title, abstract = fields
561 title_sentences = self.sentence_splitter.split(title)
562 abstract_sentences = self.sentence_splitter.split(abstract)
564 tokenlists.extend(self.drugprot_document_to_tokenlists(pmid=pmid,
565 title_sentences=title_sentences,
566 abstract_sentences=abstract_sentences,
567 abstract_offset=len(title) + 1,
568 entities=pmid_to_entities[pmid],
569 relations=pmid_to_relations[pmid]))
571 target_file_path = Path(data_folder) / target_filename
572 with open(target_file_path, mode="w", encoding="utf-8") as target_file:
573 # write CoNLL-U Plus header
574 target_file.write("# global.columns = id form ner ner-2\n")
576 for tokenlist in tokenlists:
577 target_file.write(tokenlist.serialize())
579 # for source_file_path, target_filename in zip(source_file_paths, target_filenames):
580 # with zip_file.open(source_file_path, mode="r") as source_file:
582 # target_file_path = Path(data_folder) / target_filename
583 # with open(target_file_path, mode="w", encoding="utf-8") as target_file:
584 # # write CoNLL-U Plus header
585 # target_file.write("# global.columns = id form ner\n")
587 # for example in json.load(source_file):
588 # token_list = self._tacred_example_to_token_list(example)
589 # target_file.write(token_list.serialize())
591 def char_spans_to_token_spans(self, char_spans, token_offsets):
592 token_starts = [s[0] for s in token_offsets]
593 token_ends = [s[1] for s in token_offsets]
595 token_spans = []
596 for char_start, char_end in char_spans:
597 token_start = bisect.bisect_right(token_ends, char_start)
598 token_end = bisect.bisect_left(token_starts, char_end)
599 token_spans.append((token_start, token_end))
601 return token_spans
603 def has_overlap(self, a, b):
604 if a is None or b is None:
605 return False
607 return max(0, min(a[1], b[1]) - max(a[0], b[0])) > 0
609 def drugprot_document_to_tokenlists(self,
610 pmid: str,
611 title_sentences: List[Sentence],
612 abstract_sentences: List[Sentence],
613 abstract_offset: int,
614 entities: Dict[str, Tuple[str, int, int, str]],
615 relations: Set[Tuple[str, str, str]]
616 ) -> List[conllu.TokenList]:
617 tokenlists: List[conllu.TokenList] = []
618 sentence_id = 1
619 for offset, sents in [(0, title_sentences), (abstract_offset, abstract_sentences)]:
620 for sent in sents:
622 sent_char_start = sent.start_pos + offset
623 sent_char_end = sent.end_pos + offset
625 entities_in_sent = set()
626 for entity_id, (_, char_start, char_end, _) in entities.items():
627 if sent_char_start <= char_start and char_end <= sent_char_end:
628 entities_in_sent.add(entity_id)
630 entity_char_spans = [(entities[entity_id][1], entities[entity_id][2]) for entity_id in entities_in_sent]
632 token_offsets = [(sent.start_pos + token.start_pos + offset, sent.start_pos + token.end_pos + offset)
633 for token in sent.tokens]
634 entity_token_spans = self.char_spans_to_token_spans(entity_char_spans, token_offsets)
636 tags_1 = ["O"] * len(sent)
637 tags_2 = ["O"] * len(sent)
638 entity_id_to_token_idx = {}
640 ordered_entities = sorted(zip(entities_in_sent, entity_token_spans), key=lambda x: x[1][1] - x[1][0],
641 reverse=True)
643 for entity_id, entity_span in ordered_entities:
645 entity_id_to_token_idx[entity_id] = entity_span
647 # check if first tag row is already occupied
648 token_start, token_end = entity_span
649 tag_1_occupied = False
650 for i in range(token_start, token_end):
651 if tags_1[i] != "O":
652 tag_1_occupied = True
654 # if first tag row is occupied, use second tag row
655 tags = tags_2 if tag_1_occupied else tags_1
657 tag = entities[entity_id][0]
658 token_start, token_end = entity_span
659 for i in range(token_start, token_end):
660 if i == token_start:
661 prefix = "B-"
662 else:
663 prefix = "I-"
665 tags[i] = prefix + tag
667 token_dicts = []
668 for i, (token, tag_1, tag_2) in enumerate(zip(sent, tags_1, tags_2)):
669 # hardcoded mapping TODO: perhaps find nicer solution
670 tag_1 = tag_1.replace("GENE-N", "GENE")
671 tag_1 = tag_1.replace("GENE-Y", "GENE")
672 tag_2 = tag_2.replace("GENE-N", "GENE")
673 tag_2 = tag_2.replace("GENE-Y", "GENE")
675 token_dicts.append({
676 "id": str(i + 1),
677 "form": token.text,
678 "ner": tag_1,
679 "ner-2": tag_2
680 })
682 relations_in_sent = []
683 for relation, ent1, ent2 in [r for r in relations if {r[1], r[2]} <= entities_in_sent]:
684 subj_start = entity_id_to_token_idx[ent1][0]
685 subj_end = entity_id_to_token_idx[ent1][1]
686 obj_start = entity_id_to_token_idx[ent2][0]
687 obj_end = entity_id_to_token_idx[ent2][1]
688 relations_in_sent.append((subj_start, subj_end, obj_start, obj_end, relation))
690 metadata = {
691 "text": sent.to_original_text(),
692 "doc_id": pmid,
693 "sentence_id": str(sentence_id),
694 "relations": "|".join(
695 [
696 ";".join([str(subj_start + 1), str(subj_end), str(obj_start + 1), str(obj_end), relation])
697 for subj_start, subj_end, obj_start, obj_end, relation in relations_in_sent
698 ]
699 ),
700 }
702 tokenlists.append(conllu.TokenList(tokens=token_dicts, metadata=metadata))
704 sentence_id += 1
706 return tokenlists