Coverage for /home/ubuntu/Documents/Research/mut_p1/flair/flair/datasets/relation_extraction.py: 10%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

375 statements  

1import bisect 

2import io 

3import json 

4import logging 

5import os 

6import re 

7from collections import defaultdict 

8from pathlib import Path 

9from typing import List, Union, Sequence, Dict, Any, Tuple, Set 

10 

11import conllu 

12import gdown 

13 

14import flair 

15from flair.data import Sentence 

16from flair.datasets.conllu import CoNLLUCorpus 

17from flair.file_utils import cached_path 

18from flair.tokenization import ( 

19 SentenceSplitter, 

20 SegtokSentenceSplitter, 

21) 

22 

23log = logging.getLogger("flair") 

24 

25 

26def convert_ptb_token(token: str) -> str: 

27 """Convert PTB tokens to normal tokens""" 

28 return { 

29 "-lrb-": "(", 

30 "-rrb-": ")", 

31 "-lsb-": "[", 

32 "-rsb-": "]", 

33 "-lcb-": "{", 

34 "-rcb-": "}", 

35 }.get(token.lower(), token) 

36 

37 

38class RE_ENGLISH_SEMEVAL2010(CoNLLUCorpus): 

39 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, augment_train: bool = False): 

40 """ 

41 SemEval-2010 Task 8 on Multi-Way Classification of Semantic Relations Between Pairs of 

42 Nominals: https://aclanthology.org/S10-1006.pdf 

43 :param base_path: 

44 :param in_memory: 

45 :param augment_train: 

46 """ 

47 if type(base_path) == str: 

48 base_path: Path = Path(base_path) 

49 

50 # this dataset name 

51 dataset_name = self.__class__.__name__.lower() 

52 

53 # default dataset folder is the cache root 

54 if not base_path: 

55 base_path = flair.cache_root / "datasets" 

56 data_folder = base_path / dataset_name 

57 

58 # download data if necessary 

59 semeval_2010_task_8_url = ( 

60 "https://drive.google.com/uc?id=0B_jQiLugGTAkMDQ5ZjZiMTUtMzQ1Yy00YWNmLWJlZDYtOWY1ZDMwY2U4YjFk" 

61 ) 

62 train_file_name = "semeval2010-task8-train-aug.conllu" if augment_train else "semeval2010-task8-train.conllu" 

63 data_file = data_folder / train_file_name 

64 

65 # if True: 

66 if not data_file.is_file(): 

67 source_data_folder = data_folder / "original" 

68 source_data_file = source_data_folder / "SemEval2010_task8_all_data.zip" 

69 os.makedirs(source_data_folder, exist_ok=True) 

70 gdown.download(semeval_2010_task_8_url, str(source_data_file)) 

71 self.extract_and_convert_to_conllu( 

72 data_file=source_data_file, 

73 data_folder=data_folder, 

74 augment_train=augment_train, 

75 ) 

76 

77 super(RE_ENGLISH_SEMEVAL2010, self).__init__( 

78 data_folder, 

79 train_file=train_file_name, 

80 test_file="semeval2010-task8-test.conllu", 

81 token_annotation_fields=['ner'], 

82 in_memory=in_memory, 

83 ) 

84 

85 def extract_and_convert_to_conllu(self, data_file, data_folder, augment_train): 

86 import zipfile 

87 

88 source_file_paths = [ 

89 "SemEval2010_task8_all_data/SemEval2010_task8_training/TRAIN_FILE.TXT", 

90 "SemEval2010_task8_all_data/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT", 

91 ] 

92 train_filename = "semeval2010-task8-train-aug.conllu" if augment_train else "semeval2010-task8-train.conllu" 

93 target_filenames = [train_filename, "semeval2010-task8-test.conllu"] 

94 

95 with zipfile.ZipFile(data_file) as zip_file: 

96 

97 for source_file_path, target_filename in zip(source_file_paths, target_filenames): 

98 with zip_file.open(source_file_path, mode="r") as source_file: 

99 

100 target_file_path = Path(data_folder) / target_filename 

101 with open(target_file_path, mode="w", encoding="utf-8") as target_file: 

102 # write CoNLL-U Plus header 

103 target_file.write("# global.columns = id form ner\n") 

104 

105 raw_lines = [] 

106 for line in io.TextIOWrapper(source_file, encoding="utf-8"): 

107 line = line.strip() 

108 

109 if not line: 

110 token_list = self._semeval_lines_to_token_list(raw_lines, 

111 augment_relations=augment_train if "train" in target_filename else False) 

112 target_file.write(token_list.serialize()) 

113 

114 raw_lines = [] 

115 continue 

116 

117 raw_lines.append(line) 

118 

119 def _semeval_lines_to_token_list(self, raw_lines, augment_relations): 

120 raw_id, raw_text = raw_lines[0].split("\t") 

121 label = raw_lines[1] 

122 id_ = int(raw_id) 

123 raw_text = raw_text.strip('"') 

124 

125 # Some special cases (e.g., missing spaces before entity marker) 

126 if id_ in [213, 4612, 6373, 8411, 9867]: 

127 raw_text = raw_text.replace("<e2>", " <e2>") 

128 if id_ in [2740, 4219, 4784]: 

129 raw_text = raw_text.replace("<e1>", " <e1>") 

130 if id_ == 9256: 

131 raw_text = raw_text.replace("log- jam", "log-jam") 

132 

133 # necessary if text should be whitespace tokenizeable 

134 if id_ in [2609, 7589]: 

135 raw_text = raw_text.replace("1 1/2", "1-1/2") 

136 if id_ == 10591: 

137 raw_text = raw_text.replace("1 1/4", "1-1/4") 

138 if id_ == 10665: 

139 raw_text = raw_text.replace("6 1/2", "6-1/2") 

140 

141 raw_text = re.sub(r"([.,!?()])$", r" \1", raw_text) 

142 raw_text = re.sub(r"(e[12]>)([',;:\"\(\)])", r"\1 \2", raw_text) 

143 raw_text = re.sub(r"([',;:\"\(\)])(</?e[12])", r"\1 \2", raw_text) 

144 raw_text = raw_text.replace("<e1>", "<e1> ") 

145 raw_text = raw_text.replace("<e2>", "<e2> ") 

146 raw_text = raw_text.replace("</e1>", " </e1>") 

147 raw_text = raw_text.replace("</e2>", " </e2>") 

148 

149 tokens = raw_text.split(" ") 

150 

151 # Handle case where tail may occur before the head 

152 subj_start = tokens.index("<e1>") 

153 obj_start = tokens.index("<e2>") 

154 if subj_start < obj_start: 

155 tokens.pop(subj_start) 

156 subj_end = tokens.index("</e1>") 

157 tokens.pop(subj_end) 

158 obj_start = tokens.index("<e2>") 

159 tokens.pop(obj_start) 

160 obj_end = tokens.index("</e2>") 

161 tokens.pop(obj_end) 

162 else: 

163 tokens.pop(obj_start) 

164 obj_end = tokens.index("</e2>") 

165 tokens.pop(obj_end) 

166 subj_start = tokens.index("<e1>") 

167 tokens.pop(subj_start) 

168 subj_end = tokens.index("</e1>") 

169 tokens.pop(subj_end) 

170 

171 relation = ";".join([str(subj_start + 1), str(subj_end), str(obj_start + 1), str(obj_end), label]) 

172 

173 if augment_relations: 

174 label_inverted = label.replace("e1", "e3") 

175 label_inverted = label_inverted.replace("e2", "e1") 

176 label_inverted = label_inverted.replace("e3", "e2") 

177 relation_inverted = ";".join( 

178 [str(obj_start + 1), str(obj_end), str(subj_start + 1), str(subj_end), label_inverted]) 

179 

180 metadata = { 

181 "text": " ".join(tokens), 

182 "sentence_id": str(id_), 

183 "relations": relation + "|" + relation_inverted if augment_relations else relation, 

184 } 

185 

186 token_dicts = [] 

187 for idx, token in enumerate(tokens): 

188 tag = "O" 

189 prefix = "" 

190 

191 if subj_start <= idx < subj_end: 

192 prefix = "B-" if idx == subj_start else "I-" 

193 tag = "E1" 

194 elif obj_start <= idx < obj_end: 

195 prefix = "B-" if idx == obj_start else "I-" 

196 tag = "E2" 

197 

198 token_dicts.append( 

199 { 

200 "id": str(idx + 1), 

201 "form": token, 

202 "ner": prefix + tag, 

203 } 

204 ) 

205 

206 return conllu.TokenList(tokens=token_dicts, metadata=metadata) 

207 

208 

209class RE_ENGLISH_TACRED(CoNLLUCorpus): 

210 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): 

211 """ 

212 TAC Relation Extraction Dataset with 41 relations from https://nlp.stanford.edu/projects/tacred/. 

213 Manual download is required for this dataset. 

214 :param base_path: 

215 :param in_memory: 

216 """ 

217 if type(base_path) == str: 

218 base_path: Path = Path(base_path) 

219 

220 # this dataset name 

221 dataset_name = self.__class__.__name__.lower() 

222 

223 # default dataset folder is the cache root 

224 if not base_path: 

225 base_path = flair.cache_root / "datasets" 

226 data_folder = base_path / dataset_name 

227 

228 data_file = data_folder / "tacred-train.conllu" 

229 

230 if not data_file.is_file(): 

231 source_data_folder = data_folder / "original" 

232 source_data_file = source_data_folder / "TACRED_LDC.zip" 

233 os.makedirs(source_data_folder, exist_ok=True) 

234 self.extract_and_convert_to_conllu( 

235 data_file=source_data_file, 

236 data_folder=data_folder, 

237 ) 

238 

239 super(RE_ENGLISH_TACRED, self).__init__( 

240 data_folder, 

241 token_annotation_fields=['ner'], 

242 in_memory=in_memory, 

243 ) 

244 

245 def extract_and_convert_to_conllu(self, data_file, data_folder): 

246 import zipfile 

247 

248 source_file_paths = [ 

249 "tacred/data/json/train.json", 

250 "tacred/data/json/dev.json", 

251 "tacred/data/json/test.json", 

252 ] 

253 target_filenames = ["tacred-train.conllu", "tacred-dev.conllu", "tacred-test.conllu"] 

254 

255 with zipfile.ZipFile(data_file) as zip_file: 

256 

257 for source_file_path, target_filename in zip(source_file_paths, target_filenames): 

258 with zip_file.open(source_file_path, mode="r") as source_file: 

259 

260 target_file_path = Path(data_folder) / target_filename 

261 with open(target_file_path, mode="w", encoding="utf-8") as target_file: 

262 # write CoNLL-U Plus header 

263 target_file.write("# global.columns = id form ner\n") 

264 

265 for example in json.load(source_file): 

266 token_list = self._tacred_example_to_token_list(example) 

267 target_file.write(token_list.serialize()) 

268 

269 def _tacred_example_to_token_list(self, example: Dict[str, Any]) -> conllu.TokenList: 

270 id_ = example["id"] 

271 tokens = example["token"] 

272 ner = example["stanford_ner"] 

273 

274 subj_start = example["subj_start"] 

275 subj_end = example["subj_end"] 

276 obj_start = example["obj_start"] 

277 obj_end = example["obj_end"] 

278 

279 subj_tag = example["subj_type"] 

280 obj_tag = example["obj_type"] 

281 

282 label = example["relation"] 

283 

284 metadata = { 

285 "text": " ".join(tokens), 

286 "sentence_id": str(id_), 

287 "relations": ";".join( 

288 [str(subj_start + 1), str(subj_end + 1), str(obj_start + 1), str(obj_end + 1), label] 

289 ), 

290 } 

291 

292 prev_tag = None 

293 token_dicts = [] 

294 for idx, (token, tag) in enumerate(zip(tokens, ner)): 

295 if subj_start <= idx <= subj_end: 

296 tag = subj_tag 

297 

298 if obj_start <= idx <= obj_end: 

299 tag = obj_tag 

300 

301 prefix = "" 

302 if tag != "O": 

303 if tag != prev_tag: 

304 prefix = "B-" 

305 else: 

306 prefix = "I-" 

307 

308 prev_tag = tag 

309 

310 token_dicts.append( 

311 { 

312 "id": str(idx + 1), 

313 "form": convert_ptb_token(token), 

314 "ner": prefix + tag, 

315 } 

316 ) 

317 

318 return conllu.TokenList(tokens=token_dicts, metadata=metadata) 

319 

320 

321class RE_ENGLISH_CONLL04(CoNLLUCorpus): 

322 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): 

323 if type(base_path) == str: 

324 base_path: Path = Path(base_path) 

325 

326 # this dataset name 

327 dataset_name = self.__class__.__name__.lower() 

328 

329 # default dataset folder is the cache root 

330 if not base_path: 

331 base_path = flair.cache_root / "datasets" 

332 data_folder = base_path / dataset_name 

333 

334 # TODO: change data source to original CoNLL04 -- this dataset has span formatting errors 

335 # download data if necessary 

336 conll04_url = ( 

337 "https://raw.githubusercontent.com/bekou/multihead_joint_entity_relation_extraction/master/data/CoNLL04/" 

338 ) 

339 data_file = data_folder / "conll04-train.conllu" 

340 

341 if True or not data_file.is_file(): 

342 source_data_folder = data_folder / "original" 

343 cached_path(f"{conll04_url}train.txt", source_data_folder) 

344 cached_path(f"{conll04_url}dev.txt", source_data_folder) 

345 cached_path(f"{conll04_url}test.txt", source_data_folder) 

346 

347 self.convert_to_conllu( 

348 source_data_folder=source_data_folder, 

349 data_folder=data_folder, 

350 ) 

351 

352 super(RE_ENGLISH_CONLL04, self).__init__( 

353 data_folder, 

354 token_annotation_fields=['ner'], 

355 in_memory=in_memory, 

356 ) 

357 

358 def _parse_incr(self, source_file) -> Sequence[conllu.TokenList]: 

359 fields = ["id", "form", "ner", "relations", "relation_heads"] 

360 field_parsers = { 

361 "relations": lambda line, i: json.loads(line[i].replace("'", '"')), 

362 "relation_heads": lambda line, i: json.loads(line[i]), 

363 } 

364 metadata_parsers = {"__fallback__": lambda k, v: tuple(k.split())} 

365 

366 lines = [] 

367 for index, line in enumerate(source_file): 

368 if index > 0 and line.startswith("#"): 

369 source_str = "".join(lines) 

370 src_token_list = conllu.parse( 

371 source_str, fields=fields, field_parsers=field_parsers, metadata_parsers=metadata_parsers 

372 ) 

373 lines = [] 

374 yield src_token_list[0] 

375 

376 lines.append(line) 

377 

378 source_str = "".join(lines) 

379 src_token_list = conllu.parse( 

380 source_str, fields=fields, field_parsers=field_parsers, metadata_parsers=metadata_parsers 

381 ) 

382 yield src_token_list[0] 

383 

384 def convert_to_conllu(self, source_data_folder, data_folder): 

385 source_filenames = [ 

386 "train.txt", 

387 "dev.txt", 

388 "test.txt", 

389 ] 

390 target_filenames = ["conll04-train.conllu", "conll04-dev.conllu", "conll04-test.conllu"] 

391 

392 for source_filename, target_filename in zip(source_filenames, target_filenames): 

393 with open(source_data_folder / source_filename, mode="r") as source_file: 

394 

395 with open(data_folder / target_filename, mode="w", encoding="utf-8") as target_file: 

396 # write CoNLL-U Plus header 

397 target_file.write("# global.columns = id form ner\n") 

398 

399 for src_token_list in self._parse_incr(source_file): 

400 token_list = self._src_token_list_to_token_list(src_token_list) 

401 target_file.write(token_list.serialize()) 

402 

403 def _bio_tags_to_spans(self, tags: List[str]) -> List[Tuple[int, int]]: 

404 spans = [] 

405 span_start = 0 

406 span_end = 0 

407 active_conll_tag = None 

408 for index, tag in enumerate(tags): 

409 bio_tag = tag[0] 

410 conll_tag = tag[2:] 

411 if bio_tag == "O": 

412 # The span has ended. 

413 if active_conll_tag is not None: 

414 spans.append((span_start, span_end)) 

415 active_conll_tag = None 

416 continue 

417 elif bio_tag == "B" or (bio_tag == "I" and conll_tag != active_conll_tag): 

418 # We are entering a new span; reset indices 

419 # and active tag to new span. 

420 if active_conll_tag is not None: 

421 spans.append((span_start, span_end)) 

422 active_conll_tag = conll_tag 

423 span_start = index 

424 span_end = index 

425 elif bio_tag == "I" and conll_tag == active_conll_tag: 

426 # We're inside a span. 

427 span_end += 1 

428 else: 

429 raise Exception("That should never happen.") 

430 

431 # Last token might have been a part of a valid span. 

432 if active_conll_tag is not None: 

433 spans.append((span_start, span_end)) 

434 

435 return spans 

436 

437 def _src_token_list_to_token_list(self, src_token_list): 

438 tokens = [] 

439 token_dicts = [] 

440 ner_tags = [] 

441 for index, token in enumerate(src_token_list, start=1): 

442 text = token["form"] 

443 ner_tag = token["ner"] 

444 tokens.append(text) 

445 ner_tags.append(ner_tag) 

446 

447 token_dicts.append( 

448 { 

449 "id": str(index), 

450 "form": text, 

451 "ner": ner_tag, 

452 } 

453 ) 

454 

455 span_end_to_span = {end: (start, end) for start, end in self._bio_tags_to_spans(ner_tags)} 

456 

457 relations = [] 

458 for index, token in enumerate(src_token_list): 

459 for relation, head in zip(token["relations"], token["relation_heads"]): 

460 if relation == "N": 

461 continue 

462 

463 subj_start, subj_end = span_end_to_span[index] 

464 obj_start, obj_end = span_end_to_span[head] 

465 relations.append((subj_start, subj_end, obj_start, obj_end, relation)) 

466 

467 doc_id = src_token_list.metadata["doc"] 

468 

469 metadata = { 

470 "text": " ".join(tokens), 

471 "sentence_id": doc_id, 

472 "relations": "|".join( 

473 [ 

474 ";".join([str(subj_start + 1), str(subj_end + 1), str(obj_start + 1), str(obj_end + 1), relation]) 

475 for subj_start, subj_end, obj_start, obj_end, relation in relations 

476 ] 

477 ), 

478 } 

479 

480 return conllu.TokenList(tokens=token_dicts, metadata=metadata) 

481 

482 

483class RE_ENGLISH_DRUGPROT(CoNLLUCorpus): 

484 def __init__( 

485 self, 

486 base_path: Union[str, Path] = None, 

487 in_memory: bool = True, 

488 sentence_splitter: SentenceSplitter = SegtokSentenceSplitter(), 

489 ): 

490 """ 

491 DrugProt corpus: Biocreative VII Track 1 from https://zenodo.org/record/5119892#.YSdSaVuxU5k/ on 

492 drug and chemical-protein interactions. 

493 """ 

494 if type(base_path) == str: 

495 base_path: Path = Path(base_path) 

496 

497 self.sentence_splitter = sentence_splitter 

498 

499 # this dataset name 

500 dataset_name = self.__class__.__name__.lower() + "_" + type(self.sentence_splitter).__name__ + "_v3" 

501 

502 # default dataset folder is the cache root 

503 if not base_path: 

504 base_path = flair.cache_root / "datasets" 

505 data_folder = base_path / dataset_name 

506 

507 drugprot_url = ( 

508 "https://zenodo.org/record/5119892/files/drugprot-training-development-test-background.zip" 

509 ) 

510 data_file = data_folder / "drugprot-train.conllu" 

511 

512 if not data_file.is_file(): 

513 source_data_folder = data_folder / "original" 

514 cached_path(drugprot_url, source_data_folder) 

515 self.extract_and_convert_to_conllu( 

516 data_file=source_data_folder / "drugprot-training-development-test-background.zip", 

517 data_folder=data_folder, 

518 ) 

519 

520 super(RE_ENGLISH_DRUGPROT, self).__init__( 

521 data_folder, 

522 in_memory=in_memory, 

523 token_annotation_fields=["ner", "ner-2"], 

524 sample_missing_splits=False, 

525 ) 

526 

527 def extract_and_convert_to_conllu(self, data_file, data_folder): 

528 import zipfile 

529 

530 splits = ["training", "development"] 

531 target_filenames = ["drugprot-train.conllu", "drugprot-dev.conllu"] 

532 

533 with zipfile.ZipFile(data_file) as zip_file: 

534 for split, target_filename in zip(splits, target_filenames): 

535 pmid_to_entities = defaultdict(dict) 

536 pmid_to_relations = defaultdict(set) 

537 

538 with zip_file.open( 

539 f"drugprot-gs-training-development/{split}/drugprot_{split}_entities.tsv") as entites_file: 

540 for line in io.TextIOWrapper(entites_file, encoding="utf-8"): 

541 fields = line.strip().split("\t") 

542 pmid, ent_id, ent_type, start, end, mention = fields 

543 pmid_to_entities[pmid][ent_id] = ( 

544 ent_type, int(start), int(end), mention) 

545 

546 with zip_file.open( 

547 f"drugprot-gs-training-development/{split}/drugprot_{split}_relations.tsv") as relations_file: 

548 for line in io.TextIOWrapper(relations_file, encoding="utf-8"): 

549 fields = line.strip().split("\t") 

550 pmid, rel_type, arg1, arg2 = fields 

551 ent1 = arg1.split(":")[1] 

552 ent2 = arg2.split(":")[1] 

553 pmid_to_relations[pmid].add((rel_type, ent1, ent2)) 

554 

555 tokenlists: List[conllu.TokenList] = [] 

556 with zip_file.open( 

557 f"drugprot-gs-training-development/{split}/drugprot_{split}_abstracs.tsv") as abstracts_file: 

558 for line in io.TextIOWrapper(abstracts_file, encoding="utf-8"): 

559 fields = line.strip().split("\t") 

560 pmid, title, abstract = fields 

561 title_sentences = self.sentence_splitter.split(title) 

562 abstract_sentences = self.sentence_splitter.split(abstract) 

563 

564 tokenlists.extend(self.drugprot_document_to_tokenlists(pmid=pmid, 

565 title_sentences=title_sentences, 

566 abstract_sentences=abstract_sentences, 

567 abstract_offset=len(title) + 1, 

568 entities=pmid_to_entities[pmid], 

569 relations=pmid_to_relations[pmid])) 

570 

571 target_file_path = Path(data_folder) / target_filename 

572 with open(target_file_path, mode="w", encoding="utf-8") as target_file: 

573 # write CoNLL-U Plus header 

574 target_file.write("# global.columns = id form ner ner-2\n") 

575 

576 for tokenlist in tokenlists: 

577 target_file.write(tokenlist.serialize()) 

578 

579 # for source_file_path, target_filename in zip(source_file_paths, target_filenames): 

580 # with zip_file.open(source_file_path, mode="r") as source_file: 

581 

582 # target_file_path = Path(data_folder) / target_filename 

583 # with open(target_file_path, mode="w", encoding="utf-8") as target_file: 

584 # # write CoNLL-U Plus header 

585 # target_file.write("# global.columns = id form ner\n") 

586 

587 # for example in json.load(source_file): 

588 # token_list = self._tacred_example_to_token_list(example) 

589 # target_file.write(token_list.serialize()) 

590 

591 def char_spans_to_token_spans(self, char_spans, token_offsets): 

592 token_starts = [s[0] for s in token_offsets] 

593 token_ends = [s[1] for s in token_offsets] 

594 

595 token_spans = [] 

596 for char_start, char_end in char_spans: 

597 token_start = bisect.bisect_right(token_ends, char_start) 

598 token_end = bisect.bisect_left(token_starts, char_end) 

599 token_spans.append((token_start, token_end)) 

600 

601 return token_spans 

602 

603 def has_overlap(self, a, b): 

604 if a is None or b is None: 

605 return False 

606 

607 return max(0, min(a[1], b[1]) - max(a[0], b[0])) > 0 

608 

609 def drugprot_document_to_tokenlists(self, 

610 pmid: str, 

611 title_sentences: List[Sentence], 

612 abstract_sentences: List[Sentence], 

613 abstract_offset: int, 

614 entities: Dict[str, Tuple[str, int, int, str]], 

615 relations: Set[Tuple[str, str, str]] 

616 ) -> List[conllu.TokenList]: 

617 tokenlists: List[conllu.TokenList] = [] 

618 sentence_id = 1 

619 for offset, sents in [(0, title_sentences), (abstract_offset, abstract_sentences)]: 

620 for sent in sents: 

621 

622 sent_char_start = sent.start_pos + offset 

623 sent_char_end = sent.end_pos + offset 

624 

625 entities_in_sent = set() 

626 for entity_id, (_, char_start, char_end, _) in entities.items(): 

627 if sent_char_start <= char_start and char_end <= sent_char_end: 

628 entities_in_sent.add(entity_id) 

629 

630 entity_char_spans = [(entities[entity_id][1], entities[entity_id][2]) for entity_id in entities_in_sent] 

631 

632 token_offsets = [(sent.start_pos + token.start_pos + offset, sent.start_pos + token.end_pos + offset) 

633 for token in sent.tokens] 

634 entity_token_spans = self.char_spans_to_token_spans(entity_char_spans, token_offsets) 

635 

636 tags_1 = ["O"] * len(sent) 

637 tags_2 = ["O"] * len(sent) 

638 entity_id_to_token_idx = {} 

639 

640 ordered_entities = sorted(zip(entities_in_sent, entity_token_spans), key=lambda x: x[1][1] - x[1][0], 

641 reverse=True) 

642 

643 for entity_id, entity_span in ordered_entities: 

644 

645 entity_id_to_token_idx[entity_id] = entity_span 

646 

647 # check if first tag row is already occupied 

648 token_start, token_end = entity_span 

649 tag_1_occupied = False 

650 for i in range(token_start, token_end): 

651 if tags_1[i] != "O": 

652 tag_1_occupied = True 

653 

654 # if first tag row is occupied, use second tag row 

655 tags = tags_2 if tag_1_occupied else tags_1 

656 

657 tag = entities[entity_id][0] 

658 token_start, token_end = entity_span 

659 for i in range(token_start, token_end): 

660 if i == token_start: 

661 prefix = "B-" 

662 else: 

663 prefix = "I-" 

664 

665 tags[i] = prefix + tag 

666 

667 token_dicts = [] 

668 for i, (token, tag_1, tag_2) in enumerate(zip(sent, tags_1, tags_2)): 

669 # hardcoded mapping TODO: perhaps find nicer solution 

670 tag_1 = tag_1.replace("GENE-N", "GENE") 

671 tag_1 = tag_1.replace("GENE-Y", "GENE") 

672 tag_2 = tag_2.replace("GENE-N", "GENE") 

673 tag_2 = tag_2.replace("GENE-Y", "GENE") 

674 

675 token_dicts.append({ 

676 "id": str(i + 1), 

677 "form": token.text, 

678 "ner": tag_1, 

679 "ner-2": tag_2 

680 }) 

681 

682 relations_in_sent = [] 

683 for relation, ent1, ent2 in [r for r in relations if {r[1], r[2]} <= entities_in_sent]: 

684 subj_start = entity_id_to_token_idx[ent1][0] 

685 subj_end = entity_id_to_token_idx[ent1][1] 

686 obj_start = entity_id_to_token_idx[ent2][0] 

687 obj_end = entity_id_to_token_idx[ent2][1] 

688 relations_in_sent.append((subj_start, subj_end, obj_start, obj_end, relation)) 

689 

690 metadata = { 

691 "text": sent.to_original_text(), 

692 "doc_id": pmid, 

693 "sentence_id": str(sentence_id), 

694 "relations": "|".join( 

695 [ 

696 ";".join([str(subj_start + 1), str(subj_end), str(obj_start + 1), str(obj_end), relation]) 

697 for subj_start, subj_end, obj_start, obj_end, relation in relations_in_sent 

698 ] 

699 ), 

700 } 

701 

702 tokenlists.append(conllu.TokenList(tokens=token_dicts, metadata=metadata)) 

703 

704 sentence_id += 1 

705 

706 return tokenlists