Coverage for flair/flair/datasets/biomedical.py: 19%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

2701 statements  

1import logging 

2import inspect 

3import flair 

4import ftfy 

5import json 

6import os 

7import shutil 

8import re 

9import sys 

10 

11from abc import ABC, abstractmethod 

12from collections import defaultdict, deque 

13from copy import copy 

14from lxml import etree 

15from lxml.etree import XMLSyntaxError 

16from typing import Union, Callable, Dict, List, Tuple, Iterable, Optional 

17from operator import attrgetter 

18from pathlib import Path 

19from warnings import warn 

20 

21from flair.data import MultiCorpus 

22from flair.data import Tokenizer 

23from flair.file_utils import cached_path, Tqdm, unpack_file 

24from flair.datasets import ColumnCorpus, ColumnDataset 

25from flair.tokenization import ( 

26 SentenceSplitter, 

27 SciSpacySentenceSplitter, 

28 NoSentenceSplitter, 

29 TagSentenceSplitter, 

30 SciSpacyTokenizer, 

31 NewlineSentenceSplitter, 

32 SpaceTokenizer, 

33) 

34 

35 

36DISEASE_TAG = "Disease" 

37CHEMICAL_TAG = "Chemical" 

38CELL_LINE_TAG = "CellLine" 

39GENE_TAG = "Gene" 

40SPECIES_TAG = "Species" 

41 

42SENTENCE_TAG = "[__SENT__]" 

43 

44logger = logging.getLogger("flair") 

45 

46 

47class Entity: 

48 """ 

49 Internal class to represent entities while converting biomedical NER corpora to a standardized format 

50 (only used for pre-processing purposes!). Each entity consists of the char span it addresses in 

51 the original text as well as the type of entity (e.g. Chemical, Gene, and so on). 

52 """ 

53 

54 def __init__(self, char_span: Tuple[int, int], entity_type: str): 

55 assert char_span[0] < char_span[1] 

56 self.char_span = range(*char_span) 

57 self.type = entity_type 

58 

59 def __str__(self): 

60 return ( 

61 self.type 

62 + "(" 

63 + str(self.char_span.start) 

64 + "," 

65 + str(self.char_span.stop) 

66 + ")" 

67 ) 

68 

69 def __repr__(self): 

70 return str(self) 

71 

72 def is_before(self, other_entity) -> bool: 

73 """ 

74 Checks whether this entity is located before the given one 

75 

76 :param other_entity: Entity to check 

77 """ 

78 return self.char_span.stop <= other_entity.char_span.start 

79 

80 def contains(self, other_entity) -> bool: 

81 """ 

82 Checks whether the given entity is fully contained in this entity 

83 

84 :param other_entity: Entity to check 

85 """ 

86 return ( 

87 other_entity.char_span.start >= self.char_span.start 

88 and other_entity.char_span.stop <= self.char_span.stop 

89 ) 

90 

91 def overlaps(self, other_entity) -> bool: 

92 """ 

93 Checks whether this and the given entity overlap 

94 

95 :param other_entity: Entity to check 

96 """ 

97 return ( 

98 self.char_span.start <= other_entity.char_span.start < self.char_span.stop 

99 ) or (self.char_span.start < other_entity.char_span.stop <= self.char_span.stop) 

100 

101 

102class InternalBioNerDataset: 

103 """ 

104 Internal class to represent a corpus and it's entities. 

105 """ 

106 

107 def __init__( 

108 self, documents: Dict[str, str], entities_per_document: Dict[str, List[Entity]] 

109 ): 

110 self.documents = documents 

111 self.entities_per_document = entities_per_document 

112 

113 

114def merge_datasets(data_sets: Iterable[InternalBioNerDataset]): 

115 all_documents = {} 

116 all_entities = {} 

117 

118 for ds in data_sets: 

119 all_documents.update(ds.documents) 

120 all_entities.update(ds.entities_per_document) 

121 

122 return InternalBioNerDataset( 

123 documents=all_documents, entities_per_document=all_entities 

124 ) 

125 

126 

127def filter_and_map_entities( 

128 dataset: InternalBioNerDataset, entity_type_to_canonical: Dict[str, str] 

129) -> InternalBioNerDataset: 

130 """ 

131 :param entity_type_to_canonical: Maps entity type in dataset to canonical type 

132 if entity type is not present in map it is discarded 

133 """ 

134 mapped_entities_per_document = {} 

135 for id, entities in dataset.entities_per_document.items(): 

136 new_entities = [] 

137 for entity in entities: 

138 if entity.type in entity_type_to_canonical: 

139 new_entity = copy(entity) 

140 new_entity.type = entity_type_to_canonical[entity.type] 

141 new_entities.append(new_entity) 

142 else: 

143 logging.debug(f"Skip entity type {entity.type}") 

144 pass 

145 mapped_entities_per_document[id] = new_entities 

146 

147 return InternalBioNerDataset( 

148 documents=dataset.documents, entities_per_document=mapped_entities_per_document 

149 ) 

150 

151 

152def filter_nested_entities(dataset: InternalBioNerDataset) -> None: 

153 num_entities_before = sum([len(x) for x in dataset.entities_per_document.values()]) 

154 

155 for document_id, entities in dataset.entities_per_document.items(): 

156 # Uses dynamic programming approach to calculate maximum independent set in interval graph 

157 # with sum of all entity lengths as secondary key 

158 dp_array = [ 

159 (0, 0, 0, None) 

160 ] # position_end, number of entities, sum of all entity lengths, last entity 

161 for entity in sorted(entities, key=lambda x: x.char_span.stop): 

162 i = len(dp_array) - 1 

163 while dp_array[i][0] > entity.char_span.start: 

164 i -= 1 

165 if dp_array[i][1] + 1 > dp_array[-1][1] or ( 

166 dp_array[i][1] + 1 == dp_array[-1][1] 

167 and dp_array[i][2] + len(entity.char_span) > dp_array[-1][2] 

168 ): 

169 dp_array += [ 

170 ( 

171 entity.char_span.stop, 

172 dp_array[i][1] + 1, 

173 dp_array[i][2] + len(entity.char_span), 

174 entity, 

175 ) 

176 ] 

177 else: 

178 dp_array += [dp_array[-1]] 

179 

180 independent_set = [] 

181 p = dp_array[-1][0] 

182 for dp_entry in dp_array[::-1]: 

183 if dp_entry[3] is None: 

184 break 

185 if dp_entry[0] <= p: 

186 independent_set += [dp_entry[3]] 

187 p -= len(dp_entry[3].char_span) 

188 

189 dataset.entities_per_document[document_id] = independent_set 

190 

191 num_entities_after = sum([len(x) for x in dataset.entities_per_document.values()]) 

192 if num_entities_before != num_entities_after: 

193 removed = num_entities_before - num_entities_after 

194 warn( 

195 f"Corpus modified by filtering nested entities. Removed {removed} entities." 

196 ) 

197 

198 

199def bioc_to_internal(bioc_file: Path): 

200 """ 

201 Helper function to parse corpora that are given in BIOC format. See 

202 

203 http://bioc.sourceforge.net/ 

204 

205 for details. 

206 """ 

207 tree = etree.parse(str(bioc_file)) 

208 texts_per_document = {} 

209 entities_per_document = {} 

210 documents = tree.xpath(".//document") 

211 

212 all_entities = 0 

213 non_matching = 0 

214 

215 for document in Tqdm.tqdm(documents, desc="Converting to internal"): 

216 document_id = document.xpath("./id")[0].text 

217 texts = [] 

218 entities = [] 

219 

220 for passage in document.xpath("passage"): 

221 passage_texts = passage.xpath("text/text()") 

222 if len(passage_texts) == 0: 

223 continue 

224 text = passage_texts[0] 

225 

226 passage_offset = int( 

227 passage.xpath("./offset/text()")[0] 

228 ) # from BioC annotation 

229 

230 # calculate offset without current text 

231 # because we stick all passages of a document together 

232 document_text = " ".join(texts) 

233 document_offset = len(document_text) 

234 

235 texts.append(text) 

236 document_text += " " + text 

237 

238 for annotation in passage.xpath(".//annotation"): 

239 

240 entity_types = [ 

241 i.text.replace(" ", "_") 

242 for i in annotation.xpath("./infon") 

243 if i.attrib["key"] in {"type", "class"} 

244 ] 

245 

246 start = ( 

247 int(annotation.xpath("./location")[0].get("offset")) 

248 - passage_offset 

249 ) 

250 # TODO For split entities we also annotate everything inbetween which might be a bad idea? 

251 final_length = int(annotation.xpath("./location")[-1].get("length")) 

252 final_offset = ( 

253 int(annotation.xpath("./location")[-1].get("offset")) 

254 - passage_offset 

255 ) 

256 if final_length <= 0: 

257 continue 

258 end = final_offset + final_length 

259 

260 start += document_offset 

261 end += document_offset 

262 

263 true_entity = annotation.xpath(".//text")[0].text 

264 annotated_entity = " ".join(texts)[start:end] 

265 

266 # Try to fix incorrect annotations 

267 if annotated_entity.lower() != true_entity.lower(): 

268 max_shift = min(3, len(true_entity)) 

269 for i in range(max_shift): 

270 index = annotated_entity.lower().find( 

271 true_entity[0 : max_shift - i].lower() 

272 ) 

273 if index != -1: 

274 start += index 

275 end += index 

276 break 

277 

278 annotated_entity = " ".join(texts)[start:end] 

279 if not annotated_entity.lower() == true_entity.lower(): 

280 non_matching += 1 

281 

282 all_entities += 1 

283 

284 for entity_type in entity_types: 

285 entities.append(Entity((start, end), entity_type)) 

286 

287 texts_per_document[document_id] = " ".join(texts) 

288 entities_per_document[document_id] = entities 

289 

290 # print( 

291 # f"Found {non_matching} non-matching entities ({non_matching/all_entities}%) in {bioc_file}" 

292 # ) 

293 

294 return InternalBioNerDataset( 

295 documents=texts_per_document, entities_per_document=entities_per_document 

296 ) 

297 

298 

299def brat_to_internal(corpus_dir: Path, ann_file_suffixes=None) -> InternalBioNerDataset: 

300 """ 

301 Helper function to parse corpora that are annotated using BRAT. See 

302 

303 https://brat.nlplab.org/ 

304 

305 for details. 

306 

307 """ 

308 if ann_file_suffixes is None: 

309 ann_file_suffixes = [".ann"] 

310 

311 text_files = list(corpus_dir.glob("*.txt")) 

312 documents = {} 

313 entities_per_document = defaultdict(list) 

314 for text_file in text_files: 

315 document_text = open(str(text_file), encoding="utf8").read().strip() 

316 document_id = text_file.stem 

317 

318 for suffix in ann_file_suffixes: 

319 with open(str(text_file.with_suffix(suffix)), "r", encoding="utf8") as ann_file: 

320 for line in ann_file: 

321 fields = line.strip().split("\t") 

322 

323 # Ignore empty lines or relation annotations 

324 if not fields or len(fields) <= 2: 

325 continue 

326 

327 ent_type, char_start, char_end = fields[1].split() 

328 start = int(char_start) 

329 end = int(char_end) 

330 

331 # FIX annotation of whitespaces (necessary for PDR) 

332 while document_text[start:end].startswith(" "): 

333 start += 1 

334 

335 while document_text[start:end].endswith(" "): 

336 end -= 1 

337 

338 entities_per_document[document_id].append( 

339 Entity(char_span=(start, end), entity_type=ent_type,) 

340 ) 

341 

342 assert document_text[start:end].strip() == fields[2].strip() 

343 

344 documents[document_id] = document_text 

345 

346 return InternalBioNerDataset( 

347 documents=documents, entities_per_document=dict(entities_per_document) 

348 ) 

349 

350 

351class CoNLLWriter: 

352 """ 

353 Class which implements the output CONLL file generation of corpora given as instances of 

354 :class:`InternalBioNerDataset`. 

355 """ 

356 

357 def __init__( 

358 self, sentence_splitter: SentenceSplitter, 

359 ): 

360 """ 

361 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which 

362 segments the text into sentences and tokens 

363 """ 

364 self.sentence_splitter = sentence_splitter 

365 

366 def process_dataset( 

367 self, datasets: Dict[str, InternalBioNerDataset], out_dir: Path 

368 ): 

369 self.write_to_conll(datasets["train"], out_dir / "train.conll") 

370 self.write_to_conll(datasets["dev"], out_dir / "dev.conll") 

371 self.write_to_conll(datasets["test"], out_dir / "test.conll") 

372 

373 def write_to_conll(self, dataset: InternalBioNerDataset, output_file: Path): 

374 os.makedirs(str(output_file.parent), exist_ok=True) 

375 filter_nested_entities(dataset) 

376 

377 with output_file.open("w", encoding="utf8") as f: 

378 for document_id in Tqdm.tqdm( 

379 dataset.documents.keys(), 

380 total=len(dataset.documents), 

381 desc="Converting to CoNLL", 

382 ): 

383 document_text = ftfy.fix_text(dataset.documents[document_id]) 

384 document_text = re.sub( 

385 r"[\u2000-\u200B]", " ", document_text 

386 ) # replace unicode space characters! 

387 document_text = document_text.replace( 

388 "\xa0", " " 

389 ) # replace non-break space 

390 

391 entities = deque( 

392 sorted( 

393 dataset.entities_per_document[document_id], 

394 key=attrgetter("char_span.start", "char_span.stop"), 

395 ) 

396 ) 

397 current_entity = entities.popleft() if entities else None 

398 

399 sentences = self.sentence_splitter.split(document_text) 

400 

401 for sentence in sentences: 

402 in_entity = False 

403 sentence_had_tokens = False 

404 

405 for flair_token in sentence.tokens: 

406 token = flair_token.text.strip() 

407 offset = sentence.start_pos + flair_token.start_pos 

408 

409 if current_entity and offset >= current_entity.char_span.stop: 

410 in_entity = False 

411 

412 # One token may contain multiple entities -> deque all of them 

413 while ( 

414 current_entity 

415 and offset >= current_entity.char_span.stop 

416 ): 

417 current_entity = ( 

418 entities.popleft() if entities else None 

419 ) 

420 

421 if current_entity and offset in current_entity.char_span: 

422 if not in_entity: 

423 tag = "B-" + current_entity.type 

424 in_entity = True 

425 else: 

426 tag = "I-" + current_entity.type 

427 else: 

428 tag = "O" 

429 in_entity = False 

430 

431 whitespace_after = "+" if flair_token.whitespace_after else "-" 

432 if len(token) > 0: 

433 f.write(" ".join([token, tag, whitespace_after]) + "\n") 

434 sentence_had_tokens = True 

435 

436 if sentence_had_tokens: 

437 f.write("\n") 

438 

439 

440class HunerDataset(ColumnCorpus, ABC): 

441 """ 

442 Base class for HUNER datasets. 

443 

444 Every subclass has to implement the following methods: 

445 - `to_internal', which reads the complete data set (incl. train, dev, test) and returns the corpus 

446 as InternalBioNerDataset 

447 - `split_url', which returns the base url (i.e. without '.train', '.dev', '.test') to the HUNER split files 

448 

449 For further information see: 

450 - Weber et al.: 'HUNER: improving biomedical NER with pretraining' 

451 https://academic.oup.com/bioinformatics/article-abstract/36/1/295/5523847?redirectedFrom=fulltext 

452 - HUNER github repository: 

453 https://github.com/hu-ner/huner 

454 """ 

455 

456 @abstractmethod 

457 def to_internal(self, data_folder: Path) -> InternalBioNerDataset: 

458 raise NotImplementedError() 

459 

460 @staticmethod 

461 @abstractmethod 

462 def split_url() -> str: 

463 raise NotImplementedError() 

464 

465 def get_corpus_sentence_splitter(self) -> Optional[SentenceSplitter]: 

466 """ 

467 If the corpus has a pre-defined sentence splitting, then this method returns 

468 the sentence splitter to be used to reconstruct the original splitting. 

469 If the corpus has no pre-defined sentence splitting None will be returned. 

470 """ 

471 return None 

472 

473 def __init__( 

474 self, 

475 base_path: Union[str, Path] = None, 

476 in_memory: bool = True, 

477 sentence_splitter: SentenceSplitter = None, 

478 ): 

479 """ 

480 :param base_path: Path to the corpus on your machine 

481 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

482 :param sentence_splitter: Custom implementation of :class:`SentenceSplitter` which 

483 segments the text into sentences and tokens (default :class:`SciSpacySentenceSplitter`) 

484 """ 

485 

486 if type(base_path) == str: 

487 base_path: Path = Path(base_path) 

488 

489 # column format 

490 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY} 

491 

492 # this dataset name 

493 dataset_name = self.__class__.__name__.lower() 

494 

495 # default dataset folder is the cache root 

496 if not base_path: 

497 base_path = flair.cache_root / "datasets" 

498 data_folder = base_path / dataset_name 

499 

500 self.sentence_splitter = self.get_corpus_sentence_splitter() 

501 if not self.sentence_splitter: 

502 self.sentence_splitter = ( 

503 sentence_splitter if sentence_splitter else SciSpacySentenceSplitter() 

504 ) 

505 else: 

506 if sentence_splitter: 

507 warn( 

508 f"The corpus {self.__class__.__name__} has a pre-defined sentence splitting, " 

509 f"thus just the tokenizer of the given sentence splitter ist used" 

510 ) 

511 self.sentence_splitter.tokenizer = sentence_splitter.tokenizer 

512 

513 # Create tokenization-dependent CONLL files. This is necessary to prevent 

514 # from caching issues (e.g. loading the same corpus with different sentence splitters) 

515 train_file = data_folder / f"{self.sentence_splitter.name}_train.conll" 

516 dev_file = data_folder / f"{self.sentence_splitter.name}_dev.conll" 

517 test_file = data_folder / f"{self.sentence_splitter.name}_test.conll" 

518 

519 if not (train_file.exists() and dev_file.exists() and test_file.exists()): 

520 splits_dir = data_folder / "splits" 

521 os.makedirs(splits_dir, exist_ok=True) 

522 

523 writer = CoNLLWriter(sentence_splitter=self.sentence_splitter) 

524 internal_dataset = self.to_internal(data_folder) 

525 

526 train_data = self.get_subset(internal_dataset, "train", splits_dir) 

527 writer.write_to_conll(train_data, train_file) 

528 

529 dev_data = self.get_subset(internal_dataset, "dev", splits_dir) 

530 writer.write_to_conll(dev_data, dev_file) 

531 

532 test_data = self.get_subset(internal_dataset, "test", splits_dir) 

533 writer.write_to_conll(test_data, test_file) 

534 

535 super(HunerDataset, self).__init__( 

536 data_folder=data_folder, 

537 train_file=train_file.name, 

538 dev_file=dev_file.name, 

539 test_file=test_file.name, 

540 column_format=columns, 

541 tag_to_bioes="ner", 

542 in_memory=in_memory 

543 ) 

544 

545 def get_subset(self, dataset: InternalBioNerDataset, split: str, split_dir: Path): 

546 split_file = cached_path(f"{self.split_url()}.{split}", split_dir) 

547 

548 with split_file.open(encoding="utf8") as f: 

549 ids = [l.strip() for l in f if l.strip()] 

550 ids = sorted(id_ for id_ in ids if id_ in dataset.documents) 

551 

552 return InternalBioNerDataset( 

553 documents={k: dataset.documents[k] for k in ids}, 

554 entities_per_document={k: dataset.entities_per_document[k] for k in ids}, 

555 ) 

556 

557 

558class BIO_INFER(ColumnCorpus): 

559 """ 

560 Original BioInfer corpus 

561 

562 For further information see Pyysalo et al.: 

563 BioInfer: a corpus for information extraction in the biomedical domain 

564 https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-8-50 

565 """ 

566 

567 def __init__( 

568 self, base_path: Union[str, Path] = None, in_memory: bool = True, 

569 ): 

570 """ 

571 :param base_path: Path to the corpus on your machine 

572 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

573 """ 

574 

575 if type(base_path) == str: 

576 base_path: Path = Path(base_path) 

577 

578 # column format 

579 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY} 

580 

581 # this dataset name 

582 dataset_name = self.__class__.__name__.lower() 

583 

584 # default dataset folder is the cache root 

585 if not base_path: 

586 base_path = flair.cache_root / "datasets" 

587 data_folder = base_path / dataset_name 

588 

589 train_file = data_folder / "train.conll" 

590 

591 if not (train_file.exists()): 

592 corpus_folder = self.download_dataset(data_folder) 

593 corpus_data = self.parse_dataset(corpus_folder) 

594 

595 sentence_splitter = NoSentenceSplitter(tokenizer=SpaceTokenizer()) 

596 

597 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter) 

598 conll_writer.write_to_conll(corpus_data, train_file) 

599 

600 super(BIO_INFER, self).__init__( 

601 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

602 ) 

603 

604 @classmethod 

605 def download_dataset(cls, data_dir: Path) -> Path: 

606 data_url = "http://mars.cs.utu.fi/BioInfer/files/BioInfer_corpus_1.1.1.zip" 

607 data_path = cached_path(data_url, data_dir) 

608 unpack_file(data_path, data_dir) 

609 

610 return data_dir / "BioInfer_corpus_1.1.1.xml" 

611 

612 @classmethod 

613 def parse_dataset(cls, original_file: Path): 

614 documents = {} 

615 entities_per_document = {} 

616 

617 tree = etree.parse(str(original_file)) 

618 sentence_elems = tree.xpath("//sentence") 

619 for sentence_id, sentence in enumerate(sentence_elems): 

620 sentence_id = str(sentence_id) 

621 token_id_to_span = {} 

622 sentence_text = "" 

623 entities_per_document[sentence_id] = [] 

624 

625 for token in sentence.xpath(".//token"): 

626 token_text = "".join(token.xpath(".//subtoken/@text")) 

627 token_id = ".".join(token.attrib["id"].split(".")[1:]) 

628 

629 if not sentence_text: 

630 token_id_to_span[token_id] = (0, len(token_text)) 

631 sentence_text = token_text 

632 else: 

633 token_id_to_span[token_id] = ( 

634 len(sentence_text) + 1, 

635 len(token_text) + len(sentence_text) + 1, 

636 ) 

637 sentence_text += " " + token_text 

638 documents[sentence_id] = sentence_text 

639 

640 entities = [ 

641 e for e in sentence.xpath(".//entity") if not e.attrib["type"].isupper() 

642 ] # all caps entity type apparently marks event trigger 

643 

644 for entity in entities: 

645 token_nums = [] 

646 entity_character_starts = [] 

647 entity_character_ends = [] 

648 

649 for subtoken in entity.xpath(".//nestedsubtoken"): 

650 token_id_parts = subtoken.attrib["id"].split(".") 

651 token_id = ".".join(token_id_parts[1:3]) 

652 

653 token_nums.append(int(token_id_parts[2])) 

654 entity_character_starts.append(token_id_to_span[token_id][0]) 

655 entity_character_ends.append(token_id_to_span[token_id][1]) 

656 

657 if token_nums and entity_character_starts and entity_character_ends: 

658 entity_tokens = list( 

659 zip(token_nums, entity_character_starts, entity_character_ends) 

660 ) 

661 

662 start_token = entity_tokens[0] 

663 last_entity_token = entity_tokens[0] 

664 for entity_token in entity_tokens[1:]: 

665 if not (entity_token[0] - 1) == last_entity_token[0]: 

666 entities_per_document[sentence_id].append( 

667 Entity( 

668 char_span=(start_token[1], last_entity_token[2]), 

669 entity_type=entity.attrib["type"], 

670 ) 

671 ) 

672 start_token = entity_token 

673 

674 last_entity_token = entity_token 

675 

676 if start_token: 

677 entities_per_document[sentence_id].append( 

678 Entity( 

679 char_span=(start_token[1], last_entity_token[2]), 

680 entity_type=entity.attrib["type"], 

681 ) 

682 ) 

683 

684 return InternalBioNerDataset( 

685 documents=documents, entities_per_document=entities_per_document 

686 ) 

687 

688 

689class HUNER_GENE_BIO_INFER(HunerDataset): 

690 """ 

691 HUNER version of the BioInfer corpus containing only gene/protein annotations 

692 """ 

693 def __init__(self, *args, **kwargs): 

694 super().__init__(*args, **kwargs) 

695 

696 @staticmethod 

697 def split_url() -> str: 

698 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/bioinfer" 

699 

700 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

701 original_file = BIO_INFER.download_dataset(data_dir) 

702 corpus = BIO_INFER.parse_dataset(original_file) 

703 

704 entity_type_mapping = { 

705 "Individual_protein": GENE_TAG, 

706 "Gene/protein/RNA": GENE_TAG, 

707 "Gene": GENE_TAG, 

708 "DNA_family_or_group": GENE_TAG, 

709 } 

710 

711 return filter_and_map_entities(corpus, entity_type_mapping) 

712 

713 

714class JNLPBA(ColumnCorpus): 

715 """ 

716 Original corpus of the JNLPBA shared task. 

717 

718 For further information see Kim et al.: 

719 Introduction to the Bio-Entity Recognition Task at JNLPBA 

720 https://www.aclweb.org/anthology/W04-1213.pdf 

721 """ 

722 

723 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): 

724 """ 

725 :param base_path: Path to the corpus on your machine 

726 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

727 """ 

728 

729 if type(base_path) == str: 

730 base_path: Path = Path(base_path) 

731 

732 # column format 

733 columns = {0: "text", 1: "ner"} 

734 

735 # this dataset name 

736 dataset_name = self.__class__.__name__.lower() 

737 

738 # default dataset folder is the cache root 

739 if not base_path: 

740 base_path = flair.cache_root / "datasets" 

741 data_folder = base_path / dataset_name 

742 

743 train_file = data_folder / "train.conll" 

744 test_file = data_folder / "test.conll" 

745 

746 if not (train_file.exists() and test_file.exists()): 

747 download_dir = data_folder / "original" 

748 os.makedirs(download_dir, exist_ok=True) 

749 

750 train_data_url = "http://www.nactem.ac.uk/GENIA/current/Shared-tasks/JNLPBA/Train/Genia4ERtraining.tar.gz" 

751 train_data_path = cached_path(train_data_url, download_dir) 

752 unpack_file(train_data_path, download_dir) 

753 

754 train_data_url = "http://www.nactem.ac.uk/GENIA/current/Shared-tasks/JNLPBA/Evaluation/Genia4ERtest.tar.gz" 

755 train_data_path = cached_path(train_data_url, download_dir) 

756 unpack_file(train_data_path, download_dir) 

757 

758 train_file = download_dir / "Genia4ERtask2.iob2" 

759 shutil.copy(train_file, data_folder / "train.conll") 

760 

761 test_file = download_dir / "Genia4EReval2.iob2" 

762 shutil.copy(test_file, data_folder / "test.conll") 

763 

764 super(JNLPBA, self).__init__( 

765 data_folder, 

766 columns, 

767 tag_to_bioes="ner", 

768 in_memory=in_memory, 

769 comment_symbol="#", 

770 ) 

771 

772 

773class HunerJNLPBA(object): 

774 @classmethod 

775 def download_and_prepare_train( 

776 cls, data_folder: Path, sentence_tag: str 

777 ) -> InternalBioNerDataset: 

778 train_data_url = "http://www.nactem.ac.uk/GENIA/current/Shared-tasks/JNLPBA/Train/Genia4ERtraining.tar.gz" 

779 train_data_path = cached_path(train_data_url, data_folder) 

780 unpack_file(train_data_path, data_folder) 

781 

782 train_input_file = data_folder / "Genia4ERtask2.iob2" 

783 return cls.read_file(train_input_file, sentence_tag) 

784 

785 @classmethod 

786 def download_and_prepare_test( 

787 cls, data_folder: Path, sentence_tag: str 

788 ) -> InternalBioNerDataset: 

789 test_data_url = "http://www.nactem.ac.uk/GENIA/current/Shared-tasks/JNLPBA/Evaluation/Genia4ERtest.tar.gz" 

790 test_data_path = cached_path(test_data_url, data_folder) 

791 unpack_file(test_data_path, data_folder) 

792 

793 test_input_file = data_folder / "Genia4EReval2.iob2" 

794 return cls.read_file(test_input_file, sentence_tag) 

795 

796 @classmethod 

797 def read_file( 

798 cls, input_iob_file: Path, sentence_tag: str 

799 ) -> InternalBioNerDataset: 

800 documents = {} 

801 entities_per_document = defaultdict(list) 

802 

803 with open(str(input_iob_file), "r", encoding="utf8") as file_reader: 

804 document_id = None 

805 document_text = None 

806 

807 entities = [] 

808 entity_type = None 

809 entity_start = 0 

810 

811 for line in file_reader: 

812 line = line.strip() 

813 if line[:3] == "###": 

814 if not (document_id is None and document_text is None): 

815 documents[document_id] = document_text 

816 entities_per_document[document_id] = entities 

817 

818 document_id = line.split(":")[-1] 

819 document_text = None 

820 

821 entities = [] 

822 entity_type = None 

823 entity_start = 0 

824 

825 file_reader.__next__() 

826 continue 

827 

828 if line: 

829 parts = line.split() 

830 token = parts[0].strip() 

831 tag = parts[1].strip() 

832 

833 if tag.startswith("B-"): 

834 if entity_type is not None: 

835 entities.append( 

836 Entity((entity_start, len(document_text)), entity_type) 

837 ) 

838 

839 entity_start = len(document_text) + 1 if document_text else 0 

840 entity_type = tag[2:] 

841 

842 elif tag == "O" and entity_type is not None: 

843 entities.append( 

844 Entity((entity_start, len(document_text)), entity_type) 

845 ) 

846 entity_type = None 

847 

848 document_text = ( 

849 document_text + " " + token if document_text else token 

850 ) 

851 

852 else: 

853 document_text += sentence_tag 

854 

855 # Edge case: last token starts a new entity 

856 if entity_type is not None: 

857 entities.append( 

858 Entity((entity_start, len(document_text)), entity_type) 

859 ) 

860 

861 # Last document in file 

862 if not (document_id is None and document_text is None): 

863 documents[document_id] = document_text 

864 entities_per_document[document_id] = entities 

865 

866 return InternalBioNerDataset( 

867 documents=documents, entities_per_document=entities_per_document 

868 ) 

869 

870 

871class HUNER_GENE_JNLPBA(HunerDataset): 

872 """ 

873 HUNER version of the JNLPBA corpus containing gene annotations. 

874 """ 

875 

876 def __init__(self, *args, **kwargs): 

877 super().__init__(*args, **kwargs) 

878 

879 @staticmethod 

880 def split_url() -> str: 

881 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/genia" 

882 

883 def get_corpus_sentence_splitter(self) -> SentenceSplitter: 

884 return TagSentenceSplitter(tag=SENTENCE_TAG, tokenizer=SciSpacyTokenizer()) 

885 

886 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

887 orig_folder = data_dir / "original" 

888 os.makedirs(str(orig_folder), exist_ok=True) 

889 

890 sentence_separator = " " 

891 if isinstance(self.sentence_splitter, TagSentenceSplitter): 

892 sentence_separator = self.sentence_splitter.tag 

893 

894 train_data = HunerJNLPBA.download_and_prepare_train(orig_folder, sentence_separator) 

895 train_data = filter_and_map_entities(train_data, {"protein": GENE_TAG}) 

896 

897 test_data = HunerJNLPBA.download_and_prepare_test(orig_folder, sentence_separator) 

898 test_data = filter_and_map_entities(test_data, {"protein": GENE_TAG}) 

899 

900 return merge_datasets([train_data, test_data]) 

901 

902 

903class HUNER_CELL_LINE_JNLPBA(HunerDataset): 

904 """ 

905 HUNER version of the JNLPBA corpus containing cell line annotations. 

906 """ 

907 

908 def __init__(self, *args, **kwargs): 

909 super().__init__(*args, **kwargs) 

910 

911 @staticmethod 

912 def split_url() -> str: 

913 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/genia" 

914 

915 def get_corpus_sentence_splitter(self) -> SentenceSplitter: 

916 return TagSentenceSplitter(tag=SENTENCE_TAG, tokenizer=SciSpacyTokenizer()) 

917 

918 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

919 download_folder = data_dir / "original" 

920 os.makedirs(str(download_folder), exist_ok=True) 

921 

922 sentence_separator = " " 

923 if isinstance(self.sentence_splitter, TagSentenceSplitter): 

924 sentence_separator = self.sentence_splitter.tag 

925 

926 train_data = HunerJNLPBA.download_and_prepare_train(download_folder, sentence_separator) 

927 train_data = filter_and_map_entities(train_data, {"cell_line": CELL_LINE_TAG}) 

928 

929 test_data = HunerJNLPBA.download_and_prepare_test(download_folder, sentence_separator) 

930 test_data = filter_and_map_entities(test_data, {"cell_line": CELL_LINE_TAG}) 

931 

932 return merge_datasets([train_data, test_data]) 

933 

934 

935class CELL_FINDER(ColumnCorpus): 

936 """ 

937 Original CellFinder corpus containing cell line, species and gene annotations. 

938 

939 For futher information see Neves et al.: 

940 Annotating and evaluating text for stem cell research 

941 https://pdfs.semanticscholar.org/38e3/75aeeeb1937d03c3c80128a70d8e7a74441f.pdf 

942 """ 

943 

944 def __init__( 

945 self, 

946 base_path: Union[str, Path] = None, 

947 in_memory: bool = True, 

948 sentence_splitter: SentenceSplitter = None, 

949 ): 

950 """ 

951 :param base_path: Path to the corpus on your machine 

952 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

953 :param sentence_splitter: Custom implementation of :class:`SentenceSplitter` which segments 

954 the text into sentences and tokens. 

955 """ 

956 if type(base_path) == str: 

957 base_path: Path = Path(base_path) 

958 

959 # column format 

960 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY} 

961 

962 # this dataset name 

963 dataset_name = self.__class__.__name__.lower() 

964 

965 if sentence_splitter is None: 

966 sentence_splitter = SciSpacySentenceSplitter() 

967 

968 # default dataset folder is the cache root 

969 if not base_path: 

970 base_path = flair.cache_root / "datasets" 

971 data_folder = base_path / dataset_name 

972 

973 train_file = data_folder / f"{sentence_splitter.name}_train.conll" 

974 if not (train_file.exists()): 

975 train_corpus = self.download_and_prepare(data_folder) 

976 

977 writer = CoNLLWriter(sentence_splitter=sentence_splitter) 

978 writer.write_to_conll(train_corpus, train_file) 

979 

980 super(CELL_FINDER, self).__init__( 

981 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

982 ) 

983 

984 @classmethod 

985 def download_and_prepare(cls, data_folder: Path) -> InternalBioNerDataset: 

986 data_url = "https://www.informatik.hu-berlin.de/de/forschung/gebiete/wbi/resources/cellfinder/cellfinder1_brat.tar.gz" 

987 data_path = cached_path(data_url, data_folder) 

988 unpack_file(data_path, data_folder) 

989 

990 return cls.read_folder(data_folder) 

991 

992 @classmethod 

993 def read_folder(cls, data_folder: Path) -> InternalBioNerDataset: 

994 ann_files = list(data_folder.glob("*.ann")) 

995 documents = {} 

996 entities_per_document = defaultdict(list) 

997 for ann_file in ann_files: 

998 with ann_file.open(encoding="utf8") as f_ann, ann_file.with_suffix(".txt").open(encoding="utf8") as f_txt: 

999 document_text = f_txt.read().strip() 

1000 

1001 document_id = ann_file.stem 

1002 documents[document_id] = document_text 

1003 

1004 for line in f_ann: 

1005 fields = line.strip().split("\t") 

1006 if not fields: 

1007 continue 

1008 ent_type, char_start, char_end = fields[1].split() 

1009 entities_per_document[document_id].append( 

1010 Entity( 

1011 char_span=(int(char_start), int(char_end)), 

1012 entity_type=ent_type, 

1013 ) 

1014 ) 

1015 

1016 assert document_text[int(char_start) : int(char_end)] == fields[2] 

1017 

1018 return InternalBioNerDataset( 

1019 documents=documents, entities_per_document=dict(entities_per_document) 

1020 ) 

1021 

1022 

1023class HUNER_CELL_LINE_CELL_FINDER(HunerDataset): 

1024 """ 

1025 HUNER version of the CellFinder corpus containing only cell line annotations. 

1026 """ 

1027 def __init__(self, *args, **kwargs): 

1028 super().__init__(*args, **kwargs) 

1029 

1030 @staticmethod 

1031 def split_url() -> str: 

1032 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/cellfinder_cellline" 

1033 

1034 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

1035 data = CELL_FINDER.download_and_prepare(data_dir) 

1036 data = filter_and_map_entities(data, {"CellLine": CELL_LINE_TAG}) 

1037 

1038 return data 

1039 

1040 

1041class HUNER_SPECIES_CELL_FINDER(HunerDataset): 

1042 """ 

1043 HUNER version of the CellFinder corpus containing only species annotations. 

1044 """ 

1045 def __init__(self, *args, **kwargs): 

1046 super().__init__(*args, **kwargs) 

1047 

1048 @staticmethod 

1049 def split_url() -> str: 

1050 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/cellfinder_species" 

1051 

1052 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

1053 data = CELL_FINDER.download_and_prepare(data_dir) 

1054 data = filter_and_map_entities(data, {"Species": SPECIES_TAG}) 

1055 

1056 return data 

1057 

1058 

1059class HUNER_GENE_CELL_FINDER(HunerDataset): 

1060 """ 

1061 HUNER version of the CellFinder corpus containing only gene annotations. 

1062 """ 

1063 def __init__(self, *args, **kwargs): 

1064 super().__init__(*args, **kwargs) 

1065 

1066 @staticmethod 

1067 def split_url() -> str: 

1068 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/cellfinder_protein" 

1069 

1070 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

1071 data = CELL_FINDER.download_and_prepare(data_dir) 

1072 data = filter_and_map_entities(data, {"GeneProtein": GENE_TAG}) 

1073 

1074 return data 

1075 

1076 

1077class MIRNA(ColumnCorpus): 

1078 """ 

1079 Original miRNA corpus. 

1080 

1081 For further information see Bagewadi et al.: 

1082 Detecting miRNA Mentions and Relations in Biomedical Literature 

1083 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4602280/ 

1084 """ 

1085 

1086 def __init__( 

1087 self, 

1088 base_path: Union[str, Path] = None, 

1089 in_memory: bool = True, 

1090 sentence_splitter: SentenceSplitter = None, 

1091 ): 

1092 """ 

1093 :param base_path: Path to the corpus on your machine 

1094 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

1095 :param tokenizer: Callable that segments a sentence into words, 

1096 defaults to scispacy 

1097 :param sentence_splitter: Callable that segments a document into sentences, 

1098 defaults to scispacy 

1099 """ 

1100 if type(base_path) == str: 

1101 base_path: Path = Path(base_path) 

1102 

1103 # column format 

1104 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY} 

1105 

1106 # this dataset name 

1107 dataset_name = self.__class__.__name__.lower() 

1108 

1109 # default dataset folder is the cache root 

1110 if not base_path: 

1111 base_path = flair.cache_root / "datasets" 

1112 data_folder = base_path / dataset_name 

1113 

1114 sentence_separator = " " 

1115 if sentence_splitter is None: 

1116 sentence_separator = SENTENCE_TAG 

1117 sentence_splitter = TagSentenceSplitter( 

1118 tag=sentence_separator, tokenizer=SciSpacyTokenizer() 

1119 ) 

1120 

1121 train_file = data_folder / f"{sentence_splitter.name}_train.conll" 

1122 test_file = data_folder / f"{sentence_splitter.name}_test.conll" 

1123 

1124 if not (train_file.exists() and test_file.exists()): 

1125 download_folder = data_folder / "original" 

1126 os.makedirs(str(download_folder), exist_ok=True) 

1127 

1128 writer = CoNLLWriter(sentence_splitter=sentence_splitter) 

1129 

1130 train_corpus = self.download_and_prepare_train( 

1131 download_folder, sentence_separator 

1132 ) 

1133 writer.write_to_conll(train_corpus, train_file) 

1134 

1135 test_corpus = self.download_and_prepare_test( 

1136 download_folder, sentence_separator 

1137 ) 

1138 writer.write_to_conll(test_corpus, test_file) 

1139 

1140 super(MIRNA, self).__init__( 

1141 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

1142 ) 

1143 

1144 @classmethod 

1145 def download_and_prepare_train(cls, data_folder: Path, sentence_separator: str): 

1146 data_url = "https://www.scai.fraunhofer.de/content/dam/scai/de/downloads/bioinformatik/miRNA/miRNA-Train-Corpus.xml" 

1147 data_path = cached_path(data_url, data_folder) 

1148 

1149 return cls.parse_file(data_path, "train", sentence_separator) 

1150 

1151 @classmethod 

1152 def download_and_prepare_test(cls, data_folder: Path, sentence_separator): 

1153 data_url = "https://www.scai.fraunhofer.de/content/dam/scai/de/downloads/bioinformatik/miRNA/miRNA-Test-Corpus.xml" 

1154 data_path = cached_path(data_url, data_folder) 

1155 

1156 return cls.parse_file(data_path, "test", sentence_separator) 

1157 

1158 @classmethod 

1159 def parse_file( 

1160 cls, input_file: Path, split: str, sentence_separator: str 

1161 ) -> InternalBioNerDataset: 

1162 tree = etree.parse(str(input_file)) 

1163 

1164 documents = {} 

1165 entities_per_document = {} 

1166 

1167 for document in tree.xpath(".//document"): 

1168 document_id = document.get("id") + "-" + split 

1169 entities = [] 

1170 

1171 document_text = "" 

1172 for sentence in document.xpath(".//sentence"): 

1173 if document_text: 

1174 document_text += sentence_separator 

1175 

1176 sentence_offset = len(document_text) 

1177 document_text += ( 

1178 sentence.get("text") if document_text else sentence.get("text") 

1179 ) 

1180 

1181 for entity in sentence.xpath(".//entity"): 

1182 start, end = entity.get("charOffset").split("-") 

1183 entities.append( 

1184 Entity( 

1185 ( 

1186 sentence_offset + int(start), 

1187 sentence_offset + int(end) + 1, 

1188 ), 

1189 entity.get("type"), 

1190 ) 

1191 ) 

1192 

1193 documents[document_id] = document_text 

1194 entities_per_document[document_id] = entities 

1195 

1196 return InternalBioNerDataset( 

1197 documents=documents, entities_per_document=entities_per_document 

1198 ) 

1199 

1200 

1201class HunerMiRNAHelper(object): 

1202 @staticmethod 

1203 def get_mirna_subset( 

1204 dataset: InternalBioNerDataset, split_url: str, split_dir: Path 

1205 ): 

1206 split_file = cached_path(split_url, split_dir) 

1207 

1208 with split_file.open(encoding="utf8") as f: 

1209 ids = [l.strip() for l in f if l.strip()] 

1210 ids = [id + "-train" for id in ids] + [id + "-test" for id in ids] 

1211 ids = sorted(id_ for id_ in ids if id_ in dataset.documents) 

1212 

1213 return InternalBioNerDataset( 

1214 documents={k: dataset.documents[k] for k in ids}, 

1215 entities_per_document={k: dataset.entities_per_document[k] for k in ids}, 

1216 ) 

1217 

1218 

1219class HUNER_GENE_MIRNA(HunerDataset): 

1220 """ 

1221 HUNER version of the miRNA corpus containing protein / gene annotations. 

1222 """ 

1223 

1224 def __init__(self, *args, **kwargs): 

1225 super().__init__(*args, **kwargs) 

1226 

1227 @staticmethod 

1228 def split_url() -> str: 

1229 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/miRNA" 

1230 

1231 def get_subset(self, dataset: InternalBioNerDataset, split: str, split_dir: Path): 

1232 # In the huner split files there is no information whether a given id originates 

1233 # from the train or test file of the original corpus - so we have to adapt corpus 

1234 # splitting here 

1235 return HunerMiRNAHelper.get_mirna_subset( 

1236 dataset, f"{self.split_url()}.{split}", split_dir 

1237 ) 

1238 

1239 def get_corpus_sentence_splitter(self): 

1240 return TagSentenceSplitter(tag=SENTENCE_TAG, tokenizer=SciSpacyTokenizer()) 

1241 

1242 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

1243 download_folder = data_dir / "original" 

1244 os.makedirs(str(download_folder), exist_ok=True) 

1245 

1246 sentence_separator = " " 

1247 if isinstance(self.sentence_splitter, TagSentenceSplitter): 

1248 sentence_separator = self.sentence_splitter.tag 

1249 

1250 train_data = MIRNA.download_and_prepare_train(download_folder, sentence_separator) 

1251 train_data = filter_and_map_entities(train_data, {"Genes/Proteins": GENE_TAG}) 

1252 

1253 test_data = MIRNA.download_and_prepare_test(download_folder, sentence_separator) 

1254 test_data = filter_and_map_entities(test_data, {"Genes/Proteins": GENE_TAG}) 

1255 

1256 return merge_datasets([train_data, test_data]) 

1257 

1258 

1259class HUNER_SPECIES_MIRNA(HunerDataset): 

1260 """ 

1261 HUNER version of the miRNA corpus containing species annotations. 

1262 """ 

1263 

1264 def __init__(self, *args, **kwargs): 

1265 super().__init__(*args, **kwargs) 

1266 

1267 @staticmethod 

1268 def split_url() -> str: 

1269 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/miRNA" 

1270 

1271 def get_subset(self, dataset: InternalBioNerDataset, split: str, split_dir: Path): 

1272 # In the huner split files there is no information whether a given id originates 

1273 # from the train or test file of the original corpus - so we have to adapt corpus 

1274 # splitting here 

1275 return HunerMiRNAHelper.get_mirna_subset( 

1276 dataset, f"{self.split_url()}.{split}", split_dir 

1277 ) 

1278 

1279 def get_corpus_sentence_splitter(self) -> SentenceSplitter: 

1280 return TagSentenceSplitter(tag=SENTENCE_TAG, tokenizer=SciSpacyTokenizer()) 

1281 

1282 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

1283 download_folder = data_dir / "original" 

1284 os.makedirs(str(download_folder), exist_ok=True) 

1285 

1286 sentence_separator = " " 

1287 if isinstance(self.sentence_splitter, TagSentenceSplitter): 

1288 sentence_separator = self.sentence_splitter.tag 

1289 

1290 train_data = MIRNA.download_and_prepare_train(download_folder, sentence_separator) 

1291 train_data = filter_and_map_entities(train_data, {"Species": SPECIES_TAG}) 

1292 

1293 test_data = MIRNA.download_and_prepare_test(download_folder, sentence_separator) 

1294 test_data = filter_and_map_entities(test_data, {"Species": SPECIES_TAG}) 

1295 

1296 return merge_datasets([train_data, test_data]) 

1297 

1298 

1299class HUNER_DISEASE_MIRNA(HunerDataset): 

1300 """ 

1301 HUNER version of the miRNA corpus containing disease annotations. 

1302 """ 

1303 

1304 def __init__(self, *args, **kwargs): 

1305 super().__init__(*args, **kwargs) 

1306 

1307 @staticmethod 

1308 def split_url() -> str: 

1309 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/miRNA" 

1310 

1311 def get_subset(self, dataset: InternalBioNerDataset, split: str, split_dir: Path): 

1312 # In the huner split files there is no information whether a given id originates 

1313 # from the train or test file of the original corpus - so we have to adapt corpus 

1314 # splitting here 

1315 return HunerMiRNAHelper.get_mirna_subset( 

1316 dataset, f"{self.split_url()}.{split}", split_dir 

1317 ) 

1318 

1319 def get_corpus_sentence_splitter(self) -> SentenceSplitter: 

1320 return TagSentenceSplitter(tag=SENTENCE_TAG, tokenizer=SciSpacyTokenizer()) 

1321 

1322 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

1323 download_folder = data_dir / "original" 

1324 os.makedirs(str(download_folder), exist_ok=True) 

1325 

1326 sentence_separator = " " 

1327 if isinstance(self.sentence_splitter, TagSentenceSplitter): 

1328 sentence_separator = self.sentence_splitter.tag 

1329 

1330 train_data = MIRNA.download_and_prepare_train(download_folder, sentence_separator) 

1331 train_data = filter_and_map_entities(train_data, {"Diseases": DISEASE_TAG}) 

1332 

1333 test_data = MIRNA.download_and_prepare_test(download_folder, sentence_separator) 

1334 test_data = filter_and_map_entities(test_data, {"Diseases": DISEASE_TAG}) 

1335 

1336 return merge_datasets([train_data, test_data]) 

1337 

1338 

1339class KaewphanCorpusHelper: 

1340 """ Helper class for the corpora from Kaewphan et al., i.e. CLL and Gellus""" 

1341 

1342 @staticmethod 

1343 def download_cll_dataset(data_folder: Path): 

1344 data_url = "http://bionlp-www.utu.fi/cell-lines/CLL_corpus.tar.gz" 

1345 data_path = cached_path(data_url, data_folder) 

1346 unpack_file(data_path, data_folder) 

1347 

1348 @staticmethod 

1349 def prepare_and_save_dataset(nersuite_folder: Path, output_file: Path): 

1350 writer = open(str(output_file), "w", encoding="utf8") 

1351 out_newline = False 

1352 

1353 for file in os.listdir(str(nersuite_folder)): 

1354 if not file.endswith(".nersuite"): 

1355 continue 

1356 

1357 annotations = [] 

1358 with open(os.path.join(str(nersuite_folder), file), "r", encoding="utf8") as reader: 

1359 for line in reader.readlines(): 

1360 columns = line.split("\t") 

1361 annotations.append(columns[:4]) 

1362 

1363 num_annotations = len(annotations) 

1364 for i, annotation in enumerate(annotations): 

1365 if len(annotation) == 1: 

1366 assert annotation[0] == "\n" 

1367 if not out_newline: 

1368 writer.write("\n") 

1369 out_newline = True 

1370 continue 

1371 

1372 has_whitespace = "+" 

1373 

1374 next_annotation = ( 

1375 annotations[i + 1] 

1376 if (i + 1) < num_annotations and len(annotations[i + 1]) > 1 

1377 else None 

1378 ) 

1379 if next_annotation and next_annotation[1] == annotation[2]: 

1380 has_whitespace = "-" 

1381 

1382 writer.write( 

1383 " ".join([annotation[3], annotation[0], has_whitespace]) + "\n" 

1384 ) 

1385 out_newline = False 

1386 

1387 if not out_newline: 

1388 writer.write("\n") 

1389 out_newline = True 

1390 

1391 writer.close() 

1392 

1393 @staticmethod 

1394 def download_gellus_dataset(data_folder: Path): 

1395 data_url = "http://bionlp-www.utu.fi/cell-lines/Gellus_corpus.tar.gz" 

1396 data_path = cached_path(data_url, data_folder) 

1397 unpack_file(data_path, data_folder) 

1398 

1399 @staticmethod 

1400 def read_dataset( 

1401 nersuite_folder: Path, sentence_separator: str 

1402 ) -> InternalBioNerDataset: 

1403 documents = {} 

1404 entities_per_document = {} 

1405 for file in os.listdir(str(nersuite_folder)): 

1406 if not file.endswith(".nersuite"): 

1407 continue 

1408 

1409 document_id = file.replace(".nersuite", "") 

1410 

1411 with open(os.path.join(str(nersuite_folder), file), "r", encoding="utf8") as reader: 

1412 document_text = "" 

1413 entities = [] 

1414 

1415 entity_start = None 

1416 entity_type = None 

1417 

1418 for line in reader.readlines(): 

1419 line = line.strip() 

1420 if line: 

1421 columns = line.split("\t") 

1422 tag = columns[0] 

1423 token = columns[3] 

1424 if tag.startswith("B-"): 

1425 if entity_type is not None: 

1426 entities.append( 

1427 Entity( 

1428 (entity_start, len(document_text)), entity_type 

1429 ) 

1430 ) 

1431 

1432 entity_start = ( 

1433 len(document_text) + 1 if document_text else 0 

1434 ) 

1435 entity_type = tag[2:] 

1436 

1437 elif tag == "O" and entity_type is not None: 

1438 entities.append( 

1439 Entity((entity_start, len(document_text)), entity_type,) 

1440 ) 

1441 entity_type = None 

1442 

1443 document_text = ( 

1444 document_text + " " + token if document_text else token 

1445 ) 

1446 else: 

1447 # Edge case: last token starts a new entity 

1448 if entity_type is not None: 

1449 entities.append( 

1450 Entity((entity_start, len(document_text)), entity_type) 

1451 ) 

1452 document_text += sentence_separator 

1453 

1454 if document_text.endswith(sentence_separator): 

1455 document_text = document_text[: -len(sentence_separator)] 

1456 

1457 documents[document_id] = document_text 

1458 entities_per_document[document_id] = entities 

1459 

1460 return InternalBioNerDataset( 

1461 documents=documents, entities_per_document=entities_per_document 

1462 ) 

1463 

1464 

1465class CLL(ColumnCorpus): 

1466 """ 

1467 Original CLL corpus containing cell line annotations. 

1468 

1469 For further information, see Kaewphan et al.: 

1470 Cell line name recognition in support of the identification of synthetic lethality in cancer from text 

1471 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4708107/ 

1472 """ 

1473 

1474 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): 

1475 """ 

1476 :param base_path: Path to the corpus on your machine 

1477 :param in_memory: If True, keeps dataset in memory giving speedups in training 

1478 """ 

1479 if type(base_path) == str: 

1480 base_path: Path = Path(base_path) 

1481 

1482 # column format 

1483 columns = {0: "text", 1: "ner"} 

1484 

1485 # this dataset name 

1486 dataset_name = self.__class__.__name__.lower() 

1487 

1488 # default dataset folder is the cache root 

1489 if not base_path: 

1490 base_path = flair.cache_root / "datasets" 

1491 data_folder = base_path / dataset_name 

1492 

1493 train_file = data_folder / "train.conll" 

1494 

1495 if not (train_file.exists()): 

1496 KaewphanCorpusHelper.download_cll_dataset(data_folder) 

1497 

1498 nersuite_folder = data_folder / "CLL-1.0.2" / "nersuite" 

1499 KaewphanCorpusHelper.prepare_and_save_dataset(nersuite_folder, train_file) 

1500 

1501 super(CLL, self).__init__( 

1502 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

1503 ) 

1504 

1505 

1506class HUNER_CELL_LINE_CLL(HunerDataset): 

1507 """ 

1508 HUNER version of the CLL corpus containing cell line annotations. 

1509 """ 

1510 

1511 def __init__(self, *args, **kwargs): 

1512 super().__init__(*args, **kwargs) 

1513 

1514 @staticmethod 

1515 def split_url() -> str: 

1516 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/cll" 

1517 

1518 def get_corpus_sentence_splitter(self) -> SentenceSplitter: 

1519 return TagSentenceSplitter(tag=SENTENCE_TAG, tokenizer=SciSpacyTokenizer()) 

1520 

1521 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

1522 KaewphanCorpusHelper.download_cll_dataset(data_dir) 

1523 

1524 sentence_separator = " " 

1525 if isinstance(self.sentence_splitter, TagSentenceSplitter): 

1526 sentence_separator = self.sentence_splitter.tag 

1527 

1528 nersuite_folder = data_dir / "CLL-1.0.2" / "nersuite" 

1529 orig_dataset = KaewphanCorpusHelper.read_dataset(nersuite_folder, sentence_separator) 

1530 

1531 return filter_and_map_entities(orig_dataset, {"CL": CELL_LINE_TAG}) 

1532 

1533 

1534class GELLUS(ColumnCorpus): 

1535 """ 

1536 Original Gellus corpus containing cell line annotations. 

1537 

1538 For further information, see Kaewphan et al.: 

1539 Cell line name recognition in support of the identification of synthetic lethality in cancer from text 

1540 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4708107/ 

1541 """ 

1542 

1543 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): 

1544 """ 

1545 :param base_path: Path to the corpus on your machine 

1546 :param in_memory: If True, keeps dataset in memory giving speedups in training 

1547 """ 

1548 if type(base_path) == str: 

1549 base_path: Path = Path(base_path) 

1550 

1551 # column format 

1552 columns = {0: "text", 1: "ner"} 

1553 

1554 # this dataset name 

1555 dataset_name = self.__class__.__name__.lower() 

1556 

1557 # default dataset folder is the cache root 

1558 if not base_path: 

1559 base_path = flair.cache_root / "datasets" 

1560 data_folder = base_path / dataset_name 

1561 

1562 train_file = data_folder / "train.conll" 

1563 dev_file = data_folder / "dev.conll" 

1564 test_file = data_folder / "test.conll" 

1565 

1566 if not (train_file.exists() and dev_file.exists() and test_file.exists()): 

1567 KaewphanCorpusHelper.download_gellus_dataset(data_folder) 

1568 

1569 nersuite_train = data_folder / "GELLUS-1.0.3" / "nersuite" / "train" 

1570 KaewphanCorpusHelper.prepare_and_save_dataset(nersuite_train, train_file) 

1571 

1572 nersuite_dev = data_folder / "GELLUS-1.0.3" / "nersuite" / "devel" 

1573 KaewphanCorpusHelper.prepare_and_save_dataset(nersuite_dev, dev_file) 

1574 

1575 nersuite_test = data_folder / "GELLUS-1.0.3" / "nersuite" / "test" 

1576 KaewphanCorpusHelper.prepare_and_save_dataset(nersuite_test, test_file) 

1577 

1578 super(GELLUS, self).__init__( 

1579 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

1580 ) 

1581 

1582 

1583class HUNER_CELL_LINE_GELLUS(HunerDataset): 

1584 """ 

1585 HUNER version of the Gellus corpus containing cell line annotations. 

1586 """ 

1587 

1588 def __init__(self, *args, **kwargs): 

1589 super().__init__(*args, **kwargs) 

1590 

1591 @staticmethod 

1592 def split_url() -> str: 

1593 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/gellus" 

1594 

1595 def get_corpus_sentence_splitter(self) -> SentenceSplitter: 

1596 return TagSentenceSplitter(tag=SENTENCE_TAG, tokenizer=SciSpacyTokenizer()) 

1597 

1598 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

1599 KaewphanCorpusHelper.download_gellus_dataset(data_dir) 

1600 

1601 sentence_separator = " " 

1602 if isinstance(self.sentence_splitter, TagSentenceSplitter): 

1603 sentence_separator = self.sentence_splitter.tag 

1604 

1605 splits = [] 

1606 for folder in ["train", "devel", "test"]: 

1607 nersuite_folder = data_dir / "GELLUS-1.0.3" / "nersuite" / folder 

1608 splits.append( 

1609 KaewphanCorpusHelper.read_dataset(nersuite_folder, sentence_separator) 

1610 ) 

1611 

1612 full_dataset = merge_datasets(splits) 

1613 return filter_and_map_entities(full_dataset, {"Cell-line-name": CELL_LINE_TAG}) 

1614 

1615 

1616class LOCTEXT(ColumnCorpus): 

1617 """ 

1618 Original LOCTEXT corpus containing species annotations. 

1619 

1620 For further information see Cejuela et al.: 

1621 LocText: relation extraction of protein localizations to assist database curation 

1622 https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-018-2021-9 

1623 """ 

1624 

1625 def __init__( 

1626 self, 

1627 base_path: Union[str, Path] = None, 

1628 in_memory: bool = True, 

1629 sentence_splitter: SentenceSplitter = None, 

1630 ): 

1631 """ 

1632 :param base_path: Path to the corpus on your machine 

1633 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

1634 :param sentence_splitter: Custom implementation of :class:`SentenceSplitter` 

1635 that segments a document into sentences and tokens (default :class:`SciSpacySentenceSplitter`) 

1636 """ 

1637 if type(base_path) == str: 

1638 base_path: Path = Path(base_path) 

1639 

1640 # column format 

1641 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY} 

1642 

1643 # this dataset name 

1644 dataset_name = self.__class__.__name__.lower() 

1645 

1646 # default dataset folder is the cache root 

1647 if not base_path: 

1648 base_path = flair.cache_root / "datasets" 

1649 data_folder = base_path / dataset_name 

1650 

1651 if sentence_splitter is None: 

1652 sentence_splitter = SciSpacySentenceSplitter() 

1653 

1654 train_file = data_folder / f"{sentence_splitter.name}_train.conll" 

1655 

1656 if not (train_file.exists()): 

1657 self.download_dataset(data_folder) 

1658 full_dataset = self.parse_dataset(data_folder) 

1659 

1660 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter) 

1661 conll_writer.write_to_conll(full_dataset, train_file) 

1662 

1663 super(LOCTEXT, self).__init__( 

1664 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

1665 ) 

1666 

1667 @staticmethod 

1668 def download_dataset(data_dir: Path): 

1669 data_url = "http://pubannotation.org/downloads/LocText-annotations.tgz" 

1670 data_path = cached_path(data_url, data_dir) 

1671 unpack_file(data_path, data_dir) 

1672 

1673 @staticmethod 

1674 def parse_dataset(data_dir: Path) -> InternalBioNerDataset: 

1675 loctext_json_folder = data_dir / "LocText" 

1676 

1677 entity_type_mapping = { 

1678 "go": "protein", 

1679 "uniprot": "protein", 

1680 "taxonomy": "species", 

1681 } 

1682 

1683 documents = {} 

1684 entities_per_document = {} 

1685 

1686 for file in os.listdir(str(loctext_json_folder)): 

1687 document_id = file.strip(".json") 

1688 entities = [] 

1689 

1690 with open(os.path.join(str(loctext_json_folder), file), "r", encoding="utf8") as f_in: 

1691 data = json.load(f_in) 

1692 document_text = data["text"].strip() 

1693 document_text = document_text.replace("\n", " ") 

1694 

1695 if "denotations" in data.keys(): 

1696 for ann in data["denotations"]: 

1697 start = int(ann["span"]["begin"]) 

1698 end = int(ann["span"]["end"]) 

1699 

1700 original_entity_type = ann["obj"].split(":")[0] 

1701 if not original_entity_type in entity_type_mapping: 

1702 continue 

1703 

1704 entity_type = entity_type_mapping[original_entity_type] 

1705 entities.append(Entity((start, end), entity_type)) 

1706 

1707 documents[document_id] = document_text 

1708 entities_per_document[document_id] = entities 

1709 

1710 return InternalBioNerDataset( 

1711 documents=documents, entities_per_document=entities_per_document 

1712 ) 

1713 

1714 

1715class HUNER_SPECIES_LOCTEXT(HunerDataset): 

1716 """ 

1717 HUNER version of the Loctext corpus containing species annotations. 

1718 """ 

1719 

1720 def __init__(self, *args, **kwargs): 

1721 super().__init__(*args, **kwargs) 

1722 

1723 @staticmethod 

1724 def split_url() -> str: 

1725 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/loctext" 

1726 

1727 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

1728 LOCTEXT.download_dataset(data_dir) 

1729 dataset = LOCTEXT.parse_dataset(data_dir) 

1730 

1731 return filter_and_map_entities(dataset, {"species": SPECIES_TAG}) 

1732 

1733 

1734class HUNER_GENE_LOCTEXT(HunerDataset): 

1735 """ 

1736 HUNER version of the Loctext corpus containing protein annotations. 

1737 """ 

1738 

1739 def __init__(self, *args, **kwargs): 

1740 super().__init__(*args, **kwargs) 

1741 

1742 @staticmethod 

1743 def split_url() -> str: 

1744 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/loctext" 

1745 

1746 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

1747 LOCTEXT.download_dataset(data_dir) 

1748 dataset = LOCTEXT.parse_dataset(data_dir) 

1749 

1750 return filter_and_map_entities(dataset, {"protein": GENE_TAG}) 

1751 

1752 

1753class CHEMDNER(ColumnCorpus): 

1754 """ 

1755 Original corpus of the CHEMDNER shared task. 

1756 

1757 For further information see Krallinger et al.: 

1758 The CHEMDNER corpus of chemicals and drugs and its annotation principles 

1759 https://jcheminf.biomedcentral.com/articles/10.1186/1758-2946-7-S1-S2 

1760 """ 

1761 

1762 default_dir = flair.cache_root / "datasets" / "CHEMDNER" 

1763 

1764 def __init__( 

1765 self, 

1766 base_path: Union[str, Path] = None, 

1767 in_memory: bool = True, 

1768 sentence_splitter: SentenceSplitter = None, 

1769 ): 

1770 """ 

1771 :param base_path: Path to the corpus on your machine 

1772 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

1773 :param sentence_splitter: Custom implementation of :class:`SentenceSplitter` which 

1774 segements documents into sentences and tokens 

1775 """ 

1776 

1777 if type(base_path) == str: 

1778 base_path: Path = Path(base_path) 

1779 

1780 # column format 

1781 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY} 

1782 

1783 # this dataset name 

1784 dataset_name = self.__class__.__name__.lower() 

1785 

1786 # default dataset folder is the cache root 

1787 if not base_path: 

1788 # download file is huge => make default_dir visible so that derivative 

1789 # corpora can all use the same download file 

1790 data_folder = self.default_dir 

1791 else: 

1792 data_folder = base_path / dataset_name 

1793 

1794 if sentence_splitter is None: 

1795 sentence_splitter = SciSpacySentenceSplitter() 

1796 

1797 train_file = data_folder / f"{sentence_splitter.name}_train.conll" 

1798 dev_file = data_folder / f"{sentence_splitter.name}_dev.conll" 

1799 test_file = data_folder / f"{sentence_splitter.name}_test.conll" 

1800 

1801 if not (train_file.exists() and dev_file.exists() and test_file.exists()): 

1802 download_dir = data_folder / "original" 

1803 os.makedirs(download_dir, exist_ok=True) 

1804 self.download_dataset(download_dir) 

1805 

1806 train_data = bioc_to_internal( 

1807 download_dir / "chemdner_corpus" / "training.bioc.xml" 

1808 ) 

1809 dev_data = bioc_to_internal( 

1810 download_dir / "chemdner_corpus" / "development.bioc.xml" 

1811 ) 

1812 test_data = bioc_to_internal( 

1813 download_dir / "chemdner_corpus" / "evaluation.bioc.xml" 

1814 ) 

1815 

1816 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter) 

1817 

1818 conll_writer.write_to_conll(train_data, train_file) 

1819 conll_writer.write_to_conll(dev_data, dev_file) 

1820 conll_writer.write_to_conll(test_data, test_file) 

1821 

1822 super(CHEMDNER, self).__init__( 

1823 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

1824 ) 

1825 

1826 @staticmethod 

1827 def download_dataset(data_dir: Path): 

1828 data_url = "https://biocreative.bioinformatics.udel.edu/media/store/files/2014/chemdner_corpus.tar.gz" 

1829 data_path = cached_path(data_url, data_dir) 

1830 unpack_file(data_path, data_dir) 

1831 

1832 

1833class HUNER_CHEMICAL_CHEMDNER(HunerDataset): 

1834 """ 

1835 HUNER version of the CHEMDNER corpus containing chemical annotations. 

1836 """ 

1837 

1838 def __init__(self, *args, download_folder=None, **kwargs): 

1839 self.download_folder = download_folder or CHEMDNER.default_dir / "original" 

1840 super().__init__(*args, **kwargs) 

1841 

1842 @staticmethod 

1843 def split_url() -> str: 

1844 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/chemdner" 

1845 

1846 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

1847 os.makedirs(str(self.download_folder), exist_ok=True) 

1848 CHEMDNER.download_dataset(self.download_folder) 

1849 train_data = bioc_to_internal( 

1850 self.download_folder / "chemdner_corpus" / "training.bioc.xml" 

1851 ) 

1852 dev_data = bioc_to_internal( 

1853 self.download_folder / "chemdner_corpus" / "development.bioc.xml" 

1854 ) 

1855 test_data = bioc_to_internal( 

1856 self.download_folder / "chemdner_corpus" / "evaluation.bioc.xml" 

1857 ) 

1858 all_data = merge_datasets([train_data, dev_data, test_data]) 

1859 all_data = filter_and_map_entities( 

1860 all_data, 

1861 { 

1862 "ABBREVIATION": CHEMICAL_TAG, 

1863 "FAMILY": CHEMICAL_TAG, 

1864 "FORMULA": CHEMICAL_TAG, 

1865 "IDENTIFIER": CHEMICAL_TAG, 

1866 "MULTIPLE": CHEMICAL_TAG, 

1867 "NO_CLASS": CHEMICAL_TAG, 

1868 "SYSTEMATIC": CHEMICAL_TAG, 

1869 "TRIVIAL": CHEMICAL_TAG, 

1870 }, 

1871 ) 

1872 

1873 return all_data 

1874 

1875 

1876class IEPA(ColumnCorpus): 

1877 """ 

1878 IEPA corpus as provided by http://corpora.informatik.hu-berlin.de/ 

1879 (Original corpus is 404) 

1880 

1881 For further information see Ding, Berleant, Nettleton, Wurtele: 

1882 Mining MEDLINE: abstracts, sentences, or phrases? 

1883 https://www.ncbi.nlm.nih.gov/pubmed/11928487 

1884 """ 

1885 

1886 def __init__( 

1887 self, 

1888 base_path: Union[str, Path] = None, 

1889 in_memory: bool = True, 

1890 tokenizer: Tokenizer = None, 

1891 ): 

1892 """ 

1893 :param base_path: Path to the corpus on your machine 

1894 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

1895 :param tokenizer: Custom implementation of :class:`Tokenizer` which 

1896 segments sentences into tokens (default :class:`SciSpacyTokenizer`) 

1897 """ 

1898 

1899 if type(base_path) == str: 

1900 base_path: Path = Path(base_path) 

1901 

1902 # column format 

1903 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY} 

1904 

1905 # this dataset name 

1906 dataset_name = self.__class__.__name__.lower() 

1907 

1908 # default dataset folder is the cache root 

1909 if not base_path: 

1910 base_path = flair.cache_root / "datasets" 

1911 data_folder = base_path / dataset_name 

1912 

1913 if tokenizer is None: 

1914 tokenizer = SciSpacyTokenizer() 

1915 

1916 sentence_splitter = NewlineSentenceSplitter(tokenizer=tokenizer) 

1917 

1918 train_file = data_folder / f"{sentence_splitter.name}_train.conll" 

1919 

1920 if not (train_file.exists()): 

1921 download_dir = data_folder / "original" 

1922 os.makedirs(download_dir, exist_ok=True) 

1923 self.download_dataset(download_dir) 

1924 

1925 all_data = bioc_to_internal(download_dir / "iepa_bioc.xml") 

1926 

1927 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter) 

1928 conll_writer.write_to_conll(all_data, train_file) 

1929 

1930 super(IEPA, self).__init__( 

1931 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

1932 ) 

1933 

1934 @staticmethod 

1935 def download_dataset(data_dir: Path): 

1936 data_url = ( 

1937 "http://corpora.informatik.hu-berlin.de/corpora/brat2bioc/iepa_bioc.xml.zip" 

1938 ) 

1939 data_path = cached_path(data_url, data_dir) 

1940 unpack_file(data_path, data_dir) 

1941 

1942 

1943class HUNER_GENE_IEPA(HunerDataset): 

1944 """ 

1945 HUNER version of the IEPA corpus containing gene annotations. 

1946 """ 

1947 

1948 def __init__(self, *args, **kwargs): 

1949 super().__init__(*args, **kwargs) 

1950 

1951 @staticmethod 

1952 def split_url() -> str: 

1953 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/iepa" 

1954 

1955 def get_corpus_sentence_splitter(self) -> SentenceSplitter: 

1956 return NewlineSentenceSplitter(tokenizer=SciSpacyTokenizer()) 

1957 

1958 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

1959 os.makedirs(str(data_dir), exist_ok=True) 

1960 IEPA.download_dataset(data_dir) 

1961 

1962 all_data = bioc_to_internal(data_dir / "iepa_bioc.xml") 

1963 all_data = filter_and_map_entities(all_data, {"Protein": GENE_TAG}) 

1964 

1965 return all_data 

1966 

1967 

1968class LINNEAUS(ColumnCorpus): 

1969 """ 

1970 Original LINNEAUS corpus containing species annotations. 

1971 

1972 For further information see Gerner et al.: 

1973 LINNAEUS: a species name identification system for biomedical literature 

1974 https://www.ncbi.nlm.nih.gov/pubmed/20149233 

1975 """ 

1976 

1977 def __init__( 

1978 self, 

1979 base_path: Union[str, Path] = None, 

1980 in_memory: bool = True, 

1981 tokenizer: Callable[[str], Tuple[List[str], List[int]]] = None, 

1982 ): 

1983 """ 

1984 :param base_path: Path to the corpus on your machine 

1985 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

1986 :param tokenizer: Custom implementation of :class:`Tokenizer` which segments 

1987 sentence into tokens (default :class:`SciSpacyTokenizer`) 

1988 """ 

1989 

1990 if type(base_path) == str: 

1991 base_path: Path = Path(base_path) 

1992 

1993 # column format 

1994 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY} 

1995 

1996 # this dataset name 

1997 dataset_name = self.__class__.__name__.lower() 

1998 

1999 # default dataset folder is the cache root 

2000 if not base_path: 

2001 base_path = flair.cache_root / "datasets" 

2002 data_folder = base_path / dataset_name 

2003 

2004 if tokenizer is None: 

2005 tokenizer = SciSpacyTokenizer() 

2006 

2007 sentence_splitter = TagSentenceSplitter(tag=SENTENCE_TAG, tokenizer=tokenizer) 

2008 

2009 train_file = data_folder / f"{sentence_splitter.name}_train.conll" 

2010 

2011 if not (train_file.exists()): 

2012 dataset = self.download_and_parse_dataset(data_folder) 

2013 

2014 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter) 

2015 conll_writer.write_to_conll(dataset, train_file) 

2016 

2017 super(LINNEAUS, self).__init__( 

2018 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

2019 ) 

2020 

2021 @staticmethod 

2022 def download_and_parse_dataset(data_dir: Path): 

2023 data_url = "https://iweb.dl.sourceforge.net/project/linnaeus/Corpora/manual-corpus-species-1.0.tar.gz" 

2024 data_path = cached_path(data_url, data_dir) 

2025 unpack_file(data_path, data_dir) 

2026 

2027 documents = {} 

2028 entities_per_document = defaultdict(list) 

2029 

2030 # Read texts 

2031 texts_directory = data_dir / "manual-corpus-species-1.0" / "txt" 

2032 for filename in os.listdir(str(texts_directory)): 

2033 document_id = filename.strip(".txt") 

2034 

2035 with open(os.path.join(str(texts_directory), filename), "r", encoding="utf8") as file: 

2036 documents[document_id] = file.read().strip() 

2037 

2038 # Read annotations 

2039 tag_file = data_dir / "manual-corpus-species-1.0" / "filtered_tags.tsv" 

2040 with open(str(tag_file), "r", encoding="utf8") as file: 

2041 next(file) # Ignore header row 

2042 

2043 for line in file: 

2044 if not line: 

2045 continue 

2046 

2047 document_id, start, end, text = line.strip().split("\t")[1:5] 

2048 start, end = int(start), int(end) 

2049 

2050 entities_per_document[document_id].append( 

2051 Entity((start, end), SPECIES_TAG) 

2052 ) 

2053 

2054 document_text = documents[document_id] 

2055 if document_text[start:end] != text: 

2056 raise AssertionError() 

2057 

2058 return InternalBioNerDataset( 

2059 documents=documents, entities_per_document=entities_per_document 

2060 ) 

2061 

2062 

2063class HUNER_SPECIES_LINNEAUS(HunerDataset): 

2064 """ 

2065 HUNER version of the LINNEAUS corpus containing species annotations. 

2066 """ 

2067 

2068 def __init__(self, *args, **kwargs): 

2069 super().__init__(*args, **kwargs) 

2070 

2071 @staticmethod 

2072 def split_url() -> str: 

2073 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/linneaus" 

2074 

2075 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

2076 return LINNEAUS.download_and_parse_dataset(data_dir) 

2077 

2078 

2079class CDR(ColumnCorpus): 

2080 """ 

2081 CDR corpus as provided by https://github.com/JHnlp/BioCreative-V-CDR-Corpus 

2082 

2083 For further information see Li et al.: 

2084 BioCreative V CDR task corpus: a resource for chemical disease relation extraction 

2085 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4860626/ 

2086 """ 

2087 

2088 def __init__( 

2089 self, 

2090 base_path: Union[str, Path] = None, 

2091 in_memory: bool = True, 

2092 sentence_splitter: Callable[[str], Tuple[List[str], List[int]]] = None, 

2093 ): 

2094 """ 

2095 :param base_path: Path to the corpus on your machine 

2096 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

2097 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments 

2098 documents into sentences and tokens (default :class:`SciSpacySentenceSplitter`) 

2099 """ 

2100 

2101 if type(base_path) == str: 

2102 base_path: Path = Path(base_path) 

2103 

2104 # column format 

2105 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY} 

2106 

2107 # this dataset name 

2108 dataset_name = self.__class__.__name__.lower() 

2109 

2110 # default dataset folder is the cache root 

2111 if not base_path: 

2112 base_path = flair.cache_root / "datasets" 

2113 data_folder = base_path / dataset_name 

2114 

2115 if sentence_splitter is None: 

2116 sentence_splitter = SciSpacySentenceSplitter() 

2117 

2118 train_file = data_folder / f"{sentence_splitter.name}_train.conll" 

2119 dev_file = data_folder / f"{sentence_splitter.name}_dev.conll" 

2120 test_file = data_folder / f"{sentence_splitter.name}_test.conll" 

2121 

2122 if not (train_file.exists() and dev_file.exists() and test_file.exists()): 

2123 download_dir = data_folder / "original" 

2124 os.makedirs(download_dir, exist_ok=True) 

2125 self.download_dataset(download_dir) 

2126 

2127 train_data = bioc_to_internal( 

2128 download_dir 

2129 / "CDR_Data" 

2130 / "CDR.Corpus.v010516" 

2131 / "CDR_TrainingSet.BioC.xml" 

2132 ) 

2133 dev_data = bioc_to_internal( 

2134 download_dir 

2135 / "CDR_Data" 

2136 / "CDR.Corpus.v010516" 

2137 / "CDR_DevelopmentSet.BioC.xml" 

2138 ) 

2139 test_data = bioc_to_internal( 

2140 download_dir 

2141 / "CDR_Data" 

2142 / "CDR.Corpus.v010516" 

2143 / "CDR_TestSet.BioC.xml" 

2144 ) 

2145 

2146 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter) 

2147 conll_writer.write_to_conll(train_data, train_file) 

2148 conll_writer.write_to_conll(dev_data, dev_file) 

2149 conll_writer.write_to_conll(test_data, test_file) 

2150 

2151 super(CDR, self).__init__( 

2152 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

2153 ) 

2154 

2155 @staticmethod 

2156 def download_dataset(data_dir: Path): 

2157 data_url = ( 

2158 "https://github.com/JHnlp/BioCreative-V-CDR-Corpus/raw/master/CDR_Data.zip" 

2159 ) 

2160 data_path = cached_path(data_url, data_dir) 

2161 unpack_file(data_path, data_dir) 

2162 

2163 

2164class HUNER_DISEASE_CDR(HunerDataset): 

2165 """ 

2166 HUNER version of the IEPA corpus containing disease annotations. 

2167 """ 

2168 

2169 def __init__(self, *args, **kwargs): 

2170 super().__init__(*args, **kwargs) 

2171 

2172 @staticmethod 

2173 def split_url() -> str: 

2174 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/CDRDisease" 

2175 

2176 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

2177 os.makedirs(str(data_dir), exist_ok=True) 

2178 CDR.download_dataset(data_dir) 

2179 train_data = bioc_to_internal( 

2180 data_dir / "CDR_Data" / "CDR.Corpus.v010516" / "CDR_TrainingSet.BioC.xml" 

2181 ) 

2182 dev_data = bioc_to_internal( 

2183 data_dir / "CDR_Data" / "CDR.Corpus.v010516" / "CDR_DevelopmentSet.BioC.xml" 

2184 ) 

2185 test_data = bioc_to_internal( 

2186 data_dir / "CDR_Data" / "CDR.Corpus.v010516" / "CDR_TestSet.BioC.xml" 

2187 ) 

2188 all_data = merge_datasets([train_data, dev_data, test_data]) 

2189 all_data = filter_and_map_entities(all_data, {"Disease": DISEASE_TAG}) 

2190 

2191 return all_data 

2192 

2193 

2194class HUNER_CHEMICAL_CDR(HunerDataset): 

2195 """ 

2196 HUNER version of the IEPA corpus containing chemical annotations. 

2197 """ 

2198 

2199 def __init__(self, *args, **kwargs): 

2200 super().__init__(*args, **kwargs) 

2201 

2202 @staticmethod 

2203 def split_url() -> str: 

2204 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/CDRChem" 

2205 

2206 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

2207 os.makedirs(str(data_dir), exist_ok=True) 

2208 CDR.download_dataset(data_dir) 

2209 train_data = bioc_to_internal( 

2210 data_dir / "CDR_Data" / "CDR.Corpus.v010516" / "CDR_TrainingSet.BioC.xml" 

2211 ) 

2212 dev_data = bioc_to_internal( 

2213 data_dir / "CDR_Data" / "CDR.Corpus.v010516" / "CDR_DevelopmentSet.BioC.xml" 

2214 ) 

2215 test_data = bioc_to_internal( 

2216 data_dir / "CDR_Data" / "CDR.Corpus.v010516" / "CDR_TestSet.BioC.xml" 

2217 ) 

2218 all_data = merge_datasets([train_data, dev_data, test_data]) 

2219 all_data = filter_and_map_entities(all_data, {"Chemical": CHEMICAL_TAG}) 

2220 

2221 return all_data 

2222 

2223 

2224class VARIOME(ColumnCorpus): 

2225 """ 

2226 Variome corpus as provided by http://corpora.informatik.hu-berlin.de/corpora/brat2bioc/hvp_bioc.xml.zip 

2227 

2228 For further information see Verspoor et al.: 

2229 Annotating the biomedical literature for the human variome 

2230 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3676157/ 

2231 """ 

2232 

2233 def __init__( 

2234 self, 

2235 base_path: Union[str, Path] = None, 

2236 in_memory: bool = True, 

2237 sentence_splitter: Callable[[str], Tuple[List[str], List[int]]] = None, 

2238 ): 

2239 """ 

2240 :param base_path: Path to the corpus on your machine 

2241 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

2242 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments 

2243 documents into sentences and tokens (default :class:`SciSpacySentenceSplitter`) 

2244 """ 

2245 

2246 if type(base_path) == str: 

2247 base_path: Path = Path(base_path) 

2248 

2249 # column format 

2250 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY} 

2251 

2252 # this dataset name 

2253 dataset_name = self.__class__.__name__.lower() 

2254 

2255 # default dataset folder is the cache root 

2256 if not base_path: 

2257 base_path = flair.cache_root / "datasets" 

2258 data_folder = base_path / dataset_name 

2259 

2260 if sentence_splitter is None: 

2261 sentence_splitter = SciSpacySentenceSplitter() 

2262 

2263 train_file = data_folder / f"{sentence_splitter.name}_train.conll" 

2264 

2265 if not (train_file.exists()): 

2266 download_dir = data_folder / "original" 

2267 os.makedirs(download_dir, exist_ok=True) 

2268 self.download_dataset(download_dir) 

2269 

2270 all_data = self.parse_corpus(download_dir / "hvp_bioc.xml") 

2271 

2272 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter) 

2273 conll_writer.write_to_conll(all_data, train_file) 

2274 

2275 super(VARIOME, self).__init__( 

2276 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

2277 ) 

2278 

2279 @staticmethod 

2280 def download_dataset(data_dir: Path): 

2281 data_url = ( 

2282 "http://corpora.informatik.hu-berlin.de/corpora/brat2bioc/hvp_bioc.xml.zip" 

2283 ) 

2284 data_path = cached_path(data_url, data_dir) 

2285 unpack_file(data_path, data_dir) 

2286 

2287 @staticmethod 

2288 def parse_corpus(corpus_xml: Path) -> InternalBioNerDataset: 

2289 corpus = bioc_to_internal(corpus_xml) 

2290 

2291 cleaned_documents = {} 

2292 cleaned_entities_per_document = {} 

2293 

2294 for id, document_text in corpus.documents.items(): 

2295 entities = corpus.entities_per_document[id] 

2296 original_length = len(document_text) 

2297 

2298 text_cleaned = document_text.replace("** IGNORE LINE **\n", "") 

2299 offset = original_length - len(text_cleaned) 

2300 

2301 if offset != 0: 

2302 new_entities = [] 

2303 for entity in entities: 

2304 new_start = entity.char_span.start - offset 

2305 new_end = entity.char_span.stop - offset 

2306 

2307 new_entities.append(Entity((new_start, new_end), entity.type)) 

2308 

2309 orig_text = document_text[ 

2310 entity.char_span.start : entity.char_span.stop 

2311 ] 

2312 new_text = text_cleaned[new_start:new_end] 

2313 assert orig_text == new_text 

2314 

2315 entities = new_entities 

2316 document_text = text_cleaned 

2317 

2318 cleaned_documents[id] = document_text 

2319 cleaned_entities_per_document[id] = entities 

2320 

2321 return InternalBioNerDataset( 

2322 documents=cleaned_documents, 

2323 entities_per_document=cleaned_entities_per_document, 

2324 ) 

2325 

2326 

2327class HUNER_GENE_VARIOME(HunerDataset): 

2328 """ 

2329 HUNER version of the Variome corpus containing gene annotations. 

2330 """ 

2331 

2332 def __init__(self, *args, **kwargs): 

2333 super().__init__(*args, **kwargs) 

2334 

2335 @staticmethod 

2336 def split_url() -> str: 

2337 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/variome_gene" 

2338 

2339 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

2340 os.makedirs(str(data_dir), exist_ok=True) 

2341 VARIOME.download_dataset(data_dir) 

2342 all_data = VARIOME.parse_corpus(data_dir / "hvp_bioc.xml") 

2343 all_data = filter_and_map_entities(all_data, {"gene": GENE_TAG}) 

2344 

2345 return all_data 

2346 

2347 

2348class HUNER_DISEASE_VARIOME(HunerDataset): 

2349 """ 

2350 HUNER version of the Variome corpus containing disease annotations. 

2351 """ 

2352 

2353 def __init__(self, *args, **kwargs): 

2354 super().__init__(*args, **kwargs) 

2355 

2356 @staticmethod 

2357 def split_url() -> str: 

2358 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/variome_disease" 

2359 

2360 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

2361 os.makedirs(str(data_dir), exist_ok=True) 

2362 VARIOME.download_dataset(data_dir) 

2363 all_data = VARIOME.parse_corpus(data_dir / "hvp_bioc.xml") 

2364 all_data = filter_and_map_entities( 

2365 all_data, {"Disorder": DISEASE_TAG, "disease": DISEASE_TAG} 

2366 ) 

2367 

2368 return all_data 

2369 

2370 

2371class HUNER_SPECIES_VARIOME(HunerDataset): 

2372 """ 

2373 HUNER version of the Variome corpus containing species annotations. 

2374 """ 

2375 

2376 def __init__(self, *args, **kwargs): 

2377 super().__init__(*args, **kwargs) 

2378 

2379 @staticmethod 

2380 def split_url() -> str: 

2381 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/variome_species" 

2382 

2383 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

2384 os.makedirs(str(data_dir), exist_ok=True) 

2385 VARIOME.download_dataset(data_dir) 

2386 all_data = VARIOME.parse_corpus(data_dir / "hvp_bioc.xml") 

2387 all_data = filter_and_map_entities(all_data, {"Living_Beings": SPECIES_TAG}) 

2388 

2389 return all_data 

2390 

2391 

2392class NCBI_DISEASE(ColumnCorpus): 

2393 """ 

2394 Original NCBI disease corpus containing disease annotations. 

2395 

2396 For further information see Dogan et al.: 

2397 NCBI disease corpus: a resource for disease name recognition and concept normalization 

2398 https://www.ncbi.nlm.nih.gov/pubmed/24393765 

2399 """ 

2400 

2401 def __init__( 

2402 self, 

2403 base_path: Union[str, Path] = None, 

2404 in_memory: bool = True, 

2405 sentence_splitter: SentenceSplitter = None, 

2406 ): 

2407 """ 

2408 :param base_path: Path to the corpus on your machine 

2409 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

2410 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments 

2411 documents into sentences and tokens (default :class:`SciSpacySentenceSplitter`) 

2412 """ 

2413 

2414 if type(base_path) == str: 

2415 base_path: Path = Path(base_path) 

2416 

2417 # column format 

2418 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY} 

2419 

2420 # this dataset name 

2421 dataset_name = self.__class__.__name__.lower() 

2422 

2423 # default dataset folder is the cache root 

2424 if not base_path: 

2425 base_path = flair.cache_root / "datasets" 

2426 data_folder = base_path / dataset_name 

2427 

2428 if sentence_splitter is None: 

2429 sentence_splitter = SciSpacySentenceSplitter() 

2430 

2431 train_file = data_folder / f"{sentence_splitter.name}_train.conll" 

2432 dev_file = data_folder / f"{sentence_splitter.name}_dev.conll" 

2433 test_file = data_folder / f"{sentence_splitter.name}_test.conll" 

2434 

2435 if not (train_file.exists() and dev_file.exists() and test_file.exists()): 

2436 orig_folder = self.download_corpus(data_folder) 

2437 

2438 train_data = self.parse_input_file(orig_folder / "NCBItrainset_patched.txt") 

2439 dev_data = self.parse_input_file(orig_folder / "NCBIdevelopset_corpus.txt") 

2440 test_data = self.parse_input_file(orig_folder / "NCBItestset_corpus.txt") 

2441 

2442 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter) 

2443 conll_writer.write_to_conll(train_data, train_file) 

2444 conll_writer.write_to_conll(dev_data, dev_file) 

2445 conll_writer.write_to_conll(test_data, test_file) 

2446 

2447 super(NCBI_DISEASE, self).__init__( 

2448 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

2449 ) 

2450 

2451 @classmethod 

2452 def download_corpus(cls, data_dir: Path) -> Path: 

2453 original_folder = data_dir / "original" 

2454 os.makedirs(str(original_folder), exist_ok=True) 

2455 

2456 data_urls = [ 

2457 "https://www.ncbi.nlm.nih.gov/CBBresearch/Dogan/DISEASE/NCBItrainset_corpus.zip", 

2458 "https://www.ncbi.nlm.nih.gov/CBBresearch/Dogan/DISEASE/NCBIdevelopset_corpus.zip", 

2459 "https://www.ncbi.nlm.nih.gov/CBBresearch/Dogan/DISEASE/NCBItestset_corpus.zip", 

2460 ] 

2461 

2462 for url in data_urls: 

2463 data_path = cached_path(url, original_folder) 

2464 unpack_file(data_path, original_folder) 

2465 

2466 # We need to apply a patch to correct the original training file 

2467 orig_train_file = original_folder / "NCBItrainset_corpus.txt" 

2468 patched_train_file = original_folder / "NCBItrainset_patched.txt" 

2469 cls.patch_training_file(orig_train_file, patched_train_file) 

2470 

2471 return original_folder 

2472 

2473 @staticmethod 

2474 def patch_training_file(orig_train_file: Path, patched_file: Path): 

2475 patch_lines = { 

2476 3249: '10923035\t711\t761\tgeneralized epilepsy and febrile seizures " plus "\tSpecificDisease\tD004829+D003294\n' 

2477 } 

2478 with open(str(orig_train_file), "r", encoding="utf8") as input: 

2479 with open(str(patched_file), "w", encoding="utf8") as output: 

2480 line_no = 1 

2481 

2482 for line in input: 

2483 output.write( 

2484 patch_lines[line_no] if line_no in patch_lines else line 

2485 ) 

2486 line_no += 1 

2487 

2488 @staticmethod 

2489 def parse_input_file(input_file: Path): 

2490 documents = {} 

2491 entities_per_document = {} 

2492 

2493 with open(str(input_file), "r", encoding="utf8") as file: 

2494 document_id = None 

2495 document_text = None 

2496 entities = [] 

2497 

2498 c = 1 

2499 for line in file: 

2500 line = line.strip() 

2501 if not line: 

2502 if document_id and document_text: 

2503 documents[document_id] = document_text 

2504 entities_per_document[document_id] = entities 

2505 

2506 document_id, document_text, entities = None, None, [] 

2507 c = 1 

2508 continue 

2509 if c == 1: 

2510 # Articles title 

2511 document_text = line.split("|")[2] + " " 

2512 document_id = line.split("|")[0] 

2513 elif c == 2: 

2514 # Article abstract 

2515 document_text += line.split("|")[2] 

2516 else: 

2517 # Entity annotations 

2518 columns = line.split("\t") 

2519 start = int(columns[1]) 

2520 end = int(columns[2]) 

2521 entity_text = columns[3] 

2522 

2523 assert document_text[start:end] == entity_text 

2524 entities.append(Entity((start, end), DISEASE_TAG)) 

2525 c += 1 

2526 

2527 if c != 1 and document_id and document_text: 

2528 documents[document_id] = document_text 

2529 entities_per_document[document_id] = entities 

2530 

2531 return InternalBioNerDataset( 

2532 documents=documents, entities_per_document=entities_per_document 

2533 ) 

2534 

2535 

2536class HUNER_DISEASE_NCBI(HunerDataset): 

2537 """ 

2538 HUNER version of the NCBI corpus containing disease annotations. 

2539 """ 

2540 

2541 def __init__(self, *args, **kwargs): 

2542 super().__init__(*args, **kwargs) 

2543 

2544 @staticmethod 

2545 def split_url() -> str: 

2546 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/ncbi" 

2547 

2548 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

2549 orig_folder = NCBI_DISEASE.download_corpus(data_dir) 

2550 

2551 train_data = NCBI_DISEASE.parse_input_file( 

2552 orig_folder / "NCBItrainset_patched.txt" 

2553 ) 

2554 dev_data = NCBI_DISEASE.parse_input_file( 

2555 orig_folder / "NCBIdevelopset_corpus.txt" 

2556 ) 

2557 test_data = NCBI_DISEASE.parse_input_file( 

2558 orig_folder / "NCBItestset_corpus.txt" 

2559 ) 

2560 

2561 return merge_datasets([train_data, dev_data, test_data]) 

2562 

2563 

2564class ScaiCorpus(ColumnCorpus): 

2565 """Base class to support the SCAI chemicals and disease corpora""" 

2566 

2567 def __init__( 

2568 self, 

2569 base_path: Union[str, Path] = None, 

2570 in_memory: bool = True, 

2571 sentence_splitter: Callable[[str], Tuple[List[str], List[int]]] = None, 

2572 ): 

2573 """ 

2574 :param base_path: Path to the corpus on your machine 

2575 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

2576 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments 

2577 documents into sentences and tokens (default :class:`SciSpacySentenceSplitter`) 

2578 """ 

2579 

2580 if type(base_path) == str: 

2581 base_path: Path = Path(base_path) 

2582 

2583 # column format 

2584 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY} 

2585 

2586 # this dataset name 

2587 dataset_name = self.__class__.__name__.lower() 

2588 

2589 # default dataset folder is the cache root 

2590 if not base_path: 

2591 base_path = flair.cache_root / "datasets" 

2592 data_folder = base_path / dataset_name 

2593 

2594 if sentence_splitter is None: 

2595 sentence_splitter = SciSpacySentenceSplitter() 

2596 

2597 train_file = data_folder / f"{sentence_splitter.name}_train.conll" 

2598 

2599 if not (train_file.exists()): 

2600 dataset_file = self.download_corpus(data_folder) 

2601 train_data = self.parse_input_file(dataset_file) 

2602 

2603 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter) 

2604 conll_writer.write_to_conll(train_data, train_file) 

2605 

2606 super(ScaiCorpus, self).__init__( 

2607 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

2608 ) 

2609 

2610 def download_corpus(self, data_folder: Path) -> Path: 

2611 raise NotImplementedError() 

2612 

2613 @staticmethod 

2614 def parse_input_file(input_file: Path): 

2615 documents = {} 

2616 entities_per_document = {} 

2617 

2618 with open(str(input_file), "r", encoding="iso-8859-1") as file: 

2619 document_id = None 

2620 document_text = None 

2621 entities = [] 

2622 entity_type = None 

2623 

2624 for line in file: 

2625 line = line.strip() 

2626 if not line: 

2627 continue 

2628 

2629 if line[:3] == "###": 

2630 # Edge case: last token starts a new entity 

2631 if entity_type is not None: 

2632 entities.append( 

2633 Entity((entity_start, len(document_text)), entity_type) 

2634 ) 

2635 

2636 if not (document_id is None and document_text is None): 

2637 documents[document_id] = document_text 

2638 entities_per_document[document_id] = entities 

2639 

2640 document_id = line.strip("#").strip() 

2641 document_text = None 

2642 entities = [] 

2643 else: 

2644 columns = line.strip().split("\t") 

2645 token = columns[0].strip() 

2646 tag = columns[4].strip().split("|")[1] 

2647 

2648 if tag.startswith("B-"): 

2649 if entity_type is not None: 

2650 entities.append( 

2651 Entity((entity_start, len(document_text)), entity_type) 

2652 ) 

2653 

2654 entity_start = len(document_text) + 1 if document_text else 0 

2655 entity_type = tag[2:] 

2656 

2657 elif tag == "O" and entity_type is not None: 

2658 entities.append( 

2659 Entity((entity_start, len(document_text)), entity_type) 

2660 ) 

2661 entity_type = None 

2662 

2663 document_text = ( 

2664 document_text + " " + token if document_text else token 

2665 ) 

2666 

2667 return InternalBioNerDataset( 

2668 documents=documents, entities_per_document=entities_per_document 

2669 ) 

2670 

2671 

2672class SCAI_CHEMICALS(ScaiCorpus): 

2673 """ 

2674 Original SCAI chemicals corpus containing chemical annotations. 

2675 

2676 For further information see Kolářik et al.: 

2677 Chemical Names: Terminological Resources and Corpora Annotation 

2678 https://pub.uni-bielefeld.de/record/2603498 

2679 """ 

2680 

2681 def __init__(self, *args, **kwargs): 

2682 super().__init__(*args, **kwargs) 

2683 

2684 def download_corpus(self, data_dir: Path) -> Path: 

2685 return self.perform_corpus_download(data_dir) 

2686 

2687 @staticmethod 

2688 def perform_corpus_download(data_dir: Path) -> Path: 

2689 original_directory = data_dir / "original" 

2690 os.makedirs(str(original_directory), exist_ok=True) 

2691 

2692 url = "https://www.scai.fraunhofer.de/content/dam/scai/de/downloads/bioinformatik/Corpora-for-Chemical-Entity-Recognition/chemicals-test-corpus-27-04-2009-v3_iob.gz" 

2693 data_path = cached_path(url, original_directory) 

2694 corpus_file = original_directory / "chemicals-test-corpus-27-04-2009-v3.iob" 

2695 unpack_file(data_path, corpus_file) 

2696 

2697 return corpus_file 

2698 

2699 

2700class SCAI_DISEASE(ScaiCorpus): 

2701 """ 

2702 Original SCAI disease corpus containing disease annotations. 

2703 

2704 For further information see Gurulingappa et al.: 

2705 An Empirical Evaluation of Resources for the Identification of Diseases and Adverse Effects in Biomedical Literature 

2706 https://pub.uni-bielefeld.de/record/2603398 

2707 """ 

2708 

2709 def __init__(self, *args, **kwargs): 

2710 super().__init__(*args, **kwargs) 

2711 

2712 def download_corpus(self, data_dir: Path) -> Path: 

2713 return self.perform_corpus_download(data_dir) 

2714 

2715 @staticmethod 

2716 def perform_corpus_download(data_dir: Path) -> Path: 

2717 original_directory = data_dir / "original" 

2718 os.makedirs(str(original_directory), exist_ok=True) 

2719 

2720 url = "https://www.scai.fraunhofer.de/content/dam/scai/de/downloads/bioinformatik/Disease-ae-corpus.iob" 

2721 data_path = cached_path(url, original_directory) 

2722 

2723 return data_path 

2724 

2725 

2726class HUNER_CHEMICAL_SCAI(HunerDataset): 

2727 """ 

2728 HUNER version of the SCAI chemicals corpus containing chemical annotations. 

2729 """ 

2730 

2731 def __init__(self, *args, **kwargs): 

2732 super().__init__(*args, **kwargs) 

2733 

2734 @staticmethod 

2735 def split_url() -> str: 

2736 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/scai_chemicals" 

2737 

2738 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

2739 original_file = SCAI_CHEMICALS.perform_corpus_download(data_dir) 

2740 corpus = ScaiCorpus.parse_input_file(original_file) 

2741 

2742 # Map all entities to chemicals 

2743 entity_mapping = { 

2744 "FAMILY": CHEMICAL_TAG, 

2745 "TRIVIALVAR": CHEMICAL_TAG, 

2746 "PARTIUPAC": CHEMICAL_TAG, 

2747 "TRIVIAL": CHEMICAL_TAG, 

2748 "ABBREVIATION": CHEMICAL_TAG, 

2749 "IUPAC": CHEMICAL_TAG, 

2750 "MODIFIER": CHEMICAL_TAG, 

2751 "SUM": CHEMICAL_TAG, 

2752 } 

2753 

2754 return filter_and_map_entities(corpus, entity_mapping) 

2755 

2756 

2757class HUNER_DISEASE_SCAI(HunerDataset): 

2758 """ 

2759 HUNER version of the SCAI chemicals corpus containing chemical annotations. 

2760 """ 

2761 

2762 def __init__(self, *args, **kwargs): 

2763 super().__init__(*args, **kwargs) 

2764 

2765 @staticmethod 

2766 def split_url() -> str: 

2767 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/scai_disease" 

2768 

2769 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

2770 original_file = SCAI_DISEASE.perform_corpus_download(data_dir) 

2771 corpus = ScaiCorpus.parse_input_file(original_file) 

2772 

2773 # Map all entities to disease 

2774 entity_mapping = {"DISEASE": DISEASE_TAG, "ADVERSE": DISEASE_TAG} 

2775 

2776 return filter_and_map_entities(corpus, entity_mapping) 

2777 

2778 

2779class OSIRIS(ColumnCorpus): 

2780 """ 

2781 Original OSIRIS corpus containing variation and gene annotations. 

2782 

2783 For further information see Furlong et al.: 

2784 Osiris v1.2: a named entity recognition system for sequence variants of genes in biomedical literature 

2785 https://www.ncbi.nlm.nih.gov/pubmed/18251998 

2786 """ 

2787 

2788 def __init__( 

2789 self, 

2790 base_path: Union[str, Path] = None, 

2791 in_memory: bool = True, 

2792 sentence_splitter: SentenceSplitter = None, 

2793 load_original_unfixed_annotation=False, 

2794 ): 

2795 """ 

2796 :param base_path: Path to the corpus on your machine 

2797 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

2798 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which 

2799 segments documents into sentences and tokens (default :class:`SciSpacySentenceSplitter`) 

2800 :param load_original_unfixed_annotation: The original annotation of Osiris 

2801 erroneously annotates two sentences as a protein. Set to True if you don't 

2802 want the fixed version. 

2803 """ 

2804 

2805 if type(base_path) == str: 

2806 base_path: Path = Path(base_path) 

2807 

2808 # column format 

2809 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY} 

2810 

2811 # this dataset name 

2812 dataset_name = self.__class__.__name__.lower() 

2813 

2814 # default dataset folder is the cache root 

2815 if not base_path: 

2816 base_path = flair.cache_root / "datasets" 

2817 data_folder = base_path / dataset_name 

2818 

2819 if sentence_splitter is None: 

2820 sentence_splitter = SciSpacySentenceSplitter() 

2821 

2822 train_file = data_folder / f"{sentence_splitter.name}_train.conll" 

2823 

2824 if not (train_file.exists()): 

2825 corpus_folder = self.download_dataset(data_folder) 

2826 corpus_data = self.parse_dataset( 

2827 corpus_folder, fix_annotation=not load_original_unfixed_annotation 

2828 ) 

2829 

2830 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter) 

2831 conll_writer.write_to_conll(corpus_data, train_file) 

2832 

2833 super(OSIRIS, self).__init__( 

2834 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

2835 ) 

2836 

2837 @classmethod 

2838 def download_dataset(cls, data_dir: Path) -> Path: 

2839 url = "http://ibi.imim.es/OSIRIScorpusv02.tar" 

2840 data_path = cached_path(url, data_dir) 

2841 unpack_file(data_path, data_dir) 

2842 

2843 return data_dir / "OSIRIScorpusv02" 

2844 

2845 @classmethod 

2846 def parse_dataset(cls, corpus_folder: Path, fix_annotation=True): 

2847 documents = {} 

2848 entities_per_document = {} 

2849 

2850 input_files = [ 

2851 file 

2852 for file in os.listdir(str(corpus_folder)) 

2853 if file.endswith(".txt") and not file.startswith("README") 

2854 ] 

2855 for text_file in input_files: 

2856 

2857 with open(os.path.join(str(corpus_folder), text_file), encoding="utf8") as text_reader: 

2858 document_text = text_reader.read() 

2859 if not document_text: 

2860 continue 

2861 

2862 article_parts = document_text.split("\n\n") 

2863 document_id = article_parts[0] 

2864 text_offset = document_text.find(article_parts[1]) 

2865 document_text = (article_parts[1] + " " + article_parts[2]).strip() 

2866 

2867 with open(os.path.join(str(corpus_folder), text_file + ".ann"), encoding="utf8") as ann_file: 

2868 entities = [] 

2869 

2870 tree = etree.parse(ann_file) 

2871 for annotation in tree.xpath(".//Annotation"): 

2872 entity_type = annotation.get("type") 

2873 if entity_type == "file": 

2874 continue 

2875 

2876 start, end = annotation.get("span").split("..") 

2877 start, end = int(start), int(end) 

2878 

2879 if ( 

2880 fix_annotation 

2881 and text_file == "article46.txt" 

2882 and start == 289 

2883 and end == 644 

2884 ): 

2885 end = 295 

2886 

2887 entities.append( 

2888 Entity((start - text_offset, end - text_offset), entity_type) 

2889 ) 

2890 

2891 documents[document_id] = document_text 

2892 entities_per_document[document_id] = entities 

2893 

2894 return InternalBioNerDataset( 

2895 documents=documents, entities_per_document=entities_per_document 

2896 ) 

2897 

2898 

2899class HUNER_GENE_OSIRIS(HunerDataset): 

2900 """ 

2901 HUNER version of the OSIRIS corpus containing (only) gene annotations. 

2902 

2903 """ 

2904 

2905 def __init__(self, *args, **kwargs): 

2906 super().__init__(*args, **kwargs) 

2907 

2908 @staticmethod 

2909 def split_url() -> str: 

2910 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/osiris" 

2911 

2912 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

2913 original_file = OSIRIS.download_dataset(data_dir) 

2914 corpus = OSIRIS.parse_dataset(original_file) 

2915 

2916 entity_type_mapping = {"ge": GENE_TAG} 

2917 return filter_and_map_entities(corpus, entity_type_mapping) 

2918 

2919 

2920class S800(ColumnCorpus): 

2921 """ 

2922 S800 corpus 

2923 For further information see Pafilis et al.: 

2924 The SPECIES and ORGANISMS Resources for Fast and Accurate Identification of Taxonomic Names in Text 

2925 http://www.plosone.org/article/info:doi%2F10.1371%2Fjournal.pone.0065390 

2926 """ 

2927 

2928 def __init__( 

2929 self, 

2930 base_path: Union[str, Path] = None, 

2931 in_memory: bool = True, 

2932 sentence_splitter: SentenceSplitter = None, 

2933 ): 

2934 """ 

2935 :param base_path: Path to the corpus on your machine 

2936 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

2937 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments documents 

2938 into sentences and tokens (default :class:`SciSpacySentenceSplitter`) 

2939 """ 

2940 

2941 if type(base_path) == str: 

2942 base_path: Path = Path(base_path) 

2943 

2944 # column format 

2945 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY} 

2946 

2947 # this dataset name 

2948 dataset_name = self.__class__.__name__.lower() 

2949 

2950 # default dataset folder is the cache root 

2951 if not base_path: 

2952 base_path = flair.cache_root / "datasets" 

2953 data_folder = base_path / dataset_name 

2954 

2955 if sentence_splitter is None: 

2956 sentence_splitter = SciSpacySentenceSplitter() 

2957 

2958 train_file = data_folder / f"{sentence_splitter.name}_train.conll" 

2959 

2960 if not (train_file.exists()): 

2961 download_dir = data_folder / "original" 

2962 os.makedirs(download_dir, exist_ok=True) 

2963 self.download_dataset(download_dir) 

2964 

2965 all_data = self.parse_dataset(download_dir) 

2966 

2967 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter) 

2968 conll_writer.write_to_conll(all_data, train_file) 

2969 

2970 super(S800, self).__init__( 

2971 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

2972 ) 

2973 

2974 @staticmethod 

2975 def download_dataset(data_dir: Path): 

2976 data_url = "https://species.jensenlab.org/files/S800-1.0.tar.gz" 

2977 data_path = cached_path(data_url, data_dir) 

2978 unpack_file(data_path, data_dir) 

2979 

2980 @staticmethod 

2981 def parse_dataset(data_dir: Path) -> InternalBioNerDataset: 

2982 entities_per_document = defaultdict(list) 

2983 texts_per_document = {} 

2984 with (data_dir / "S800.tsv").open(encoding="utf8") as f: 

2985 for line in f: 

2986 fields = line.strip().split("\t") 

2987 if not fields: 

2988 continue 

2989 fname, pmid = fields[1].split(":") 

2990 start, end = int(fields[2]), int(fields[3]) 

2991 

2992 if start == end: 

2993 continue 

2994 

2995 entities_per_document[fname].append(Entity((start, end), "Species")) 

2996 

2997 for fname in entities_per_document: 

2998 with (data_dir / "abstracts" / fname).with_suffix(".txt").open(encoding="utf8") as f: 

2999 texts_per_document[fname] = f.read() 

3000 

3001 return InternalBioNerDataset( 

3002 documents=texts_per_document, entities_per_document=entities_per_document 

3003 ) 

3004 

3005 

3006class HUNER_SPECIES_S800(HunerDataset): 

3007 """ 

3008 HUNER version of the S800 corpus containing species annotations. 

3009 """ 

3010 def __init__(self, *args, **kwargs): 

3011 super().__init__(*args, **kwargs) 

3012 

3013 @staticmethod 

3014 def split_url() -> str: 

3015 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/s800" 

3016 

3017 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

3018 S800.download_dataset(data_dir) 

3019 data = S800.parse_dataset(data_dir) 

3020 data = filter_and_map_entities(data, {"Species": SPECIES_TAG}) 

3021 

3022 return data 

3023 

3024 

3025class GPRO(ColumnCorpus): 

3026 """ 

3027 Original GPRO corpus containing gene annotations. 

3028 

3029 For further information see: 

3030 https://biocreative.bioinformatics.udel.edu/tasks/biocreative-v/gpro-detailed-task-description/ 

3031 """ 

3032 

3033 def __init__( 

3034 self, 

3035 base_path: Union[str, Path] = None, 

3036 in_memory: bool = True, 

3037 sentence_splitter: SentenceSplitter = None, 

3038 ): 

3039 """ 

3040 :param base_path: Path to the corpus on your machine 

3041 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

3042 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments documents 

3043 into sentences and tokens (default :class:`SciSpacySentenceSplitter`) 

3044 """ 

3045 

3046 if type(base_path) == str: 

3047 base_path: Path = Path(base_path) 

3048 

3049 # column format 

3050 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY} 

3051 

3052 # this dataset name 

3053 dataset_name = self.__class__.__name__.lower() 

3054 

3055 # default dataset folder is the cache root 

3056 if not base_path: 

3057 base_path = flair.cache_root / "datasets" 

3058 data_folder = base_path / dataset_name 

3059 

3060 if sentence_splitter is None: 

3061 sentence_splitter = SciSpacySentenceSplitter() 

3062 

3063 train_file = data_folder / f"{sentence_splitter.name}_train.conll" 

3064 dev_file = data_folder / f"{sentence_splitter.name}_dev.conll" 

3065 

3066 if not (train_file.exists() and dev_file.exists()): 

3067 train_folder = self.download_train_corpus(data_folder) 

3068 train_text_file = train_folder / "chemdner_patents_train_text.txt" 

3069 train_ann_file = train_folder / "chemdner_gpro_gold_standard_train_v02.tsv" 

3070 train_data = self.parse_input_file(train_text_file, train_ann_file) 

3071 

3072 dev_folder = self.download_dev_corpus(data_folder) 

3073 dev_text_file = dev_folder / "chemdner_patents_development_text.txt" 

3074 dev_ann_file = dev_folder / "chemdner_gpro_gold_standard_development.tsv" 

3075 dev_data = self.parse_input_file(dev_text_file, dev_ann_file) 

3076 

3077 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter) 

3078 conll_writer.write_to_conll(train_data, train_file) 

3079 conll_writer.write_to_conll(dev_data, dev_file) 

3080 

3081 super(GPRO, self).__init__( 

3082 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

3083 ) 

3084 

3085 @classmethod 

3086 def download_train_corpus(cls, data_dir: Path) -> Path: 

3087 corpus_dir = data_dir / "original" 

3088 os.makedirs(str(corpus_dir), exist_ok=True) 

3089 

3090 train_url = "https://biocreative.bioinformatics.udel.edu/media/store/files/2015/gpro_training_set_v02.tar.gz" 

3091 data_path = cached_path(train_url, corpus_dir) 

3092 unpack_file(data_path, corpus_dir) 

3093 

3094 return corpus_dir / "gpro_training_set_v02" 

3095 

3096 @classmethod 

3097 def download_dev_corpus(cls, data_dir) -> Path: 

3098 corpus_dir = data_dir / "original" 

3099 os.makedirs(str(corpus_dir), exist_ok=True) 

3100 

3101 dev_url = "https://biocreative.bioinformatics.udel.edu/media/store/files/2015/gpro_development_set.tar.gz" 

3102 data_path = cached_path(dev_url, corpus_dir) 

3103 unpack_file(data_path, corpus_dir) 

3104 

3105 return corpus_dir / "gpro_development_set" 

3106 

3107 @staticmethod 

3108 def parse_input_file(text_file: Path, ann_file: Path) -> InternalBioNerDataset: 

3109 documents = {} 

3110 entities_per_document = {} 

3111 

3112 document_title_length = {} 

3113 

3114 with open(str(text_file), "r", encoding="utf8") as text_reader: 

3115 for line in text_reader: 

3116 if not line: 

3117 continue 

3118 

3119 document_id, title, abstract = line.split("\t") 

3120 documents[document_id] = title + " " + abstract 

3121 document_title_length[document_id] = len(title) + 1 

3122 

3123 entities_per_document[document_id] = [] 

3124 

3125 with open(str(ann_file), "r", encoding="utf8") as ann_reader: 

3126 for line in ann_reader: 

3127 if not line: 

3128 continue 

3129 

3130 columns = line.split("\t") 

3131 document_id = columns[0] 

3132 start, end = int(columns[2]), int(columns[3]) 

3133 

3134 if columns[1] == "A": 

3135 start = start + document_title_length[document_id] 

3136 end = end + document_title_length[document_id] 

3137 

3138 entities_per_document[document_id].append( 

3139 Entity((start, end), GENE_TAG) 

3140 ) 

3141 

3142 document_text = documents[document_id] 

3143 assert columns[4] == document_text[start:end] 

3144 

3145 return InternalBioNerDataset( 

3146 documents=documents, entities_per_document=entities_per_document 

3147 ) 

3148 

3149 

3150class HUNER_GENE_GPRO(HunerDataset): 

3151 """ 

3152 HUNER version of the GPRO corpus containing gene annotations. 

3153 """ 

3154 

3155 def __init__(self, *args, **kwargs): 

3156 super().__init__(*args, **kwargs) 

3157 

3158 @staticmethod 

3159 def split_url() -> str: 

3160 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/gpro" 

3161 

3162 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

3163 train_folder = GPRO.download_train_corpus(data_dir) 

3164 train_text_file = train_folder / "chemdner_patents_train_text.txt" 

3165 train_ann_file = train_folder / "chemdner_gpro_gold_standard_train_v02.tsv" 

3166 train_data = GPRO.parse_input_file(train_text_file, train_ann_file) 

3167 

3168 dev_folder = GPRO.download_dev_corpus(data_dir) 

3169 dev_text_file = dev_folder / "chemdner_patents_development_text.txt" 

3170 dev_ann_file = dev_folder / "chemdner_gpro_gold_standard_development.tsv" 

3171 dev_data = GPRO.parse_input_file(dev_text_file, dev_ann_file) 

3172 

3173 return merge_datasets([train_data, dev_data]) 

3174 

3175 

3176class DECA(ColumnCorpus): 

3177 """ 

3178 Original DECA corpus containing gene annotations. 

3179 

3180 For further information see Wang et al.: 

3181 Disambiguating the species of biomedical named entities using natural language parsers 

3182 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2828111/ 

3183 """ 

3184 

3185 def __init__( 

3186 self, 

3187 base_path: Union[str, Path] = None, 

3188 in_memory: bool = True, 

3189 sentence_splitter: SentenceSplitter = None, 

3190 ): 

3191 """ 

3192 :param base_path: Path to the corpus on your machine 

3193 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

3194 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments 

3195 documents into sentences and tokens (default BioSpacySentenceSpliiter) 

3196 """ 

3197 

3198 if type(base_path) == str: 

3199 base_path: Path = Path(base_path) 

3200 

3201 # column format 

3202 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY} 

3203 

3204 # this dataset name 

3205 dataset_name = self.__class__.__name__.lower() 

3206 

3207 # default dataset folder is the cache root 

3208 if not base_path: 

3209 base_path = flair.cache_root / "datasets" 

3210 data_folder = base_path / dataset_name 

3211 

3212 if sentence_splitter is None: 

3213 sentence_splitter = SciSpacySentenceSplitter() 

3214 

3215 train_file = data_folder / "train.conll" 

3216 

3217 if not train_file.exists(): 

3218 corpus_dir = self.download_corpus(data_folder) 

3219 text_dir = corpus_dir / "text" 

3220 gold_file = corpus_dir / "gold.txt" 

3221 

3222 corpus_data = self.parse_corpus(text_dir, gold_file) 

3223 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter) 

3224 conll_writer.write_to_conll(corpus_data, train_file) 

3225 

3226 super(DECA, self).__init__( 

3227 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

3228 ) 

3229 

3230 @classmethod 

3231 def download_corpus(cls, data_dir: Path) -> Path: 

3232 url = "http://www.nactem.ac.uk/deca/species_corpus_0.2.tar.gz" 

3233 data_path = cached_path(url, data_dir) 

3234 unpack_file(data_path, data_dir) 

3235 

3236 return data_dir / "species_corpus_0.2" 

3237 

3238 @staticmethod 

3239 def parse_corpus(text_dir: Path, gold_file: Path) -> InternalBioNerDataset: 

3240 documents = {} 

3241 entities_per_document = {} 

3242 

3243 text_files = [ 

3244 file for file in os.listdir(str(text_dir)) if not file.startswith(".") 

3245 ] 

3246 

3247 for file in text_files: 

3248 document_id = file.strip(".txt") 

3249 with open(os.path.join(str(text_dir), file), "r", encoding="utf8") as text_file: 

3250 documents[document_id] = text_file.read().strip() 

3251 entities_per_document[document_id] = [] 

3252 

3253 with open(str(gold_file), "r", encoding="utf8") as gold_reader: 

3254 for line in gold_reader: 

3255 if not line: 

3256 continue 

3257 columns = line.strip().split("\t") 

3258 

3259 document_id = columns[0].strip(".txt") 

3260 start, end = int(columns[1]), int(columns[2]) 

3261 

3262 entities_per_document[document_id].append( 

3263 Entity((start, end), GENE_TAG) 

3264 ) 

3265 

3266 document_text = documents[document_id] 

3267 assert document_text[start:end] == columns[3] 

3268 

3269 return InternalBioNerDataset( 

3270 documents=documents, entities_per_document=entities_per_document 

3271 ) 

3272 

3273 

3274class HUNER_GENE_DECA(HunerDataset): 

3275 """ 

3276 HUNER version of the DECA corpus containing gene annotations. 

3277 """ 

3278 

3279 def __init__(self, *args, **kwargs): 

3280 super().__init__(*args, **kwargs) 

3281 

3282 @staticmethod 

3283 def split_url() -> str: 

3284 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/deca" 

3285 

3286 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

3287 corpus_dir = DECA.download_corpus(data_dir) 

3288 text_dir = corpus_dir / "text" 

3289 gold_file = corpus_dir / "gold.txt" 

3290 

3291 return DECA.parse_corpus(text_dir, gold_file) 

3292 

3293 

3294class FSU(ColumnCorpus): 

3295 """ 

3296 Original FSU corpus containing protein and derived annotations. 

3297 

3298 For further information see Hahn et al.: 

3299 A proposal for a configurable silver standard 

3300 https://www.aclweb.org/anthology/W10-1838/ 

3301 """ 

3302 

3303 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): 

3304 """ 

3305 :param base_path: Path to the corpus on your machine 

3306 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

3307 """ 

3308 

3309 if type(base_path) == str: 

3310 base_path: Path = Path(base_path) 

3311 

3312 # column format 

3313 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY} 

3314 

3315 # this dataset name 

3316 dataset_name = self.__class__.__name__.lower() 

3317 

3318 # default dataset folder is the cache root 

3319 if not base_path: 

3320 base_path = flair.cache_root / "datasets" 

3321 data_folder = base_path / dataset_name 

3322 

3323 sentence_splitter = TagSentenceSplitter( 

3324 tag=SENTENCE_TAG, tokenizer=SpaceTokenizer() 

3325 ) 

3326 train_file = data_folder / f"{sentence_splitter.name}_train.conll" 

3327 

3328 if not train_file.exists(): 

3329 corpus_dir = self.download_corpus(data_folder) 

3330 corpus_data = self.parse_corpus(corpus_dir, SENTENCE_TAG) 

3331 

3332 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter) 

3333 conll_writer.write_to_conll(corpus_data, train_file) 

3334 

3335 super(FSU, self).__init__( 

3336 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

3337 ) 

3338 

3339 @classmethod 

3340 def download_corpus(cls, data_dir: Path) -> Path: 

3341 url = "https://julielab.de/downloads/resources/fsu_prge_release_v1_0.tgz" 

3342 data_path = cached_path(url, data_dir) 

3343 unpack_file(data_path, data_dir, mode="targz") 

3344 

3345 return data_dir / "fsu-prge-release-v1.0" 

3346 

3347 @staticmethod 

3348 def parse_corpus( 

3349 corpus_dir: Path, sentence_separator: str 

3350 ) -> InternalBioNerDataset: 

3351 documents = {} 

3352 entities_per_document = {} 

3353 

3354 for subcorpus in corpus_dir.iterdir(): 

3355 if not subcorpus.is_dir(): 

3356 continue 

3357 for doc in (subcorpus / "mmax").iterdir(): 

3358 if not doc.is_dir(): 

3359 continue 

3360 try: 

3361 with open(doc / "Basedata" / "Basedata.xml", "r", encoding="utf8") as word_f: 

3362 word_tree = etree.parse(word_f) 

3363 with open(doc / "Markables" / "sentence.xml", "r", encoding="utf8") as sentence_f: 

3364 sentence_tree = etree.parse(sentence_f).getroot() 

3365 with open(doc / "Markables" / "proteins.xml", "r", encoding="utf8") as protein_f: 

3366 protein_tree = etree.parse(protein_f).getroot() 

3367 with open(doc / "Basedata.uri", "r", encoding="utf8") as id_f: 

3368 document_id = id_f.read().strip() 

3369 except FileNotFoundError: 

3370 # Incomplete article 

3371 continue 

3372 except XMLSyntaxError: 

3373 # Invalid XML syntax 

3374 continue 

3375 

3376 word_to_id = {} 

3377 words = [] 

3378 for i, token in enumerate(word_tree.xpath(".//word")): 

3379 words += [token.text] 

3380 word_to_id[token.get("id")] = i 

3381 word_pos = [(0, 0) for _ in words] 

3382 

3383 sentences_id_span = sorted( 

3384 [ 

3385 (int(sentence.get("id").split("_")[-1]), sentence.get("span")) 

3386 for sentence in sentence_tree 

3387 ] 

3388 ) 

3389 

3390 sentences = [] 

3391 for j, sentence in enumerate(sentences_id_span): 

3392 tmp_sentence = [] 

3393 akt_pos = 0 

3394 start = word_to_id[sentence[1].split("..")[0]] 

3395 end = word_to_id[sentence[1].split("..")[1]] 

3396 for i in range(start, end + 1): 

3397 tmp_sentence += [words[i]] 

3398 word_pos[i] = (j, akt_pos) 

3399 akt_pos += len(words[i]) + 1 

3400 sentences += [tmp_sentence] 

3401 

3402 pre_entities = [[] for _ in sentences] 

3403 for protein in protein_tree: 

3404 for span in protein.get("span").split(","): 

3405 start = word_to_id[span.split("..")[0]] 

3406 end = word_to_id[span.split("..")[-1]] 

3407 pre_entities[word_pos[start][0]] += [ 

3408 ( 

3409 word_pos[start][1], 

3410 word_pos[end][1] + len(words[end]), 

3411 protein.get("proteins"), 

3412 ) 

3413 ] 

3414 

3415 sentences = [" ".join(sentence) for sentence in sentences] 

3416 document = sentence_separator.join(sentences) 

3417 

3418 entities = [] 

3419 sent_offset = 0 

3420 for sentence, sent_entities in zip(sentences, pre_entities): 

3421 entities += [ 

3422 Entity( 

3423 (entity[0] + sent_offset, entity[1] + sent_offset), 

3424 entity[2], 

3425 ) 

3426 for entity in sent_entities 

3427 ] 

3428 sent_offset += len(sentence) + len(sentence_separator) 

3429 

3430 documents[document_id] = document 

3431 entities_per_document[document_id] = entities 

3432 

3433 return InternalBioNerDataset( 

3434 documents=documents, entities_per_document=entities_per_document 

3435 ) 

3436 

3437 

3438class HUNER_GENE_FSU(HunerDataset): 

3439 """ 

3440 HUNER version of the FSU corpus containing (only) gene annotations. 

3441 """ 

3442 

3443 def __init__(self, *args, **kwargs): 

3444 super().__init__(*args, **kwargs) 

3445 

3446 @staticmethod 

3447 def split_url() -> str: 

3448 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/fsu" 

3449 

3450 def get_corpus_sentence_splitter(self) -> SentenceSplitter: 

3451 return TagSentenceSplitter(tag=SENTENCE_TAG, tokenizer=SciSpacyTokenizer()) 

3452 

3453 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

3454 corpus_dir = FSU.download_corpus(data_dir) 

3455 

3456 sentence_separator = " " 

3457 if isinstance(self.sentence_splitter, TagSentenceSplitter): 

3458 sentence_separator = self.sentence_splitter.tag 

3459 

3460 corpus = FSU.parse_corpus(corpus_dir, sentence_separator) 

3461 

3462 entity_type_mapping = { 

3463 "protein": GENE_TAG, 

3464 "protein_familiy_or_group": GENE_TAG, 

3465 "protein_complex": GENE_TAG, 

3466 "protein_variant": GENE_TAG, 

3467 "protein_enum": GENE_TAG, 

3468 } 

3469 return filter_and_map_entities(corpus, entity_type_mapping) 

3470 

3471 

3472class CRAFT(ColumnCorpus): 

3473 """ 

3474 Original CRAFT corpus (version 2.0) containing all but the coreference and sections/typography annotations. 

3475 

3476 For further information see Bada et al.: 

3477 Concept annotation in the craft corpus 

3478 https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-13-161 

3479 """ 

3480 

3481 def __init__( 

3482 self, 

3483 base_path: Union[str, Path] = None, 

3484 in_memory: bool = True, 

3485 sentence_splitter: SentenceSplitter = None, 

3486 ): 

3487 """ 

3488 :param base_path: Path to the corpus on your machine 

3489 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

3490 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments documents 

3491 into sentences and tokens (default :class:`SciSpacySentenceSplitter`) 

3492 """ 

3493 

3494 if type(base_path) == str: 

3495 base_path: Path = Path(base_path) 

3496 

3497 # column format 

3498 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY} 

3499 

3500 # this dataset name 

3501 dataset_name = self.__class__.__name__.lower() 

3502 

3503 # default dataset folder is the cache root 

3504 if not base_path: 

3505 base_path = flair.cache_root / "datasets" 

3506 data_folder = base_path / dataset_name 

3507 

3508 if sentence_splitter is None: 

3509 sentence_splitter = SciSpacySentenceSplitter() 

3510 

3511 train_file = data_folder / f"{sentence_splitter.name}_train.conll" 

3512 

3513 if not train_file.exists(): 

3514 corpus_dir = self.download_corpus(data_folder) 

3515 corpus_data = self.parse_corpus(corpus_dir) 

3516 

3517 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter) 

3518 conll_writer.write_to_conll(corpus_data, train_file) 

3519 

3520 super(CRAFT, self).__init__( 

3521 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

3522 ) 

3523 

3524 @classmethod 

3525 def download_corpus(cls, data_dir: Path) -> Path: 

3526 url = "http://sourceforge.net/projects/bionlp-corpora/files/CRAFT/v2.0/craft-2.0.tar.gz/download" 

3527 data_path = cached_path(url, data_dir) 

3528 unpack_file(data_path, data_dir, mode="targz") 

3529 

3530 return data_dir / "craft-2.0" 

3531 

3532 @staticmethod 

3533 def parse_corpus(corpus_dir: Path) -> InternalBioNerDataset: 

3534 documents = {} 

3535 entities_per_document = {} 

3536 

3537 text_dir = corpus_dir / "articles" / "txt" 

3538 document_texts = [doc for doc in text_dir.iterdir() if doc.name[-4:] == ".txt"] 

3539 annotation_dirs = [ 

3540 path 

3541 for path in (corpus_dir / "xml").iterdir() 

3542 if path.name not in ["sections-and-typography", "coreference"] 

3543 ] 

3544 

3545 for doc in Tqdm.tqdm(document_texts, desc="Converting to internal"): 

3546 document_id = doc.name.split(".")[0] 

3547 

3548 with open(doc, "r", encoding="utf8") as f_txt: 

3549 documents[document_id] = f_txt.read() 

3550 

3551 entities = [] 

3552 

3553 for annotation_dir in annotation_dirs: 

3554 with open( 

3555 annotation_dir / (doc.name + ".annotations.xml"), "r", encoding="utf8" 

3556 ) as f_ann: 

3557 ann_tree = etree.parse(f_ann) 

3558 for annotation in ann_tree.xpath("//annotation"): 

3559 for span in annotation.xpath("span"): 

3560 start = int(span.get("start")) 

3561 end = int(span.get("end")) 

3562 entities += [Entity((start, end), annotation_dir.name)] 

3563 

3564 entities_per_document[document_id] = entities 

3565 

3566 return InternalBioNerDataset( 

3567 documents=documents, entities_per_document=entities_per_document 

3568 ) 

3569 

3570 

3571class BIOSEMANTICS(ColumnCorpus): 

3572 """ 

3573 Original Biosemantics corpus. 

3574 

3575 For further information see Akhondi et al.: 

3576 Annotated chemical patent corpus: a gold standard for text mining 

3577 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4182036/ 

3578 """ 

3579 

3580 def __init__( 

3581 self, 

3582 base_path: Union[str, Path] = None, 

3583 in_memory: bool = True, 

3584 sentence_splitter: SentenceSplitter = None, 

3585 ): 

3586 """ 

3587 :param base_path: Path to the corpus on your machine 

3588 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

3589 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments documents 

3590 into sentences and tokens (default :class:`SciSpacySentenceSplitter`) 

3591 """ 

3592 if type(base_path) == str: 

3593 base_path: Path = Path(base_path) 

3594 

3595 # column format 

3596 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY} 

3597 

3598 # this dataset name 

3599 dataset_name = self.__class__.__name__.lower() 

3600 

3601 # default dataset folder is the cache root 

3602 if not base_path: 

3603 base_path = flair.cache_root / "datasets" 

3604 data_folder = base_path / dataset_name 

3605 

3606 if sentence_splitter is None: 

3607 sentence_splitter = SciSpacySentenceSplitter() 

3608 

3609 train_file = data_folder / f"{sentence_splitter.name}_train.conll" 

3610 

3611 if not (train_file.exists()): 

3612 corpus_dir = self.download_dataset(data_folder) 

3613 full_dataset = self.parse_dataset(corpus_dir) 

3614 

3615 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter) 

3616 conll_writer.write_to_conll(full_dataset, train_file) 

3617 

3618 super(BIOSEMANTICS, self).__init__( 

3619 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

3620 ) 

3621 

3622 @staticmethod 

3623 def download_dataset(data_dir: Path) -> Path: 

3624 data_url = "http://biosemantics.erasmusmc.nl/PatentCorpus/Patent_Corpus.rar" 

3625 data_path = cached_path(data_url, data_dir) 

3626 unpack_file(data_path, data_dir) 

3627 

3628 return data_dir / "Patent_Corpus" 

3629 

3630 @staticmethod 

3631 def parse_dataset(data_dir: Path) -> InternalBioNerDataset: 

3632 base_folder = data_dir / "Full_set" 

3633 

3634 dirs = [ 

3635 file 

3636 for file in os.listdir(str(base_folder)) 

3637 if os.path.isdir(os.path.join(str(base_folder), file)) 

3638 ] 

3639 

3640 text_files = [] 

3641 for directory in dirs: 

3642 text_files += [ 

3643 os.path.join(str(base_folder), directory, file) 

3644 for file in os.listdir(os.path.join(str(base_folder), directory)) 

3645 if file[-4:] == ".txt" 

3646 ] 

3647 text_files = sorted(text_files) 

3648 

3649 documents = {} 

3650 entities_per_document = {} 

3651 

3652 for text_file in sorted(text_files): 

3653 document_id = os.path.basename(text_file).split("_")[0] 

3654 with open(text_file, "r", encoding="utf8") as file_reader: 

3655 file_text = file_reader.read().replace("\n", " ") 

3656 

3657 offset = 0 

3658 document_text = "" 

3659 if document_id in documents: 

3660 document_text = documents[document_id] + " " 

3661 offset = len(document_text) 

3662 

3663 tmp_document_text = document_text + file_text 

3664 

3665 entities = [] 

3666 dirty_file = False 

3667 with open(text_file[:-4] + ".ann", encoding="utf8") as file_reader: 

3668 for line in file_reader: 

3669 if line[-1] == "\n": 

3670 line = line[:-1] 

3671 if not line: 

3672 continue 

3673 

3674 columns = line.split("\t") 

3675 mid = columns[1].split() 

3676 # if len(mid) != 3: 

3677 # continue 

3678 

3679 entity_type, start, end = mid[0], mid[1], mid[2] 

3680 start, end = int(start.split(";")[0]), int(end.split(";")[0]) 

3681 

3682 if start == end: 

3683 continue 

3684 

3685 # Try to fix entity offsets 

3686 if tmp_document_text[offset + start : offset + end] != columns[2]: 

3687 alt_text = tmp_document_text[ 

3688 offset + start : offset + start + len(columns[2]) 

3689 ] 

3690 if alt_text == columns[2]: 

3691 end = start + len(columns[2]) 

3692 

3693 if file_text[start:end] != columns[2]: 

3694 dirty_file = True 

3695 continue 

3696 

3697 if tmp_document_text[offset + start : offset + end] != columns[2]: 

3698 dirty_file = True 

3699 continue 

3700 

3701 entities.append(Entity((offset + start, offset + end), entity_type)) 

3702 

3703 if not dirty_file: 

3704 documents[document_id] = tmp_document_text 

3705 if document_id in entities_per_document: 

3706 entities_per_document[document_id] += entities 

3707 else: 

3708 entities_per_document[document_id] = entities 

3709 

3710 return InternalBioNerDataset( 

3711 documents=documents, entities_per_document=entities_per_document 

3712 ) 

3713 

3714 

3715class BC2GM(ColumnCorpus): 

3716 """ 

3717 Original BioCreative-II-GM corpus containing gene annotations. 

3718 

3719 For further information see Smith et al.: 

3720 Overview of BioCreative II gene mention recognition 

3721 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2559986/ 

3722 """ 

3723 

3724 def __init__( 

3725 self, 

3726 base_path: Union[str, Path] = None, 

3727 in_memory: bool = True, 

3728 sentence_splitter: Callable[[str], Tuple[List[str], List[int]]] = None, 

3729 ): 

3730 """ 

3731 :param base_path: Path to the corpus on your machine 

3732 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

3733 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments documents 

3734 into sentences and tokens (default :class:`SciSpacySentenceSplitter`) 

3735 """ 

3736 if type(base_path) == str: 

3737 base_path: Path = Path(base_path) 

3738 

3739 # column format 

3740 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY} 

3741 

3742 # this dataset name 

3743 dataset_name = self.__class__.__name__.lower() 

3744 

3745 # default dataset folder is the cache root 

3746 if not base_path: 

3747 base_path = flair.cache_root / "datasets" 

3748 data_folder = base_path / dataset_name 

3749 

3750 if sentence_splitter is None: 

3751 sentence_splitter = SciSpacySentenceSplitter() 

3752 

3753 train_file = data_folder / f"{sentence_splitter.name}_train.conll" 

3754 test_file = data_folder / f"{sentence_splitter.name}_test.conll" 

3755 

3756 if not (train_file.exists() and test_file.exists()): 

3757 data_folder = self.download_dataset(data_folder) 

3758 train_data = self.parse_train_dataset(data_folder) 

3759 test_data = self.parse_test_dataset(data_folder) 

3760 

3761 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter) 

3762 conll_writer.write_to_conll(train_data, train_file) 

3763 conll_writer.write_to_conll(test_data, test_file) 

3764 

3765 super(BC2GM, self).__init__( 

3766 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

3767 ) 

3768 

3769 @staticmethod 

3770 def download_dataset(data_dir: Path) -> Path: 

3771 data_url = "https://biocreative.bioinformatics.udel.edu/media/store/files/2011/bc2GMtrain_1.1.tar.gz" 

3772 data_path = cached_path(data_url, data_dir) 

3773 unpack_file(data_path, data_dir) 

3774 

3775 data_url = "https://biocreative.bioinformatics.udel.edu/media/store/files/2011/bc2GMtest_1.0.tar.gz" 

3776 data_path = cached_path(data_url, data_dir) 

3777 unpack_file(data_path, data_dir) 

3778 

3779 return data_dir 

3780 

3781 @classmethod 

3782 def parse_train_dataset(cls, data_folder: Path) -> InternalBioNerDataset: 

3783 train_text_file = data_folder / "bc2geneMention" / "train" / "train.in" 

3784 train_ann_file = data_folder / "bc2geneMention" / "train" / "GENE.eval" 

3785 

3786 return cls.parse_dataset(train_text_file, train_ann_file) 

3787 

3788 @classmethod 

3789 def parse_test_dataset(cls, data_folder: Path) -> InternalBioNerDataset: 

3790 test_text_file = data_folder / "BC2GM" / "test" / "test.in" 

3791 test_ann_file = data_folder / "BC2GM" / "test" / "GENE.eval" 

3792 

3793 return cls.parse_dataset(test_text_file, test_ann_file) 

3794 

3795 @staticmethod 

3796 def parse_dataset(text_file: Path, ann_file: Path) -> InternalBioNerDataset: 

3797 documents = {} 

3798 entities_per_document = {} 

3799 

3800 with open(str(text_file), "r", encoding="utf8") as text_file_reader: 

3801 for line in text_file_reader: 

3802 line = line.strip() 

3803 offset = line.find(" ") 

3804 document_id = line[:offset] 

3805 document_text = line[offset + 1 :] 

3806 documents[document_id] = document_text 

3807 entities_per_document[document_id] = [] 

3808 

3809 with open(str(ann_file), "r", encoding="utf8") as ann_file_reader: 

3810 for line in ann_file_reader: 

3811 columns = line.strip().split("|") 

3812 document_id = columns[0] 

3813 document_text = documents[document_id] 

3814 

3815 start_idx, end_idx = [int(i) for i in columns[1].split()] 

3816 

3817 non_whitespaces_chars = 0 

3818 new_start_idx = None 

3819 new_end_idx = None 

3820 for i, char in enumerate(document_text): 

3821 if char != " ": 

3822 non_whitespaces_chars += 1 

3823 if new_start_idx is None and non_whitespaces_chars == start_idx + 1: 

3824 new_start_idx = i 

3825 if non_whitespaces_chars == end_idx + 1: 

3826 new_end_idx = i + 1 

3827 break 

3828 

3829 mention_text = document_text[new_start_idx:new_end_idx] 

3830 if mention_text != columns[2] and mention_text.startswith("/"): 

3831 # There is still one illegal annotation in the file .. 

3832 new_start_idx += 1 

3833 

3834 entities_per_document[document_id].append( 

3835 Entity((new_start_idx, new_end_idx), GENE_TAG) 

3836 ) 

3837 

3838 assert document_text[new_start_idx:new_end_idx] == columns[2] 

3839 

3840 return InternalBioNerDataset( 

3841 documents=documents, entities_per_document=entities_per_document 

3842 ) 

3843 

3844 

3845class HUNER_GENE_BC2GM(HunerDataset): 

3846 """ 

3847 HUNER version of the BioCreative-II-GM corpus containing gene annotations. 

3848 """ 

3849 

3850 def __init__(self, *args, **kwargs): 

3851 super().__init__( 

3852 *args, **kwargs, 

3853 ) 

3854 

3855 @staticmethod 

3856 def split_url() -> str: 

3857 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/bc2gm" 

3858 

3859 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

3860 data_dir = BC2GM.download_dataset(data_dir) 

3861 train_data = BC2GM.parse_train_dataset(data_dir) 

3862 test_data = BC2GM.parse_test_dataset(data_dir) 

3863 

3864 return merge_datasets([train_data, test_data]) 

3865 

3866 

3867class CEMP(ColumnCorpus): 

3868 """ 

3869 Original CEMP corpus containing chemical annotations. 

3870 

3871 For further information see: 

3872 https://biocreative.bioinformatics.udel.edu/tasks/biocreative-v/cemp-detailed-task-description/ 

3873 """ 

3874 

3875 def __init__( 

3876 self, 

3877 base_path: Union[str, Path] = None, 

3878 in_memory: bool = True, 

3879 sentence_splitter: Callable[[str], Tuple[List[str], List[int]]] = None, 

3880 ): 

3881 """ 

3882 :param base_path: Path to the corpus on your machine 

3883 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

3884 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments 

3885 documents into sentences and tokens (default :class:`SciSpacySentenceSplitter`) 

3886 """ 

3887 

3888 if type(base_path) == str: 

3889 base_path: Path = Path(base_path) 

3890 

3891 # column format 

3892 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY} 

3893 

3894 # this dataset name 

3895 dataset_name = self.__class__.__name__.lower() 

3896 

3897 # default dataset folder is the cache root 

3898 if not base_path: 

3899 base_path = flair.cache_root / "datasets" 

3900 data_folder = base_path / dataset_name 

3901 

3902 if sentence_splitter is None: 

3903 sentence_splitter = SciSpacySentenceSplitter() 

3904 

3905 train_file = data_folder / f"{sentence_splitter.name}_train.conll" 

3906 dev_file = data_folder / f"{sentence_splitter.name}_dev.conll" 

3907 

3908 if not (train_file.exists() and dev_file.exists()): 

3909 train_folder = self.download_train_corpus(data_folder) 

3910 train_text_file = train_folder / "chemdner_patents_train_text.txt" 

3911 train_ann_file = train_folder / "chemdner_cemp_gold_standard_train.tsv" 

3912 train_data = self.parse_input_file(train_text_file, train_ann_file) 

3913 

3914 dev_folder = self.download_dev_corpus(data_folder) 

3915 dev_text_file = dev_folder / "chemdner_patents_development_text.txt" 

3916 dev_ann_file = ( 

3917 dev_folder / "chemdner_cemp_gold_standard_development_v03.tsv" 

3918 ) 

3919 dev_data = self.parse_input_file(dev_text_file, dev_ann_file) 

3920 

3921 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter) 

3922 conll_writer.write_to_conll(train_data, train_file) 

3923 conll_writer.write_to_conll(dev_data, dev_file) 

3924 

3925 super(CEMP, self).__init__( 

3926 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

3927 ) 

3928 

3929 @classmethod 

3930 def download_train_corpus(cls, data_dir: Path) -> Path: 

3931 corpus_dir = data_dir / "original" 

3932 os.makedirs(str(corpus_dir), exist_ok=True) 

3933 

3934 train_url = "https://biocreative.bioinformatics.udel.edu/media/store/files/2015/cemp_training_set.tar.gz" 

3935 data_path = cached_path(train_url, corpus_dir) 

3936 unpack_file(data_path, corpus_dir) 

3937 

3938 return corpus_dir / "cemp_training_set" 

3939 

3940 @classmethod 

3941 def download_dev_corpus(cls, data_dir) -> Path: 

3942 corpus_dir = data_dir / "original" 

3943 os.makedirs(str(corpus_dir), exist_ok=True) 

3944 

3945 dev_url = "https://biocreative.bioinformatics.udel.edu/media/store/files/2015/cemp_development_set_v03.tar.gz" 

3946 data_path = cached_path(dev_url, corpus_dir) 

3947 unpack_file(data_path, corpus_dir) 

3948 

3949 return corpus_dir / "cemp_development_set_v03" 

3950 

3951 @staticmethod 

3952 def parse_input_file(text_file: Path, ann_file: Path) -> InternalBioNerDataset: 

3953 documents = {} 

3954 entities_per_document = {} 

3955 document_abstract_length = {} 

3956 

3957 with open(str(text_file), "r", encoding="utf8") as text_reader: 

3958 for line in text_reader: 

3959 if not line: 

3960 continue 

3961 

3962 document_id, title, abstract = line.split("\t") 

3963 

3964 # Abstract first, title second to prevent issues with sentence splitting 

3965 documents[document_id] = abstract + " " + title 

3966 document_abstract_length[document_id] = len(abstract) + 1 

3967 

3968 entities_per_document[document_id] = [] 

3969 

3970 with open(str(ann_file), "r", encoding="utf8") as ann_reader: 

3971 for line in ann_reader: 

3972 if not line: 

3973 continue 

3974 

3975 columns = line.split("\t") 

3976 document_id = columns[0] 

3977 start, end = int(columns[2]), int(columns[3]) 

3978 

3979 if columns[1] == "T": 

3980 start = start + document_abstract_length[document_id] 

3981 end = end + document_abstract_length[document_id] 

3982 

3983 entities_per_document[document_id].append( 

3984 Entity((start, end), columns[5].strip()) 

3985 ) 

3986 

3987 document_text = documents[document_id] 

3988 assert columns[4] == document_text[start:end] 

3989 

3990 return InternalBioNerDataset( 

3991 documents=documents, entities_per_document=entities_per_document 

3992 ) 

3993 

3994 

3995class HUNER_CHEMICAL_CEMP(HunerDataset): 

3996 """ 

3997 HUNER version of the CEMP corpus containing chemical annotations. 

3998 """ 

3999 

4000 def __init__(self, *args, **kwargs): 

4001 super().__init__(*args, **kwargs) 

4002 

4003 @staticmethod 

4004 def split_url() -> str: 

4005 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/cemp" 

4006 

4007 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

4008 train_folder = CEMP.download_train_corpus(data_dir) 

4009 train_text_file = train_folder / "chemdner_patents_train_text.txt" 

4010 train_ann_file = train_folder / "chemdner_cemp_gold_standard_train.tsv" 

4011 train_data = CEMP.parse_input_file(train_text_file, train_ann_file) 

4012 

4013 dev_folder = CEMP.download_dev_corpus(data_dir) 

4014 dev_text_file = dev_folder / "chemdner_patents_development_text.txt" 

4015 dev_ann_file = dev_folder / "chemdner_cemp_gold_standard_development_v03.tsv" 

4016 dev_data = CEMP.parse_input_file(dev_text_file, dev_ann_file) 

4017 

4018 dataset = merge_datasets([train_data, dev_data]) 

4019 entity_type_mapping = { 

4020 x: CHEMICAL_TAG 

4021 for x in [ 

4022 "ABBREVIATION", 

4023 "FAMILY", 

4024 "FORMULA", 

4025 "IDENTIFIERS", 

4026 "MULTIPLE", 

4027 "SYSTEMATIC", 

4028 "TRIVIAL", 

4029 ] 

4030 } 

4031 return filter_and_map_entities(dataset, entity_type_mapping) 

4032 

4033 

4034class CHEBI(ColumnCorpus): 

4035 """ 

4036 Original CHEBI corpus containing all annotations. 

4037 

4038 For further information see Shardlow et al.: 

4039 A New Corpus to Support Text Mining for the Curation of Metabolites in the ChEBI Database 

4040 http://www.lrec-conf.org/proceedings/lrec2018/pdf/229.pdf 

4041 """ 

4042 

4043 def __init__( 

4044 self, 

4045 base_path: Union[str, Path] = None, 

4046 in_memory: bool = True, 

4047 sentence_splitter: SentenceSplitter = None, 

4048 annotator: int = 0, 

4049 ): 

4050 """ 

4051 :param base_path: Path to the corpus on your machine 

4052 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

4053 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments documents 

4054 into sentences and tokens (default :class:`SciSpacySentenceSplitter`) 

4055 :param annotator: The abstracts have been annotated by two annotators, which can be 

4056 selected by choosing annotator 1 or 2. If annotator is 0, the union of both annotations is used. 

4057 """ 

4058 if type(base_path) == str: 

4059 base_path: Path = Path(base_path) 

4060 

4061 # column format 

4062 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY} 

4063 

4064 # this dataset name 

4065 dataset_name = self.__class__.__name__.lower() 

4066 

4067 # default dataset folder is the cache root 

4068 if not base_path: 

4069 base_path = flair.cache_root / "datasets" 

4070 data_folder = base_path / dataset_name 

4071 

4072 if sentence_splitter is None: 

4073 sentence_splitter = SciSpacySentenceSplitter() 

4074 

4075 train_file = data_folder / f"{sentence_splitter.name}_train.conll" 

4076 

4077 if not (train_file.exists()): 

4078 corpus_dir = self.download_dataset(data_folder) 

4079 full_dataset = self.parse_dataset(corpus_dir, annotator=annotator) 

4080 

4081 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter) 

4082 conll_writer.write_to_conll(full_dataset, train_file) 

4083 

4084 super(CHEBI, self).__init__( 

4085 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

4086 ) 

4087 

4088 @staticmethod 

4089 def download_dataset(data_dir: Path) -> Path: 

4090 data_url = "http://www.nactem.ac.uk/chebi/ChEBI.zip" 

4091 data_path = cached_path(data_url, data_dir) 

4092 unpack_file(data_path, data_dir) 

4093 

4094 return data_dir / "ChEBI" 

4095 

4096 @staticmethod 

4097 def parse_dataset(data_dir: Path, annotator: int) -> InternalBioNerDataset: 

4098 abstract_folder = data_dir / "abstracts" 

4099 fulltext_folder = data_dir / "fullpapers" 

4100 

4101 if annotator == 0: 

4102 annotation_dirs = ["Annotator1", "Annotator2"] 

4103 elif annotator <= 2: 

4104 annotation_dirs = [f"Annotator{annotator}"] 

4105 else: 

4106 raise ValueError("Invalid value for annotator") 

4107 

4108 documents = {} 

4109 entities_per_document = {} 

4110 

4111 abstract_ids = [ 

4112 x.name[:-4] 

4113 for x in (abstract_folder / annotation_dirs[0]).iterdir() 

4114 if x.name[-4:] == ".txt" 

4115 ] 

4116 fulltext_ids = [ 

4117 x.name[:-4] for x in fulltext_folder.iterdir() if x.name[-4:] == ".txt" 

4118 ] 

4119 

4120 for abstract_id in abstract_ids: 

4121 abstract_id_output = abstract_id + "_A" 

4122 with open( 

4123 abstract_folder / annotation_dirs[0] / f"{abstract_id}.txt", "r", encoding="utf8" 

4124 ) as f: 

4125 documents[abstract_id_output] = f.read() 

4126 

4127 for annotation_dir in annotation_dirs: 

4128 with open( 

4129 abstract_folder / annotation_dir / f"{abstract_id}.ann", "r", encoding="utf8" 

4130 ) as f: 

4131 entities = CHEBI.get_entities(f) 

4132 entities_per_document[abstract_id_output] = entities 

4133 

4134 for fulltext_id in fulltext_ids: 

4135 fulltext_id_output = fulltext_id + "_F" 

4136 with open(fulltext_folder / f"{fulltext_id}.txt", "r", encoding="utf8") as f: 

4137 documents[fulltext_id_output] = f.read() 

4138 

4139 with open(fulltext_folder / f"{fulltext_id}.ann", "r", encoding="utf8") as f: 

4140 entities = CHEBI.get_entities(f) 

4141 entities_per_document[fulltext_id_output] = entities 

4142 

4143 return InternalBioNerDataset( 

4144 documents=documents, entities_per_document=entities_per_document 

4145 ) 

4146 

4147 @staticmethod 

4148 def get_entities(f): 

4149 entities = [] 

4150 for line in f: 

4151 if not line.strip() or line[0] != "T": 

4152 continue 

4153 parts = line.split("\t")[1].split() 

4154 entity_type = parts[0] 

4155 char_offsets = " ".join(parts[1:]) 

4156 for start_end in char_offsets.split(";"): 

4157 start, end = start_end.split(" ") 

4158 entities += [Entity((int(start), int(end)), entity_type)] 

4159 

4160 return entities 

4161 

4162 

4163class HUNER_CHEMICAL_CHEBI(HunerDataset): 

4164 """ 

4165 HUNER version of the CHEBI corpus containing chemical annotations. 

4166 """ 

4167 

4168 def __init__(self, *args, **kwargs): 

4169 super().__init__(*args, **kwargs) 

4170 

4171 @staticmethod 

4172 def split_url() -> str: 

4173 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/chebi_new" 

4174 

4175 def to_internal(self, data_dir: Path, annotator: int = 0) -> InternalBioNerDataset: 

4176 corpus_dir = CHEBI.download_dataset(data_dir) 

4177 dataset = CHEBI.parse_dataset(corpus_dir, annotator=annotator) 

4178 entity_type_mapping = {"Chemical": CHEMICAL_TAG} 

4179 return filter_and_map_entities(dataset, entity_type_mapping) 

4180 

4181 

4182class HUNER_GENE_CHEBI(HunerDataset): 

4183 """ 

4184 HUNER version of the CHEBI corpus containing gene annotations. 

4185 """ 

4186 

4187 def __init__(self, *args, **kwargs): 

4188 super().__init__(*args, **kwargs) 

4189 

4190 @staticmethod 

4191 def split_url() -> str: 

4192 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/chebi_new" 

4193 

4194 def to_internal(self, data_dir: Path, annotator: int = 0) -> InternalBioNerDataset: 

4195 corpus_dir = CHEBI.download_dataset(data_dir) 

4196 dataset = CHEBI.parse_dataset(corpus_dir, annotator=annotator) 

4197 entity_type_mapping = {"Protein": GENE_TAG} 

4198 return filter_and_map_entities(dataset, entity_type_mapping) 

4199 

4200 

4201class HUNER_SPECIES_CHEBI(HunerDataset): 

4202 """ 

4203 HUNER version of the CHEBI corpus containing species annotations. 

4204 """ 

4205 

4206 def __init__(self, *args, **kwargs): 

4207 super().__init__(*args, **kwargs) 

4208 

4209 @staticmethod 

4210 def split_url() -> str: 

4211 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/chebi_new" 

4212 

4213 def to_internal(self, data_dir: Path, annotator: int = 0) -> InternalBioNerDataset: 

4214 corpus_dir = CHEBI.download_dataset(data_dir) 

4215 dataset = CHEBI.parse_dataset(corpus_dir, annotator=annotator) 

4216 entity_type_mapping = {"Species": SPECIES_TAG} 

4217 return filter_and_map_entities(dataset, entity_type_mapping) 

4218 

4219 

4220class BioNLPCorpus(ColumnCorpus): 

4221 """ 

4222 Base class for corpora from BioNLP event extraction shared tasks 

4223 

4224 For further information see: 

4225 http://2013.bionlp-st.org/Intro 

4226 """ 

4227 

4228 def __init__( 

4229 self, 

4230 base_path: Union[str, Path] = None, 

4231 in_memory: bool = True, 

4232 sentence_splitter: Callable[[str], Tuple[List[str], List[int]]] = None, 

4233 ): 

4234 """ 

4235 :param base_path: Path to the corpus on your machine 

4236 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

4237 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments documents 

4238 into sentences and tokens (default :class:`SciSpacySentenceSplitter`) 

4239 """ 

4240 

4241 if type(base_path) == str: 

4242 base_path: Path = Path(base_path) 

4243 

4244 # column format 

4245 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY} 

4246 

4247 # this dataset name 

4248 dataset_name = self.__class__.__name__.lower() 

4249 

4250 # default dataset folder is the cache root 

4251 if not base_path: 

4252 base_path = flair.cache_root / "datasets" 

4253 data_folder = base_path / dataset_name 

4254 

4255 if sentence_splitter is None: 

4256 sentence_splitter = SciSpacySentenceSplitter() 

4257 

4258 train_file = data_folder / f"{sentence_splitter.name}_train.conll" 

4259 dev_file = data_folder / f"{sentence_splitter.name}_dev.conll" 

4260 test_file = data_folder / f"{sentence_splitter.name}_test.conll" 

4261 

4262 if not (train_file.exists() and dev_file.exists() and test_file.exists()): 

4263 train_folder, dev_folder, test_folder = self.download_corpus( 

4264 data_folder / "original" 

4265 ) 

4266 

4267 train_data = self.parse_input_files(train_folder) 

4268 dev_data = self.parse_input_files(dev_folder) 

4269 test_data = self.parse_input_files(test_folder) 

4270 

4271 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter) 

4272 conll_writer.write_to_conll(train_data, train_file) 

4273 conll_writer.write_to_conll(dev_data, dev_file) 

4274 conll_writer.write_to_conll(test_data, test_file) 

4275 

4276 super(BioNLPCorpus, self).__init__( 

4277 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

4278 ) 

4279 

4280 @staticmethod 

4281 @abstractmethod 

4282 def download_corpus(data_folder: Path) -> Tuple[Path, Path]: 

4283 pass 

4284 

4285 @staticmethod 

4286 def parse_input_files(input_folder: Path) -> InternalBioNerDataset: 

4287 documents = {} 

4288 entities_per_document = {} 

4289 

4290 for txt_file in input_folder.glob("*.txt"): 

4291 name = txt_file.with_suffix("").name 

4292 a1_file = txt_file.with_suffix(".a1") 

4293 

4294 with txt_file.open(encoding="utf8") as f: 

4295 documents[name] = f.read() 

4296 

4297 with a1_file.open(encoding="utf8") as ann_reader: 

4298 entities = [] 

4299 

4300 for line in ann_reader: 

4301 fields = line.strip().split("\t") 

4302 if fields[0].startswith("T"): 

4303 ann_type, start, end = fields[1].split() 

4304 entities.append( 

4305 Entity( 

4306 char_span=(int(start), int(end)), entity_type=ann_type 

4307 ) 

4308 ) 

4309 entities_per_document[name] = entities 

4310 

4311 return InternalBioNerDataset( 

4312 documents=documents, entities_per_document=entities_per_document 

4313 ) 

4314 

4315 

4316class BIONLP2013_PC(BioNLPCorpus): 

4317 """ 

4318 Corpus of the BioNLP'2013 Pathway Curation shared task 

4319 

4320 For further information see Ohta et al. 

4321 Overview of the pathway curation (PC) task of bioNLP shared task 2013. 

4322 https://www.aclweb.org/anthology/W13-2009/ 

4323 """ 

4324 

4325 @staticmethod 

4326 def download_corpus(download_folder: Path) -> Tuple[Path, Path, Path]: 

4327 train_url = ( 

4328 "http://2013.bionlp-st.org/tasks/BioNLP-ST_2013_PC_training_data.tar.gz" 

4329 ) 

4330 dev_url = ( 

4331 "http://2013.bionlp-st.org/tasks/BioNLP-ST_2013_PC_development_data.tar.gz" 

4332 ) 

4333 test_url = "http://2013.bionlp-st.org/tasks/BioNLP-ST_2013_PC_test_data.tar.gz" 

4334 

4335 cached_path(train_url, download_folder) 

4336 cached_path(dev_url, download_folder) 

4337 cached_path(test_url, download_folder) 

4338 

4339 unpack_file( 

4340 download_folder / "BioNLP-ST_2013_PC_training_data.tar.gz", 

4341 download_folder, 

4342 keep=False, 

4343 ) 

4344 unpack_file( 

4345 download_folder / "BioNLP-ST_2013_PC_development_data.tar.gz", 

4346 download_folder, 

4347 keep=False, 

4348 ) 

4349 unpack_file( 

4350 download_folder / "BioNLP-ST_2013_PC_test_data.tar.gz", 

4351 download_folder, 

4352 keep=False, 

4353 ) 

4354 

4355 train_folder = download_folder / "BioNLP-ST_2013_PC_training_data" 

4356 dev_folder = download_folder / "BioNLP-ST_2013_PC_development_data" 

4357 test_folder = download_folder / "BioNLP-ST_2013_PC_test_data" 

4358 

4359 return train_folder, dev_folder, test_folder 

4360 

4361 

4362class BIONLP2013_CG(BioNLPCorpus): 

4363 """ 

4364 Corpus of the BioNLP'2013 Cancer Genetics shared task 

4365 

4366 For further information see Pyysalo, Ohta & Ananiadou 2013 

4367 Overview of the Cancer Genetics (CG) task of BioNLP Shared Task 2013 

4368 https://www.aclweb.org/anthology/W13-2008/ 

4369 """ 

4370 

4371 @staticmethod 

4372 def download_corpus(download_folder: Path) -> Tuple[Path, Path, Path]: 

4373 train_url = ( 

4374 "http://2013.bionlp-st.org/tasks/BioNLP-ST_2013_CG_training_data.tar.gz" 

4375 ) 

4376 dev_url = ( 

4377 "http://2013.bionlp-st.org/tasks/BioNLP-ST_2013_CG_development_data.tar.gz" 

4378 ) 

4379 test_url = "http://2013.bionlp-st.org/tasks/BioNLP-ST_2013_CG_test_data.tar.gz" 

4380 

4381 download_folder = download_folder / "original" 

4382 

4383 cached_path(train_url, download_folder) 

4384 cached_path(dev_url, download_folder) 

4385 cached_path(test_url, download_folder) 

4386 

4387 unpack_file( 

4388 download_folder / "BioNLP-ST_2013_CG_training_data.tar.gz", 

4389 download_folder, 

4390 keep=False, 

4391 ) 

4392 unpack_file( 

4393 download_folder / "BioNLP-ST_2013_CG_development_data.tar.gz", 

4394 download_folder, 

4395 keep=False, 

4396 ) 

4397 unpack_file( 

4398 download_folder / "BioNLP-ST_2013_CG_test_data.tar.gz", 

4399 download_folder, 

4400 keep=False, 

4401 ) 

4402 

4403 train_folder = download_folder / "BioNLP-ST_2013_CG_training_data" 

4404 dev_folder = download_folder / "BioNLP-ST_2013_CG_development_data" 

4405 test_folder = download_folder / "BioNLP-ST_2013_CG_test_data" 

4406 

4407 return train_folder, dev_folder, test_folder 

4408 

4409 

4410class ANAT_EM(ColumnCorpus): 

4411 """ 

4412 Corpus for anatomical named entity mention recognition. 

4413 

4414 For further information see Pyysalo and Ananiadou: 

4415 Anatomical entity mention recognition at literature scale 

4416 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3957068/ 

4417 http://nactem.ac.uk/anatomytagger/#AnatEM 

4418 """ 

4419 

4420 def __init__( 

4421 self, 

4422 base_path: Union[str, Path] = None, 

4423 in_memory: bool = True, 

4424 tokenizer: Tokenizer = None, 

4425 ): 

4426 """ 

4427 :param base_path: Path to the corpus on your machine 

4428 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

4429 :param sentence_splitter: Implementation of :class:`Tokenizer` which segments 

4430 sentences into tokens (default :class:`SciSpacyTokenizer`) 

4431 """ 

4432 if type(base_path) == str: 

4433 base_path: Path = Path(base_path) 

4434 

4435 # column format 

4436 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY} 

4437 

4438 # this dataset name 

4439 dataset_name = self.__class__.__name__.lower() 

4440 

4441 # default dataset folder is the cache root 

4442 if not base_path: 

4443 base_path = flair.cache_root / "datasets" 

4444 data_folder = base_path / dataset_name 

4445 

4446 if tokenizer is None: 

4447 tokenizer = SciSpacyTokenizer() 

4448 

4449 sentence_splitter = TagSentenceSplitter(tag=SENTENCE_TAG, tokenizer=tokenizer) 

4450 

4451 train_file = data_folder / f"{sentence_splitter.name}_train.conll" 

4452 dev_file = data_folder / f"{sentence_splitter.name}_dev.conll" 

4453 test_file = data_folder / f"{sentence_splitter.name}_test.conll" 

4454 

4455 if not (train_file.exists() and dev_file.exists() and test_file.exists()): 

4456 corpus_folder = self.download_corpus(data_folder) 

4457 

4458 train_data = self.parse_input_files( 

4459 corpus_folder / "nersuite" / "train", SENTENCE_TAG 

4460 ) 

4461 dev_data = self.parse_input_files( 

4462 corpus_folder / "nersuite" / "devel", SENTENCE_TAG 

4463 ) 

4464 test_data = self.parse_input_files( 

4465 corpus_folder / "nersuite" / "test", SENTENCE_TAG 

4466 ) 

4467 

4468 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter) 

4469 conll_writer.write_to_conll(train_data, train_file) 

4470 conll_writer.write_to_conll(dev_data, dev_file) 

4471 conll_writer.write_to_conll(test_data, test_file) 

4472 

4473 super(ANAT_EM, self).__init__( 

4474 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

4475 ) 

4476 

4477 @staticmethod 

4478 @abstractmethod 

4479 def download_corpus(data_folder: Path): 

4480 corpus_url = "http://nactem.ac.uk/anatomytagger/AnatEM-1.0.2.tar.gz" 

4481 corpus_archive = cached_path(corpus_url, data_folder) 

4482 

4483 unpack_file( 

4484 corpus_archive, data_folder, keep=True, mode="targz", 

4485 ) 

4486 

4487 return data_folder / "AnatEM-1.0.2" 

4488 

4489 @staticmethod 

4490 def parse_input_files( 

4491 input_dir: Path, sentence_separator: str 

4492 ) -> InternalBioNerDataset: 

4493 documents = {} 

4494 entities_per_document = {} 

4495 

4496 input_files = [ 

4497 file 

4498 for file in os.listdir(str(input_dir)) 

4499 if file.endswith(".nersuite") and not file.startswith("._") 

4500 ] 

4501 

4502 for input_file in input_files: 

4503 document_id = input_file.replace(".nersuite", "") 

4504 document_text = "" 

4505 

4506 entities = [] 

4507 entity_type = None 

4508 entity_start = None 

4509 

4510 sent_offset = 0 

4511 last_offset = 0 

4512 

4513 input_file = open(str(input_dir / input_file), "r", encoding="utf8") 

4514 for line in input_file.readlines(): 

4515 line = line.strip() 

4516 if line: 

4517 tag, start, end, word, _, _, _ = line.split("\t") 

4518 

4519 start = int(start) + sent_offset 

4520 end = int(end) + sent_offset 

4521 

4522 document_text += " " * (start - last_offset) 

4523 document_text += word 

4524 

4525 if tag.startswith("B-"): 

4526 if entity_type is not None: 

4527 entities.append( 

4528 Entity((entity_start, last_offset), entity_type) 

4529 ) 

4530 

4531 entity_start = start 

4532 entity_type = tag[2:] 

4533 

4534 elif tag == "O" and entity_type is not None: 

4535 entities.append( 

4536 Entity((entity_start, last_offset), entity_type) 

4537 ) 

4538 entity_type = None 

4539 

4540 last_offset = end 

4541 

4542 assert word == document_text[start:end] 

4543 

4544 else: 

4545 document_text += sentence_separator 

4546 sent_offset += len(sentence_separator) 

4547 last_offset += len(sentence_separator) 

4548 

4549 documents[document_id] = document_text 

4550 entities_per_document[document_id] = entities 

4551 

4552 return InternalBioNerDataset( 

4553 documents=documents, entities_per_document=entities_per_document 

4554 ) 

4555 

4556 

4557class BioBertHelper(ColumnCorpus): 

4558 """ 

4559 Helper class to convert corpora and the respective train, dev and test split 

4560 used by BioBERT. 

4561 

4562 For further details see Lee et al.: 

4563 https://academic.oup.com/bioinformatics/article/36/4/1234/5566506 

4564 https://github.com/dmis-lab/biobert 

4565 """ 

4566 

4567 @staticmethod 

4568 def download_corpora(download_dir: Path): 

4569 from google_drive_downloader import GoogleDriveDownloader as gdd 

4570 

4571 gdd.download_file_from_google_drive( 

4572 file_id="1OletxmPYNkz2ltOr9pyT0b0iBtUWxslh", 

4573 dest_path=str(download_dir / "NERdata.zip"), 

4574 unzip=True, 

4575 ) 

4576 

4577 @staticmethod 

4578 def convert_and_write(download_folder, data_folder, tag_type): 

4579 data_folder.mkdir(parents=True, exist_ok=True) 

4580 with (download_folder / "train.tsv").open(encoding="utf8") as f_in, ( 

4581 data_folder / "train.conll" 

4582 ).open("w", encoding="utf8") as f_out: 

4583 for line in f_in: 

4584 if not line.strip(): 

4585 f_out.write("\n") 

4586 continue 

4587 

4588 token, tag = line.strip().split("\t") 

4589 if tag != "O": 

4590 tag = tag + "-" + tag_type 

4591 f_out.write(f"{token} {tag}\n") 

4592 

4593 with (download_folder / "devel.tsv").open(encoding="utf8") as f_in, ( 

4594 data_folder / "dev.conll" 

4595 ).open("w", encoding="utf8") as f_out: 

4596 for line in f_in: 

4597 if not line.strip(): 

4598 f_out.write("\n") 

4599 continue 

4600 token, tag = line.strip().split("\t") 

4601 if tag != "O": 

4602 tag = tag + "-" + tag_type 

4603 f_out.write(f"{token} {tag}\n") 

4604 

4605 with (download_folder / "test.tsv").open(encoding="utf8") as f_in, ( 

4606 data_folder / "test.conll" 

4607 ).open("w", encoding="utf8") as f_out: 

4608 for line in f_in: 

4609 if not line.strip(): 

4610 f_out.write("\n") 

4611 continue 

4612 token, tag = line.strip().split("\t") 

4613 if tag != "O": 

4614 tag = tag + "-" + tag_type 

4615 f_out.write(f"{token} {tag}\n") 

4616 

4617 

4618class BIOBERT_CHEMICAL_BC4CHEMD(ColumnCorpus): 

4619 """ 

4620 BC4CHEMD corpus with chemical annotations as used in the evaluation 

4621 of BioBERT. 

4622 

4623 For further details regarding BioBERT and it's evaluation, see Lee et al.: 

4624 https://academic.oup.com/bioinformatics/article/36/4/1234/5566506 

4625 https://github.com/dmis-lab/biobert 

4626 """ 

4627 

4628 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): 

4629 columns = {0: "text", 1: "ner"} 

4630 # this dataset name 

4631 dataset_name = self.__class__.__name__.lower() 

4632 

4633 # default dataset folder is the cache root 

4634 if not base_path: 

4635 base_path = flair.cache_root / "datasets" 

4636 

4637 data_folder = base_path / dataset_name 

4638 

4639 train_file = data_folder / "train.conll" 

4640 dev_file = data_folder / "dev.conll" 

4641 test_file = data_folder / "test.conll" 

4642 

4643 if not (train_file.exists() and dev_file.exists() and test_file.exists()): 

4644 common_path = base_path / "biobert_common" 

4645 if not (common_path / "BC4CHEMD").exists(): 

4646 BioBertHelper.download_corpora(common_path) 

4647 

4648 BioBertHelper.convert_and_write( 

4649 common_path / "BC4CHEMD", data_folder, tag_type=CHEMICAL_TAG 

4650 ) 

4651 super(BIOBERT_CHEMICAL_BC4CHEMD, self).__init__( 

4652 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

4653 ) 

4654 

4655 

4656class BIOBERT_GENE_BC2GM(ColumnCorpus): 

4657 """ 

4658 BC4CHEMD corpus with gene annotations as used in the evaluation 

4659 of BioBERT. 

4660 

4661 For further details regarding BioBERT and it's evaluation, see Lee et al.: 

4662 https://academic.oup.com/bioinformatics/article/36/4/1234/5566506 

4663 https://github.com/dmis-lab/biobert 

4664 """ 

4665 

4666 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): 

4667 columns = {0: "text", 1: "ner"} 

4668 # this dataset name 

4669 dataset_name = self.__class__.__name__.lower() 

4670 

4671 # default dataset folder is the cache root 

4672 if not base_path: 

4673 base_path = flair.cache_root / "datasets" 

4674 

4675 data_folder = base_path / dataset_name 

4676 

4677 train_file = data_folder / "train.conll" 

4678 dev_file = data_folder / "dev.conll" 

4679 test_file = data_folder / "test.conll" 

4680 

4681 if not (train_file.exists() and dev_file.exists() and test_file.exists()): 

4682 common_path = base_path / "biobert_common" 

4683 if not (common_path / "BC2GM").exists(): 

4684 BioBertHelper.download_corpora(common_path) 

4685 BioBertHelper.convert_and_write( 

4686 common_path / "BC2GM", data_folder, tag_type=GENE_TAG 

4687 ) 

4688 super(BIOBERT_GENE_BC2GM, self).__init__( 

4689 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

4690 ) 

4691 

4692 

4693class BIOBERT_GENE_JNLPBA(ColumnCorpus): 

4694 """ 

4695 JNLPBA corpus with gene annotations as used in the evaluation 

4696 of BioBERT. 

4697 

4698 For further details regarding BioBERT and it's evaluation, see Lee et al.: 

4699 https://academic.oup.com/bioinformatics/article/36/4/1234/5566506 

4700 https://github.com/dmis-lab/biobert 

4701 """ 

4702 

4703 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): 

4704 columns = {0: "text", 1: "ner"} 

4705 # this dataset name 

4706 dataset_name = self.__class__.__name__.lower() 

4707 

4708 # default dataset folder is the cache root 

4709 if not base_path: 

4710 base_path = flair.cache_root / "datasets" 

4711 

4712 data_folder = base_path / dataset_name 

4713 

4714 train_file = data_folder / "train.conll" 

4715 dev_file = data_folder / "dev.conll" 

4716 test_file = data_folder / "test.conll" 

4717 

4718 if not (train_file.exists() and dev_file.exists() and test_file.exists()): 

4719 common_path = base_path / "biobert_common" 

4720 if not (common_path / "JNLPBA").exists(): 

4721 BioBertHelper.download_corpora(common_path) 

4722 BioBertHelper.convert_and_write( 

4723 common_path / "JNLPBA", data_folder, tag_type=GENE_TAG 

4724 ) 

4725 super(BIOBERT_GENE_JNLPBA, self).__init__( 

4726 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

4727 ) 

4728 

4729 

4730class BIOBERT_CHEMICAL_BC5CDR(ColumnCorpus): 

4731 """ 

4732 BC5CDR corpus with chemical annotations as used in the evaluation 

4733 of BioBERT. 

4734 

4735 For further details regarding BioBERT and it's evaluation, see Lee et al.: 

4736 https://academic.oup.com/bioinformatics/article/36/4/1234/5566506 

4737 https://github.com/dmis-lab/biobert 

4738 """ 

4739 

4740 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): 

4741 columns = {0: "text", 1: "ner"} 

4742 # this dataset name 

4743 dataset_name = self.__class__.__name__.lower() 

4744 

4745 # default dataset folder is the cache root 

4746 if not base_path: 

4747 base_path = flair.cache_root / "datasets" 

4748 

4749 data_folder = base_path / dataset_name 

4750 

4751 train_file = data_folder / "train.conll" 

4752 dev_file = data_folder / "dev.conll" 

4753 test_file = data_folder / "test.conll" 

4754 

4755 if not (train_file.exists() and dev_file.exists() and test_file.exists()): 

4756 common_path = base_path / "biobert_common" 

4757 if not (common_path / "BC5CDR-chem").exists(): 

4758 BioBertHelper.download_corpora(common_path) 

4759 BioBertHelper.convert_and_write( 

4760 common_path / "BC5CDR-chem", data_folder, tag_type=CHEMICAL_TAG 

4761 ) 

4762 super(BIOBERT_CHEMICAL_BC5CDR, self).__init__( 

4763 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

4764 ) 

4765 

4766 

4767class BIOBERT_DISEASE_BC5CDR(ColumnCorpus): 

4768 """ 

4769 BC5CDR corpus with disease annotations as used in the evaluation 

4770 of BioBERT. 

4771 

4772 For further details regarding BioBERT and it's evaluation, see Lee et al.: 

4773 https://academic.oup.com/bioinformatics/article/36/4/1234/5566506 

4774 https://github.com/dmis-lab/biobert 

4775 """ 

4776 

4777 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): 

4778 columns = {0: "text", 1: "ner"} 

4779 # this dataset name 

4780 dataset_name = self.__class__.__name__.lower() 

4781 

4782 # default dataset folder is the cache root 

4783 if not base_path: 

4784 base_path = flair.cache_root / "datasets" 

4785 

4786 data_folder = base_path / dataset_name 

4787 

4788 train_file = data_folder / "train.conll" 

4789 dev_file = data_folder / "dev.conll" 

4790 test_file = data_folder / "test.conll" 

4791 

4792 if not (train_file.exists() and dev_file.exists() and test_file.exists()): 

4793 common_path = base_path / "biobert_common" 

4794 if not (common_path / "BC5CDR-disease").exists(): 

4795 BioBertHelper.download_corpora(common_path) 

4796 BioBertHelper.convert_and_write( 

4797 common_path / "BC5CDR-disease", data_folder, tag_type=DISEASE_TAG 

4798 ) 

4799 super(BIOBERT_DISEASE_BC5CDR, self).__init__( 

4800 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

4801 ) 

4802 

4803 

4804class BIOBERT_DISEASE_NCBI(ColumnCorpus): 

4805 """ 

4806 NCBI disease corpus as used in the evaluation of BioBERT. 

4807 

4808 For further details regarding BioBERT and it's evaluation, see Lee et al.: 

4809 https://academic.oup.com/bioinformatics/article/36/4/1234/5566506 

4810 https://github.com/dmis-lab/biobert 

4811 """ 

4812 

4813 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): 

4814 columns = {0: "text", 1: "ner"} 

4815 # this dataset name 

4816 dataset_name = self.__class__.__name__.lower() 

4817 

4818 # default dataset folder is the cache root 

4819 if not base_path: 

4820 base_path = flair.cache_root / "datasets" 

4821 

4822 data_folder = base_path / dataset_name 

4823 

4824 train_file = data_folder / "train.conll" 

4825 dev_file = data_folder / "dev.conll" 

4826 test_file = data_folder / "test.conll" 

4827 

4828 if not (train_file.exists() and dev_file.exists() and test_file.exists()): 

4829 common_path = base_path / "biobert_common" 

4830 if not (common_path / "NCBI-disease").exists(): 

4831 BioBertHelper.download_corpora(common_path) 

4832 BioBertHelper.convert_and_write( 

4833 common_path / "NCBI-disease", data_folder, tag_type=DISEASE_TAG 

4834 ) 

4835 super(BIOBERT_DISEASE_NCBI, self).__init__( 

4836 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

4837 ) 

4838 

4839 

4840class BIOBERT_SPECIES_LINNAEUS(ColumnCorpus): 

4841 """ 

4842 Linneaeus corpus with species annotations as used in the evaluation 

4843 of BioBERT. 

4844 

4845 For further details regarding BioBERT and it's evaluation, see Lee et al.: 

4846 https://academic.oup.com/bioinformatics/article/36/4/1234/5566506 

4847 https://github.com/dmis-lab/biobert 

4848 """ 

4849 

4850 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): 

4851 columns = {0: "text", 1: "ner"} 

4852 # this dataset name 

4853 dataset_name = self.__class__.__name__.lower() 

4854 

4855 # default dataset folder is the cache root 

4856 if not base_path: 

4857 base_path = flair.cache_root / "datasets" 

4858 

4859 data_folder = base_path / dataset_name 

4860 

4861 train_file = data_folder / "train.conll" 

4862 dev_file = data_folder / "dev.conll" 

4863 test_file = data_folder / "test.conll" 

4864 

4865 if not (train_file.exists() and dev_file.exists() and test_file.exists()): 

4866 common_path = base_path / "biobert_common" 

4867 if not (common_path / "linnaeus").exists(): 

4868 BioBertHelper.download_corpora(common_path) 

4869 BioBertHelper.convert_and_write( 

4870 common_path / "linnaeus", data_folder, tag_type=SPECIES_TAG 

4871 ) 

4872 super(BIOBERT_SPECIES_LINNAEUS, self).__init__( 

4873 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

4874 ) 

4875 

4876 

4877class BIOBERT_SPECIES_S800(ColumnCorpus): 

4878 """ 

4879 S800 corpus with species annotations as used in the evaluation 

4880 of BioBERT. 

4881 

4882 For further details regarding BioBERT and it's evaluation, see Lee et al.: 

4883 https://academic.oup.com/bioinformatics/article/36/4/1234/5566506 

4884 https://github.com/dmis-lab/biobert 

4885 """ 

4886 

4887 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): 

4888 columns = {0: "text", 1: "ner"} 

4889 # this dataset name 

4890 dataset_name = self.__class__.__name__.lower() 

4891 

4892 # default dataset folder is the cache root 

4893 if not base_path: 

4894 base_path = flair.cache_root / "datasets" 

4895 

4896 data_folder = base_path / dataset_name 

4897 

4898 train_file = data_folder / "train.conll" 

4899 dev_file = data_folder / "dev.conll" 

4900 test_file = data_folder / "test.conll" 

4901 

4902 if not (train_file.exists() and dev_file.exists() and test_file.exists()): 

4903 common_path = base_path / "biobert_common" 

4904 if not (common_path / "s800").exists(): 

4905 BioBertHelper.download_corpora(common_path) 

4906 BioBertHelper.convert_and_write( 

4907 common_path / "s800", data_folder, tag_type=SPECIES_TAG 

4908 ) 

4909 super(BIOBERT_SPECIES_S800, self).__init__( 

4910 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

4911 ) 

4912 

4913 

4914class CRAFT_V4(ColumnCorpus): 

4915 """ 

4916 Version 4.0.1 of the CRAFT corpus containing all but the co-reference and structural annotations. 

4917 

4918 For further information see: 

4919 https://github.com/UCDenver-ccp/CRAFT 

4920 """ 

4921 

4922 def __init__( 

4923 self, 

4924 base_path: Union[str, Path] = None, 

4925 in_memory: bool = True, 

4926 sentence_splitter: SentenceSplitter = None, 

4927 ): 

4928 """ 

4929 :param base_path: Path to the corpus on your machine 

4930 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

4931 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments 

4932 documents into sentences and tokens (default :class:`SciSpacySentenceSplitter`) 

4933 """ 

4934 

4935 if type(base_path) == str: 

4936 base_path: Path = Path(base_path) 

4937 

4938 # column format 

4939 columns = {0: "text", 1: "ner"} 

4940 

4941 # this dataset name 

4942 dataset_name = self.__class__.__name__.lower() 

4943 

4944 # default dataset folder is the cache root 

4945 if not base_path: 

4946 base_path = flair.cache_root / "datasets" 

4947 data_folder = base_path / dataset_name 

4948 

4949 if sentence_splitter is None: 

4950 sentence_splitter = SciSpacySentenceSplitter() 

4951 

4952 train_file = data_folder / f"{sentence_splitter.name}_train.conll" 

4953 dev_file = data_folder / f"{sentence_splitter.name}_dev.conll" 

4954 test_file = data_folder / f"{sentence_splitter.name}_test.conll" 

4955 

4956 if not (train_file.exists() and dev_file.exists() and test_file.exists()): 

4957 corpus_dir = self.download_corpus(data_folder) 

4958 corpus_data = self.parse_corpus(corpus_dir) 

4959 

4960 # Filter for specific entity types, by default no entities will be filtered 

4961 corpus_data = self.filter_entities(corpus_data) 

4962 

4963 train_data, dev_data, test_data = self.prepare_splits( 

4964 data_folder, corpus_data 

4965 ) 

4966 

4967 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter) 

4968 conll_writer.write_to_conll(train_data, train_file) 

4969 conll_writer.write_to_conll(dev_data, dev_file) 

4970 conll_writer.write_to_conll(test_data, test_file) 

4971 

4972 super(CRAFT_V4, self).__init__( 

4973 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

4974 ) 

4975 

4976 def filter_entities(self, corpus: InternalBioNerDataset) -> InternalBioNerDataset: 

4977 return corpus 

4978 

4979 @classmethod 

4980 def download_corpus(cls, data_dir: Path) -> Path: 

4981 url = "https://github.com/UCDenver-ccp/CRAFT/archive/v4.0.1.tar.gz" 

4982 data_path = cached_path(url, data_dir) 

4983 unpack_file(data_path, data_dir, mode="targz") 

4984 

4985 return data_dir / "CRAFT-4.0.1" 

4986 

4987 @staticmethod 

4988 def prepare_splits( 

4989 data_dir: Path, corpus: InternalBioNerDataset 

4990 ) -> Tuple[InternalBioNerDataset, InternalBioNerDataset, InternalBioNerDataset]: 

4991 splits_dir = data_dir / "splits" 

4992 os.makedirs(str(splits_dir), exist_ok=True) 

4993 

4994 # Get original HUNER splits to retrieve a list of all document ids contained in V2 

4995 split_urls = [ 

4996 "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/craft.train", 

4997 "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/craft.dev", 

4998 "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/craft.test", 

4999 ] 

5000 

5001 splits = {} 

5002 for url in split_urls: 

5003 split_file = cached_path(url, splits_dir) 

5004 with open(str(split_file), "r", encoding="utf8") as split_reader: 

5005 splits[url.split(".")[-1]] = [ 

5006 line.strip() for line in split_reader if line.strip() 

5007 ] 

5008 

5009 train_documents, train_entities = {}, {} 

5010 dev_documents, dev_entities = {}, {} 

5011 test_documents, test_entities = {}, {} 

5012 

5013 for document_id, document_text in corpus.documents.items(): 

5014 if document_id in splits["train"] or document_id in splits["dev"]: 

5015 # train and dev split of V2 will be train in V4 

5016 train_documents[document_id] = document_text 

5017 train_entities[document_id] = corpus.entities_per_document[document_id] 

5018 elif document_id in splits["test"]: 

5019 # test split of V2 will be dev in V4 

5020 dev_documents[document_id] = document_text 

5021 dev_entities[document_id] = corpus.entities_per_document[document_id] 

5022 else: 

5023 # New documents in V4 will become test documents 

5024 test_documents[document_id] = document_text 

5025 test_entities[document_id] = corpus.entities_per_document[document_id] 

5026 

5027 train_corpus = InternalBioNerDataset( 

5028 documents=train_documents, entities_per_document=train_entities 

5029 ) 

5030 dev_corpus = InternalBioNerDataset( 

5031 documents=dev_documents, entities_per_document=dev_entities 

5032 ) 

5033 test_corpus = InternalBioNerDataset( 

5034 documents=test_documents, entities_per_document=test_entities 

5035 ) 

5036 

5037 return train_corpus, dev_corpus, test_corpus 

5038 

5039 @staticmethod 

5040 def parse_corpus(corpus_dir: Path) -> InternalBioNerDataset: 

5041 documents = {} 

5042 entities_per_document = {} 

5043 

5044 text_dir = corpus_dir / "articles" / "txt" 

5045 document_texts = [doc for doc in text_dir.iterdir() if doc.name[-4:] == ".txt"] 

5046 annotation_dirs = [ 

5047 path 

5048 for path in (corpus_dir / "concept-annotation").iterdir() 

5049 if path.name not in ["sections-and-typography", "coreference"] 

5050 and path.is_dir() 

5051 ] 

5052 

5053 for doc in Tqdm.tqdm(document_texts, desc="Converting to internal"): 

5054 document_id = doc.name.split(".")[0] 

5055 

5056 with open(doc, "r", encoding="utf8") as f_txt: 

5057 documents[document_id] = f_txt.read() 

5058 

5059 entities = [] 

5060 

5061 for annotation_dir in annotation_dirs: 

5062 with open( 

5063 annotation_dir 

5064 / annotation_dir.parts[-1] 

5065 / "knowtator" 

5066 / (doc.name + ".knowtator.xml"), 

5067 "r", 

5068 encoding="utf8" 

5069 ) as f_ann: 

5070 ann_tree = etree.parse(f_ann) 

5071 for annotation in ann_tree.xpath("//annotation"): 

5072 for span in annotation.xpath("span"): 

5073 start = int(span.get("start")) 

5074 end = int(span.get("end")) 

5075 entities += [Entity((start, end), annotation_dir.name.lower())] 

5076 

5077 entities_per_document[document_id] = entities 

5078 

5079 return InternalBioNerDataset( 

5080 documents=documents, entities_per_document=entities_per_document 

5081 ) 

5082 

5083 

5084class HUNER_CHEMICAL_CRAFT_V4(HunerDataset): 

5085 """ 

5086 HUNER version of the CRAFT corpus containing (only) chemical annotations. 

5087 """ 

5088 

5089 def __init__(self, *args, **kwargs): 

5090 super().__init__( 

5091 *args, **kwargs, 

5092 ) 

5093 

5094 @staticmethod 

5095 def split_url() -> str: 

5096 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/craft_v4" 

5097 

5098 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

5099 corpus_dir = CRAFT_V4.download_corpus(data_dir) 

5100 corpus = CRAFT_V4.parse_corpus(corpus_dir) 

5101 

5102 entity_type_mapping = {"chebi": CHEMICAL_TAG} 

5103 return filter_and_map_entities(corpus, entity_type_mapping) 

5104 

5105 

5106class HUNER_GENE_CRAFT_V4(HunerDataset): 

5107 """ 

5108 HUNER version of the CRAFT corpus containing (only) gene annotations. 

5109 """ 

5110 

5111 def __init__(self, *args, **kwargs): 

5112 super().__init__( 

5113 *args, **kwargs, 

5114 ) 

5115 

5116 @staticmethod 

5117 def split_url() -> str: 

5118 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/craft_v4" 

5119 

5120 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

5121 corpus_dir = CRAFT_V4.download_corpus(data_dir) 

5122 corpus = CRAFT_V4.parse_corpus(corpus_dir) 

5123 

5124 entity_type_mapping = {"pr": GENE_TAG} 

5125 return filter_and_map_entities(corpus, entity_type_mapping) 

5126 

5127 

5128class HUNER_SPECIES_CRAFT_V4(HunerDataset): 

5129 """ 

5130 HUNER version of the CRAFT corpus containing (only) species annotations. 

5131 """ 

5132 

5133 def __init__(self, *args, **kwargs): 

5134 super().__init__( 

5135 *args, **kwargs, 

5136 ) 

5137 

5138 @staticmethod 

5139 def split_url() -> str: 

5140 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/craft_v4" 

5141 

5142 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

5143 corpus_dir = CRAFT_V4.download_corpus(data_dir) 

5144 corpus = CRAFT_V4.parse_corpus(corpus_dir) 

5145 

5146 entity_type_mapping = {"ncbitaxon": SPECIES_TAG} 

5147 return filter_and_map_entities(corpus, entity_type_mapping) 

5148 

5149 

5150class HUNER_CHEMICAL_BIONLP2013_CG(HunerDataset): 

5151 def __init__(self, *args, **kwargs): 

5152 super().__init__( 

5153 *args, **kwargs, 

5154 ) 

5155 

5156 @staticmethod 

5157 def split_url() -> str: 

5158 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/bionlp2013_cg" 

5159 

5160 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

5161 train_dir, dev_dir, test_dir = BIONLP2013_CG.download_corpus(data_dir) 

5162 train_corpus = BioNLPCorpus.parse_input_files(train_dir) 

5163 dev_corpus = BioNLPCorpus.parse_input_files(dev_dir) 

5164 test_corpus = BioNLPCorpus.parse_input_files(test_dir) 

5165 corpus = merge_datasets([train_corpus, dev_corpus, test_corpus]) 

5166 

5167 entity_type_mapping = {"Simple_chemical": CHEMICAL_TAG} 

5168 return filter_and_map_entities(corpus, entity_type_mapping) 

5169 

5170 

5171class HUNER_DISEASE_BIONLP2013_CG(HunerDataset): 

5172 def __init__(self, *args, **kwargs): 

5173 super().__init__( 

5174 *args, **kwargs, 

5175 ) 

5176 

5177 @staticmethod 

5178 def split_url() -> str: 

5179 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/bionlp2013_cg" 

5180 

5181 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

5182 train_dir, dev_dir, test_dir = BIONLP2013_CG.download_corpus(data_dir) 

5183 train_corpus = BioNLPCorpus.parse_input_files(train_dir) 

5184 dev_corpus = BioNLPCorpus.parse_input_files(dev_dir) 

5185 test_corpus = BioNLPCorpus.parse_input_files(test_dir) 

5186 corpus = merge_datasets([train_corpus, dev_corpus, test_corpus]) 

5187 

5188 entity_type_mapping = {"Cancer": DISEASE_TAG} 

5189 return filter_and_map_entities(corpus, entity_type_mapping) 

5190 

5191 

5192class HUNER_GENE_BIONLP2013_CG(HunerDataset): 

5193 def __init__(self, *args, **kwargs): 

5194 super().__init__( 

5195 *args, **kwargs, 

5196 ) 

5197 

5198 @staticmethod 

5199 def split_url() -> str: 

5200 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/bionlp2013_cg" 

5201 

5202 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

5203 train_dir, dev_dir, test_dir = BIONLP2013_CG.download_corpus(data_dir) 

5204 train_corpus = BioNLPCorpus.parse_input_files(train_dir) 

5205 dev_corpus = BioNLPCorpus.parse_input_files(dev_dir) 

5206 test_corpus = BioNLPCorpus.parse_input_files(test_dir) 

5207 corpus = merge_datasets([train_corpus, dev_corpus, test_corpus]) 

5208 

5209 entity_type_mapping = {"Gene_or_gene_product": GENE_TAG} 

5210 return filter_and_map_entities(corpus, entity_type_mapping) 

5211 

5212 

5213class HUNER_SPECIES_BIONLP2013_CG(HunerDataset): 

5214 def __init__(self, *args, **kwargs): 

5215 super().__init__( 

5216 *args, **kwargs, 

5217 ) 

5218 

5219 @staticmethod 

5220 def split_url() -> str: 

5221 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/bionlp2013_cg" 

5222 

5223 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

5224 train_dir, dev_dir, test_dir = BIONLP2013_CG.download_corpus(data_dir) 

5225 train_corpus = BioNLPCorpus.parse_input_files(train_dir) 

5226 dev_corpus = BioNLPCorpus.parse_input_files(dev_dir) 

5227 test_corpus = BioNLPCorpus.parse_input_files(test_dir) 

5228 corpus = merge_datasets([train_corpus, dev_corpus, test_corpus]) 

5229 

5230 entity_type_mapping = {"Organism": SPECIES_TAG} 

5231 return filter_and_map_entities(corpus, entity_type_mapping) 

5232 

5233 

5234class AZDZ(ColumnCorpus): 

5235 """ 

5236 Arizona Disease Corpus from the Biomedical Informatics Lab at Arizona State University. 

5237 

5238 For further information see: 

5239 http://diego.asu.edu/index.php 

5240 """ 

5241 

5242 def __init__( 

5243 self, 

5244 base_path: Union[str, Path] = None, 

5245 in_memory: bool = True, 

5246 tokenizer: Tokenizer = None, 

5247 ): 

5248 """ 

5249 :param base_path: Path to the corpus on your machine 

5250 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

5251 :param tokenizer: Implementation of :class:`Tokenizer` which segments sentences 

5252 into tokens (default :class:`SciSpacyTokenizer`) 

5253 """ 

5254 

5255 if type(base_path) == str: 

5256 base_path: Path = Path(base_path) 

5257 

5258 # column format 

5259 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY} 

5260 

5261 # this dataset name 

5262 dataset_name = self.__class__.__name__.lower() 

5263 

5264 # default dataset folder is the cache root 

5265 if not base_path: 

5266 base_path = flair.cache_root / "datasets" 

5267 data_folder = base_path / dataset_name 

5268 

5269 if tokenizer is None: 

5270 tokenizer = SciSpacyTokenizer() 

5271 sentence_splitter = TagSentenceSplitter(tag=SENTENCE_TAG, tokenizer=tokenizer) 

5272 

5273 train_file = data_folder / f"{sentence_splitter.name}_train.conll" 

5274 

5275 if not train_file.exists(): 

5276 corpus_file = self.download_corpus(data_folder) 

5277 corpus_data = self.parse_corpus(corpus_file) 

5278 

5279 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter) 

5280 conll_writer.write_to_conll(corpus_data, train_file) 

5281 

5282 super(AZDZ, self).__init__( 

5283 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

5284 ) 

5285 

5286 @classmethod 

5287 def download_corpus(cls, data_dir: Path) -> Path: 

5288 url = "http://diego.asu.edu/downloads/AZDC_6-26-2009.txt" 

5289 data_path = cached_path(url, data_dir) 

5290 

5291 return data_path 

5292 

5293 @staticmethod 

5294 def parse_corpus(input_file: Path) -> InternalBioNerDataset: 

5295 documents = {} 

5296 entities_per_document = {} 

5297 

5298 with open(str(input_file), "r", encoding="iso-8859-1") as azdz_reader: 

5299 prev_document_id = None 

5300 prev_sentence_id = None 

5301 

5302 document_text = None 

5303 entities = [] 

5304 offset = None 

5305 

5306 for line in azdz_reader: 

5307 line = line.strip() 

5308 if not line or line.startswith("Doc Id"): 

5309 continue 

5310 

5311 columns = line.split("\t") 

5312 

5313 document_id = columns[1] # PMID 

5314 sentence_id = document_id + "_" + columns[2] # PMID + sentence no 

5315 

5316 if document_id != prev_document_id and document_text: 

5317 documents[document_id] = document_text 

5318 entities_per_document[document_id] = entities 

5319 

5320 document_text = None 

5321 entities = [] 

5322 offset = None 

5323 

5324 if sentence_id != prev_sentence_id: 

5325 offset = offset + len(SENTENCE_TAG) if offset else 0 

5326 document_text = ( 

5327 document_text + SENTENCE_TAG + columns[3].strip() 

5328 if document_text 

5329 else columns[3] 

5330 ) 

5331 

5332 try: 

5333 start = offset + int(columns[4]) - 1 

5334 end = offset + int(columns[5]) 

5335 except: 

5336 continue 

5337 

5338 if end == 0: 

5339 continue 

5340 

5341 entities.append(Entity((start, end), DISEASE_TAG)) 

5342 

5343 return InternalBioNerDataset( 

5344 documents=documents, entities_per_document=entities_per_document 

5345 ) 

5346 

5347 

5348class PDR(ColumnCorpus): 

5349 """ 

5350 Corpus of plant-disease relations from Kim et al., consisting of named entity annotations 

5351 for plants and disease. 

5352 

5353 For further information see Kim et al.: 

5354 A corpus of plant-disease relations in the biomedical domain 

5355 https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0221582 

5356 http://gcancer.org/pdr/ 

5357 """ 

5358 

5359 def __init__( 

5360 self, 

5361 base_path: Union[str, Path] = None, 

5362 in_memory: bool = True, 

5363 sentence_splitter: SentenceSplitter = None, 

5364 ): 

5365 """ 

5366 :param base_path: Path to the corpus on your machine 

5367 :param in_memory: If True, keeps dataset in memory giving speedups in training. 

5368 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which 

5369 segments documents into sentences and tokens (default :class:`SciSpacySentenceSplitter`) 

5370 """ 

5371 

5372 if type(base_path) == str: 

5373 base_path: Path = Path(base_path) 

5374 

5375 # column format 

5376 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY} 

5377 

5378 # this dataset name 

5379 dataset_name = self.__class__.__name__.lower() 

5380 

5381 # default dataset folder is the cache root 

5382 if not base_path: 

5383 base_path = flair.cache_root / "datasets" 

5384 data_folder = base_path / dataset_name 

5385 

5386 if sentence_splitter is None: 

5387 sentence_splitter = SciSpacySentenceSplitter() 

5388 

5389 train_file = data_folder / f"{sentence_splitter.name}_train.conll" 

5390 

5391 if not train_file.exists(): 

5392 corpus_folder = self.download_corpus(data_folder) 

5393 corpus_data = brat_to_internal( 

5394 corpus_folder, ann_file_suffixes=[".ann", ".ann2"] 

5395 ) 

5396 

5397 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter) 

5398 conll_writer.write_to_conll(corpus_data, train_file) 

5399 

5400 super(PDR, self).__init__( 

5401 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory 

5402 ) 

5403 

5404 @classmethod 

5405 def download_corpus(cls, data_dir: Path) -> Path: 

5406 url = "http://gcancer.org/pdr/Plant-Disease_Corpus.tar.gz" 

5407 data_path = cached_path(url, data_dir) 

5408 unpack_file(data_path, data_dir) 

5409 

5410 return data_dir / "Plant-Disease_Corpus" 

5411 

5412 

5413class HUNER_DISEASE_PDR(HunerDataset): 

5414 """ 

5415 PDR Dataset with only Disease annotations 

5416 """ 

5417 

5418 def __init__(self, *args, **kwargs): 

5419 super().__init__(*args, **kwargs) 

5420 

5421 @staticmethod 

5422 def split_url() -> str: 

5423 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/pdr" 

5424 

5425 def to_internal(self, data_dir: Path) -> InternalBioNerDataset: 

5426 corpus_folder = PDR.download_corpus(data_dir) 

5427 corpus_data = brat_to_internal( 

5428 corpus_folder, ann_file_suffixes=[".ann", ".ann2"] 

5429 ) 

5430 corpus_data = filter_and_map_entities(corpus_data, {"Disease": DISEASE_TAG}) 

5431 

5432 return corpus_data 

5433 

5434 

5435class HunerMultiCorpus(MultiCorpus): 

5436 """ 

5437 Base class to build the union of all HUNER data sets considering a particular entity type. 

5438 """ 

5439 

5440 def __init__(self, entity_type: str, sentence_splitter: SentenceSplitter = None): 

5441 self.entity_type = entity_type 

5442 

5443 def entity_type_predicate(member): 

5444 return f"HUNER_{entity_type}_" in str(member) and inspect.isclass(member) 

5445 

5446 self.huner_corpora_classes = inspect.getmembers(sys.modules[__name__], predicate=entity_type_predicate) 

5447 self.huner_corpora = [] 

5448 for name, constructor_func in self.huner_corpora_classes: 

5449 try: 

5450 if not sentence_splitter: 

5451 corpus = constructor_func() 

5452 else: 

5453 corpus = constructor_func(sentence_splitter=sentence_splitter) 

5454 

5455 self.huner_corpora.append(corpus) 

5456 except: 

5457 print(f"Can't download and prepare corpus {name}:\n{sys.exc_info()[1]}\n\n") 

5458 

5459 super(HunerMultiCorpus, self).__init__( 

5460 corpora=self.huner_corpora, name=f"HUNER-{entity_type}" 

5461 ) 

5462 

5463 

5464class HUNER_CELL_LINE(HunerMultiCorpus): 

5465 """ 

5466 Union of all HUNER cell line data sets. 

5467 """ 

5468 

5469 def __init__(self, sentence_splitter: SentenceSplitter = None): 

5470 super(HUNER_CELL_LINE, self).__init__( 

5471 entity_type="CELL_LINE", 

5472 sentence_splitter=sentence_splitter 

5473 ) 

5474 

5475 

5476class HUNER_CHEMICAL(HunerMultiCorpus): 

5477 """ 

5478 Union of all HUNER chemical data sets. 

5479 """ 

5480 

5481 def __init__(self, sentence_splitter: SentenceSplitter = None): 

5482 super(HUNER_CHEMICAL, self).__init__( 

5483 entity_type="CHEMICAL", 

5484 sentence_splitter=sentence_splitter 

5485 ) 

5486 

5487 

5488class HUNER_DISEASE(HunerMultiCorpus): 

5489 """ 

5490 Union of all HUNER disease data sets. 

5491 """ 

5492 

5493 def __init__(self, sentence_splitter: SentenceSplitter = None): 

5494 super(HUNER_DISEASE, self).__init__( 

5495 entity_type="DISEASE", 

5496 sentence_splitter=sentence_splitter 

5497 ) 

5498 

5499 

5500class HUNER_GENE(HunerMultiCorpus): 

5501 """ 

5502 Union of all HUNER gene data sets. 

5503 """ 

5504 

5505 def __init__(self, sentence_splitter: SentenceSplitter = None): 

5506 super(HUNER_GENE, self).__init__( 

5507 entity_type="GENE", 

5508 sentence_splitter=sentence_splitter 

5509 ) 

5510 

5511 

5512class HUNER_SPECIES(HunerMultiCorpus): 

5513 """ 

5514 Union of all HUNER species data sets. 

5515 """ 

5516 

5517 def __init__(self, sentence_splitter: SentenceSplitter = None): 

5518 super(HUNER_SPECIES, self).__init__( 

5519 entity_type="SPECIES", 

5520 sentence_splitter=sentence_splitter 

5521 ) 

5522