Coverage for /home/ubuntu/Documents/Research/mut_p1/flair/flair/datasets/biomedical.py: 22%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import logging
2import inspect
3import flair
4import ftfy
5import json
6import os
7import shutil
8import re
9import sys
11from abc import ABC, abstractmethod
12from collections import defaultdict, deque
13from copy import copy
14from lxml import etree
15from lxml.etree import XMLSyntaxError
16from typing import Union, Callable, Dict, List, Tuple, Iterable, Optional
17from operator import attrgetter
18from pathlib import Path
19from warnings import warn
21from flair.data import MultiCorpus
22from flair.data import Tokenizer
23from flair.file_utils import cached_path, Tqdm, unpack_file
24from flair.datasets import ColumnCorpus, ColumnDataset
25from flair.tokenization import (
26 SentenceSplitter,
27 SciSpacySentenceSplitter,
28 NoSentenceSplitter,
29 TagSentenceSplitter,
30 SciSpacyTokenizer,
31 NewlineSentenceSplitter,
32 SpaceTokenizer,
33)
36DISEASE_TAG = "Disease"
37CHEMICAL_TAG = "Chemical"
38CELL_LINE_TAG = "CellLine"
39GENE_TAG = "Gene"
40SPECIES_TAG = "Species"
42SENTENCE_TAG = "[__SENT__]"
44logger = logging.getLogger("flair")
47class Entity:
48 """
49 Internal class to represent entities while converting biomedical NER corpora to a standardized format
50 (only used for pre-processing purposes!). Each entity consists of the char span it addresses in
51 the original text as well as the type of entity (e.g. Chemical, Gene, and so on).
52 """
54 def __init__(self, char_span: Tuple[int, int], entity_type: str):
55 assert char_span[0] < char_span[1]
56 self.char_span = range(*char_span)
57 self.type = entity_type
59 def __str__(self):
60 return (
61 self.type
62 + "("
63 + str(self.char_span.start)
64 + ","
65 + str(self.char_span.stop)
66 + ")"
67 )
69 def __repr__(self):
70 return str(self)
72 def is_before(self, other_entity) -> bool:
73 """
74 Checks whether this entity is located before the given one
76 :param other_entity: Entity to check
77 """
78 return self.char_span.stop <= other_entity.char_span.start
80 def contains(self, other_entity) -> bool:
81 """
82 Checks whether the given entity is fully contained in this entity
84 :param other_entity: Entity to check
85 """
86 return (
87 other_entity.char_span.start >= self.char_span.start
88 and other_entity.char_span.stop <= self.char_span.stop
89 )
91 def overlaps(self, other_entity) -> bool:
92 """
93 Checks whether this and the given entity overlap
95 :param other_entity: Entity to check
96 """
97 return (
98 self.char_span.start <= other_entity.char_span.start < self.char_span.stop
99 ) or (self.char_span.start < other_entity.char_span.stop <= self.char_span.stop)
102class InternalBioNerDataset:
103 """
104 Internal class to represent a corpus and it's entities.
105 """
107 def __init__(
108 self, documents: Dict[str, str], entities_per_document: Dict[str, List[Entity]]
109 ):
110 self.documents = documents
111 self.entities_per_document = entities_per_document
114def merge_datasets(data_sets: Iterable[InternalBioNerDataset]):
115 all_documents = {}
116 all_entities = {}
118 for ds in data_sets:
119 all_documents.update(ds.documents)
120 all_entities.update(ds.entities_per_document)
122 return InternalBioNerDataset(
123 documents=all_documents, entities_per_document=all_entities
124 )
127def filter_and_map_entities(
128 dataset: InternalBioNerDataset, entity_type_to_canonical: Dict[str, str]
129) -> InternalBioNerDataset:
130 """
131 :param entity_type_to_canonical: Maps entity type in dataset to canonical type
132 if entity type is not present in map it is discarded
133 """
134 mapped_entities_per_document = {}
135 for id, entities in dataset.entities_per_document.items():
136 new_entities = []
137 for entity in entities:
138 if entity.type in entity_type_to_canonical:
139 new_entity = copy(entity)
140 new_entity.type = entity_type_to_canonical[entity.type]
141 new_entities.append(new_entity)
142 else:
143 logging.debug(f"Skip entity type {entity.type}")
144 pass
145 mapped_entities_per_document[id] = new_entities
147 return InternalBioNerDataset(
148 documents=dataset.documents, entities_per_document=mapped_entities_per_document
149 )
152def filter_nested_entities(dataset: InternalBioNerDataset) -> None:
153 num_entities_before = sum([len(x) for x in dataset.entities_per_document.values()])
155 for document_id, entities in dataset.entities_per_document.items():
156 # Uses dynamic programming approach to calculate maximum independent set in interval graph
157 # with sum of all entity lengths as secondary key
158 dp_array = [
159 (0, 0, 0, None)
160 ] # position_end, number of entities, sum of all entity lengths, last entity
161 for entity in sorted(entities, key=lambda x: x.char_span.stop):
162 i = len(dp_array) - 1
163 while dp_array[i][0] > entity.char_span.start:
164 i -= 1
165 if dp_array[i][1] + 1 > dp_array[-1][1] or (
166 dp_array[i][1] + 1 == dp_array[-1][1]
167 and dp_array[i][2] + len(entity.char_span) > dp_array[-1][2]
168 ):
169 dp_array += [
170 (
171 entity.char_span.stop,
172 dp_array[i][1] + 1,
173 dp_array[i][2] + len(entity.char_span),
174 entity,
175 )
176 ]
177 else:
178 dp_array += [dp_array[-1]]
180 independent_set = []
181 p = dp_array[-1][0]
182 for dp_entry in dp_array[::-1]:
183 if dp_entry[3] is None:
184 break
185 if dp_entry[0] <= p:
186 independent_set += [dp_entry[3]]
187 p -= len(dp_entry[3].char_span)
189 dataset.entities_per_document[document_id] = independent_set
191 num_entities_after = sum([len(x) for x in dataset.entities_per_document.values()])
192 if num_entities_before != num_entities_after:
193 removed = num_entities_before - num_entities_after
194 warn(
195 f"Corpus modified by filtering nested entities. Removed {removed} entities."
196 )
199def bioc_to_internal(bioc_file: Path):
200 """
201 Helper function to parse corpora that are given in BIOC format. See
203 http://bioc.sourceforge.net/
205 for details.
206 """
207 tree = etree.parse(str(bioc_file))
208 texts_per_document = {}
209 entities_per_document = {}
210 documents = tree.xpath(".//document")
212 all_entities = 0
213 non_matching = 0
215 for document in Tqdm.tqdm(documents, desc="Converting to internal"):
216 document_id = document.xpath("./id")[0].text
217 texts = []
218 entities = []
220 for passage in document.xpath("passage"):
221 passage_texts = passage.xpath("text/text()")
222 if len(passage_texts) == 0:
223 continue
224 text = passage_texts[0]
226 passage_offset = int(
227 passage.xpath("./offset/text()")[0]
228 ) # from BioC annotation
230 # calculate offset without current text
231 # because we stick all passages of a document together
232 document_text = " ".join(texts)
233 document_offset = len(document_text)
235 texts.append(text)
236 document_text += " " + text
238 for annotation in passage.xpath(".//annotation"):
240 entity_types = [
241 i.text.replace(" ", "_")
242 for i in annotation.xpath("./infon")
243 if i.attrib["key"] in {"type", "class"}
244 ]
246 start = (
247 int(annotation.xpath("./location")[0].get("offset"))
248 - passage_offset
249 )
250 # TODO For split entities we also annotate everything inbetween which might be a bad idea?
251 final_length = int(annotation.xpath("./location")[-1].get("length"))
252 final_offset = (
253 int(annotation.xpath("./location")[-1].get("offset"))
254 - passage_offset
255 )
256 if final_length <= 0:
257 continue
258 end = final_offset + final_length
260 start += document_offset
261 end += document_offset
263 true_entity = annotation.xpath(".//text")[0].text
264 annotated_entity = " ".join(texts)[start:end]
266 # Try to fix incorrect annotations
267 if annotated_entity.lower() != true_entity.lower():
268 max_shift = min(3, len(true_entity))
269 for i in range(max_shift):
270 index = annotated_entity.lower().find(
271 true_entity[0 : max_shift - i].lower()
272 )
273 if index != -1:
274 start += index
275 end += index
276 break
278 annotated_entity = " ".join(texts)[start:end]
279 if not annotated_entity.lower() == true_entity.lower():
280 non_matching += 1
282 all_entities += 1
284 for entity_type in entity_types:
285 entities.append(Entity((start, end), entity_type))
287 texts_per_document[document_id] = " ".join(texts)
288 entities_per_document[document_id] = entities
290 # print(
291 # f"Found {non_matching} non-matching entities ({non_matching/all_entities}%) in {bioc_file}"
292 # )
294 return InternalBioNerDataset(
295 documents=texts_per_document, entities_per_document=entities_per_document
296 )
299def brat_to_internal(corpus_dir: Path, ann_file_suffixes=None) -> InternalBioNerDataset:
300 """
301 Helper function to parse corpora that are annotated using BRAT. See
303 https://brat.nlplab.org/
305 for details.
307 """
308 if ann_file_suffixes is None:
309 ann_file_suffixes = [".ann"]
311 text_files = list(corpus_dir.glob("*.txt"))
312 documents = {}
313 entities_per_document = defaultdict(list)
314 for text_file in text_files:
315 document_text = open(str(text_file), encoding="utf8").read().strip()
316 document_id = text_file.stem
318 for suffix in ann_file_suffixes:
319 with open(str(text_file.with_suffix(suffix)), "r", encoding="utf8") as ann_file:
320 for line in ann_file:
321 fields = line.strip().split("\t")
323 # Ignore empty lines or relation annotations
324 if not fields or len(fields) <= 2:
325 continue
327 ent_type, char_start, char_end = fields[1].split()
328 start = int(char_start)
329 end = int(char_end)
331 # FIX annotation of whitespaces (necessary for PDR)
332 while document_text[start:end].startswith(" "):
333 start += 1
335 while document_text[start:end].endswith(" "):
336 end -= 1
338 entities_per_document[document_id].append(
339 Entity(char_span=(start, end), entity_type=ent_type,)
340 )
342 assert document_text[start:end].strip() == fields[2].strip()
344 documents[document_id] = document_text
346 return InternalBioNerDataset(
347 documents=documents, entities_per_document=dict(entities_per_document)
348 )
351class CoNLLWriter:
352 """
353 Class which implements the output CONLL file generation of corpora given as instances of
354 :class:`InternalBioNerDataset`.
355 """
357 def __init__(
358 self, sentence_splitter: SentenceSplitter,
359 ):
360 """
361 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which
362 segments the text into sentences and tokens
363 """
364 self.sentence_splitter = sentence_splitter
366 def process_dataset(
367 self, datasets: Dict[str, InternalBioNerDataset], out_dir: Path
368 ):
369 self.write_to_conll(datasets["train"], out_dir / "train.conll")
370 self.write_to_conll(datasets["dev"], out_dir / "dev.conll")
371 self.write_to_conll(datasets["test"], out_dir / "test.conll")
373 def write_to_conll(self, dataset: InternalBioNerDataset, output_file: Path):
374 os.makedirs(str(output_file.parent), exist_ok=True)
375 filter_nested_entities(dataset)
377 with output_file.open("w", encoding="utf8") as f:
378 for document_id in Tqdm.tqdm(
379 dataset.documents.keys(),
380 total=len(dataset.documents),
381 desc="Converting to CoNLL",
382 ):
383 document_text = ftfy.fix_text(dataset.documents[document_id])
384 document_text = re.sub(
385 r"[\u2000-\u200B]", " ", document_text
386 ) # replace unicode space characters!
387 document_text = document_text.replace(
388 "\xa0", " "
389 ) # replace non-break space
391 entities = deque(
392 sorted(
393 dataset.entities_per_document[document_id],
394 key=attrgetter("char_span.start", "char_span.stop"),
395 )
396 )
397 current_entity = entities.popleft() if entities else None
399 sentences = self.sentence_splitter.split(document_text)
401 for sentence in sentences:
402 in_entity = False
403 sentence_had_tokens = False
405 for flair_token in sentence.tokens:
406 token = flair_token.text.strip()
407 offset = sentence.start_pos + flair_token.start_pos
409 if current_entity and offset >= current_entity.char_span.stop:
410 in_entity = False
412 # One token may contain multiple entities -> deque all of them
413 while (
414 current_entity
415 and offset >= current_entity.char_span.stop
416 ):
417 current_entity = (
418 entities.popleft() if entities else None
419 )
421 if current_entity and offset in current_entity.char_span:
422 if not in_entity:
423 tag = "B-" + current_entity.type
424 in_entity = True
425 else:
426 tag = "I-" + current_entity.type
427 else:
428 tag = "O"
429 in_entity = False
431 whitespace_after = "+" if flair_token.whitespace_after else "-"
432 if len(token) > 0:
433 f.write(" ".join([token, tag, whitespace_after]) + "\n")
434 sentence_had_tokens = True
436 if sentence_had_tokens:
437 f.write("\n")
440class HunerDataset(ColumnCorpus, ABC):
441 """
442 Base class for HUNER datasets.
444 Every subclass has to implement the following methods:
445 - `to_internal', which reads the complete data set (incl. train, dev, test) and returns the corpus
446 as InternalBioNerDataset
447 - `split_url', which returns the base url (i.e. without '.train', '.dev', '.test') to the HUNER split files
449 For further information see:
450 - Weber et al.: 'HUNER: improving biomedical NER with pretraining'
451 https://academic.oup.com/bioinformatics/article-abstract/36/1/295/5523847?redirectedFrom=fulltext
452 - HUNER github repository:
453 https://github.com/hu-ner/huner
454 """
456 @abstractmethod
457 def to_internal(self, data_folder: Path) -> InternalBioNerDataset:
458 raise NotImplementedError()
460 @staticmethod
461 @abstractmethod
462 def split_url() -> str:
463 raise NotImplementedError()
465 def get_corpus_sentence_splitter(self) -> Optional[SentenceSplitter]:
466 """
467 If the corpus has a pre-defined sentence splitting, then this method returns
468 the sentence splitter to be used to reconstruct the original splitting.
469 If the corpus has no pre-defined sentence splitting None will be returned.
470 """
471 return None
473 def __init__(
474 self,
475 base_path: Union[str, Path] = None,
476 in_memory: bool = True,
477 sentence_splitter: SentenceSplitter = None,
478 ):
479 """
480 :param base_path: Path to the corpus on your machine
481 :param in_memory: If True, keeps dataset in memory giving speedups in training.
482 :param sentence_splitter: Custom implementation of :class:`SentenceSplitter` which
483 segments the text into sentences and tokens (default :class:`SciSpacySentenceSplitter`)
484 """
486 if type(base_path) == str:
487 base_path: Path = Path(base_path)
489 # column format
490 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY}
492 # this dataset name
493 dataset_name = self.__class__.__name__.lower()
495 # default dataset folder is the cache root
496 if not base_path:
497 base_path = flair.cache_root / "datasets"
498 data_folder = base_path / dataset_name
500 self.sentence_splitter = self.get_corpus_sentence_splitter()
501 if not self.sentence_splitter:
502 self.sentence_splitter = (
503 sentence_splitter if sentence_splitter else SciSpacySentenceSplitter()
504 )
505 else:
506 if sentence_splitter:
507 warn(
508 f"The corpus {self.__class__.__name__} has a pre-defined sentence splitting, "
509 f"thus just the tokenizer of the given sentence splitter ist used"
510 )
511 self.sentence_splitter.tokenizer = sentence_splitter.tokenizer
513 # Create tokenization-dependent CONLL files. This is necessary to prevent
514 # from caching issues (e.g. loading the same corpus with different sentence splitters)
515 train_file = data_folder / f"{self.sentence_splitter.name}_train.conll"
516 dev_file = data_folder / f"{self.sentence_splitter.name}_dev.conll"
517 test_file = data_folder / f"{self.sentence_splitter.name}_test.conll"
519 if not (train_file.exists() and dev_file.exists() and test_file.exists()):
520 splits_dir = data_folder / "splits"
521 os.makedirs(splits_dir, exist_ok=True)
523 writer = CoNLLWriter(sentence_splitter=self.sentence_splitter)
524 internal_dataset = self.to_internal(data_folder)
526 train_data = self.get_subset(internal_dataset, "train", splits_dir)
527 writer.write_to_conll(train_data, train_file)
529 dev_data = self.get_subset(internal_dataset, "dev", splits_dir)
530 writer.write_to_conll(dev_data, dev_file)
532 test_data = self.get_subset(internal_dataset, "test", splits_dir)
533 writer.write_to_conll(test_data, test_file)
535 super(HunerDataset, self).__init__(
536 data_folder=data_folder,
537 train_file=train_file.name,
538 dev_file=dev_file.name,
539 test_file=test_file.name,
540 column_format=columns,
541 tag_to_bioes="ner",
542 in_memory=in_memory
543 )
545 def get_subset(self, dataset: InternalBioNerDataset, split: str, split_dir: Path):
546 split_file = cached_path(f"{self.split_url()}.{split}", split_dir)
548 with split_file.open(encoding="utf8") as f:
549 ids = [l.strip() for l in f if l.strip()]
550 ids = sorted(id_ for id_ in ids if id_ in dataset.documents)
552 return InternalBioNerDataset(
553 documents={k: dataset.documents[k] for k in ids},
554 entities_per_document={k: dataset.entities_per_document[k] for k in ids},
555 )
558class BIO_INFER(ColumnCorpus):
559 """
560 Original BioInfer corpus
562 For further information see Pyysalo et al.:
563 BioInfer: a corpus for information extraction in the biomedical domain
564 https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-8-50
565 """
567 def __init__(
568 self, base_path: Union[str, Path] = None, in_memory: bool = True,
569 ):
570 """
571 :param base_path: Path to the corpus on your machine
572 :param in_memory: If True, keeps dataset in memory giving speedups in training.
573 """
575 if type(base_path) == str:
576 base_path: Path = Path(base_path)
578 # column format
579 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY}
581 # this dataset name
582 dataset_name = self.__class__.__name__.lower()
584 # default dataset folder is the cache root
585 if not base_path:
586 base_path = flair.cache_root / "datasets"
587 data_folder = base_path / dataset_name
589 train_file = data_folder / "train.conll"
591 if not (train_file.exists()):
592 corpus_folder = self.download_dataset(data_folder)
593 corpus_data = self.parse_dataset(corpus_folder)
595 sentence_splitter = NoSentenceSplitter(tokenizer=SpaceTokenizer())
597 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter)
598 conll_writer.write_to_conll(corpus_data, train_file)
600 super(BIO_INFER, self).__init__(
601 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
602 )
604 @classmethod
605 def download_dataset(cls, data_dir: Path) -> Path:
606 data_url = "http://mars.cs.utu.fi/BioInfer/files/BioInfer_corpus_1.1.1.zip"
607 data_path = cached_path(data_url, data_dir)
608 unpack_file(data_path, data_dir)
610 return data_dir / "BioInfer_corpus_1.1.1.xml"
612 @classmethod
613 def parse_dataset(cls, original_file: Path):
614 documents = {}
615 entities_per_document = {}
617 tree = etree.parse(str(original_file))
618 sentence_elems = tree.xpath("//sentence")
619 for sentence_id, sentence in enumerate(sentence_elems):
620 sentence_id = str(sentence_id)
621 token_id_to_span = {}
622 sentence_text = ""
623 entities_per_document[sentence_id] = []
625 for token in sentence.xpath(".//token"):
626 token_text = "".join(token.xpath(".//subtoken/@text"))
627 token_id = ".".join(token.attrib["id"].split(".")[1:])
629 if not sentence_text:
630 token_id_to_span[token_id] = (0, len(token_text))
631 sentence_text = token_text
632 else:
633 token_id_to_span[token_id] = (
634 len(sentence_text) + 1,
635 len(token_text) + len(sentence_text) + 1,
636 )
637 sentence_text += " " + token_text
638 documents[sentence_id] = sentence_text
640 entities = [
641 e for e in sentence.xpath(".//entity") if not e.attrib["type"].isupper()
642 ] # all caps entity type apparently marks event trigger
644 for entity in entities:
645 token_nums = []
646 entity_character_starts = []
647 entity_character_ends = []
649 for subtoken in entity.xpath(".//nestedsubtoken"):
650 token_id_parts = subtoken.attrib["id"].split(".")
651 token_id = ".".join(token_id_parts[1:3])
653 token_nums.append(int(token_id_parts[2]))
654 entity_character_starts.append(token_id_to_span[token_id][0])
655 entity_character_ends.append(token_id_to_span[token_id][1])
657 if token_nums and entity_character_starts and entity_character_ends:
658 entity_tokens = list(
659 zip(token_nums, entity_character_starts, entity_character_ends)
660 )
662 start_token = entity_tokens[0]
663 last_entity_token = entity_tokens[0]
664 for entity_token in entity_tokens[1:]:
665 if not (entity_token[0] - 1) == last_entity_token[0]:
666 entities_per_document[sentence_id].append(
667 Entity(
668 char_span=(start_token[1], last_entity_token[2]),
669 entity_type=entity.attrib["type"],
670 )
671 )
672 start_token = entity_token
674 last_entity_token = entity_token
676 if start_token:
677 entities_per_document[sentence_id].append(
678 Entity(
679 char_span=(start_token[1], last_entity_token[2]),
680 entity_type=entity.attrib["type"],
681 )
682 )
684 return InternalBioNerDataset(
685 documents=documents, entities_per_document=entities_per_document
686 )
689class HUNER_GENE_BIO_INFER(HunerDataset):
690 """
691 HUNER version of the BioInfer corpus containing only gene/protein annotations
692 """
693 def __init__(self, *args, **kwargs):
694 super().__init__(*args, **kwargs)
696 @staticmethod
697 def split_url() -> str:
698 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/bioinfer"
700 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
701 original_file = BIO_INFER.download_dataset(data_dir)
702 corpus = BIO_INFER.parse_dataset(original_file)
704 entity_type_mapping = {
705 "Individual_protein": GENE_TAG,
706 "Gene/protein/RNA": GENE_TAG,
707 "Gene": GENE_TAG,
708 "DNA_family_or_group": GENE_TAG,
709 }
711 return filter_and_map_entities(corpus, entity_type_mapping)
714class JNLPBA(ColumnCorpus):
715 """
716 Original corpus of the JNLPBA shared task.
718 For further information see Kim et al.:
719 Introduction to the Bio-Entity Recognition Task at JNLPBA
720 https://www.aclweb.org/anthology/W04-1213.pdf
721 """
723 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
724 """
725 :param base_path: Path to the corpus on your machine
726 :param in_memory: If True, keeps dataset in memory giving speedups in training.
727 """
729 if type(base_path) == str:
730 base_path: Path = Path(base_path)
732 # column format
733 columns = {0: "text", 1: "ner"}
735 # this dataset name
736 dataset_name = self.__class__.__name__.lower()
738 # default dataset folder is the cache root
739 if not base_path:
740 base_path = flair.cache_root / "datasets"
741 data_folder = base_path / dataset_name
743 train_file = data_folder / "train.conll"
744 test_file = data_folder / "test.conll"
746 if not (train_file.exists() and test_file.exists()):
747 download_dir = data_folder / "original"
748 os.makedirs(download_dir, exist_ok=True)
750 train_data_url = "http://www.nactem.ac.uk/GENIA/current/Shared-tasks/JNLPBA/Train/Genia4ERtraining.tar.gz"
751 train_data_path = cached_path(train_data_url, download_dir)
752 unpack_file(train_data_path, download_dir)
754 train_data_url = "http://www.nactem.ac.uk/GENIA/current/Shared-tasks/JNLPBA/Evaluation/Genia4ERtest.tar.gz"
755 train_data_path = cached_path(train_data_url, download_dir)
756 unpack_file(train_data_path, download_dir)
758 train_file = download_dir / "Genia4ERtask2.iob2"
759 shutil.copy(train_file, data_folder / "train.conll")
761 test_file = download_dir / "Genia4EReval2.iob2"
762 shutil.copy(test_file, data_folder / "test.conll")
764 super(JNLPBA, self).__init__(
765 data_folder,
766 columns,
767 tag_to_bioes="ner",
768 in_memory=in_memory,
769 comment_symbol="#",
770 )
773class HunerJNLPBA(object):
774 @classmethod
775 def download_and_prepare_train(
776 cls, data_folder: Path, sentence_tag: str
777 ) -> InternalBioNerDataset:
778 train_data_url = "http://www.nactem.ac.uk/GENIA/current/Shared-tasks/JNLPBA/Train/Genia4ERtraining.tar.gz"
779 train_data_path = cached_path(train_data_url, data_folder)
780 unpack_file(train_data_path, data_folder)
782 train_input_file = data_folder / "Genia4ERtask2.iob2"
783 return cls.read_file(train_input_file, sentence_tag)
785 @classmethod
786 def download_and_prepare_test(
787 cls, data_folder: Path, sentence_tag: str
788 ) -> InternalBioNerDataset:
789 test_data_url = "http://www.nactem.ac.uk/GENIA/current/Shared-tasks/JNLPBA/Evaluation/Genia4ERtest.tar.gz"
790 test_data_path = cached_path(test_data_url, data_folder)
791 unpack_file(test_data_path, data_folder)
793 test_input_file = data_folder / "Genia4EReval2.iob2"
794 return cls.read_file(test_input_file, sentence_tag)
796 @classmethod
797 def read_file(
798 cls, input_iob_file: Path, sentence_tag: str
799 ) -> InternalBioNerDataset:
800 documents = {}
801 entities_per_document = defaultdict(list)
803 with open(str(input_iob_file), "r", encoding="utf8") as file_reader:
804 document_id = None
805 document_text = None
807 entities = []
808 entity_type = None
809 entity_start = 0
811 for line in file_reader:
812 line = line.strip()
813 if line[:3] == "###":
814 if not (document_id is None and document_text is None):
815 documents[document_id] = document_text
816 entities_per_document[document_id] = entities
818 document_id = line.split(":")[-1]
819 document_text = None
821 entities = []
822 entity_type = None
823 entity_start = 0
825 file_reader.__next__()
826 continue
828 if line:
829 parts = line.split()
830 token = parts[0].strip()
831 tag = parts[1].strip()
833 if tag.startswith("B-"):
834 if entity_type is not None:
835 entities.append(
836 Entity((entity_start, len(document_text)), entity_type)
837 )
839 entity_start = len(document_text) + 1 if document_text else 0
840 entity_type = tag[2:]
842 elif tag == "O" and entity_type is not None:
843 entities.append(
844 Entity((entity_start, len(document_text)), entity_type)
845 )
846 entity_type = None
848 document_text = (
849 document_text + " " + token if document_text else token
850 )
852 else:
853 document_text += sentence_tag
855 # Edge case: last token starts a new entity
856 if entity_type is not None:
857 entities.append(
858 Entity((entity_start, len(document_text)), entity_type)
859 )
861 # Last document in file
862 if not (document_id is None and document_text is None):
863 documents[document_id] = document_text
864 entities_per_document[document_id] = entities
866 return InternalBioNerDataset(
867 documents=documents, entities_per_document=entities_per_document
868 )
871class HUNER_GENE_JNLPBA(HunerDataset):
872 """
873 HUNER version of the JNLPBA corpus containing gene annotations.
874 """
876 def __init__(self, *args, **kwargs):
877 super().__init__(*args, **kwargs)
879 @staticmethod
880 def split_url() -> str:
881 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/genia"
883 def get_corpus_sentence_splitter(self) -> SentenceSplitter:
884 return TagSentenceSplitter(tag=SENTENCE_TAG, tokenizer=SciSpacyTokenizer())
886 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
887 orig_folder = data_dir / "original"
888 os.makedirs(str(orig_folder), exist_ok=True)
890 sentence_separator = " "
891 if isinstance(self.sentence_splitter, TagSentenceSplitter):
892 sentence_separator = self.sentence_splitter.tag
894 train_data = HunerJNLPBA.download_and_prepare_train(orig_folder, sentence_separator)
895 train_data = filter_and_map_entities(train_data, {"protein": GENE_TAG})
897 test_data = HunerJNLPBA.download_and_prepare_test(orig_folder, sentence_separator)
898 test_data = filter_and_map_entities(test_data, {"protein": GENE_TAG})
900 return merge_datasets([train_data, test_data])
903class HUNER_CELL_LINE_JNLPBA(HunerDataset):
904 """
905 HUNER version of the JNLPBA corpus containing cell line annotations.
906 """
908 def __init__(self, *args, **kwargs):
909 super().__init__(*args, **kwargs)
911 @staticmethod
912 def split_url() -> str:
913 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/genia"
915 def get_corpus_sentence_splitter(self) -> SentenceSplitter:
916 return TagSentenceSplitter(tag=SENTENCE_TAG, tokenizer=SciSpacyTokenizer())
918 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
919 download_folder = data_dir / "original"
920 os.makedirs(str(download_folder), exist_ok=True)
922 sentence_separator = " "
923 if isinstance(self.sentence_splitter, TagSentenceSplitter):
924 sentence_separator = self.sentence_splitter.tag
926 train_data = HunerJNLPBA.download_and_prepare_train(download_folder, sentence_separator)
927 train_data = filter_and_map_entities(train_data, {"cell_line": CELL_LINE_TAG})
929 test_data = HunerJNLPBA.download_and_prepare_test(download_folder, sentence_separator)
930 test_data = filter_and_map_entities(test_data, {"cell_line": CELL_LINE_TAG})
932 return merge_datasets([train_data, test_data])
935class CELL_FINDER(ColumnCorpus):
936 """
937 Original CellFinder corpus containing cell line, species and gene annotations.
939 For futher information see Neves et al.:
940 Annotating and evaluating text for stem cell research
941 https://pdfs.semanticscholar.org/38e3/75aeeeb1937d03c3c80128a70d8e7a74441f.pdf
942 """
944 def __init__(
945 self,
946 base_path: Union[str, Path] = None,
947 in_memory: bool = True,
948 sentence_splitter: SentenceSplitter = None,
949 ):
950 """
951 :param base_path: Path to the corpus on your machine
952 :param in_memory: If True, keeps dataset in memory giving speedups in training.
953 :param sentence_splitter: Custom implementation of :class:`SentenceSplitter` which segments
954 the text into sentences and tokens.
955 """
956 if type(base_path) == str:
957 base_path: Path = Path(base_path)
959 # column format
960 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY}
962 # this dataset name
963 dataset_name = self.__class__.__name__.lower()
965 if sentence_splitter is None:
966 sentence_splitter = SciSpacySentenceSplitter()
968 # default dataset folder is the cache root
969 if not base_path:
970 base_path = flair.cache_root / "datasets"
971 data_folder = base_path / dataset_name
973 train_file = data_folder / f"{sentence_splitter.name}_train.conll"
974 if not (train_file.exists()):
975 train_corpus = self.download_and_prepare(data_folder)
977 writer = CoNLLWriter(sentence_splitter=sentence_splitter)
978 writer.write_to_conll(train_corpus, train_file)
980 super(CELL_FINDER, self).__init__(
981 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
982 )
984 @classmethod
985 def download_and_prepare(cls, data_folder: Path) -> InternalBioNerDataset:
986 data_url = "https://www.informatik.hu-berlin.de/de/forschung/gebiete/wbi/resources/cellfinder/cellfinder1_brat.tar.gz"
987 data_path = cached_path(data_url, data_folder)
988 unpack_file(data_path, data_folder)
990 return cls.read_folder(data_folder)
992 @classmethod
993 def read_folder(cls, data_folder: Path) -> InternalBioNerDataset:
994 ann_files = list(data_folder.glob("*.ann"))
995 documents = {}
996 entities_per_document = defaultdict(list)
997 for ann_file in ann_files:
998 with ann_file.open(encoding="utf8") as f_ann, ann_file.with_suffix(".txt").open(encoding="utf8") as f_txt:
999 document_text = f_txt.read().strip()
1001 document_id = ann_file.stem
1002 documents[document_id] = document_text
1004 for line in f_ann:
1005 fields = line.strip().split("\t")
1006 if not fields:
1007 continue
1008 ent_type, char_start, char_end = fields[1].split()
1009 entities_per_document[document_id].append(
1010 Entity(
1011 char_span=(int(char_start), int(char_end)),
1012 entity_type=ent_type,
1013 )
1014 )
1016 assert document_text[int(char_start) : int(char_end)] == fields[2]
1018 return InternalBioNerDataset(
1019 documents=documents, entities_per_document=dict(entities_per_document)
1020 )
1023class HUNER_CELL_LINE_CELL_FINDER(HunerDataset):
1024 """
1025 HUNER version of the CellFinder corpus containing only cell line annotations.
1026 """
1027 def __init__(self, *args, **kwargs):
1028 super().__init__(*args, **kwargs)
1030 @staticmethod
1031 def split_url() -> str:
1032 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/cellfinder_cellline"
1034 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
1035 data = CELL_FINDER.download_and_prepare(data_dir)
1036 data = filter_and_map_entities(data, {"CellLine": CELL_LINE_TAG})
1038 return data
1041class HUNER_SPECIES_CELL_FINDER(HunerDataset):
1042 """
1043 HUNER version of the CellFinder corpus containing only species annotations.
1044 """
1045 def __init__(self, *args, **kwargs):
1046 super().__init__(*args, **kwargs)
1048 @staticmethod
1049 def split_url() -> str:
1050 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/cellfinder_species"
1052 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
1053 data = CELL_FINDER.download_and_prepare(data_dir)
1054 data = filter_and_map_entities(data, {"Species": SPECIES_TAG})
1056 return data
1059class HUNER_GENE_CELL_FINDER(HunerDataset):
1060 """
1061 HUNER version of the CellFinder corpus containing only gene annotations.
1062 """
1063 def __init__(self, *args, **kwargs):
1064 super().__init__(*args, **kwargs)
1066 @staticmethod
1067 def split_url() -> str:
1068 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/cellfinder_protein"
1070 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
1071 data = CELL_FINDER.download_and_prepare(data_dir)
1072 data = filter_and_map_entities(data, {"GeneProtein": GENE_TAG})
1074 return data
1077class MIRNA(ColumnCorpus):
1078 """
1079 Original miRNA corpus.
1081 For further information see Bagewadi et al.:
1082 Detecting miRNA Mentions and Relations in Biomedical Literature
1083 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4602280/
1084 """
1086 def __init__(
1087 self,
1088 base_path: Union[str, Path] = None,
1089 in_memory: bool = True,
1090 sentence_splitter: SentenceSplitter = None,
1091 ):
1092 """
1093 :param base_path: Path to the corpus on your machine
1094 :param in_memory: If True, keeps dataset in memory giving speedups in training.
1095 :param tokenizer: Callable that segments a sentence into words,
1096 defaults to scispacy
1097 :param sentence_splitter: Callable that segments a document into sentences,
1098 defaults to scispacy
1099 """
1100 if type(base_path) == str:
1101 base_path: Path = Path(base_path)
1103 # column format
1104 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY}
1106 # this dataset name
1107 dataset_name = self.__class__.__name__.lower()
1109 # default dataset folder is the cache root
1110 if not base_path:
1111 base_path = flair.cache_root / "datasets"
1112 data_folder = base_path / dataset_name
1114 sentence_separator = " "
1115 if sentence_splitter is None:
1116 sentence_separator = SENTENCE_TAG
1117 sentence_splitter = TagSentenceSplitter(
1118 tag=sentence_separator, tokenizer=SciSpacyTokenizer()
1119 )
1121 train_file = data_folder / f"{sentence_splitter.name}_train.conll"
1122 test_file = data_folder / f"{sentence_splitter.name}_test.conll"
1124 if not (train_file.exists() and test_file.exists()):
1125 download_folder = data_folder / "original"
1126 os.makedirs(str(download_folder), exist_ok=True)
1128 writer = CoNLLWriter(sentence_splitter=sentence_splitter)
1130 train_corpus = self.download_and_prepare_train(
1131 download_folder, sentence_separator
1132 )
1133 writer.write_to_conll(train_corpus, train_file)
1135 test_corpus = self.download_and_prepare_test(
1136 download_folder, sentence_separator
1137 )
1138 writer.write_to_conll(test_corpus, test_file)
1140 super(MIRNA, self).__init__(
1141 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
1142 )
1144 @classmethod
1145 def download_and_prepare_train(cls, data_folder: Path, sentence_separator: str):
1146 data_url = "https://www.scai.fraunhofer.de/content/dam/scai/de/downloads/bioinformatik/miRNA/miRNA-Train-Corpus.xml"
1147 data_path = cached_path(data_url, data_folder)
1149 return cls.parse_file(data_path, "train", sentence_separator)
1151 @classmethod
1152 def download_and_prepare_test(cls, data_folder: Path, sentence_separator):
1153 data_url = "https://www.scai.fraunhofer.de/content/dam/scai/de/downloads/bioinformatik/miRNA/miRNA-Test-Corpus.xml"
1154 data_path = cached_path(data_url, data_folder)
1156 return cls.parse_file(data_path, "test", sentence_separator)
1158 @classmethod
1159 def parse_file(
1160 cls, input_file: Path, split: str, sentence_separator: str
1161 ) -> InternalBioNerDataset:
1162 tree = etree.parse(str(input_file))
1164 documents = {}
1165 entities_per_document = {}
1167 for document in tree.xpath(".//document"):
1168 document_id = document.get("id") + "-" + split
1169 entities = []
1171 document_text = ""
1172 for sentence in document.xpath(".//sentence"):
1173 if document_text:
1174 document_text += sentence_separator
1176 sentence_offset = len(document_text)
1177 document_text += (
1178 sentence.get("text") if document_text else sentence.get("text")
1179 )
1181 for entity in sentence.xpath(".//entity"):
1182 start, end = entity.get("charOffset").split("-")
1183 entities.append(
1184 Entity(
1185 (
1186 sentence_offset + int(start),
1187 sentence_offset + int(end) + 1,
1188 ),
1189 entity.get("type"),
1190 )
1191 )
1193 documents[document_id] = document_text
1194 entities_per_document[document_id] = entities
1196 return InternalBioNerDataset(
1197 documents=documents, entities_per_document=entities_per_document
1198 )
1201class HunerMiRNAHelper(object):
1202 @staticmethod
1203 def get_mirna_subset(
1204 dataset: InternalBioNerDataset, split_url: str, split_dir: Path
1205 ):
1206 split_file = cached_path(split_url, split_dir)
1208 with split_file.open(encoding="utf8") as f:
1209 ids = [l.strip() for l in f if l.strip()]
1210 ids = [id + "-train" for id in ids] + [id + "-test" for id in ids]
1211 ids = sorted(id_ for id_ in ids if id_ in dataset.documents)
1213 return InternalBioNerDataset(
1214 documents={k: dataset.documents[k] for k in ids},
1215 entities_per_document={k: dataset.entities_per_document[k] for k in ids},
1216 )
1219class HUNER_GENE_MIRNA(HunerDataset):
1220 """
1221 HUNER version of the miRNA corpus containing protein / gene annotations.
1222 """
1224 def __init__(self, *args, **kwargs):
1225 super().__init__(*args, **kwargs)
1227 @staticmethod
1228 def split_url() -> str:
1229 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/miRNA"
1231 def get_subset(self, dataset: InternalBioNerDataset, split: str, split_dir: Path):
1232 # In the huner split files there is no information whether a given id originates
1233 # from the train or test file of the original corpus - so we have to adapt corpus
1234 # splitting here
1235 return HunerMiRNAHelper.get_mirna_subset(
1236 dataset, f"{self.split_url()}.{split}", split_dir
1237 )
1239 def get_corpus_sentence_splitter(self):
1240 return TagSentenceSplitter(tag=SENTENCE_TAG, tokenizer=SciSpacyTokenizer())
1242 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
1243 download_folder = data_dir / "original"
1244 os.makedirs(str(download_folder), exist_ok=True)
1246 sentence_separator = " "
1247 if isinstance(self.sentence_splitter, TagSentenceSplitter):
1248 sentence_separator = self.sentence_splitter.tag
1250 train_data = MIRNA.download_and_prepare_train(download_folder, sentence_separator)
1251 train_data = filter_and_map_entities(train_data, {"Genes/Proteins": GENE_TAG})
1253 test_data = MIRNA.download_and_prepare_test(download_folder, sentence_separator)
1254 test_data = filter_and_map_entities(test_data, {"Genes/Proteins": GENE_TAG})
1256 return merge_datasets([train_data, test_data])
1259class HUNER_SPECIES_MIRNA(HunerDataset):
1260 """
1261 HUNER version of the miRNA corpus containing species annotations.
1262 """
1264 def __init__(self, *args, **kwargs):
1265 super().__init__(*args, **kwargs)
1267 @staticmethod
1268 def split_url() -> str:
1269 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/miRNA"
1271 def get_subset(self, dataset: InternalBioNerDataset, split: str, split_dir: Path):
1272 # In the huner split files there is no information whether a given id originates
1273 # from the train or test file of the original corpus - so we have to adapt corpus
1274 # splitting here
1275 return HunerMiRNAHelper.get_mirna_subset(
1276 dataset, f"{self.split_url()}.{split}", split_dir
1277 )
1279 def get_corpus_sentence_splitter(self) -> SentenceSplitter:
1280 return TagSentenceSplitter(tag=SENTENCE_TAG, tokenizer=SciSpacyTokenizer())
1282 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
1283 download_folder = data_dir / "original"
1284 os.makedirs(str(download_folder), exist_ok=True)
1286 sentence_separator = " "
1287 if isinstance(self.sentence_splitter, TagSentenceSplitter):
1288 sentence_separator = self.sentence_splitter.tag
1290 train_data = MIRNA.download_and_prepare_train(download_folder, sentence_separator)
1291 train_data = filter_and_map_entities(train_data, {"Species": SPECIES_TAG})
1293 test_data = MIRNA.download_and_prepare_test(download_folder, sentence_separator)
1294 test_data = filter_and_map_entities(test_data, {"Species": SPECIES_TAG})
1296 return merge_datasets([train_data, test_data])
1299class HUNER_DISEASE_MIRNA(HunerDataset):
1300 """
1301 HUNER version of the miRNA corpus containing disease annotations.
1302 """
1304 def __init__(self, *args, **kwargs):
1305 super().__init__(*args, **kwargs)
1307 @staticmethod
1308 def split_url() -> str:
1309 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/miRNA"
1311 def get_subset(self, dataset: InternalBioNerDataset, split: str, split_dir: Path):
1312 # In the huner split files there is no information whether a given id originates
1313 # from the train or test file of the original corpus - so we have to adapt corpus
1314 # splitting here
1315 return HunerMiRNAHelper.get_mirna_subset(
1316 dataset, f"{self.split_url()}.{split}", split_dir
1317 )
1319 def get_corpus_sentence_splitter(self) -> SentenceSplitter:
1320 return TagSentenceSplitter(tag=SENTENCE_TAG, tokenizer=SciSpacyTokenizer())
1322 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
1323 download_folder = data_dir / "original"
1324 os.makedirs(str(download_folder), exist_ok=True)
1326 sentence_separator = " "
1327 if isinstance(self.sentence_splitter, TagSentenceSplitter):
1328 sentence_separator = self.sentence_splitter.tag
1330 train_data = MIRNA.download_and_prepare_train(download_folder, sentence_separator)
1331 train_data = filter_and_map_entities(train_data, {"Diseases": DISEASE_TAG})
1333 test_data = MIRNA.download_and_prepare_test(download_folder, sentence_separator)
1334 test_data = filter_and_map_entities(test_data, {"Diseases": DISEASE_TAG})
1336 return merge_datasets([train_data, test_data])
1339class KaewphanCorpusHelper:
1340 """ Helper class for the corpora from Kaewphan et al., i.e. CLL and Gellus"""
1342 @staticmethod
1343 def download_cll_dataset(data_folder: Path):
1344 data_url = "http://bionlp-www.utu.fi/cell-lines/CLL_corpus.tar.gz"
1345 data_path = cached_path(data_url, data_folder)
1346 unpack_file(data_path, data_folder)
1348 @staticmethod
1349 def prepare_and_save_dataset(nersuite_folder: Path, output_file: Path):
1350 writer = open(str(output_file), "w", encoding="utf8")
1351 out_newline = False
1353 for file in os.listdir(str(nersuite_folder)):
1354 if not file.endswith(".nersuite"):
1355 continue
1357 annotations = []
1358 with open(os.path.join(str(nersuite_folder), file), "r", encoding="utf8") as reader:
1359 for line in reader.readlines():
1360 columns = line.split("\t")
1361 annotations.append(columns[:4])
1363 num_annotations = len(annotations)
1364 for i, annotation in enumerate(annotations):
1365 if len(annotation) == 1:
1366 assert annotation[0] == "\n"
1367 if not out_newline:
1368 writer.write("\n")
1369 out_newline = True
1370 continue
1372 has_whitespace = "+"
1374 next_annotation = (
1375 annotations[i + 1]
1376 if (i + 1) < num_annotations and len(annotations[i + 1]) > 1
1377 else None
1378 )
1379 if next_annotation and next_annotation[1] == annotation[2]:
1380 has_whitespace = "-"
1382 writer.write(
1383 " ".join([annotation[3], annotation[0], has_whitespace]) + "\n"
1384 )
1385 out_newline = False
1387 if not out_newline:
1388 writer.write("\n")
1389 out_newline = True
1391 writer.close()
1393 @staticmethod
1394 def download_gellus_dataset(data_folder: Path):
1395 data_url = "http://bionlp-www.utu.fi/cell-lines/Gellus_corpus.tar.gz"
1396 data_path = cached_path(data_url, data_folder)
1397 unpack_file(data_path, data_folder)
1399 @staticmethod
1400 def read_dataset(
1401 nersuite_folder: Path, sentence_separator: str
1402 ) -> InternalBioNerDataset:
1403 documents = {}
1404 entities_per_document = {}
1405 for file in os.listdir(str(nersuite_folder)):
1406 if not file.endswith(".nersuite"):
1407 continue
1409 document_id = file.replace(".nersuite", "")
1411 with open(os.path.join(str(nersuite_folder), file), "r", encoding="utf8") as reader:
1412 document_text = ""
1413 entities = []
1415 entity_start = None
1416 entity_type = None
1418 for line in reader.readlines():
1419 line = line.strip()
1420 if line:
1421 columns = line.split("\t")
1422 tag = columns[0]
1423 token = columns[3]
1424 if tag.startswith("B-"):
1425 if entity_type is not None:
1426 entities.append(
1427 Entity(
1428 (entity_start, len(document_text)), entity_type
1429 )
1430 )
1432 entity_start = (
1433 len(document_text) + 1 if document_text else 0
1434 )
1435 entity_type = tag[2:]
1437 elif tag == "O" and entity_type is not None:
1438 entities.append(
1439 Entity((entity_start, len(document_text)), entity_type,)
1440 )
1441 entity_type = None
1443 document_text = (
1444 document_text + " " + token if document_text else token
1445 )
1446 else:
1447 # Edge case: last token starts a new entity
1448 if entity_type is not None:
1449 entities.append(
1450 Entity((entity_start, len(document_text)), entity_type)
1451 )
1452 document_text += sentence_separator
1454 if document_text.endswith(sentence_separator):
1455 document_text = document_text[: -len(sentence_separator)]
1457 documents[document_id] = document_text
1458 entities_per_document[document_id] = entities
1460 return InternalBioNerDataset(
1461 documents=documents, entities_per_document=entities_per_document
1462 )
1465class CLL(ColumnCorpus):
1466 """
1467 Original CLL corpus containing cell line annotations.
1469 For further information, see Kaewphan et al.:
1470 Cell line name recognition in support of the identification of synthetic lethality in cancer from text
1471 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4708107/
1472 """
1474 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
1475 """
1476 :param base_path: Path to the corpus on your machine
1477 :param in_memory: If True, keeps dataset in memory giving speedups in training
1478 """
1479 if type(base_path) == str:
1480 base_path: Path = Path(base_path)
1482 # column format
1483 columns = {0: "text", 1: "ner"}
1485 # this dataset name
1486 dataset_name = self.__class__.__name__.lower()
1488 # default dataset folder is the cache root
1489 if not base_path:
1490 base_path = flair.cache_root / "datasets"
1491 data_folder = base_path / dataset_name
1493 train_file = data_folder / "train.conll"
1495 if not (train_file.exists()):
1496 KaewphanCorpusHelper.download_cll_dataset(data_folder)
1498 nersuite_folder = data_folder / "CLL-1.0.2" / "nersuite"
1499 KaewphanCorpusHelper.prepare_and_save_dataset(nersuite_folder, train_file)
1501 super(CLL, self).__init__(
1502 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
1503 )
1506class HUNER_CELL_LINE_CLL(HunerDataset):
1507 """
1508 HUNER version of the CLL corpus containing cell line annotations.
1509 """
1511 def __init__(self, *args, **kwargs):
1512 super().__init__(*args, **kwargs)
1514 @staticmethod
1515 def split_url() -> str:
1516 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/cll"
1518 def get_corpus_sentence_splitter(self) -> SentenceSplitter:
1519 return TagSentenceSplitter(tag=SENTENCE_TAG, tokenizer=SciSpacyTokenizer())
1521 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
1522 KaewphanCorpusHelper.download_cll_dataset(data_dir)
1524 sentence_separator = " "
1525 if isinstance(self.sentence_splitter, TagSentenceSplitter):
1526 sentence_separator = self.sentence_splitter.tag
1528 nersuite_folder = data_dir / "CLL-1.0.2" / "nersuite"
1529 orig_dataset = KaewphanCorpusHelper.read_dataset(nersuite_folder, sentence_separator)
1531 return filter_and_map_entities(orig_dataset, {"CL": CELL_LINE_TAG})
1534class GELLUS(ColumnCorpus):
1535 """
1536 Original Gellus corpus containing cell line annotations.
1538 For further information, see Kaewphan et al.:
1539 Cell line name recognition in support of the identification of synthetic lethality in cancer from text
1540 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4708107/
1541 """
1543 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
1544 """
1545 :param base_path: Path to the corpus on your machine
1546 :param in_memory: If True, keeps dataset in memory giving speedups in training
1547 """
1548 if type(base_path) == str:
1549 base_path: Path = Path(base_path)
1551 # column format
1552 columns = {0: "text", 1: "ner"}
1554 # this dataset name
1555 dataset_name = self.__class__.__name__.lower()
1557 # default dataset folder is the cache root
1558 if not base_path:
1559 base_path = flair.cache_root / "datasets"
1560 data_folder = base_path / dataset_name
1562 train_file = data_folder / "train.conll"
1563 dev_file = data_folder / "dev.conll"
1564 test_file = data_folder / "test.conll"
1566 if not (train_file.exists() and dev_file.exists() and test_file.exists()):
1567 KaewphanCorpusHelper.download_gellus_dataset(data_folder)
1569 nersuite_train = data_folder / "GELLUS-1.0.3" / "nersuite" / "train"
1570 KaewphanCorpusHelper.prepare_and_save_dataset(nersuite_train, train_file)
1572 nersuite_dev = data_folder / "GELLUS-1.0.3" / "nersuite" / "devel"
1573 KaewphanCorpusHelper.prepare_and_save_dataset(nersuite_dev, dev_file)
1575 nersuite_test = data_folder / "GELLUS-1.0.3" / "nersuite" / "test"
1576 KaewphanCorpusHelper.prepare_and_save_dataset(nersuite_test, test_file)
1578 super(GELLUS, self).__init__(
1579 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
1580 )
1583class HUNER_CELL_LINE_GELLUS(HunerDataset):
1584 """
1585 HUNER version of the Gellus corpus containing cell line annotations.
1586 """
1588 def __init__(self, *args, **kwargs):
1589 super().__init__(*args, **kwargs)
1591 @staticmethod
1592 def split_url() -> str:
1593 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/gellus"
1595 def get_corpus_sentence_splitter(self) -> SentenceSplitter:
1596 return TagSentenceSplitter(tag=SENTENCE_TAG, tokenizer=SciSpacyTokenizer())
1598 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
1599 KaewphanCorpusHelper.download_gellus_dataset(data_dir)
1601 sentence_separator = " "
1602 if isinstance(self.sentence_splitter, TagSentenceSplitter):
1603 sentence_separator = self.sentence_splitter.tag
1605 splits = []
1606 for folder in ["train", "devel", "test"]:
1607 nersuite_folder = data_dir / "GELLUS-1.0.3" / "nersuite" / folder
1608 splits.append(
1609 KaewphanCorpusHelper.read_dataset(nersuite_folder, sentence_separator)
1610 )
1612 full_dataset = merge_datasets(splits)
1613 return filter_and_map_entities(full_dataset, {"Cell-line-name": CELL_LINE_TAG})
1616class LOCTEXT(ColumnCorpus):
1617 """
1618 Original LOCTEXT corpus containing species annotations.
1620 For further information see Cejuela et al.:
1621 LocText: relation extraction of protein localizations to assist database curation
1622 https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-018-2021-9
1623 """
1625 def __init__(
1626 self,
1627 base_path: Union[str, Path] = None,
1628 in_memory: bool = True,
1629 sentence_splitter: SentenceSplitter = None,
1630 ):
1631 """
1632 :param base_path: Path to the corpus on your machine
1633 :param in_memory: If True, keeps dataset in memory giving speedups in training.
1634 :param sentence_splitter: Custom implementation of :class:`SentenceSplitter`
1635 that segments a document into sentences and tokens (default :class:`SciSpacySentenceSplitter`)
1636 """
1637 if type(base_path) == str:
1638 base_path: Path = Path(base_path)
1640 # column format
1641 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY}
1643 # this dataset name
1644 dataset_name = self.__class__.__name__.lower()
1646 # default dataset folder is the cache root
1647 if not base_path:
1648 base_path = flair.cache_root / "datasets"
1649 data_folder = base_path / dataset_name
1651 if sentence_splitter is None:
1652 sentence_splitter = SciSpacySentenceSplitter()
1654 train_file = data_folder / f"{sentence_splitter.name}_train.conll"
1656 if not (train_file.exists()):
1657 self.download_dataset(data_folder)
1658 full_dataset = self.parse_dataset(data_folder)
1660 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter)
1661 conll_writer.write_to_conll(full_dataset, train_file)
1663 super(LOCTEXT, self).__init__(
1664 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
1665 )
1667 @staticmethod
1668 def download_dataset(data_dir: Path):
1669 data_url = "http://pubannotation.org/downloads/LocText-annotations.tgz"
1670 data_path = cached_path(data_url, data_dir)
1671 unpack_file(data_path, data_dir)
1673 @staticmethod
1674 def parse_dataset(data_dir: Path) -> InternalBioNerDataset:
1675 loctext_json_folder = data_dir / "LocText"
1677 entity_type_mapping = {
1678 "go": "protein",
1679 "uniprot": "protein",
1680 "taxonomy": "species",
1681 }
1683 documents = {}
1684 entities_per_document = {}
1686 for file in os.listdir(str(loctext_json_folder)):
1687 document_id = file.strip(".json")
1688 entities = []
1690 with open(os.path.join(str(loctext_json_folder), file), "r", encoding="utf8") as f_in:
1691 data = json.load(f_in)
1692 document_text = data["text"].strip()
1693 document_text = document_text.replace("\n", " ")
1695 if "denotations" in data.keys():
1696 for ann in data["denotations"]:
1697 start = int(ann["span"]["begin"])
1698 end = int(ann["span"]["end"])
1700 original_entity_type = ann["obj"].split(":")[0]
1701 if not original_entity_type in entity_type_mapping:
1702 continue
1704 entity_type = entity_type_mapping[original_entity_type]
1705 entities.append(Entity((start, end), entity_type))
1707 documents[document_id] = document_text
1708 entities_per_document[document_id] = entities
1710 return InternalBioNerDataset(
1711 documents=documents, entities_per_document=entities_per_document
1712 )
1715class HUNER_SPECIES_LOCTEXT(HunerDataset):
1716 """
1717 HUNER version of the Loctext corpus containing species annotations.
1718 """
1720 def __init__(self, *args, **kwargs):
1721 super().__init__(*args, **kwargs)
1723 @staticmethod
1724 def split_url() -> str:
1725 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/loctext"
1727 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
1728 LOCTEXT.download_dataset(data_dir)
1729 dataset = LOCTEXT.parse_dataset(data_dir)
1731 return filter_and_map_entities(dataset, {"species": SPECIES_TAG})
1734class HUNER_GENE_LOCTEXT(HunerDataset):
1735 """
1736 HUNER version of the Loctext corpus containing protein annotations.
1737 """
1739 def __init__(self, *args, **kwargs):
1740 super().__init__(*args, **kwargs)
1742 @staticmethod
1743 def split_url() -> str:
1744 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/loctext"
1746 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
1747 LOCTEXT.download_dataset(data_dir)
1748 dataset = LOCTEXT.parse_dataset(data_dir)
1750 return filter_and_map_entities(dataset, {"protein": GENE_TAG})
1753class CHEMDNER(ColumnCorpus):
1754 """
1755 Original corpus of the CHEMDNER shared task.
1757 For further information see Krallinger et al.:
1758 The CHEMDNER corpus of chemicals and drugs and its annotation principles
1759 https://jcheminf.biomedcentral.com/articles/10.1186/1758-2946-7-S1-S2
1760 """
1762 default_dir = flair.cache_root / "datasets" / "CHEMDNER"
1764 def __init__(
1765 self,
1766 base_path: Union[str, Path] = None,
1767 in_memory: bool = True,
1768 sentence_splitter: SentenceSplitter = None,
1769 ):
1770 """
1771 :param base_path: Path to the corpus on your machine
1772 :param in_memory: If True, keeps dataset in memory giving speedups in training.
1773 :param sentence_splitter: Custom implementation of :class:`SentenceSplitter` which
1774 segements documents into sentences and tokens
1775 """
1777 if type(base_path) == str:
1778 base_path: Path = Path(base_path)
1780 # column format
1781 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY}
1783 # this dataset name
1784 dataset_name = self.__class__.__name__.lower()
1786 # default dataset folder is the cache root
1787 if not base_path:
1788 # download file is huge => make default_dir visible so that derivative
1789 # corpora can all use the same download file
1790 data_folder = self.default_dir
1791 else:
1792 data_folder = base_path / dataset_name
1794 if sentence_splitter is None:
1795 sentence_splitter = SciSpacySentenceSplitter()
1797 train_file = data_folder / f"{sentence_splitter.name}_train.conll"
1798 dev_file = data_folder / f"{sentence_splitter.name}_dev.conll"
1799 test_file = data_folder / f"{sentence_splitter.name}_test.conll"
1801 if not (train_file.exists() and dev_file.exists() and test_file.exists()):
1802 download_dir = data_folder / "original"
1803 os.makedirs(download_dir, exist_ok=True)
1804 self.download_dataset(download_dir)
1806 train_data = bioc_to_internal(
1807 download_dir / "chemdner_corpus" / "training.bioc.xml"
1808 )
1809 dev_data = bioc_to_internal(
1810 download_dir / "chemdner_corpus" / "development.bioc.xml"
1811 )
1812 test_data = bioc_to_internal(
1813 download_dir / "chemdner_corpus" / "evaluation.bioc.xml"
1814 )
1816 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter)
1818 conll_writer.write_to_conll(train_data, train_file)
1819 conll_writer.write_to_conll(dev_data, dev_file)
1820 conll_writer.write_to_conll(test_data, test_file)
1822 super(CHEMDNER, self).__init__(
1823 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
1824 )
1826 @staticmethod
1827 def download_dataset(data_dir: Path):
1828 data_url = "https://biocreative.bioinformatics.udel.edu/media/store/files/2014/chemdner_corpus.tar.gz"
1829 data_path = cached_path(data_url, data_dir)
1830 unpack_file(data_path, data_dir)
1833class HUNER_CHEMICAL_CHEMDNER(HunerDataset):
1834 """
1835 HUNER version of the CHEMDNER corpus containing chemical annotations.
1836 """
1838 def __init__(self, *args, download_folder=None, **kwargs):
1839 self.download_folder = download_folder or CHEMDNER.default_dir / "original"
1840 super().__init__(*args, **kwargs)
1842 @staticmethod
1843 def split_url() -> str:
1844 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/chemdner"
1846 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
1847 os.makedirs(str(self.download_folder), exist_ok=True)
1848 CHEMDNER.download_dataset(self.download_folder)
1849 train_data = bioc_to_internal(
1850 self.download_folder / "chemdner_corpus" / "training.bioc.xml"
1851 )
1852 dev_data = bioc_to_internal(
1853 self.download_folder / "chemdner_corpus" / "development.bioc.xml"
1854 )
1855 test_data = bioc_to_internal(
1856 self.download_folder / "chemdner_corpus" / "evaluation.bioc.xml"
1857 )
1858 all_data = merge_datasets([train_data, dev_data, test_data])
1859 all_data = filter_and_map_entities(
1860 all_data,
1861 {
1862 "ABBREVIATION": CHEMICAL_TAG,
1863 "FAMILY": CHEMICAL_TAG,
1864 "FORMULA": CHEMICAL_TAG,
1865 "IDENTIFIER": CHEMICAL_TAG,
1866 "MULTIPLE": CHEMICAL_TAG,
1867 "NO_CLASS": CHEMICAL_TAG,
1868 "SYSTEMATIC": CHEMICAL_TAG,
1869 "TRIVIAL": CHEMICAL_TAG,
1870 },
1871 )
1873 return all_data
1876class IEPA(ColumnCorpus):
1877 """
1878 IEPA corpus as provided by http://corpora.informatik.hu-berlin.de/
1879 (Original corpus is 404)
1881 For further information see Ding, Berleant, Nettleton, Wurtele:
1882 Mining MEDLINE: abstracts, sentences, or phrases?
1883 https://www.ncbi.nlm.nih.gov/pubmed/11928487
1884 """
1886 def __init__(
1887 self,
1888 base_path: Union[str, Path] = None,
1889 in_memory: bool = True,
1890 tokenizer: Tokenizer = None,
1891 ):
1892 """
1893 :param base_path: Path to the corpus on your machine
1894 :param in_memory: If True, keeps dataset in memory giving speedups in training.
1895 :param tokenizer: Custom implementation of :class:`Tokenizer` which
1896 segments sentences into tokens (default :class:`SciSpacyTokenizer`)
1897 """
1899 if type(base_path) == str:
1900 base_path: Path = Path(base_path)
1902 # column format
1903 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY}
1905 # this dataset name
1906 dataset_name = self.__class__.__name__.lower()
1908 # default dataset folder is the cache root
1909 if not base_path:
1910 base_path = flair.cache_root / "datasets"
1911 data_folder = base_path / dataset_name
1913 if tokenizer is None:
1914 tokenizer = SciSpacyTokenizer()
1916 sentence_splitter = NewlineSentenceSplitter(tokenizer=tokenizer)
1918 train_file = data_folder / f"{sentence_splitter.name}_train.conll"
1920 if not (train_file.exists()):
1921 download_dir = data_folder / "original"
1922 os.makedirs(download_dir, exist_ok=True)
1923 self.download_dataset(download_dir)
1925 all_data = bioc_to_internal(download_dir / "iepa_bioc.xml")
1927 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter)
1928 conll_writer.write_to_conll(all_data, train_file)
1930 super(IEPA, self).__init__(
1931 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
1932 )
1934 @staticmethod
1935 def download_dataset(data_dir: Path):
1936 data_url = (
1937 "http://corpora.informatik.hu-berlin.de/corpora/brat2bioc/iepa_bioc.xml.zip"
1938 )
1939 data_path = cached_path(data_url, data_dir)
1940 unpack_file(data_path, data_dir)
1943class HUNER_GENE_IEPA(HunerDataset):
1944 """
1945 HUNER version of the IEPA corpus containing gene annotations.
1946 """
1948 def __init__(self, *args, **kwargs):
1949 super().__init__(*args, **kwargs)
1951 @staticmethod
1952 def split_url() -> str:
1953 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/iepa"
1955 def get_corpus_sentence_splitter(self) -> SentenceSplitter:
1956 return NewlineSentenceSplitter(tokenizer=SciSpacyTokenizer())
1958 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
1959 os.makedirs(str(data_dir), exist_ok=True)
1960 IEPA.download_dataset(data_dir)
1962 all_data = bioc_to_internal(data_dir / "iepa_bioc.xml")
1963 all_data = filter_and_map_entities(all_data, {"Protein": GENE_TAG})
1965 return all_data
1968class LINNEAUS(ColumnCorpus):
1969 """
1970 Original LINNEAUS corpus containing species annotations.
1972 For further information see Gerner et al.:
1973 LINNAEUS: a species name identification system for biomedical literature
1974 https://www.ncbi.nlm.nih.gov/pubmed/20149233
1975 """
1977 def __init__(
1978 self,
1979 base_path: Union[str, Path] = None,
1980 in_memory: bool = True,
1981 tokenizer: Callable[[str], Tuple[List[str], List[int]]] = None,
1982 ):
1983 """
1984 :param base_path: Path to the corpus on your machine
1985 :param in_memory: If True, keeps dataset in memory giving speedups in training.
1986 :param tokenizer: Custom implementation of :class:`Tokenizer` which segments
1987 sentence into tokens (default :class:`SciSpacyTokenizer`)
1988 """
1990 if type(base_path) == str:
1991 base_path: Path = Path(base_path)
1993 # column format
1994 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY}
1996 # this dataset name
1997 dataset_name = self.__class__.__name__.lower()
1999 # default dataset folder is the cache root
2000 if not base_path:
2001 base_path = flair.cache_root / "datasets"
2002 data_folder = base_path / dataset_name
2004 if tokenizer is None:
2005 tokenizer = SciSpacyTokenizer()
2007 sentence_splitter = TagSentenceSplitter(tag=SENTENCE_TAG, tokenizer=tokenizer)
2009 train_file = data_folder / f"{sentence_splitter.name}_train.conll"
2011 if not (train_file.exists()):
2012 dataset = self.download_and_parse_dataset(data_folder)
2014 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter)
2015 conll_writer.write_to_conll(dataset, train_file)
2017 super(LINNEAUS, self).__init__(
2018 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
2019 )
2021 @staticmethod
2022 def download_and_parse_dataset(data_dir: Path):
2023 data_url = "https://iweb.dl.sourceforge.net/project/linnaeus/Corpora/manual-corpus-species-1.0.tar.gz"
2024 data_path = cached_path(data_url, data_dir)
2025 unpack_file(data_path, data_dir)
2027 documents = {}
2028 entities_per_document = defaultdict(list)
2030 # Read texts
2031 texts_directory = data_dir / "manual-corpus-species-1.0" / "txt"
2032 for filename in os.listdir(str(texts_directory)):
2033 document_id = filename.strip(".txt")
2035 with open(os.path.join(str(texts_directory), filename), "r", encoding="utf8") as file:
2036 documents[document_id] = file.read().strip()
2038 # Read annotations
2039 tag_file = data_dir / "manual-corpus-species-1.0" / "filtered_tags.tsv"
2040 with open(str(tag_file), "r", encoding="utf8") as file:
2041 next(file) # Ignore header row
2043 for line in file:
2044 if not line:
2045 continue
2047 document_id, start, end, text = line.strip().split("\t")[1:5]
2048 start, end = int(start), int(end)
2050 entities_per_document[document_id].append(
2051 Entity((start, end), SPECIES_TAG)
2052 )
2054 document_text = documents[document_id]
2055 if document_text[start:end] != text:
2056 raise AssertionError()
2058 return InternalBioNerDataset(
2059 documents=documents, entities_per_document=entities_per_document
2060 )
2063class HUNER_SPECIES_LINNEAUS(HunerDataset):
2064 """
2065 HUNER version of the LINNEAUS corpus containing species annotations.
2066 """
2068 def __init__(self, *args, **kwargs):
2069 super().__init__(*args, **kwargs)
2071 @staticmethod
2072 def split_url() -> str:
2073 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/linneaus"
2075 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
2076 return LINNEAUS.download_and_parse_dataset(data_dir)
2079class CDR(ColumnCorpus):
2080 """
2081 CDR corpus as provided by https://github.com/JHnlp/BioCreative-V-CDR-Corpus
2083 For further information see Li et al.:
2084 BioCreative V CDR task corpus: a resource for chemical disease relation extraction
2085 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4860626/
2086 """
2088 def __init__(
2089 self,
2090 base_path: Union[str, Path] = None,
2091 in_memory: bool = True,
2092 sentence_splitter: Callable[[str], Tuple[List[str], List[int]]] = None,
2093 ):
2094 """
2095 :param base_path: Path to the corpus on your machine
2096 :param in_memory: If True, keeps dataset in memory giving speedups in training.
2097 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments
2098 documents into sentences and tokens (default :class:`SciSpacySentenceSplitter`)
2099 """
2101 if type(base_path) == str:
2102 base_path: Path = Path(base_path)
2104 # column format
2105 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY}
2107 # this dataset name
2108 dataset_name = self.__class__.__name__.lower()
2110 # default dataset folder is the cache root
2111 if not base_path:
2112 base_path = flair.cache_root / "datasets"
2113 data_folder = base_path / dataset_name
2115 if sentence_splitter is None:
2116 sentence_splitter = SciSpacySentenceSplitter()
2118 train_file = data_folder / f"{sentence_splitter.name}_train.conll"
2119 dev_file = data_folder / f"{sentence_splitter.name}_dev.conll"
2120 test_file = data_folder / f"{sentence_splitter.name}_test.conll"
2122 if not (train_file.exists() and dev_file.exists() and test_file.exists()):
2123 download_dir = data_folder / "original"
2124 os.makedirs(download_dir, exist_ok=True)
2125 self.download_dataset(download_dir)
2127 train_data = bioc_to_internal(
2128 download_dir
2129 / "CDR_Data"
2130 / "CDR.Corpus.v010516"
2131 / "CDR_TrainingSet.BioC.xml"
2132 )
2133 dev_data = bioc_to_internal(
2134 download_dir
2135 / "CDR_Data"
2136 / "CDR.Corpus.v010516"
2137 / "CDR_DevelopmentSet.BioC.xml"
2138 )
2139 test_data = bioc_to_internal(
2140 download_dir
2141 / "CDR_Data"
2142 / "CDR.Corpus.v010516"
2143 / "CDR_TestSet.BioC.xml"
2144 )
2146 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter)
2147 conll_writer.write_to_conll(train_data, train_file)
2148 conll_writer.write_to_conll(dev_data, dev_file)
2149 conll_writer.write_to_conll(test_data, test_file)
2151 super(CDR, self).__init__(
2152 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
2153 )
2155 @staticmethod
2156 def download_dataset(data_dir: Path):
2157 data_url = (
2158 "https://github.com/JHnlp/BioCreative-V-CDR-Corpus/raw/master/CDR_Data.zip"
2159 )
2160 data_path = cached_path(data_url, data_dir)
2161 unpack_file(data_path, data_dir)
2164class HUNER_DISEASE_CDR(HunerDataset):
2165 """
2166 HUNER version of the IEPA corpus containing disease annotations.
2167 """
2169 def __init__(self, *args, **kwargs):
2170 super().__init__(*args, **kwargs)
2172 @staticmethod
2173 def split_url() -> str:
2174 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/CDRDisease"
2176 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
2177 os.makedirs(str(data_dir), exist_ok=True)
2178 CDR.download_dataset(data_dir)
2179 train_data = bioc_to_internal(
2180 data_dir / "CDR_Data" / "CDR.Corpus.v010516" / "CDR_TrainingSet.BioC.xml"
2181 )
2182 dev_data = bioc_to_internal(
2183 data_dir / "CDR_Data" / "CDR.Corpus.v010516" / "CDR_DevelopmentSet.BioC.xml"
2184 )
2185 test_data = bioc_to_internal(
2186 data_dir / "CDR_Data" / "CDR.Corpus.v010516" / "CDR_TestSet.BioC.xml"
2187 )
2188 all_data = merge_datasets([train_data, dev_data, test_data])
2189 all_data = filter_and_map_entities(all_data, {"Disease": DISEASE_TAG})
2191 return all_data
2194class HUNER_CHEMICAL_CDR(HunerDataset):
2195 """
2196 HUNER version of the IEPA corpus containing chemical annotations.
2197 """
2199 def __init__(self, *args, **kwargs):
2200 super().__init__(*args, **kwargs)
2202 @staticmethod
2203 def split_url() -> str:
2204 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/CDRChem"
2206 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
2207 os.makedirs(str(data_dir), exist_ok=True)
2208 CDR.download_dataset(data_dir)
2209 train_data = bioc_to_internal(
2210 data_dir / "CDR_Data" / "CDR.Corpus.v010516" / "CDR_TrainingSet.BioC.xml"
2211 )
2212 dev_data = bioc_to_internal(
2213 data_dir / "CDR_Data" / "CDR.Corpus.v010516" / "CDR_DevelopmentSet.BioC.xml"
2214 )
2215 test_data = bioc_to_internal(
2216 data_dir / "CDR_Data" / "CDR.Corpus.v010516" / "CDR_TestSet.BioC.xml"
2217 )
2218 all_data = merge_datasets([train_data, dev_data, test_data])
2219 all_data = filter_and_map_entities(all_data, {"Chemical": CHEMICAL_TAG})
2221 return all_data
2224class VARIOME(ColumnCorpus):
2225 """
2226 Variome corpus as provided by http://corpora.informatik.hu-berlin.de/corpora/brat2bioc/hvp_bioc.xml.zip
2228 For further information see Verspoor et al.:
2229 Annotating the biomedical literature for the human variome
2230 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3676157/
2231 """
2233 def __init__(
2234 self,
2235 base_path: Union[str, Path] = None,
2236 in_memory: bool = True,
2237 sentence_splitter: Callable[[str], Tuple[List[str], List[int]]] = None,
2238 ):
2239 """
2240 :param base_path: Path to the corpus on your machine
2241 :param in_memory: If True, keeps dataset in memory giving speedups in training.
2242 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments
2243 documents into sentences and tokens (default :class:`SciSpacySentenceSplitter`)
2244 """
2246 if type(base_path) == str:
2247 base_path: Path = Path(base_path)
2249 # column format
2250 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY}
2252 # this dataset name
2253 dataset_name = self.__class__.__name__.lower()
2255 # default dataset folder is the cache root
2256 if not base_path:
2257 base_path = flair.cache_root / "datasets"
2258 data_folder = base_path / dataset_name
2260 if sentence_splitter is None:
2261 sentence_splitter = SciSpacySentenceSplitter()
2263 train_file = data_folder / f"{sentence_splitter.name}_train.conll"
2265 if not (train_file.exists()):
2266 download_dir = data_folder / "original"
2267 os.makedirs(download_dir, exist_ok=True)
2268 self.download_dataset(download_dir)
2270 all_data = self.parse_corpus(download_dir / "hvp_bioc.xml")
2272 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter)
2273 conll_writer.write_to_conll(all_data, train_file)
2275 super(VARIOME, self).__init__(
2276 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
2277 )
2279 @staticmethod
2280 def download_dataset(data_dir: Path):
2281 data_url = (
2282 "http://corpora.informatik.hu-berlin.de/corpora/brat2bioc/hvp_bioc.xml.zip"
2283 )
2284 data_path = cached_path(data_url, data_dir)
2285 unpack_file(data_path, data_dir)
2287 @staticmethod
2288 def parse_corpus(corpus_xml: Path) -> InternalBioNerDataset:
2289 corpus = bioc_to_internal(corpus_xml)
2291 cleaned_documents = {}
2292 cleaned_entities_per_document = {}
2294 for id, document_text in corpus.documents.items():
2295 entities = corpus.entities_per_document[id]
2296 original_length = len(document_text)
2298 text_cleaned = document_text.replace("** IGNORE LINE **\n", "")
2299 offset = original_length - len(text_cleaned)
2301 if offset != 0:
2302 new_entities = []
2303 for entity in entities:
2304 new_start = entity.char_span.start - offset
2305 new_end = entity.char_span.stop - offset
2307 new_entities.append(Entity((new_start, new_end), entity.type))
2309 orig_text = document_text[
2310 entity.char_span.start : entity.char_span.stop
2311 ]
2312 new_text = text_cleaned[new_start:new_end]
2313 assert orig_text == new_text
2315 entities = new_entities
2316 document_text = text_cleaned
2318 cleaned_documents[id] = document_text
2319 cleaned_entities_per_document[id] = entities
2321 return InternalBioNerDataset(
2322 documents=cleaned_documents,
2323 entities_per_document=cleaned_entities_per_document,
2324 )
2327class HUNER_GENE_VARIOME(HunerDataset):
2328 """
2329 HUNER version of the Variome corpus containing gene annotations.
2330 """
2332 def __init__(self, *args, **kwargs):
2333 super().__init__(*args, **kwargs)
2335 @staticmethod
2336 def split_url() -> str:
2337 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/variome_gene"
2339 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
2340 os.makedirs(str(data_dir), exist_ok=True)
2341 VARIOME.download_dataset(data_dir)
2342 all_data = VARIOME.parse_corpus(data_dir / "hvp_bioc.xml")
2343 all_data = filter_and_map_entities(all_data, {"gene": GENE_TAG})
2345 return all_data
2348class HUNER_DISEASE_VARIOME(HunerDataset):
2349 """
2350 HUNER version of the Variome corpus containing disease annotations.
2351 """
2353 def __init__(self, *args, **kwargs):
2354 super().__init__(*args, **kwargs)
2356 @staticmethod
2357 def split_url() -> str:
2358 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/variome_disease"
2360 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
2361 os.makedirs(str(data_dir), exist_ok=True)
2362 VARIOME.download_dataset(data_dir)
2363 all_data = VARIOME.parse_corpus(data_dir / "hvp_bioc.xml")
2364 all_data = filter_and_map_entities(
2365 all_data, {"Disorder": DISEASE_TAG, "disease": DISEASE_TAG}
2366 )
2368 return all_data
2371class HUNER_SPECIES_VARIOME(HunerDataset):
2372 """
2373 HUNER version of the Variome corpus containing species annotations.
2374 """
2376 def __init__(self, *args, **kwargs):
2377 super().__init__(*args, **kwargs)
2379 @staticmethod
2380 def split_url() -> str:
2381 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/variome_species"
2383 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
2384 os.makedirs(str(data_dir), exist_ok=True)
2385 VARIOME.download_dataset(data_dir)
2386 all_data = VARIOME.parse_corpus(data_dir / "hvp_bioc.xml")
2387 all_data = filter_and_map_entities(all_data, {"Living_Beings": SPECIES_TAG})
2389 return all_data
2392class NCBI_DISEASE(ColumnCorpus):
2393 """
2394 Original NCBI disease corpus containing disease annotations.
2396 For further information see Dogan et al.:
2397 NCBI disease corpus: a resource for disease name recognition and concept normalization
2398 https://www.ncbi.nlm.nih.gov/pubmed/24393765
2399 """
2401 def __init__(
2402 self,
2403 base_path: Union[str, Path] = None,
2404 in_memory: bool = True,
2405 sentence_splitter: SentenceSplitter = None,
2406 ):
2407 """
2408 :param base_path: Path to the corpus on your machine
2409 :param in_memory: If True, keeps dataset in memory giving speedups in training.
2410 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments
2411 documents into sentences and tokens (default :class:`SciSpacySentenceSplitter`)
2412 """
2414 if type(base_path) == str:
2415 base_path: Path = Path(base_path)
2417 # column format
2418 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY}
2420 # this dataset name
2421 dataset_name = self.__class__.__name__.lower()
2423 # default dataset folder is the cache root
2424 if not base_path:
2425 base_path = flair.cache_root / "datasets"
2426 data_folder = base_path / dataset_name
2428 if sentence_splitter is None:
2429 sentence_splitter = SciSpacySentenceSplitter()
2431 train_file = data_folder / f"{sentence_splitter.name}_train.conll"
2432 dev_file = data_folder / f"{sentence_splitter.name}_dev.conll"
2433 test_file = data_folder / f"{sentence_splitter.name}_test.conll"
2435 if not (train_file.exists() and dev_file.exists() and test_file.exists()):
2436 orig_folder = self.download_corpus(data_folder)
2438 train_data = self.parse_input_file(orig_folder / "NCBItrainset_patched.txt")
2439 dev_data = self.parse_input_file(orig_folder / "NCBIdevelopset_corpus.txt")
2440 test_data = self.parse_input_file(orig_folder / "NCBItestset_corpus.txt")
2442 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter)
2443 conll_writer.write_to_conll(train_data, train_file)
2444 conll_writer.write_to_conll(dev_data, dev_file)
2445 conll_writer.write_to_conll(test_data, test_file)
2447 super(NCBI_DISEASE, self).__init__(
2448 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
2449 )
2451 @classmethod
2452 def download_corpus(cls, data_dir: Path) -> Path:
2453 original_folder = data_dir / "original"
2454 os.makedirs(str(original_folder), exist_ok=True)
2456 data_urls = [
2457 "https://www.ncbi.nlm.nih.gov/CBBresearch/Dogan/DISEASE/NCBItrainset_corpus.zip",
2458 "https://www.ncbi.nlm.nih.gov/CBBresearch/Dogan/DISEASE/NCBIdevelopset_corpus.zip",
2459 "https://www.ncbi.nlm.nih.gov/CBBresearch/Dogan/DISEASE/NCBItestset_corpus.zip",
2460 ]
2462 for url in data_urls:
2463 data_path = cached_path(url, original_folder)
2464 unpack_file(data_path, original_folder)
2466 # We need to apply a patch to correct the original training file
2467 orig_train_file = original_folder / "NCBItrainset_corpus.txt"
2468 patched_train_file = original_folder / "NCBItrainset_patched.txt"
2469 cls.patch_training_file(orig_train_file, patched_train_file)
2471 return original_folder
2473 @staticmethod
2474 def patch_training_file(orig_train_file: Path, patched_file: Path):
2475 patch_lines = {
2476 3249: '10923035\t711\t761\tgeneralized epilepsy and febrile seizures " plus "\tSpecificDisease\tD004829+D003294\n'
2477 }
2478 with open(str(orig_train_file), "r", encoding="utf8") as input:
2479 with open(str(patched_file), "w", encoding="utf8") as output:
2480 line_no = 1
2482 for line in input:
2483 output.write(
2484 patch_lines[line_no] if line_no in patch_lines else line
2485 )
2486 line_no += 1
2488 @staticmethod
2489 def parse_input_file(input_file: Path):
2490 documents = {}
2491 entities_per_document = {}
2493 with open(str(input_file), "r", encoding="utf8") as file:
2494 document_id = None
2495 document_text = None
2496 entities = []
2498 c = 1
2499 for line in file:
2500 line = line.strip()
2501 if not line:
2502 if document_id and document_text:
2503 documents[document_id] = document_text
2504 entities_per_document[document_id] = entities
2506 document_id, document_text, entities = None, None, []
2507 c = 1
2508 continue
2509 if c == 1:
2510 # Articles title
2511 document_text = line.split("|")[2] + " "
2512 document_id = line.split("|")[0]
2513 elif c == 2:
2514 # Article abstract
2515 document_text += line.split("|")[2]
2516 else:
2517 # Entity annotations
2518 columns = line.split("\t")
2519 start = int(columns[1])
2520 end = int(columns[2])
2521 entity_text = columns[3]
2523 assert document_text[start:end] == entity_text
2524 entities.append(Entity((start, end), DISEASE_TAG))
2525 c += 1
2527 if c != 1 and document_id and document_text:
2528 documents[document_id] = document_text
2529 entities_per_document[document_id] = entities
2531 return InternalBioNerDataset(
2532 documents=documents, entities_per_document=entities_per_document
2533 )
2536class HUNER_DISEASE_NCBI(HunerDataset):
2537 """
2538 HUNER version of the NCBI corpus containing disease annotations.
2539 """
2541 def __init__(self, *args, **kwargs):
2542 super().__init__(*args, **kwargs)
2544 @staticmethod
2545 def split_url() -> str:
2546 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/ncbi"
2548 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
2549 orig_folder = NCBI_DISEASE.download_corpus(data_dir)
2551 train_data = NCBI_DISEASE.parse_input_file(
2552 orig_folder / "NCBItrainset_patched.txt"
2553 )
2554 dev_data = NCBI_DISEASE.parse_input_file(
2555 orig_folder / "NCBIdevelopset_corpus.txt"
2556 )
2557 test_data = NCBI_DISEASE.parse_input_file(
2558 orig_folder / "NCBItestset_corpus.txt"
2559 )
2561 return merge_datasets([train_data, dev_data, test_data])
2564class ScaiCorpus(ColumnCorpus):
2565 """Base class to support the SCAI chemicals and disease corpora"""
2567 def __init__(
2568 self,
2569 base_path: Union[str, Path] = None,
2570 in_memory: bool = True,
2571 sentence_splitter: Callable[[str], Tuple[List[str], List[int]]] = None,
2572 ):
2573 """
2574 :param base_path: Path to the corpus on your machine
2575 :param in_memory: If True, keeps dataset in memory giving speedups in training.
2576 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments
2577 documents into sentences and tokens (default :class:`SciSpacySentenceSplitter`)
2578 """
2580 if type(base_path) == str:
2581 base_path: Path = Path(base_path)
2583 # column format
2584 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY}
2586 # this dataset name
2587 dataset_name = self.__class__.__name__.lower()
2589 # default dataset folder is the cache root
2590 if not base_path:
2591 base_path = flair.cache_root / "datasets"
2592 data_folder = base_path / dataset_name
2594 if sentence_splitter is None:
2595 sentence_splitter = SciSpacySentenceSplitter()
2597 train_file = data_folder / f"{sentence_splitter.name}_train.conll"
2599 if not (train_file.exists()):
2600 dataset_file = self.download_corpus(data_folder)
2601 train_data = self.parse_input_file(dataset_file)
2603 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter)
2604 conll_writer.write_to_conll(train_data, train_file)
2606 super(ScaiCorpus, self).__init__(
2607 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
2608 )
2610 def download_corpus(self, data_folder: Path) -> Path:
2611 raise NotImplementedError()
2613 @staticmethod
2614 def parse_input_file(input_file: Path):
2615 documents = {}
2616 entities_per_document = {}
2618 with open(str(input_file), "r", encoding="iso-8859-1") as file:
2619 document_id = None
2620 document_text = None
2621 entities = []
2622 entity_type = None
2624 for line in file:
2625 line = line.strip()
2626 if not line:
2627 continue
2629 if line[:3] == "###":
2630 # Edge case: last token starts a new entity
2631 if entity_type is not None:
2632 entities.append(
2633 Entity((entity_start, len(document_text)), entity_type)
2634 )
2636 if not (document_id is None and document_text is None):
2637 documents[document_id] = document_text
2638 entities_per_document[document_id] = entities
2640 document_id = line.strip("#").strip()
2641 document_text = None
2642 entities = []
2643 else:
2644 columns = line.strip().split("\t")
2645 token = columns[0].strip()
2646 tag = columns[4].strip().split("|")[1]
2648 if tag.startswith("B-"):
2649 if entity_type is not None:
2650 entities.append(
2651 Entity((entity_start, len(document_text)), entity_type)
2652 )
2654 entity_start = len(document_text) + 1 if document_text else 0
2655 entity_type = tag[2:]
2657 elif tag == "O" and entity_type is not None:
2658 entities.append(
2659 Entity((entity_start, len(document_text)), entity_type)
2660 )
2661 entity_type = None
2663 document_text = (
2664 document_text + " " + token if document_text else token
2665 )
2667 return InternalBioNerDataset(
2668 documents=documents, entities_per_document=entities_per_document
2669 )
2672class SCAI_CHEMICALS(ScaiCorpus):
2673 """
2674 Original SCAI chemicals corpus containing chemical annotations.
2676 For further information see Kolářik et al.:
2677 Chemical Names: Terminological Resources and Corpora Annotation
2678 https://pub.uni-bielefeld.de/record/2603498
2679 """
2681 def __init__(self, *args, **kwargs):
2682 super().__init__(*args, **kwargs)
2684 def download_corpus(self, data_dir: Path) -> Path:
2685 return self.perform_corpus_download(data_dir)
2687 @staticmethod
2688 def perform_corpus_download(data_dir: Path) -> Path:
2689 original_directory = data_dir / "original"
2690 os.makedirs(str(original_directory), exist_ok=True)
2692 url = "https://www.scai.fraunhofer.de/content/dam/scai/de/downloads/bioinformatik/Corpora-for-Chemical-Entity-Recognition/chemicals-test-corpus-27-04-2009-v3_iob.gz"
2693 data_path = cached_path(url, original_directory)
2694 corpus_file = original_directory / "chemicals-test-corpus-27-04-2009-v3.iob"
2695 unpack_file(data_path, corpus_file)
2697 return corpus_file
2700class SCAI_DISEASE(ScaiCorpus):
2701 """
2702 Original SCAI disease corpus containing disease annotations.
2704 For further information see Gurulingappa et al.:
2705 An Empirical Evaluation of Resources for the Identification of Diseases and Adverse Effects in Biomedical Literature
2706 https://pub.uni-bielefeld.de/record/2603398
2707 """
2709 def __init__(self, *args, **kwargs):
2710 super().__init__(*args, **kwargs)
2712 def download_corpus(self, data_dir: Path) -> Path:
2713 return self.perform_corpus_download(data_dir)
2715 @staticmethod
2716 def perform_corpus_download(data_dir: Path) -> Path:
2717 original_directory = data_dir / "original"
2718 os.makedirs(str(original_directory), exist_ok=True)
2720 url = "https://www.scai.fraunhofer.de/content/dam/scai/de/downloads/bioinformatik/Disease-ae-corpus.iob"
2721 data_path = cached_path(url, original_directory)
2723 return data_path
2726class HUNER_CHEMICAL_SCAI(HunerDataset):
2727 """
2728 HUNER version of the SCAI chemicals corpus containing chemical annotations.
2729 """
2731 def __init__(self, *args, **kwargs):
2732 super().__init__(*args, **kwargs)
2734 @staticmethod
2735 def split_url() -> str:
2736 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/scai_chemicals"
2738 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
2739 original_file = SCAI_CHEMICALS.perform_corpus_download(data_dir)
2740 corpus = ScaiCorpus.parse_input_file(original_file)
2742 # Map all entities to chemicals
2743 entity_mapping = {
2744 "FAMILY": CHEMICAL_TAG,
2745 "TRIVIALVAR": CHEMICAL_TAG,
2746 "PARTIUPAC": CHEMICAL_TAG,
2747 "TRIVIAL": CHEMICAL_TAG,
2748 "ABBREVIATION": CHEMICAL_TAG,
2749 "IUPAC": CHEMICAL_TAG,
2750 "MODIFIER": CHEMICAL_TAG,
2751 "SUM": CHEMICAL_TAG,
2752 }
2754 return filter_and_map_entities(corpus, entity_mapping)
2757class HUNER_DISEASE_SCAI(HunerDataset):
2758 """
2759 HUNER version of the SCAI chemicals corpus containing chemical annotations.
2760 """
2762 def __init__(self, *args, **kwargs):
2763 super().__init__(*args, **kwargs)
2765 @staticmethod
2766 def split_url() -> str:
2767 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/scai_disease"
2769 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
2770 original_file = SCAI_DISEASE.perform_corpus_download(data_dir)
2771 corpus = ScaiCorpus.parse_input_file(original_file)
2773 # Map all entities to disease
2774 entity_mapping = {"DISEASE": DISEASE_TAG, "ADVERSE": DISEASE_TAG}
2776 return filter_and_map_entities(corpus, entity_mapping)
2779class OSIRIS(ColumnCorpus):
2780 """
2781 Original OSIRIS corpus containing variation and gene annotations.
2783 For further information see Furlong et al.:
2784 Osiris v1.2: a named entity recognition system for sequence variants of genes in biomedical literature
2785 https://www.ncbi.nlm.nih.gov/pubmed/18251998
2786 """
2788 def __init__(
2789 self,
2790 base_path: Union[str, Path] = None,
2791 in_memory: bool = True,
2792 sentence_splitter: SentenceSplitter = None,
2793 load_original_unfixed_annotation=False,
2794 ):
2795 """
2796 :param base_path: Path to the corpus on your machine
2797 :param in_memory: If True, keeps dataset in memory giving speedups in training.
2798 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which
2799 segments documents into sentences and tokens (default :class:`SciSpacySentenceSplitter`)
2800 :param load_original_unfixed_annotation: The original annotation of Osiris
2801 erroneously annotates two sentences as a protein. Set to True if you don't
2802 want the fixed version.
2803 """
2805 if type(base_path) == str:
2806 base_path: Path = Path(base_path)
2808 # column format
2809 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY}
2811 # this dataset name
2812 dataset_name = self.__class__.__name__.lower()
2814 # default dataset folder is the cache root
2815 if not base_path:
2816 base_path = flair.cache_root / "datasets"
2817 data_folder = base_path / dataset_name
2819 if sentence_splitter is None:
2820 sentence_splitter = SciSpacySentenceSplitter()
2822 train_file = data_folder / f"{sentence_splitter.name}_train.conll"
2824 if not (train_file.exists()):
2825 corpus_folder = self.download_dataset(data_folder)
2826 corpus_data = self.parse_dataset(
2827 corpus_folder, fix_annotation=not load_original_unfixed_annotation
2828 )
2830 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter)
2831 conll_writer.write_to_conll(corpus_data, train_file)
2833 super(OSIRIS, self).__init__(
2834 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
2835 )
2837 @classmethod
2838 def download_dataset(cls, data_dir: Path) -> Path:
2839 url = "http://ibi.imim.es/OSIRIScorpusv02.tar"
2840 data_path = cached_path(url, data_dir)
2841 unpack_file(data_path, data_dir)
2843 return data_dir / "OSIRIScorpusv02"
2845 @classmethod
2846 def parse_dataset(cls, corpus_folder: Path, fix_annotation=True):
2847 documents = {}
2848 entities_per_document = {}
2850 input_files = [
2851 file
2852 for file in os.listdir(str(corpus_folder))
2853 if file.endswith(".txt") and not file.startswith("README")
2854 ]
2855 for text_file in input_files:
2857 with open(os.path.join(str(corpus_folder), text_file), encoding="utf8") as text_reader:
2858 document_text = text_reader.read()
2859 if not document_text:
2860 continue
2862 article_parts = document_text.split("\n\n")
2863 document_id = article_parts[0]
2864 text_offset = document_text.find(article_parts[1])
2865 document_text = (article_parts[1] + " " + article_parts[2]).strip()
2867 with open(os.path.join(str(corpus_folder), text_file + ".ann"), encoding="utf8") as ann_file:
2868 entities = []
2870 tree = etree.parse(ann_file)
2871 for annotation in tree.xpath(".//Annotation"):
2872 entity_type = annotation.get("type")
2873 if entity_type == "file":
2874 continue
2876 start, end = annotation.get("span").split("..")
2877 start, end = int(start), int(end)
2879 if (
2880 fix_annotation
2881 and text_file == "article46.txt"
2882 and start == 289
2883 and end == 644
2884 ):
2885 end = 295
2887 entities.append(
2888 Entity((start - text_offset, end - text_offset), entity_type)
2889 )
2891 documents[document_id] = document_text
2892 entities_per_document[document_id] = entities
2894 return InternalBioNerDataset(
2895 documents=documents, entities_per_document=entities_per_document
2896 )
2899class HUNER_GENE_OSIRIS(HunerDataset):
2900 """
2901 HUNER version of the OSIRIS corpus containing (only) gene annotations.
2903 """
2905 def __init__(self, *args, **kwargs):
2906 super().__init__(*args, **kwargs)
2908 @staticmethod
2909 def split_url() -> str:
2910 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/osiris"
2912 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
2913 original_file = OSIRIS.download_dataset(data_dir)
2914 corpus = OSIRIS.parse_dataset(original_file)
2916 entity_type_mapping = {"ge": GENE_TAG}
2917 return filter_and_map_entities(corpus, entity_type_mapping)
2920class S800(ColumnCorpus):
2921 """
2922 S800 corpus
2923 For further information see Pafilis et al.:
2924 The SPECIES and ORGANISMS Resources for Fast and Accurate Identification of Taxonomic Names in Text
2925 http://www.plosone.org/article/info:doi%2F10.1371%2Fjournal.pone.0065390
2926 """
2928 def __init__(
2929 self,
2930 base_path: Union[str, Path] = None,
2931 in_memory: bool = True,
2932 sentence_splitter: SentenceSplitter = None,
2933 ):
2934 """
2935 :param base_path: Path to the corpus on your machine
2936 :param in_memory: If True, keeps dataset in memory giving speedups in training.
2937 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments documents
2938 into sentences and tokens (default :class:`SciSpacySentenceSplitter`)
2939 """
2941 if type(base_path) == str:
2942 base_path: Path = Path(base_path)
2944 # column format
2945 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY}
2947 # this dataset name
2948 dataset_name = self.__class__.__name__.lower()
2950 # default dataset folder is the cache root
2951 if not base_path:
2952 base_path = flair.cache_root / "datasets"
2953 data_folder = base_path / dataset_name
2955 if sentence_splitter is None:
2956 sentence_splitter = SciSpacySentenceSplitter()
2958 train_file = data_folder / f"{sentence_splitter.name}_train.conll"
2960 if not (train_file.exists()):
2961 download_dir = data_folder / "original"
2962 os.makedirs(download_dir, exist_ok=True)
2963 self.download_dataset(download_dir)
2965 all_data = self.parse_dataset(download_dir)
2967 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter)
2968 conll_writer.write_to_conll(all_data, train_file)
2970 super(S800, self).__init__(
2971 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
2972 )
2974 @staticmethod
2975 def download_dataset(data_dir: Path):
2976 data_url = "https://species.jensenlab.org/files/S800-1.0.tar.gz"
2977 data_path = cached_path(data_url, data_dir)
2978 unpack_file(data_path, data_dir)
2980 @staticmethod
2981 def parse_dataset(data_dir: Path) -> InternalBioNerDataset:
2982 entities_per_document = defaultdict(list)
2983 texts_per_document = {}
2984 with (data_dir / "S800.tsv").open(encoding="utf8") as f:
2985 for line in f:
2986 fields = line.strip().split("\t")
2987 if not fields:
2988 continue
2989 fname, pmid = fields[1].split(":")
2990 start, end = int(fields[2]), int(fields[3])
2992 if start == end:
2993 continue
2995 entities_per_document[fname].append(Entity((start, end), "Species"))
2997 for fname in entities_per_document:
2998 with (data_dir / "abstracts" / fname).with_suffix(".txt").open(encoding="utf8") as f:
2999 texts_per_document[fname] = f.read()
3001 return InternalBioNerDataset(
3002 documents=texts_per_document, entities_per_document=entities_per_document
3003 )
3006class HUNER_SPECIES_S800(HunerDataset):
3007 """
3008 HUNER version of the S800 corpus containing species annotations.
3009 """
3010 def __init__(self, *args, **kwargs):
3011 super().__init__(*args, **kwargs)
3013 @staticmethod
3014 def split_url() -> str:
3015 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/s800"
3017 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
3018 S800.download_dataset(data_dir)
3019 data = S800.parse_dataset(data_dir)
3020 data = filter_and_map_entities(data, {"Species": SPECIES_TAG})
3022 return data
3025class GPRO(ColumnCorpus):
3026 """
3027 Original GPRO corpus containing gene annotations.
3029 For further information see:
3030 https://biocreative.bioinformatics.udel.edu/tasks/biocreative-v/gpro-detailed-task-description/
3031 """
3033 def __init__(
3034 self,
3035 base_path: Union[str, Path] = None,
3036 in_memory: bool = True,
3037 sentence_splitter: SentenceSplitter = None,
3038 ):
3039 """
3040 :param base_path: Path to the corpus on your machine
3041 :param in_memory: If True, keeps dataset in memory giving speedups in training.
3042 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments documents
3043 into sentences and tokens (default :class:`SciSpacySentenceSplitter`)
3044 """
3046 if type(base_path) == str:
3047 base_path: Path = Path(base_path)
3049 # column format
3050 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY}
3052 # this dataset name
3053 dataset_name = self.__class__.__name__.lower()
3055 # default dataset folder is the cache root
3056 if not base_path:
3057 base_path = flair.cache_root / "datasets"
3058 data_folder = base_path / dataset_name
3060 if sentence_splitter is None:
3061 sentence_splitter = SciSpacySentenceSplitter()
3063 train_file = data_folder / f"{sentence_splitter.name}_train.conll"
3064 dev_file = data_folder / f"{sentence_splitter.name}_dev.conll"
3066 if not (train_file.exists() and dev_file.exists()):
3067 train_folder = self.download_train_corpus(data_folder)
3068 train_text_file = train_folder / "chemdner_patents_train_text.txt"
3069 train_ann_file = train_folder / "chemdner_gpro_gold_standard_train_v02.tsv"
3070 train_data = self.parse_input_file(train_text_file, train_ann_file)
3072 dev_folder = self.download_dev_corpus(data_folder)
3073 dev_text_file = dev_folder / "chemdner_patents_development_text.txt"
3074 dev_ann_file = dev_folder / "chemdner_gpro_gold_standard_development.tsv"
3075 dev_data = self.parse_input_file(dev_text_file, dev_ann_file)
3077 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter)
3078 conll_writer.write_to_conll(train_data, train_file)
3079 conll_writer.write_to_conll(dev_data, dev_file)
3081 super(GPRO, self).__init__(
3082 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
3083 )
3085 @classmethod
3086 def download_train_corpus(cls, data_dir: Path) -> Path:
3087 corpus_dir = data_dir / "original"
3088 os.makedirs(str(corpus_dir), exist_ok=True)
3090 train_url = "https://biocreative.bioinformatics.udel.edu/media/store/files/2015/gpro_training_set_v02.tar.gz"
3091 data_path = cached_path(train_url, corpus_dir)
3092 unpack_file(data_path, corpus_dir)
3094 return corpus_dir / "gpro_training_set_v02"
3096 @classmethod
3097 def download_dev_corpus(cls, data_dir) -> Path:
3098 corpus_dir = data_dir / "original"
3099 os.makedirs(str(corpus_dir), exist_ok=True)
3101 dev_url = "https://biocreative.bioinformatics.udel.edu/media/store/files/2015/gpro_development_set.tar.gz"
3102 data_path = cached_path(dev_url, corpus_dir)
3103 unpack_file(data_path, corpus_dir)
3105 return corpus_dir / "gpro_development_set"
3107 @staticmethod
3108 def parse_input_file(text_file: Path, ann_file: Path) -> InternalBioNerDataset:
3109 documents = {}
3110 entities_per_document = {}
3112 document_title_length = {}
3114 with open(str(text_file), "r", encoding="utf8") as text_reader:
3115 for line in text_reader:
3116 if not line:
3117 continue
3119 document_id, title, abstract = line.split("\t")
3120 documents[document_id] = title + " " + abstract
3121 document_title_length[document_id] = len(title) + 1
3123 entities_per_document[document_id] = []
3125 with open(str(ann_file), "r", encoding="utf8") as ann_reader:
3126 for line in ann_reader:
3127 if not line:
3128 continue
3130 columns = line.split("\t")
3131 document_id = columns[0]
3132 start, end = int(columns[2]), int(columns[3])
3134 if columns[1] == "A":
3135 start = start + document_title_length[document_id]
3136 end = end + document_title_length[document_id]
3138 entities_per_document[document_id].append(
3139 Entity((start, end), GENE_TAG)
3140 )
3142 document_text = documents[document_id]
3143 assert columns[4] == document_text[start:end]
3145 return InternalBioNerDataset(
3146 documents=documents, entities_per_document=entities_per_document
3147 )
3150class HUNER_GENE_GPRO(HunerDataset):
3151 """
3152 HUNER version of the GPRO corpus containing gene annotations.
3153 """
3155 def __init__(self, *args, **kwargs):
3156 super().__init__(*args, **kwargs)
3158 @staticmethod
3159 def split_url() -> str:
3160 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/gpro"
3162 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
3163 train_folder = GPRO.download_train_corpus(data_dir)
3164 train_text_file = train_folder / "chemdner_patents_train_text.txt"
3165 train_ann_file = train_folder / "chemdner_gpro_gold_standard_train_v02.tsv"
3166 train_data = GPRO.parse_input_file(train_text_file, train_ann_file)
3168 dev_folder = GPRO.download_dev_corpus(data_dir)
3169 dev_text_file = dev_folder / "chemdner_patents_development_text.txt"
3170 dev_ann_file = dev_folder / "chemdner_gpro_gold_standard_development.tsv"
3171 dev_data = GPRO.parse_input_file(dev_text_file, dev_ann_file)
3173 return merge_datasets([train_data, dev_data])
3176class DECA(ColumnCorpus):
3177 """
3178 Original DECA corpus containing gene annotations.
3180 For further information see Wang et al.:
3181 Disambiguating the species of biomedical named entities using natural language parsers
3182 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2828111/
3183 """
3185 def __init__(
3186 self,
3187 base_path: Union[str, Path] = None,
3188 in_memory: bool = True,
3189 sentence_splitter: SentenceSplitter = None,
3190 ):
3191 """
3192 :param base_path: Path to the corpus on your machine
3193 :param in_memory: If True, keeps dataset in memory giving speedups in training.
3194 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments
3195 documents into sentences and tokens (default BioSpacySentenceSpliiter)
3196 """
3198 if type(base_path) == str:
3199 base_path: Path = Path(base_path)
3201 # column format
3202 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY}
3204 # this dataset name
3205 dataset_name = self.__class__.__name__.lower()
3207 # default dataset folder is the cache root
3208 if not base_path:
3209 base_path = flair.cache_root / "datasets"
3210 data_folder = base_path / dataset_name
3212 if sentence_splitter is None:
3213 sentence_splitter = SciSpacySentenceSplitter()
3215 train_file = data_folder / "train.conll"
3217 if not train_file.exists():
3218 corpus_dir = self.download_corpus(data_folder)
3219 text_dir = corpus_dir / "text"
3220 gold_file = corpus_dir / "gold.txt"
3222 corpus_data = self.parse_corpus(text_dir, gold_file)
3223 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter)
3224 conll_writer.write_to_conll(corpus_data, train_file)
3226 super(DECA, self).__init__(
3227 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
3228 )
3230 @classmethod
3231 def download_corpus(cls, data_dir: Path) -> Path:
3232 url = "http://www.nactem.ac.uk/deca/species_corpus_0.2.tar.gz"
3233 data_path = cached_path(url, data_dir)
3234 unpack_file(data_path, data_dir)
3236 return data_dir / "species_corpus_0.2"
3238 @staticmethod
3239 def parse_corpus(text_dir: Path, gold_file: Path) -> InternalBioNerDataset:
3240 documents = {}
3241 entities_per_document = {}
3243 text_files = [
3244 file for file in os.listdir(str(text_dir)) if not file.startswith(".")
3245 ]
3247 for file in text_files:
3248 document_id = file.strip(".txt")
3249 with open(os.path.join(str(text_dir), file), "r", encoding="utf8") as text_file:
3250 documents[document_id] = text_file.read().strip()
3251 entities_per_document[document_id] = []
3253 with open(str(gold_file), "r", encoding="utf8") as gold_reader:
3254 for line in gold_reader:
3255 if not line:
3256 continue
3257 columns = line.strip().split("\t")
3259 document_id = columns[0].strip(".txt")
3260 start, end = int(columns[1]), int(columns[2])
3262 entities_per_document[document_id].append(
3263 Entity((start, end), GENE_TAG)
3264 )
3266 document_text = documents[document_id]
3267 assert document_text[start:end] == columns[3]
3269 return InternalBioNerDataset(
3270 documents=documents, entities_per_document=entities_per_document
3271 )
3274class HUNER_GENE_DECA(HunerDataset):
3275 """
3276 HUNER version of the DECA corpus containing gene annotations.
3277 """
3279 def __init__(self, *args, **kwargs):
3280 super().__init__(*args, **kwargs)
3282 @staticmethod
3283 def split_url() -> str:
3284 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/deca"
3286 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
3287 corpus_dir = DECA.download_corpus(data_dir)
3288 text_dir = corpus_dir / "text"
3289 gold_file = corpus_dir / "gold.txt"
3291 return DECA.parse_corpus(text_dir, gold_file)
3294class FSU(ColumnCorpus):
3295 """
3296 Original FSU corpus containing protein and derived annotations.
3298 For further information see Hahn et al.:
3299 A proposal for a configurable silver standard
3300 https://www.aclweb.org/anthology/W10-1838/
3301 """
3303 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
3304 """
3305 :param base_path: Path to the corpus on your machine
3306 :param in_memory: If True, keeps dataset in memory giving speedups in training.
3307 """
3309 if type(base_path) == str:
3310 base_path: Path = Path(base_path)
3312 # column format
3313 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY}
3315 # this dataset name
3316 dataset_name = self.__class__.__name__.lower()
3318 # default dataset folder is the cache root
3319 if not base_path:
3320 base_path = flair.cache_root / "datasets"
3321 data_folder = base_path / dataset_name
3323 sentence_splitter = TagSentenceSplitter(
3324 tag=SENTENCE_TAG, tokenizer=SpaceTokenizer()
3325 )
3326 train_file = data_folder / f"{sentence_splitter.name}_train.conll"
3328 if not train_file.exists():
3329 corpus_dir = self.download_corpus(data_folder)
3330 corpus_data = self.parse_corpus(corpus_dir, SENTENCE_TAG)
3332 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter)
3333 conll_writer.write_to_conll(corpus_data, train_file)
3335 super(FSU, self).__init__(
3336 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
3337 )
3339 @classmethod
3340 def download_corpus(cls, data_dir: Path) -> Path:
3341 url = "https://julielab.de/downloads/resources/fsu_prge_release_v1_0.tgz"
3342 data_path = cached_path(url, data_dir)
3343 unpack_file(data_path, data_dir, mode="targz")
3345 return data_dir / "fsu-prge-release-v1.0"
3347 @staticmethod
3348 def parse_corpus(
3349 corpus_dir: Path, sentence_separator: str
3350 ) -> InternalBioNerDataset:
3351 documents = {}
3352 entities_per_document = {}
3354 for subcorpus in corpus_dir.iterdir():
3355 if not subcorpus.is_dir():
3356 continue
3357 for doc in (subcorpus / "mmax").iterdir():
3358 if not doc.is_dir():
3359 continue
3360 try:
3361 with open(doc / "Basedata" / "Basedata.xml", "r", encoding="utf8") as word_f:
3362 word_tree = etree.parse(word_f)
3363 with open(doc / "Markables" / "sentence.xml", "r", encoding="utf8") as sentence_f:
3364 sentence_tree = etree.parse(sentence_f).getroot()
3365 with open(doc / "Markables" / "proteins.xml", "r", encoding="utf8") as protein_f:
3366 protein_tree = etree.parse(protein_f).getroot()
3367 with open(doc / "Basedata.uri", "r", encoding="utf8") as id_f:
3368 document_id = id_f.read().strip()
3369 except FileNotFoundError:
3370 # Incomplete article
3371 continue
3372 except XMLSyntaxError:
3373 # Invalid XML syntax
3374 continue
3376 word_to_id = {}
3377 words = []
3378 for i, token in enumerate(word_tree.xpath(".//word")):
3379 words += [token.text]
3380 word_to_id[token.get("id")] = i
3381 word_pos = [(0, 0) for _ in words]
3383 sentences_id_span = sorted(
3384 [
3385 (int(sentence.get("id").split("_")[-1]), sentence.get("span"))
3386 for sentence in sentence_tree
3387 ]
3388 )
3390 sentences = []
3391 for j, sentence in enumerate(sentences_id_span):
3392 tmp_sentence = []
3393 akt_pos = 0
3394 start = word_to_id[sentence[1].split("..")[0]]
3395 end = word_to_id[sentence[1].split("..")[1]]
3396 for i in range(start, end + 1):
3397 tmp_sentence += [words[i]]
3398 word_pos[i] = (j, akt_pos)
3399 akt_pos += len(words[i]) + 1
3400 sentences += [tmp_sentence]
3402 pre_entities = [[] for _ in sentences]
3403 for protein in protein_tree:
3404 for span in protein.get("span").split(","):
3405 start = word_to_id[span.split("..")[0]]
3406 end = word_to_id[span.split("..")[-1]]
3407 pre_entities[word_pos[start][0]] += [
3408 (
3409 word_pos[start][1],
3410 word_pos[end][1] + len(words[end]),
3411 protein.get("proteins"),
3412 )
3413 ]
3415 sentences = [" ".join(sentence) for sentence in sentences]
3416 document = sentence_separator.join(sentences)
3418 entities = []
3419 sent_offset = 0
3420 for sentence, sent_entities in zip(sentences, pre_entities):
3421 entities += [
3422 Entity(
3423 (entity[0] + sent_offset, entity[1] + sent_offset),
3424 entity[2],
3425 )
3426 for entity in sent_entities
3427 ]
3428 sent_offset += len(sentence) + len(sentence_separator)
3430 documents[document_id] = document
3431 entities_per_document[document_id] = entities
3433 return InternalBioNerDataset(
3434 documents=documents, entities_per_document=entities_per_document
3435 )
3438class HUNER_GENE_FSU(HunerDataset):
3439 """
3440 HUNER version of the FSU corpus containing (only) gene annotations.
3441 """
3443 def __init__(self, *args, **kwargs):
3444 super().__init__(*args, **kwargs)
3446 @staticmethod
3447 def split_url() -> str:
3448 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/fsu"
3450 def get_corpus_sentence_splitter(self) -> SentenceSplitter:
3451 return TagSentenceSplitter(tag=SENTENCE_TAG, tokenizer=SciSpacyTokenizer())
3453 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
3454 corpus_dir = FSU.download_corpus(data_dir)
3456 sentence_separator = " "
3457 if isinstance(self.sentence_splitter, TagSentenceSplitter):
3458 sentence_separator = self.sentence_splitter.tag
3460 corpus = FSU.parse_corpus(corpus_dir, sentence_separator)
3462 entity_type_mapping = {
3463 "protein": GENE_TAG,
3464 "protein_familiy_or_group": GENE_TAG,
3465 "protein_complex": GENE_TAG,
3466 "protein_variant": GENE_TAG,
3467 "protein_enum": GENE_TAG,
3468 }
3469 return filter_and_map_entities(corpus, entity_type_mapping)
3472class CRAFT(ColumnCorpus):
3473 """
3474 Original CRAFT corpus (version 2.0) containing all but the coreference and sections/typography annotations.
3476 For further information see Bada et al.:
3477 Concept annotation in the craft corpus
3478 https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-13-161
3479 """
3481 def __init__(
3482 self,
3483 base_path: Union[str, Path] = None,
3484 in_memory: bool = True,
3485 sentence_splitter: SentenceSplitter = None,
3486 ):
3487 """
3488 :param base_path: Path to the corpus on your machine
3489 :param in_memory: If True, keeps dataset in memory giving speedups in training.
3490 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments documents
3491 into sentences and tokens (default :class:`SciSpacySentenceSplitter`)
3492 """
3494 if type(base_path) == str:
3495 base_path: Path = Path(base_path)
3497 # column format
3498 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY}
3500 # this dataset name
3501 dataset_name = self.__class__.__name__.lower()
3503 # default dataset folder is the cache root
3504 if not base_path:
3505 base_path = flair.cache_root / "datasets"
3506 data_folder = base_path / dataset_name
3508 if sentence_splitter is None:
3509 sentence_splitter = SciSpacySentenceSplitter()
3511 train_file = data_folder / f"{sentence_splitter.name}_train.conll"
3513 if not train_file.exists():
3514 corpus_dir = self.download_corpus(data_folder)
3515 corpus_data = self.parse_corpus(corpus_dir)
3517 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter)
3518 conll_writer.write_to_conll(corpus_data, train_file)
3520 super(CRAFT, self).__init__(
3521 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
3522 )
3524 @classmethod
3525 def download_corpus(cls, data_dir: Path) -> Path:
3526 url = "http://sourceforge.net/projects/bionlp-corpora/files/CRAFT/v2.0/craft-2.0.tar.gz/download"
3527 data_path = cached_path(url, data_dir)
3528 unpack_file(data_path, data_dir, mode="targz")
3530 return data_dir / "craft-2.0"
3532 @staticmethod
3533 def parse_corpus(corpus_dir: Path) -> InternalBioNerDataset:
3534 documents = {}
3535 entities_per_document = {}
3537 text_dir = corpus_dir / "articles" / "txt"
3538 document_texts = [doc for doc in text_dir.iterdir() if doc.name[-4:] == ".txt"]
3539 annotation_dirs = [
3540 path
3541 for path in (corpus_dir / "xml").iterdir()
3542 if path.name not in ["sections-and-typography", "coreference"]
3543 ]
3545 for doc in Tqdm.tqdm(document_texts, desc="Converting to internal"):
3546 document_id = doc.name.split(".")[0]
3548 with open(doc, "r", encoding="utf8") as f_txt:
3549 documents[document_id] = f_txt.read()
3551 entities = []
3553 for annotation_dir in annotation_dirs:
3554 with open(
3555 annotation_dir / (doc.name + ".annotations.xml"), "r", encoding="utf8"
3556 ) as f_ann:
3557 ann_tree = etree.parse(f_ann)
3558 for annotation in ann_tree.xpath("//annotation"):
3559 for span in annotation.xpath("span"):
3560 start = int(span.get("start"))
3561 end = int(span.get("end"))
3562 entities += [Entity((start, end), annotation_dir.name)]
3564 entities_per_document[document_id] = entities
3566 return InternalBioNerDataset(
3567 documents=documents, entities_per_document=entities_per_document
3568 )
3571class BIOSEMANTICS(ColumnCorpus):
3572 """
3573 Original Biosemantics corpus.
3575 For further information see Akhondi et al.:
3576 Annotated chemical patent corpus: a gold standard for text mining
3577 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4182036/
3578 """
3580 def __init__(
3581 self,
3582 base_path: Union[str, Path] = None,
3583 in_memory: bool = True,
3584 sentence_splitter: SentenceSplitter = None,
3585 ):
3586 """
3587 :param base_path: Path to the corpus on your machine
3588 :param in_memory: If True, keeps dataset in memory giving speedups in training.
3589 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments documents
3590 into sentences and tokens (default :class:`SciSpacySentenceSplitter`)
3591 """
3592 if type(base_path) == str:
3593 base_path: Path = Path(base_path)
3595 # column format
3596 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY}
3598 # this dataset name
3599 dataset_name = self.__class__.__name__.lower()
3601 # default dataset folder is the cache root
3602 if not base_path:
3603 base_path = flair.cache_root / "datasets"
3604 data_folder = base_path / dataset_name
3606 if sentence_splitter is None:
3607 sentence_splitter = SciSpacySentenceSplitter()
3609 train_file = data_folder / f"{sentence_splitter.name}_train.conll"
3611 if not (train_file.exists()):
3612 corpus_dir = self.download_dataset(data_folder)
3613 full_dataset = self.parse_dataset(corpus_dir)
3615 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter)
3616 conll_writer.write_to_conll(full_dataset, train_file)
3618 super(BIOSEMANTICS, self).__init__(
3619 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
3620 )
3622 @staticmethod
3623 def download_dataset(data_dir: Path) -> Path:
3624 data_url = "http://biosemantics.erasmusmc.nl/PatentCorpus/Patent_Corpus.rar"
3625 data_path = cached_path(data_url, data_dir)
3626 unpack_file(data_path, data_dir)
3628 return data_dir / "Patent_Corpus"
3630 @staticmethod
3631 def parse_dataset(data_dir: Path) -> InternalBioNerDataset:
3632 base_folder = data_dir / "Full_set"
3634 dirs = [
3635 file
3636 for file in os.listdir(str(base_folder))
3637 if os.path.isdir(os.path.join(str(base_folder), file))
3638 ]
3640 text_files = []
3641 for directory in dirs:
3642 text_files += [
3643 os.path.join(str(base_folder), directory, file)
3644 for file in os.listdir(os.path.join(str(base_folder), directory))
3645 if file[-4:] == ".txt"
3646 ]
3647 text_files = sorted(text_files)
3649 documents = {}
3650 entities_per_document = {}
3652 for text_file in sorted(text_files):
3653 document_id = os.path.basename(text_file).split("_")[0]
3654 with open(text_file, "r", encoding="utf8") as file_reader:
3655 file_text = file_reader.read().replace("\n", " ")
3657 offset = 0
3658 document_text = ""
3659 if document_id in documents:
3660 document_text = documents[document_id] + " "
3661 offset = len(document_text)
3663 tmp_document_text = document_text + file_text
3665 entities = []
3666 dirty_file = False
3667 with open(text_file[:-4] + ".ann", encoding="utf8") as file_reader:
3668 for line in file_reader:
3669 if line[-1] == "\n":
3670 line = line[:-1]
3671 if not line:
3672 continue
3674 columns = line.split("\t")
3675 mid = columns[1].split()
3676 # if len(mid) != 3:
3677 # continue
3679 entity_type, start, end = mid[0], mid[1], mid[2]
3680 start, end = int(start.split(";")[0]), int(end.split(";")[0])
3682 if start == end:
3683 continue
3685 # Try to fix entity offsets
3686 if tmp_document_text[offset + start : offset + end] != columns[2]:
3687 alt_text = tmp_document_text[
3688 offset + start : offset + start + len(columns[2])
3689 ]
3690 if alt_text == columns[2]:
3691 end = start + len(columns[2])
3693 if file_text[start:end] != columns[2]:
3694 dirty_file = True
3695 continue
3697 if tmp_document_text[offset + start : offset + end] != columns[2]:
3698 dirty_file = True
3699 continue
3701 entities.append(Entity((offset + start, offset + end), entity_type))
3703 if not dirty_file:
3704 documents[document_id] = tmp_document_text
3705 if document_id in entities_per_document:
3706 entities_per_document[document_id] += entities
3707 else:
3708 entities_per_document[document_id] = entities
3710 return InternalBioNerDataset(
3711 documents=documents, entities_per_document=entities_per_document
3712 )
3715class BC2GM(ColumnCorpus):
3716 """
3717 Original BioCreative-II-GM corpus containing gene annotations.
3719 For further information see Smith et al.:
3720 Overview of BioCreative II gene mention recognition
3721 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2559986/
3722 """
3724 def __init__(
3725 self,
3726 base_path: Union[str, Path] = None,
3727 in_memory: bool = True,
3728 sentence_splitter: Callable[[str], Tuple[List[str], List[int]]] = None,
3729 ):
3730 """
3731 :param base_path: Path to the corpus on your machine
3732 :param in_memory: If True, keeps dataset in memory giving speedups in training.
3733 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments documents
3734 into sentences and tokens (default :class:`SciSpacySentenceSplitter`)
3735 """
3736 if type(base_path) == str:
3737 base_path: Path = Path(base_path)
3739 # column format
3740 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY}
3742 # this dataset name
3743 dataset_name = self.__class__.__name__.lower()
3745 # default dataset folder is the cache root
3746 if not base_path:
3747 base_path = flair.cache_root / "datasets"
3748 data_folder = base_path / dataset_name
3750 if sentence_splitter is None:
3751 sentence_splitter = SciSpacySentenceSplitter()
3753 train_file = data_folder / f"{sentence_splitter.name}_train.conll"
3754 test_file = data_folder / f"{sentence_splitter.name}_test.conll"
3756 if not (train_file.exists() and test_file.exists()):
3757 data_folder = self.download_dataset(data_folder)
3758 train_data = self.parse_train_dataset(data_folder)
3759 test_data = self.parse_test_dataset(data_folder)
3761 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter)
3762 conll_writer.write_to_conll(train_data, train_file)
3763 conll_writer.write_to_conll(test_data, test_file)
3765 super(BC2GM, self).__init__(
3766 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
3767 )
3769 @staticmethod
3770 def download_dataset(data_dir: Path) -> Path:
3771 data_url = "https://biocreative.bioinformatics.udel.edu/media/store/files/2011/bc2GMtrain_1.1.tar.gz"
3772 data_path = cached_path(data_url, data_dir)
3773 unpack_file(data_path, data_dir)
3775 data_url = "https://biocreative.bioinformatics.udel.edu/media/store/files/2011/bc2GMtest_1.0.tar.gz"
3776 data_path = cached_path(data_url, data_dir)
3777 unpack_file(data_path, data_dir)
3779 return data_dir
3781 @classmethod
3782 def parse_train_dataset(cls, data_folder: Path) -> InternalBioNerDataset:
3783 train_text_file = data_folder / "bc2geneMention" / "train" / "train.in"
3784 train_ann_file = data_folder / "bc2geneMention" / "train" / "GENE.eval"
3786 return cls.parse_dataset(train_text_file, train_ann_file)
3788 @classmethod
3789 def parse_test_dataset(cls, data_folder: Path) -> InternalBioNerDataset:
3790 test_text_file = data_folder / "BC2GM" / "test" / "test.in"
3791 test_ann_file = data_folder / "BC2GM" / "test" / "GENE.eval"
3793 return cls.parse_dataset(test_text_file, test_ann_file)
3795 @staticmethod
3796 def parse_dataset(text_file: Path, ann_file: Path) -> InternalBioNerDataset:
3797 documents = {}
3798 entities_per_document = {}
3800 with open(str(text_file), "r", encoding="utf8") as text_file_reader:
3801 for line in text_file_reader:
3802 line = line.strip()
3803 offset = line.find(" ")
3804 document_id = line[:offset]
3805 document_text = line[offset + 1 :]
3806 documents[document_id] = document_text
3807 entities_per_document[document_id] = []
3809 with open(str(ann_file), "r", encoding="utf8") as ann_file_reader:
3810 for line in ann_file_reader:
3811 columns = line.strip().split("|")
3812 document_id = columns[0]
3813 document_text = documents[document_id]
3815 start_idx, end_idx = [int(i) for i in columns[1].split()]
3817 non_whitespaces_chars = 0
3818 new_start_idx = None
3819 new_end_idx = None
3820 for i, char in enumerate(document_text):
3821 if char != " ":
3822 non_whitespaces_chars += 1
3823 if new_start_idx is None and non_whitespaces_chars == start_idx + 1:
3824 new_start_idx = i
3825 if non_whitespaces_chars == end_idx + 1:
3826 new_end_idx = i + 1
3827 break
3829 mention_text = document_text[new_start_idx:new_end_idx]
3830 if mention_text != columns[2] and mention_text.startswith("/"):
3831 # There is still one illegal annotation in the file ..
3832 new_start_idx += 1
3834 entities_per_document[document_id].append(
3835 Entity((new_start_idx, new_end_idx), GENE_TAG)
3836 )
3838 assert document_text[new_start_idx:new_end_idx] == columns[2]
3840 return InternalBioNerDataset(
3841 documents=documents, entities_per_document=entities_per_document
3842 )
3845class HUNER_GENE_BC2GM(HunerDataset):
3846 """
3847 HUNER version of the BioCreative-II-GM corpus containing gene annotations.
3848 """
3850 def __init__(self, *args, **kwargs):
3851 super().__init__(
3852 *args, **kwargs,
3853 )
3855 @staticmethod
3856 def split_url() -> str:
3857 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/bc2gm"
3859 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
3860 data_dir = BC2GM.download_dataset(data_dir)
3861 train_data = BC2GM.parse_train_dataset(data_dir)
3862 test_data = BC2GM.parse_test_dataset(data_dir)
3864 return merge_datasets([train_data, test_data])
3867class CEMP(ColumnCorpus):
3868 """
3869 Original CEMP corpus containing chemical annotations.
3871 For further information see:
3872 https://biocreative.bioinformatics.udel.edu/tasks/biocreative-v/cemp-detailed-task-description/
3873 """
3875 def __init__(
3876 self,
3877 base_path: Union[str, Path] = None,
3878 in_memory: bool = True,
3879 sentence_splitter: Callable[[str], Tuple[List[str], List[int]]] = None,
3880 ):
3881 """
3882 :param base_path: Path to the corpus on your machine
3883 :param in_memory: If True, keeps dataset in memory giving speedups in training.
3884 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments
3885 documents into sentences and tokens (default :class:`SciSpacySentenceSplitter`)
3886 """
3888 if type(base_path) == str:
3889 base_path: Path = Path(base_path)
3891 # column format
3892 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY}
3894 # this dataset name
3895 dataset_name = self.__class__.__name__.lower()
3897 # default dataset folder is the cache root
3898 if not base_path:
3899 base_path = flair.cache_root / "datasets"
3900 data_folder = base_path / dataset_name
3902 if sentence_splitter is None:
3903 sentence_splitter = SciSpacySentenceSplitter()
3905 train_file = data_folder / f"{sentence_splitter.name}_train.conll"
3906 dev_file = data_folder / f"{sentence_splitter.name}_dev.conll"
3908 if not (train_file.exists() and dev_file.exists()):
3909 train_folder = self.download_train_corpus(data_folder)
3910 train_text_file = train_folder / "chemdner_patents_train_text.txt"
3911 train_ann_file = train_folder / "chemdner_cemp_gold_standard_train.tsv"
3912 train_data = self.parse_input_file(train_text_file, train_ann_file)
3914 dev_folder = self.download_dev_corpus(data_folder)
3915 dev_text_file = dev_folder / "chemdner_patents_development_text.txt"
3916 dev_ann_file = (
3917 dev_folder / "chemdner_cemp_gold_standard_development_v03.tsv"
3918 )
3919 dev_data = self.parse_input_file(dev_text_file, dev_ann_file)
3921 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter)
3922 conll_writer.write_to_conll(train_data, train_file)
3923 conll_writer.write_to_conll(dev_data, dev_file)
3925 super(CEMP, self).__init__(
3926 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
3927 )
3929 @classmethod
3930 def download_train_corpus(cls, data_dir: Path) -> Path:
3931 corpus_dir = data_dir / "original"
3932 os.makedirs(str(corpus_dir), exist_ok=True)
3934 train_url = "https://biocreative.bioinformatics.udel.edu/media/store/files/2015/cemp_training_set.tar.gz"
3935 data_path = cached_path(train_url, corpus_dir)
3936 unpack_file(data_path, corpus_dir)
3938 return corpus_dir / "cemp_training_set"
3940 @classmethod
3941 def download_dev_corpus(cls, data_dir) -> Path:
3942 corpus_dir = data_dir / "original"
3943 os.makedirs(str(corpus_dir), exist_ok=True)
3945 dev_url = "https://biocreative.bioinformatics.udel.edu/media/store/files/2015/cemp_development_set_v03.tar.gz"
3946 data_path = cached_path(dev_url, corpus_dir)
3947 unpack_file(data_path, corpus_dir)
3949 return corpus_dir / "cemp_development_set_v03"
3951 @staticmethod
3952 def parse_input_file(text_file: Path, ann_file: Path) -> InternalBioNerDataset:
3953 documents = {}
3954 entities_per_document = {}
3955 document_abstract_length = {}
3957 with open(str(text_file), "r", encoding="utf8") as text_reader:
3958 for line in text_reader:
3959 if not line:
3960 continue
3962 document_id, title, abstract = line.split("\t")
3964 # Abstract first, title second to prevent issues with sentence splitting
3965 documents[document_id] = abstract + " " + title
3966 document_abstract_length[document_id] = len(abstract) + 1
3968 entities_per_document[document_id] = []
3970 with open(str(ann_file), "r", encoding="utf8") as ann_reader:
3971 for line in ann_reader:
3972 if not line:
3973 continue
3975 columns = line.split("\t")
3976 document_id = columns[0]
3977 start, end = int(columns[2]), int(columns[3])
3979 if columns[1] == "T":
3980 start = start + document_abstract_length[document_id]
3981 end = end + document_abstract_length[document_id]
3983 entities_per_document[document_id].append(
3984 Entity((start, end), columns[5].strip())
3985 )
3987 document_text = documents[document_id]
3988 assert columns[4] == document_text[start:end]
3990 return InternalBioNerDataset(
3991 documents=documents, entities_per_document=entities_per_document
3992 )
3995class HUNER_CHEMICAL_CEMP(HunerDataset):
3996 """
3997 HUNER version of the CEMP corpus containing chemical annotations.
3998 """
4000 def __init__(self, *args, **kwargs):
4001 super().__init__(*args, **kwargs)
4003 @staticmethod
4004 def split_url() -> str:
4005 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/cemp"
4007 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
4008 train_folder = CEMP.download_train_corpus(data_dir)
4009 train_text_file = train_folder / "chemdner_patents_train_text.txt"
4010 train_ann_file = train_folder / "chemdner_cemp_gold_standard_train.tsv"
4011 train_data = CEMP.parse_input_file(train_text_file, train_ann_file)
4013 dev_folder = CEMP.download_dev_corpus(data_dir)
4014 dev_text_file = dev_folder / "chemdner_patents_development_text.txt"
4015 dev_ann_file = dev_folder / "chemdner_cemp_gold_standard_development_v03.tsv"
4016 dev_data = CEMP.parse_input_file(dev_text_file, dev_ann_file)
4018 dataset = merge_datasets([train_data, dev_data])
4019 entity_type_mapping = {
4020 x: CHEMICAL_TAG
4021 for x in [
4022 "ABBREVIATION",
4023 "FAMILY",
4024 "FORMULA",
4025 "IDENTIFIERS",
4026 "MULTIPLE",
4027 "SYSTEMATIC",
4028 "TRIVIAL",
4029 ]
4030 }
4031 return filter_and_map_entities(dataset, entity_type_mapping)
4034class CHEBI(ColumnCorpus):
4035 """
4036 Original CHEBI corpus containing all annotations.
4038 For further information see Shardlow et al.:
4039 A New Corpus to Support Text Mining for the Curation of Metabolites in the ChEBI Database
4040 http://www.lrec-conf.org/proceedings/lrec2018/pdf/229.pdf
4041 """
4043 def __init__(
4044 self,
4045 base_path: Union[str, Path] = None,
4046 in_memory: bool = True,
4047 sentence_splitter: SentenceSplitter = None,
4048 annotator: int = 0,
4049 ):
4050 """
4051 :param base_path: Path to the corpus on your machine
4052 :param in_memory: If True, keeps dataset in memory giving speedups in training.
4053 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments documents
4054 into sentences and tokens (default :class:`SciSpacySentenceSplitter`)
4055 :param annotator: The abstracts have been annotated by two annotators, which can be
4056 selected by choosing annotator 1 or 2. If annotator is 0, the union of both annotations is used.
4057 """
4058 if type(base_path) == str:
4059 base_path: Path = Path(base_path)
4061 # column format
4062 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY}
4064 # this dataset name
4065 dataset_name = self.__class__.__name__.lower()
4067 # default dataset folder is the cache root
4068 if not base_path:
4069 base_path = flair.cache_root / "datasets"
4070 data_folder = base_path / dataset_name
4072 if sentence_splitter is None:
4073 sentence_splitter = SciSpacySentenceSplitter()
4075 train_file = data_folder / f"{sentence_splitter.name}_train.conll"
4077 if not (train_file.exists()):
4078 corpus_dir = self.download_dataset(data_folder)
4079 full_dataset = self.parse_dataset(corpus_dir, annotator=annotator)
4081 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter)
4082 conll_writer.write_to_conll(full_dataset, train_file)
4084 super(CHEBI, self).__init__(
4085 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
4086 )
4088 @staticmethod
4089 def download_dataset(data_dir: Path) -> Path:
4090 data_url = "http://www.nactem.ac.uk/chebi/ChEBI.zip"
4091 data_path = cached_path(data_url, data_dir)
4092 unpack_file(data_path, data_dir)
4094 return data_dir / "ChEBI"
4096 @staticmethod
4097 def parse_dataset(data_dir: Path, annotator: int) -> InternalBioNerDataset:
4098 abstract_folder = data_dir / "abstracts"
4099 fulltext_folder = data_dir / "fullpapers"
4101 if annotator == 0:
4102 annotation_dirs = ["Annotator1", "Annotator2"]
4103 elif annotator <= 2:
4104 annotation_dirs = [f"Annotator{annotator}"]
4105 else:
4106 raise ValueError("Invalid value for annotator")
4108 documents = {}
4109 entities_per_document = {}
4111 abstract_ids = [
4112 x.name[:-4]
4113 for x in (abstract_folder / annotation_dirs[0]).iterdir()
4114 if x.name[-4:] == ".txt"
4115 ]
4116 fulltext_ids = [
4117 x.name[:-4] for x in fulltext_folder.iterdir() if x.name[-4:] == ".txt"
4118 ]
4120 for abstract_id in abstract_ids:
4121 abstract_id_output = abstract_id + "_A"
4122 with open(
4123 abstract_folder / annotation_dirs[0] / f"{abstract_id}.txt", "r", encoding="utf8"
4124 ) as f:
4125 documents[abstract_id_output] = f.read()
4127 for annotation_dir in annotation_dirs:
4128 with open(
4129 abstract_folder / annotation_dir / f"{abstract_id}.ann", "r", encoding="utf8"
4130 ) as f:
4131 entities = CHEBI.get_entities(f)
4132 entities_per_document[abstract_id_output] = entities
4134 for fulltext_id in fulltext_ids:
4135 fulltext_id_output = fulltext_id + "_F"
4136 with open(fulltext_folder / f"{fulltext_id}.txt", "r", encoding="utf8") as f:
4137 documents[fulltext_id_output] = f.read()
4139 with open(fulltext_folder / f"{fulltext_id}.ann", "r", encoding="utf8") as f:
4140 entities = CHEBI.get_entities(f)
4141 entities_per_document[fulltext_id_output] = entities
4143 return InternalBioNerDataset(
4144 documents=documents, entities_per_document=entities_per_document
4145 )
4147 @staticmethod
4148 def get_entities(f):
4149 entities = []
4150 for line in f:
4151 if not line.strip() or line[0] != "T":
4152 continue
4153 parts = line.split("\t")[1].split()
4154 entity_type = parts[0]
4155 char_offsets = " ".join(parts[1:])
4156 for start_end in char_offsets.split(";"):
4157 start, end = start_end.split(" ")
4158 entities += [Entity((int(start), int(end)), entity_type)]
4160 return entities
4163class HUNER_CHEMICAL_CHEBI(HunerDataset):
4164 """
4165 HUNER version of the CHEBI corpus containing chemical annotations.
4166 """
4168 def __init__(self, *args, **kwargs):
4169 super().__init__(*args, **kwargs)
4171 @staticmethod
4172 def split_url() -> str:
4173 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/chebi_new"
4175 def to_internal(self, data_dir: Path, annotator: int = 0) -> InternalBioNerDataset:
4176 corpus_dir = CHEBI.download_dataset(data_dir)
4177 dataset = CHEBI.parse_dataset(corpus_dir, annotator=annotator)
4178 entity_type_mapping = {"Chemical": CHEMICAL_TAG}
4179 return filter_and_map_entities(dataset, entity_type_mapping)
4182class HUNER_GENE_CHEBI(HunerDataset):
4183 """
4184 HUNER version of the CHEBI corpus containing gene annotations.
4185 """
4187 def __init__(self, *args, **kwargs):
4188 super().__init__(*args, **kwargs)
4190 @staticmethod
4191 def split_url() -> str:
4192 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/chebi_new"
4194 def to_internal(self, data_dir: Path, annotator: int = 0) -> InternalBioNerDataset:
4195 corpus_dir = CHEBI.download_dataset(data_dir)
4196 dataset = CHEBI.parse_dataset(corpus_dir, annotator=annotator)
4197 entity_type_mapping = {"Protein": GENE_TAG}
4198 return filter_and_map_entities(dataset, entity_type_mapping)
4201class HUNER_SPECIES_CHEBI(HunerDataset):
4202 """
4203 HUNER version of the CHEBI corpus containing species annotations.
4204 """
4206 def __init__(self, *args, **kwargs):
4207 super().__init__(*args, **kwargs)
4209 @staticmethod
4210 def split_url() -> str:
4211 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/chebi_new"
4213 def to_internal(self, data_dir: Path, annotator: int = 0) -> InternalBioNerDataset:
4214 corpus_dir = CHEBI.download_dataset(data_dir)
4215 dataset = CHEBI.parse_dataset(corpus_dir, annotator=annotator)
4216 entity_type_mapping = {"Species": SPECIES_TAG}
4217 return filter_and_map_entities(dataset, entity_type_mapping)
4220class BioNLPCorpus(ColumnCorpus):
4221 """
4222 Base class for corpora from BioNLP event extraction shared tasks
4224 For further information see:
4225 http://2013.bionlp-st.org/Intro
4226 """
4228 def __init__(
4229 self,
4230 base_path: Union[str, Path] = None,
4231 in_memory: bool = True,
4232 sentence_splitter: Callable[[str], Tuple[List[str], List[int]]] = None,
4233 ):
4234 """
4235 :param base_path: Path to the corpus on your machine
4236 :param in_memory: If True, keeps dataset in memory giving speedups in training.
4237 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments documents
4238 into sentences and tokens (default :class:`SciSpacySentenceSplitter`)
4239 """
4241 if type(base_path) == str:
4242 base_path: Path = Path(base_path)
4244 # column format
4245 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY}
4247 # this dataset name
4248 dataset_name = self.__class__.__name__.lower()
4250 # default dataset folder is the cache root
4251 if not base_path:
4252 base_path = flair.cache_root / "datasets"
4253 data_folder = base_path / dataset_name
4255 if sentence_splitter is None:
4256 sentence_splitter = SciSpacySentenceSplitter()
4258 train_file = data_folder / f"{sentence_splitter.name}_train.conll"
4259 dev_file = data_folder / f"{sentence_splitter.name}_dev.conll"
4260 test_file = data_folder / f"{sentence_splitter.name}_test.conll"
4262 if not (train_file.exists() and dev_file.exists() and test_file.exists()):
4263 train_folder, dev_folder, test_folder = self.download_corpus(
4264 data_folder / "original"
4265 )
4267 train_data = self.parse_input_files(train_folder)
4268 dev_data = self.parse_input_files(dev_folder)
4269 test_data = self.parse_input_files(test_folder)
4271 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter)
4272 conll_writer.write_to_conll(train_data, train_file)
4273 conll_writer.write_to_conll(dev_data, dev_file)
4274 conll_writer.write_to_conll(test_data, test_file)
4276 super(BioNLPCorpus, self).__init__(
4277 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
4278 )
4280 @staticmethod
4281 @abstractmethod
4282 def download_corpus(data_folder: Path) -> Tuple[Path, Path]:
4283 pass
4285 @staticmethod
4286 def parse_input_files(input_folder: Path) -> InternalBioNerDataset:
4287 documents = {}
4288 entities_per_document = {}
4290 for txt_file in input_folder.glob("*.txt"):
4291 name = txt_file.with_suffix("").name
4292 a1_file = txt_file.with_suffix(".a1")
4294 with txt_file.open(encoding="utf8") as f:
4295 documents[name] = f.read()
4297 with a1_file.open(encoding="utf8") as ann_reader:
4298 entities = []
4300 for line in ann_reader:
4301 fields = line.strip().split("\t")
4302 if fields[0].startswith("T"):
4303 ann_type, start, end = fields[1].split()
4304 entities.append(
4305 Entity(
4306 char_span=(int(start), int(end)), entity_type=ann_type
4307 )
4308 )
4309 entities_per_document[name] = entities
4311 return InternalBioNerDataset(
4312 documents=documents, entities_per_document=entities_per_document
4313 )
4316class BIONLP2013_PC(BioNLPCorpus):
4317 """
4318 Corpus of the BioNLP'2013 Pathway Curation shared task
4320 For further information see Ohta et al.
4321 Overview of the pathway curation (PC) task of bioNLP shared task 2013.
4322 https://www.aclweb.org/anthology/W13-2009/
4323 """
4325 @staticmethod
4326 def download_corpus(download_folder: Path) -> Tuple[Path, Path, Path]:
4327 train_url = (
4328 "http://2013.bionlp-st.org/tasks/BioNLP-ST_2013_PC_training_data.tar.gz"
4329 )
4330 dev_url = (
4331 "http://2013.bionlp-st.org/tasks/BioNLP-ST_2013_PC_development_data.tar.gz"
4332 )
4333 test_url = "http://2013.bionlp-st.org/tasks/BioNLP-ST_2013_PC_test_data.tar.gz"
4335 cached_path(train_url, download_folder)
4336 cached_path(dev_url, download_folder)
4337 cached_path(test_url, download_folder)
4339 unpack_file(
4340 download_folder / "BioNLP-ST_2013_PC_training_data.tar.gz",
4341 download_folder,
4342 keep=False,
4343 )
4344 unpack_file(
4345 download_folder / "BioNLP-ST_2013_PC_development_data.tar.gz",
4346 download_folder,
4347 keep=False,
4348 )
4349 unpack_file(
4350 download_folder / "BioNLP-ST_2013_PC_test_data.tar.gz",
4351 download_folder,
4352 keep=False,
4353 )
4355 train_folder = download_folder / "BioNLP-ST_2013_PC_training_data"
4356 dev_folder = download_folder / "BioNLP-ST_2013_PC_development_data"
4357 test_folder = download_folder / "BioNLP-ST_2013_PC_test_data"
4359 return train_folder, dev_folder, test_folder
4362class BIONLP2013_CG(BioNLPCorpus):
4363 """
4364 Corpus of the BioNLP'2013 Cancer Genetics shared task
4366 For further information see Pyysalo, Ohta & Ananiadou 2013
4367 Overview of the Cancer Genetics (CG) task of BioNLP Shared Task 2013
4368 https://www.aclweb.org/anthology/W13-2008/
4369 """
4371 @staticmethod
4372 def download_corpus(download_folder: Path) -> Tuple[Path, Path, Path]:
4373 train_url = (
4374 "http://2013.bionlp-st.org/tasks/BioNLP-ST_2013_CG_training_data.tar.gz"
4375 )
4376 dev_url = (
4377 "http://2013.bionlp-st.org/tasks/BioNLP-ST_2013_CG_development_data.tar.gz"
4378 )
4379 test_url = "http://2013.bionlp-st.org/tasks/BioNLP-ST_2013_CG_test_data.tar.gz"
4381 download_folder = download_folder / "original"
4383 cached_path(train_url, download_folder)
4384 cached_path(dev_url, download_folder)
4385 cached_path(test_url, download_folder)
4387 unpack_file(
4388 download_folder / "BioNLP-ST_2013_CG_training_data.tar.gz",
4389 download_folder,
4390 keep=False,
4391 )
4392 unpack_file(
4393 download_folder / "BioNLP-ST_2013_CG_development_data.tar.gz",
4394 download_folder,
4395 keep=False,
4396 )
4397 unpack_file(
4398 download_folder / "BioNLP-ST_2013_CG_test_data.tar.gz",
4399 download_folder,
4400 keep=False,
4401 )
4403 train_folder = download_folder / "BioNLP-ST_2013_CG_training_data"
4404 dev_folder = download_folder / "BioNLP-ST_2013_CG_development_data"
4405 test_folder = download_folder / "BioNLP-ST_2013_CG_test_data"
4407 return train_folder, dev_folder, test_folder
4410class ANAT_EM(ColumnCorpus):
4411 """
4412 Corpus for anatomical named entity mention recognition.
4414 For further information see Pyysalo and Ananiadou:
4415 Anatomical entity mention recognition at literature scale
4416 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3957068/
4417 http://nactem.ac.uk/anatomytagger/#AnatEM
4418 """
4420 def __init__(
4421 self,
4422 base_path: Union[str, Path] = None,
4423 in_memory: bool = True,
4424 tokenizer: Tokenizer = None,
4425 ):
4426 """
4427 :param base_path: Path to the corpus on your machine
4428 :param in_memory: If True, keeps dataset in memory giving speedups in training.
4429 :param sentence_splitter: Implementation of :class:`Tokenizer` which segments
4430 sentences into tokens (default :class:`SciSpacyTokenizer`)
4431 """
4432 if type(base_path) == str:
4433 base_path: Path = Path(base_path)
4435 # column format
4436 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY}
4438 # this dataset name
4439 dataset_name = self.__class__.__name__.lower()
4441 # default dataset folder is the cache root
4442 if not base_path:
4443 base_path = flair.cache_root / "datasets"
4444 data_folder = base_path / dataset_name
4446 if tokenizer is None:
4447 tokenizer = SciSpacyTokenizer()
4449 sentence_splitter = TagSentenceSplitter(tag=SENTENCE_TAG, tokenizer=tokenizer)
4451 train_file = data_folder / f"{sentence_splitter.name}_train.conll"
4452 dev_file = data_folder / f"{sentence_splitter.name}_dev.conll"
4453 test_file = data_folder / f"{sentence_splitter.name}_test.conll"
4455 if not (train_file.exists() and dev_file.exists() and test_file.exists()):
4456 corpus_folder = self.download_corpus(data_folder)
4458 train_data = self.parse_input_files(
4459 corpus_folder / "nersuite" / "train", SENTENCE_TAG
4460 )
4461 dev_data = self.parse_input_files(
4462 corpus_folder / "nersuite" / "devel", SENTENCE_TAG
4463 )
4464 test_data = self.parse_input_files(
4465 corpus_folder / "nersuite" / "test", SENTENCE_TAG
4466 )
4468 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter)
4469 conll_writer.write_to_conll(train_data, train_file)
4470 conll_writer.write_to_conll(dev_data, dev_file)
4471 conll_writer.write_to_conll(test_data, test_file)
4473 super(ANAT_EM, self).__init__(
4474 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
4475 )
4477 @staticmethod
4478 @abstractmethod
4479 def download_corpus(data_folder: Path):
4480 corpus_url = "http://nactem.ac.uk/anatomytagger/AnatEM-1.0.2.tar.gz"
4481 corpus_archive = cached_path(corpus_url, data_folder)
4483 unpack_file(
4484 corpus_archive, data_folder, keep=True, mode="targz",
4485 )
4487 return data_folder / "AnatEM-1.0.2"
4489 @staticmethod
4490 def parse_input_files(
4491 input_dir: Path, sentence_separator: str
4492 ) -> InternalBioNerDataset:
4493 documents = {}
4494 entities_per_document = {}
4496 input_files = [
4497 file
4498 for file in os.listdir(str(input_dir))
4499 if file.endswith(".nersuite") and not file.startswith("._")
4500 ]
4502 for input_file in input_files:
4503 document_id = input_file.replace(".nersuite", "")
4504 document_text = ""
4506 entities = []
4507 entity_type = None
4508 entity_start = None
4510 sent_offset = 0
4511 last_offset = 0
4513 input_file = open(str(input_dir / input_file), "r", encoding="utf8")
4514 for line in input_file.readlines():
4515 line = line.strip()
4516 if line:
4517 tag, start, end, word, _, _, _ = line.split("\t")
4519 start = int(start) + sent_offset
4520 end = int(end) + sent_offset
4522 document_text += " " * (start - last_offset)
4523 document_text += word
4525 if tag.startswith("B-"):
4526 if entity_type is not None:
4527 entities.append(
4528 Entity((entity_start, last_offset), entity_type)
4529 )
4531 entity_start = start
4532 entity_type = tag[2:]
4534 elif tag == "O" and entity_type is not None:
4535 entities.append(
4536 Entity((entity_start, last_offset), entity_type)
4537 )
4538 entity_type = None
4540 last_offset = end
4542 assert word == document_text[start:end]
4544 else:
4545 document_text += sentence_separator
4546 sent_offset += len(sentence_separator)
4547 last_offset += len(sentence_separator)
4549 documents[document_id] = document_text
4550 entities_per_document[document_id] = entities
4552 return InternalBioNerDataset(
4553 documents=documents, entities_per_document=entities_per_document
4554 )
4557class BioBertHelper(ColumnCorpus):
4558 """
4559 Helper class to convert corpora and the respective train, dev and test split
4560 used by BioBERT.
4562 For further details see Lee et al.:
4563 https://academic.oup.com/bioinformatics/article/36/4/1234/5566506
4564 https://github.com/dmis-lab/biobert
4565 """
4567 @staticmethod
4568 def download_corpora(download_dir: Path):
4569 from google_drive_downloader import GoogleDriveDownloader as gdd
4571 gdd.download_file_from_google_drive(
4572 file_id="1OletxmPYNkz2ltOr9pyT0b0iBtUWxslh",
4573 dest_path=str(download_dir / "NERdata.zip"),
4574 unzip=True,
4575 )
4577 @staticmethod
4578 def convert_and_write(download_folder, data_folder, tag_type):
4579 data_folder.mkdir(parents=True, exist_ok=True)
4580 with (download_folder / "train.tsv").open(encoding="utf8") as f_in, (
4581 data_folder / "train.conll"
4582 ).open("w", encoding="utf8") as f_out:
4583 for line in f_in:
4584 if not line.strip():
4585 f_out.write("\n")
4586 continue
4588 token, tag = line.strip().split("\t")
4589 if tag != "O":
4590 tag = tag + "-" + tag_type
4591 f_out.write(f"{token} {tag}\n")
4593 with (download_folder / "devel.tsv").open(encoding="utf8") as f_in, (
4594 data_folder / "dev.conll"
4595 ).open("w", encoding="utf8") as f_out:
4596 for line in f_in:
4597 if not line.strip():
4598 f_out.write("\n")
4599 continue
4600 token, tag = line.strip().split("\t")
4601 if tag != "O":
4602 tag = tag + "-" + tag_type
4603 f_out.write(f"{token} {tag}\n")
4605 with (download_folder / "test.tsv").open(encoding="utf8") as f_in, (
4606 data_folder / "test.conll"
4607 ).open("w", encoding="utf8") as f_out:
4608 for line in f_in:
4609 if not line.strip():
4610 f_out.write("\n")
4611 continue
4612 token, tag = line.strip().split("\t")
4613 if tag != "O":
4614 tag = tag + "-" + tag_type
4615 f_out.write(f"{token} {tag}\n")
4618class BIOBERT_CHEMICAL_BC4CHEMD(ColumnCorpus):
4619 """
4620 BC4CHEMD corpus with chemical annotations as used in the evaluation
4621 of BioBERT.
4623 For further details regarding BioBERT and it's evaluation, see Lee et al.:
4624 https://academic.oup.com/bioinformatics/article/36/4/1234/5566506
4625 https://github.com/dmis-lab/biobert
4626 """
4628 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
4629 columns = {0: "text", 1: "ner"}
4630 # this dataset name
4631 dataset_name = self.__class__.__name__.lower()
4633 # default dataset folder is the cache root
4634 if not base_path:
4635 base_path = flair.cache_root / "datasets"
4637 data_folder = base_path / dataset_name
4639 train_file = data_folder / "train.conll"
4640 dev_file = data_folder / "dev.conll"
4641 test_file = data_folder / "test.conll"
4643 if not (train_file.exists() and dev_file.exists() and test_file.exists()):
4644 common_path = base_path / "biobert_common"
4645 if not (common_path / "BC4CHEMD").exists():
4646 BioBertHelper.download_corpora(common_path)
4648 BioBertHelper.convert_and_write(
4649 common_path / "BC4CHEMD", data_folder, tag_type=CHEMICAL_TAG
4650 )
4651 super(BIOBERT_CHEMICAL_BC4CHEMD, self).__init__(
4652 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
4653 )
4656class BIOBERT_GENE_BC2GM(ColumnCorpus):
4657 """
4658 BC4CHEMD corpus with gene annotations as used in the evaluation
4659 of BioBERT.
4661 For further details regarding BioBERT and it's evaluation, see Lee et al.:
4662 https://academic.oup.com/bioinformatics/article/36/4/1234/5566506
4663 https://github.com/dmis-lab/biobert
4664 """
4666 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
4667 columns = {0: "text", 1: "ner"}
4668 # this dataset name
4669 dataset_name = self.__class__.__name__.lower()
4671 # default dataset folder is the cache root
4672 if not base_path:
4673 base_path = flair.cache_root / "datasets"
4675 data_folder = base_path / dataset_name
4677 train_file = data_folder / "train.conll"
4678 dev_file = data_folder / "dev.conll"
4679 test_file = data_folder / "test.conll"
4681 if not (train_file.exists() and dev_file.exists() and test_file.exists()):
4682 common_path = base_path / "biobert_common"
4683 if not (common_path / "BC2GM").exists():
4684 BioBertHelper.download_corpora(common_path)
4685 BioBertHelper.convert_and_write(
4686 common_path / "BC2GM", data_folder, tag_type=GENE_TAG
4687 )
4688 super(BIOBERT_GENE_BC2GM, self).__init__(
4689 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
4690 )
4693class BIOBERT_GENE_JNLPBA(ColumnCorpus):
4694 """
4695 JNLPBA corpus with gene annotations as used in the evaluation
4696 of BioBERT.
4698 For further details regarding BioBERT and it's evaluation, see Lee et al.:
4699 https://academic.oup.com/bioinformatics/article/36/4/1234/5566506
4700 https://github.com/dmis-lab/biobert
4701 """
4703 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
4704 columns = {0: "text", 1: "ner"}
4705 # this dataset name
4706 dataset_name = self.__class__.__name__.lower()
4708 # default dataset folder is the cache root
4709 if not base_path:
4710 base_path = flair.cache_root / "datasets"
4712 data_folder = base_path / dataset_name
4714 train_file = data_folder / "train.conll"
4715 dev_file = data_folder / "dev.conll"
4716 test_file = data_folder / "test.conll"
4718 if not (train_file.exists() and dev_file.exists() and test_file.exists()):
4719 common_path = base_path / "biobert_common"
4720 if not (common_path / "JNLPBA").exists():
4721 BioBertHelper.download_corpora(common_path)
4722 BioBertHelper.convert_and_write(
4723 common_path / "JNLPBA", data_folder, tag_type=GENE_TAG
4724 )
4725 super(BIOBERT_GENE_JNLPBA, self).__init__(
4726 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
4727 )
4730class BIOBERT_CHEMICAL_BC5CDR(ColumnCorpus):
4731 """
4732 BC5CDR corpus with chemical annotations as used in the evaluation
4733 of BioBERT.
4735 For further details regarding BioBERT and it's evaluation, see Lee et al.:
4736 https://academic.oup.com/bioinformatics/article/36/4/1234/5566506
4737 https://github.com/dmis-lab/biobert
4738 """
4740 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
4741 columns = {0: "text", 1: "ner"}
4742 # this dataset name
4743 dataset_name = self.__class__.__name__.lower()
4745 # default dataset folder is the cache root
4746 if not base_path:
4747 base_path = flair.cache_root / "datasets"
4749 data_folder = base_path / dataset_name
4751 train_file = data_folder / "train.conll"
4752 dev_file = data_folder / "dev.conll"
4753 test_file = data_folder / "test.conll"
4755 if not (train_file.exists() and dev_file.exists() and test_file.exists()):
4756 common_path = base_path / "biobert_common"
4757 if not (common_path / "BC5CDR-chem").exists():
4758 BioBertHelper.download_corpora(common_path)
4759 BioBertHelper.convert_and_write(
4760 common_path / "BC5CDR-chem", data_folder, tag_type=CHEMICAL_TAG
4761 )
4762 super(BIOBERT_CHEMICAL_BC5CDR, self).__init__(
4763 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
4764 )
4767class BIOBERT_DISEASE_BC5CDR(ColumnCorpus):
4768 """
4769 BC5CDR corpus with disease annotations as used in the evaluation
4770 of BioBERT.
4772 For further details regarding BioBERT and it's evaluation, see Lee et al.:
4773 https://academic.oup.com/bioinformatics/article/36/4/1234/5566506
4774 https://github.com/dmis-lab/biobert
4775 """
4777 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
4778 columns = {0: "text", 1: "ner"}
4779 # this dataset name
4780 dataset_name = self.__class__.__name__.lower()
4782 # default dataset folder is the cache root
4783 if not base_path:
4784 base_path = flair.cache_root / "datasets"
4786 data_folder = base_path / dataset_name
4788 train_file = data_folder / "train.conll"
4789 dev_file = data_folder / "dev.conll"
4790 test_file = data_folder / "test.conll"
4792 if not (train_file.exists() and dev_file.exists() and test_file.exists()):
4793 common_path = base_path / "biobert_common"
4794 if not (common_path / "BC5CDR-disease").exists():
4795 BioBertHelper.download_corpora(common_path)
4796 BioBertHelper.convert_and_write(
4797 common_path / "BC5CDR-disease", data_folder, tag_type=DISEASE_TAG
4798 )
4799 super(BIOBERT_DISEASE_BC5CDR, self).__init__(
4800 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
4801 )
4804class BIOBERT_DISEASE_NCBI(ColumnCorpus):
4805 """
4806 NCBI disease corpus as used in the evaluation of BioBERT.
4808 For further details regarding BioBERT and it's evaluation, see Lee et al.:
4809 https://academic.oup.com/bioinformatics/article/36/4/1234/5566506
4810 https://github.com/dmis-lab/biobert
4811 """
4813 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
4814 columns = {0: "text", 1: "ner"}
4815 # this dataset name
4816 dataset_name = self.__class__.__name__.lower()
4818 # default dataset folder is the cache root
4819 if not base_path:
4820 base_path = flair.cache_root / "datasets"
4822 data_folder = base_path / dataset_name
4824 train_file = data_folder / "train.conll"
4825 dev_file = data_folder / "dev.conll"
4826 test_file = data_folder / "test.conll"
4828 if not (train_file.exists() and dev_file.exists() and test_file.exists()):
4829 common_path = base_path / "biobert_common"
4830 if not (common_path / "NCBI-disease").exists():
4831 BioBertHelper.download_corpora(common_path)
4832 BioBertHelper.convert_and_write(
4833 common_path / "NCBI-disease", data_folder, tag_type=DISEASE_TAG
4834 )
4835 super(BIOBERT_DISEASE_NCBI, self).__init__(
4836 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
4837 )
4840class BIOBERT_SPECIES_LINNAEUS(ColumnCorpus):
4841 """
4842 Linneaeus corpus with species annotations as used in the evaluation
4843 of BioBERT.
4845 For further details regarding BioBERT and it's evaluation, see Lee et al.:
4846 https://academic.oup.com/bioinformatics/article/36/4/1234/5566506
4847 https://github.com/dmis-lab/biobert
4848 """
4850 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
4851 columns = {0: "text", 1: "ner"}
4852 # this dataset name
4853 dataset_name = self.__class__.__name__.lower()
4855 # default dataset folder is the cache root
4856 if not base_path:
4857 base_path = flair.cache_root / "datasets"
4859 data_folder = base_path / dataset_name
4861 train_file = data_folder / "train.conll"
4862 dev_file = data_folder / "dev.conll"
4863 test_file = data_folder / "test.conll"
4865 if not (train_file.exists() and dev_file.exists() and test_file.exists()):
4866 common_path = base_path / "biobert_common"
4867 if not (common_path / "linnaeus").exists():
4868 BioBertHelper.download_corpora(common_path)
4869 BioBertHelper.convert_and_write(
4870 common_path / "linnaeus", data_folder, tag_type=SPECIES_TAG
4871 )
4872 super(BIOBERT_SPECIES_LINNAEUS, self).__init__(
4873 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
4874 )
4877class BIOBERT_SPECIES_S800(ColumnCorpus):
4878 """
4879 S800 corpus with species annotations as used in the evaluation
4880 of BioBERT.
4882 For further details regarding BioBERT and it's evaluation, see Lee et al.:
4883 https://academic.oup.com/bioinformatics/article/36/4/1234/5566506
4884 https://github.com/dmis-lab/biobert
4885 """
4887 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
4888 columns = {0: "text", 1: "ner"}
4889 # this dataset name
4890 dataset_name = self.__class__.__name__.lower()
4892 # default dataset folder is the cache root
4893 if not base_path:
4894 base_path = flair.cache_root / "datasets"
4896 data_folder = base_path / dataset_name
4898 train_file = data_folder / "train.conll"
4899 dev_file = data_folder / "dev.conll"
4900 test_file = data_folder / "test.conll"
4902 if not (train_file.exists() and dev_file.exists() and test_file.exists()):
4903 common_path = base_path / "biobert_common"
4904 if not (common_path / "s800").exists():
4905 BioBertHelper.download_corpora(common_path)
4906 BioBertHelper.convert_and_write(
4907 common_path / "s800", data_folder, tag_type=SPECIES_TAG
4908 )
4909 super(BIOBERT_SPECIES_S800, self).__init__(
4910 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
4911 )
4914class CRAFT_V4(ColumnCorpus):
4915 """
4916 Version 4.0.1 of the CRAFT corpus containing all but the co-reference and structural annotations.
4918 For further information see:
4919 https://github.com/UCDenver-ccp/CRAFT
4920 """
4922 def __init__(
4923 self,
4924 base_path: Union[str, Path] = None,
4925 in_memory: bool = True,
4926 sentence_splitter: SentenceSplitter = None,
4927 ):
4928 """
4929 :param base_path: Path to the corpus on your machine
4930 :param in_memory: If True, keeps dataset in memory giving speedups in training.
4931 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments
4932 documents into sentences and tokens (default :class:`SciSpacySentenceSplitter`)
4933 """
4935 if type(base_path) == str:
4936 base_path: Path = Path(base_path)
4938 # column format
4939 columns = {0: "text", 1: "ner"}
4941 # this dataset name
4942 dataset_name = self.__class__.__name__.lower()
4944 # default dataset folder is the cache root
4945 if not base_path:
4946 base_path = flair.cache_root / "datasets"
4947 data_folder = base_path / dataset_name
4949 if sentence_splitter is None:
4950 sentence_splitter = SciSpacySentenceSplitter()
4952 train_file = data_folder / f"{sentence_splitter.name}_train.conll"
4953 dev_file = data_folder / f"{sentence_splitter.name}_dev.conll"
4954 test_file = data_folder / f"{sentence_splitter.name}_test.conll"
4956 if not (train_file.exists() and dev_file.exists() and test_file.exists()):
4957 corpus_dir = self.download_corpus(data_folder)
4958 corpus_data = self.parse_corpus(corpus_dir)
4960 # Filter for specific entity types, by default no entities will be filtered
4961 corpus_data = self.filter_entities(corpus_data)
4963 train_data, dev_data, test_data = self.prepare_splits(
4964 data_folder, corpus_data
4965 )
4967 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter)
4968 conll_writer.write_to_conll(train_data, train_file)
4969 conll_writer.write_to_conll(dev_data, dev_file)
4970 conll_writer.write_to_conll(test_data, test_file)
4972 super(CRAFT_V4, self).__init__(
4973 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
4974 )
4976 def filter_entities(self, corpus: InternalBioNerDataset) -> InternalBioNerDataset:
4977 return corpus
4979 @classmethod
4980 def download_corpus(cls, data_dir: Path) -> Path:
4981 url = "https://github.com/UCDenver-ccp/CRAFT/archive/v4.0.1.tar.gz"
4982 data_path = cached_path(url, data_dir)
4983 unpack_file(data_path, data_dir, mode="targz")
4985 return data_dir / "CRAFT-4.0.1"
4987 @staticmethod
4988 def prepare_splits(
4989 data_dir: Path, corpus: InternalBioNerDataset
4990 ) -> Tuple[InternalBioNerDataset, InternalBioNerDataset, InternalBioNerDataset]:
4991 splits_dir = data_dir / "splits"
4992 os.makedirs(str(splits_dir), exist_ok=True)
4994 # Get original HUNER splits to retrieve a list of all document ids contained in V2
4995 split_urls = [
4996 "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/craft.train",
4997 "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/craft.dev",
4998 "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/craft.test",
4999 ]
5001 splits = {}
5002 for url in split_urls:
5003 split_file = cached_path(url, splits_dir)
5004 with open(str(split_file), "r", encoding="utf8") as split_reader:
5005 splits[url.split(".")[-1]] = [
5006 line.strip() for line in split_reader if line.strip()
5007 ]
5009 train_documents, train_entities = {}, {}
5010 dev_documents, dev_entities = {}, {}
5011 test_documents, test_entities = {}, {}
5013 for document_id, document_text in corpus.documents.items():
5014 if document_id in splits["train"] or document_id in splits["dev"]:
5015 # train and dev split of V2 will be train in V4
5016 train_documents[document_id] = document_text
5017 train_entities[document_id] = corpus.entities_per_document[document_id]
5018 elif document_id in splits["test"]:
5019 # test split of V2 will be dev in V4
5020 dev_documents[document_id] = document_text
5021 dev_entities[document_id] = corpus.entities_per_document[document_id]
5022 else:
5023 # New documents in V4 will become test documents
5024 test_documents[document_id] = document_text
5025 test_entities[document_id] = corpus.entities_per_document[document_id]
5027 train_corpus = InternalBioNerDataset(
5028 documents=train_documents, entities_per_document=train_entities
5029 )
5030 dev_corpus = InternalBioNerDataset(
5031 documents=dev_documents, entities_per_document=dev_entities
5032 )
5033 test_corpus = InternalBioNerDataset(
5034 documents=test_documents, entities_per_document=test_entities
5035 )
5037 return train_corpus, dev_corpus, test_corpus
5039 @staticmethod
5040 def parse_corpus(corpus_dir: Path) -> InternalBioNerDataset:
5041 documents = {}
5042 entities_per_document = {}
5044 text_dir = corpus_dir / "articles" / "txt"
5045 document_texts = [doc for doc in text_dir.iterdir() if doc.name[-4:] == ".txt"]
5046 annotation_dirs = [
5047 path
5048 for path in (corpus_dir / "concept-annotation").iterdir()
5049 if path.name not in ["sections-and-typography", "coreference"]
5050 and path.is_dir()
5051 ]
5053 for doc in Tqdm.tqdm(document_texts, desc="Converting to internal"):
5054 document_id = doc.name.split(".")[0]
5056 with open(doc, "r", encoding="utf8") as f_txt:
5057 documents[document_id] = f_txt.read()
5059 entities = []
5061 for annotation_dir in annotation_dirs:
5062 with open(
5063 annotation_dir
5064 / annotation_dir.parts[-1]
5065 / "knowtator"
5066 / (doc.name + ".knowtator.xml"),
5067 "r",
5068 encoding="utf8"
5069 ) as f_ann:
5070 ann_tree = etree.parse(f_ann)
5071 for annotation in ann_tree.xpath("//annotation"):
5072 for span in annotation.xpath("span"):
5073 start = int(span.get("start"))
5074 end = int(span.get("end"))
5075 entities += [Entity((start, end), annotation_dir.name.lower())]
5077 entities_per_document[document_id] = entities
5079 return InternalBioNerDataset(
5080 documents=documents, entities_per_document=entities_per_document
5081 )
5084class HUNER_CHEMICAL_CRAFT_V4(HunerDataset):
5085 """
5086 HUNER version of the CRAFT corpus containing (only) chemical annotations.
5087 """
5089 def __init__(self, *args, **kwargs):
5090 super().__init__(
5091 *args, **kwargs,
5092 )
5094 @staticmethod
5095 def split_url() -> str:
5096 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/craft_v4"
5098 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
5099 corpus_dir = CRAFT_V4.download_corpus(data_dir)
5100 corpus = CRAFT_V4.parse_corpus(corpus_dir)
5102 entity_type_mapping = {"chebi": CHEMICAL_TAG}
5103 return filter_and_map_entities(corpus, entity_type_mapping)
5106class HUNER_GENE_CRAFT_V4(HunerDataset):
5107 """
5108 HUNER version of the CRAFT corpus containing (only) gene annotations.
5109 """
5111 def __init__(self, *args, **kwargs):
5112 super().__init__(
5113 *args, **kwargs,
5114 )
5116 @staticmethod
5117 def split_url() -> str:
5118 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/craft_v4"
5120 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
5121 corpus_dir = CRAFT_V4.download_corpus(data_dir)
5122 corpus = CRAFT_V4.parse_corpus(corpus_dir)
5124 entity_type_mapping = {"pr": GENE_TAG}
5125 return filter_and_map_entities(corpus, entity_type_mapping)
5128class HUNER_SPECIES_CRAFT_V4(HunerDataset):
5129 """
5130 HUNER version of the CRAFT corpus containing (only) species annotations.
5131 """
5133 def __init__(self, *args, **kwargs):
5134 super().__init__(
5135 *args, **kwargs,
5136 )
5138 @staticmethod
5139 def split_url() -> str:
5140 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/craft_v4"
5142 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
5143 corpus_dir = CRAFT_V4.download_corpus(data_dir)
5144 corpus = CRAFT_V4.parse_corpus(corpus_dir)
5146 entity_type_mapping = {"ncbitaxon": SPECIES_TAG}
5147 return filter_and_map_entities(corpus, entity_type_mapping)
5150class HUNER_CHEMICAL_BIONLP2013_CG(HunerDataset):
5151 def __init__(self, *args, **kwargs):
5152 super().__init__(
5153 *args, **kwargs,
5154 )
5156 @staticmethod
5157 def split_url() -> str:
5158 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/bionlp2013_cg"
5160 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
5161 train_dir, dev_dir, test_dir = BIONLP2013_CG.download_corpus(data_dir)
5162 train_corpus = BioNLPCorpus.parse_input_files(train_dir)
5163 dev_corpus = BioNLPCorpus.parse_input_files(dev_dir)
5164 test_corpus = BioNLPCorpus.parse_input_files(test_dir)
5165 corpus = merge_datasets([train_corpus, dev_corpus, test_corpus])
5167 entity_type_mapping = {"Simple_chemical": CHEMICAL_TAG}
5168 return filter_and_map_entities(corpus, entity_type_mapping)
5171class HUNER_DISEASE_BIONLP2013_CG(HunerDataset):
5172 def __init__(self, *args, **kwargs):
5173 super().__init__(
5174 *args, **kwargs,
5175 )
5177 @staticmethod
5178 def split_url() -> str:
5179 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/bionlp2013_cg"
5181 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
5182 train_dir, dev_dir, test_dir = BIONLP2013_CG.download_corpus(data_dir)
5183 train_corpus = BioNLPCorpus.parse_input_files(train_dir)
5184 dev_corpus = BioNLPCorpus.parse_input_files(dev_dir)
5185 test_corpus = BioNLPCorpus.parse_input_files(test_dir)
5186 corpus = merge_datasets([train_corpus, dev_corpus, test_corpus])
5188 entity_type_mapping = {"Cancer": DISEASE_TAG}
5189 return filter_and_map_entities(corpus, entity_type_mapping)
5192class HUNER_GENE_BIONLP2013_CG(HunerDataset):
5193 def __init__(self, *args, **kwargs):
5194 super().__init__(
5195 *args, **kwargs,
5196 )
5198 @staticmethod
5199 def split_url() -> str:
5200 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/bionlp2013_cg"
5202 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
5203 train_dir, dev_dir, test_dir = BIONLP2013_CG.download_corpus(data_dir)
5204 train_corpus = BioNLPCorpus.parse_input_files(train_dir)
5205 dev_corpus = BioNLPCorpus.parse_input_files(dev_dir)
5206 test_corpus = BioNLPCorpus.parse_input_files(test_dir)
5207 corpus = merge_datasets([train_corpus, dev_corpus, test_corpus])
5209 entity_type_mapping = {"Gene_or_gene_product": GENE_TAG}
5210 return filter_and_map_entities(corpus, entity_type_mapping)
5213class HUNER_SPECIES_BIONLP2013_CG(HunerDataset):
5214 def __init__(self, *args, **kwargs):
5215 super().__init__(
5216 *args, **kwargs,
5217 )
5219 @staticmethod
5220 def split_url() -> str:
5221 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/bionlp2013_cg"
5223 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
5224 train_dir, dev_dir, test_dir = BIONLP2013_CG.download_corpus(data_dir)
5225 train_corpus = BioNLPCorpus.parse_input_files(train_dir)
5226 dev_corpus = BioNLPCorpus.parse_input_files(dev_dir)
5227 test_corpus = BioNLPCorpus.parse_input_files(test_dir)
5228 corpus = merge_datasets([train_corpus, dev_corpus, test_corpus])
5230 entity_type_mapping = {"Organism": SPECIES_TAG}
5231 return filter_and_map_entities(corpus, entity_type_mapping)
5234class AZDZ(ColumnCorpus):
5235 """
5236 Arizona Disease Corpus from the Biomedical Informatics Lab at Arizona State University.
5238 For further information see:
5239 http://diego.asu.edu/index.php
5240 """
5242 def __init__(
5243 self,
5244 base_path: Union[str, Path] = None,
5245 in_memory: bool = True,
5246 tokenizer: Tokenizer = None,
5247 ):
5248 """
5249 :param base_path: Path to the corpus on your machine
5250 :param in_memory: If True, keeps dataset in memory giving speedups in training.
5251 :param tokenizer: Implementation of :class:`Tokenizer` which segments sentences
5252 into tokens (default :class:`SciSpacyTokenizer`)
5253 """
5255 if type(base_path) == str:
5256 base_path: Path = Path(base_path)
5258 # column format
5259 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY}
5261 # this dataset name
5262 dataset_name = self.__class__.__name__.lower()
5264 # default dataset folder is the cache root
5265 if not base_path:
5266 base_path = flair.cache_root / "datasets"
5267 data_folder = base_path / dataset_name
5269 if tokenizer is None:
5270 tokenizer = SciSpacyTokenizer()
5271 sentence_splitter = TagSentenceSplitter(tag=SENTENCE_TAG, tokenizer=tokenizer)
5273 train_file = data_folder / f"{sentence_splitter.name}_train.conll"
5275 if not train_file.exists():
5276 corpus_file = self.download_corpus(data_folder)
5277 corpus_data = self.parse_corpus(corpus_file)
5279 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter)
5280 conll_writer.write_to_conll(corpus_data, train_file)
5282 super(AZDZ, self).__init__(
5283 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
5284 )
5286 @classmethod
5287 def download_corpus(cls, data_dir: Path) -> Path:
5288 url = "http://diego.asu.edu/downloads/AZDC_6-26-2009.txt"
5289 data_path = cached_path(url, data_dir)
5291 return data_path
5293 @staticmethod
5294 def parse_corpus(input_file: Path) -> InternalBioNerDataset:
5295 documents = {}
5296 entities_per_document = {}
5298 with open(str(input_file), "r", encoding="iso-8859-1") as azdz_reader:
5299 prev_document_id = None
5300 prev_sentence_id = None
5302 document_text = None
5303 entities = []
5304 offset = None
5306 for line in azdz_reader:
5307 line = line.strip()
5308 if not line or line.startswith("Doc Id"):
5309 continue
5311 columns = line.split("\t")
5313 document_id = columns[1] # PMID
5314 sentence_id = document_id + "_" + columns[2] # PMID + sentence no
5316 if document_id != prev_document_id and document_text:
5317 documents[document_id] = document_text
5318 entities_per_document[document_id] = entities
5320 document_text = None
5321 entities = []
5322 offset = None
5324 if sentence_id != prev_sentence_id:
5325 offset = offset + len(SENTENCE_TAG) if offset else 0
5326 document_text = (
5327 document_text + SENTENCE_TAG + columns[3].strip()
5328 if document_text
5329 else columns[3]
5330 )
5332 try:
5333 start = offset + int(columns[4]) - 1
5334 end = offset + int(columns[5])
5335 except:
5336 continue
5338 if end == 0:
5339 continue
5341 entities.append(Entity((start, end), DISEASE_TAG))
5343 return InternalBioNerDataset(
5344 documents=documents, entities_per_document=entities_per_document
5345 )
5348class PDR(ColumnCorpus):
5349 """
5350 Corpus of plant-disease relations from Kim et al., consisting of named entity annotations
5351 for plants and disease.
5353 For further information see Kim et al.:
5354 A corpus of plant-disease relations in the biomedical domain
5355 https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0221582
5356 http://gcancer.org/pdr/
5357 """
5359 def __init__(
5360 self,
5361 base_path: Union[str, Path] = None,
5362 in_memory: bool = True,
5363 sentence_splitter: SentenceSplitter = None,
5364 ):
5365 """
5366 :param base_path: Path to the corpus on your machine
5367 :param in_memory: If True, keeps dataset in memory giving speedups in training.
5368 :param sentence_splitter: Implementation of :class:`SentenceSplitter` which
5369 segments documents into sentences and tokens (default :class:`SciSpacySentenceSplitter`)
5370 """
5372 if type(base_path) == str:
5373 base_path: Path = Path(base_path)
5375 # column format
5376 columns = {0: "text", 1: "ner", 2: ColumnDataset.SPACE_AFTER_KEY}
5378 # this dataset name
5379 dataset_name = self.__class__.__name__.lower()
5381 # default dataset folder is the cache root
5382 if not base_path:
5383 base_path = flair.cache_root / "datasets"
5384 data_folder = base_path / dataset_name
5386 if sentence_splitter is None:
5387 sentence_splitter = SciSpacySentenceSplitter()
5389 train_file = data_folder / f"{sentence_splitter.name}_train.conll"
5391 if not train_file.exists():
5392 corpus_folder = self.download_corpus(data_folder)
5393 corpus_data = brat_to_internal(
5394 corpus_folder, ann_file_suffixes=[".ann", ".ann2"]
5395 )
5397 conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter)
5398 conll_writer.write_to_conll(corpus_data, train_file)
5400 super(PDR, self).__init__(
5401 data_folder, columns, tag_to_bioes="ner", in_memory=in_memory
5402 )
5404 @classmethod
5405 def download_corpus(cls, data_dir: Path) -> Path:
5406 url = "http://gcancer.org/pdr/Plant-Disease_Corpus.tar.gz"
5407 data_path = cached_path(url, data_dir)
5408 unpack_file(data_path, data_dir)
5410 return data_dir / "Plant-Disease_Corpus"
5413class HUNER_DISEASE_PDR(HunerDataset):
5414 """
5415 PDR Dataset with only Disease annotations
5416 """
5418 def __init__(self, *args, **kwargs):
5419 super().__init__(*args, **kwargs)
5421 @staticmethod
5422 def split_url() -> str:
5423 return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/pdr"
5425 def to_internal(self, data_dir: Path) -> InternalBioNerDataset:
5426 corpus_folder = PDR.download_corpus(data_dir)
5427 corpus_data = brat_to_internal(
5428 corpus_folder, ann_file_suffixes=[".ann", ".ann2"]
5429 )
5430 corpus_data = filter_and_map_entities(corpus_data, {"Disease": DISEASE_TAG})
5432 return corpus_data
5435class HunerMultiCorpus(MultiCorpus):
5436 """
5437 Base class to build the union of all HUNER data sets considering a particular entity type.
5438 """
5440 def __init__(self, entity_type: str, sentence_splitter: SentenceSplitter = None):
5441 self.entity_type = entity_type
5443 def entity_type_predicate(member):
5444 return f"HUNER_{entity_type}_" in str(member) and inspect.isclass(member)
5446 self.huner_corpora_classes = inspect.getmembers(sys.modules[__name__], predicate=entity_type_predicate)
5447 self.huner_corpora = []
5448 for name, constructor_func in self.huner_corpora_classes:
5449 try:
5450 if not sentence_splitter:
5451 corpus = constructor_func()
5452 else:
5453 corpus = constructor_func(sentence_splitter=sentence_splitter)
5455 self.huner_corpora.append(corpus)
5456 except:
5457 print(f"Can't download and prepare corpus {name}:\n{sys.exc_info()[1]}\n\n")
5459 super(HunerMultiCorpus, self).__init__(
5460 corpora=self.huner_corpora, name=f"HUNER-{entity_type}"
5461 )
5464class HUNER_CELL_LINE(HunerMultiCorpus):
5465 """
5466 Union of all HUNER cell line data sets.
5467 """
5469 def __init__(self, sentence_splitter: SentenceSplitter = None):
5470 super(HUNER_CELL_LINE, self).__init__(
5471 entity_type="CELL_LINE",
5472 sentence_splitter=sentence_splitter
5473 )
5476class HUNER_CHEMICAL(HunerMultiCorpus):
5477 """
5478 Union of all HUNER chemical data sets.
5479 """
5481 def __init__(self, sentence_splitter: SentenceSplitter = None):
5482 super(HUNER_CHEMICAL, self).__init__(
5483 entity_type="CHEMICAL",
5484 sentence_splitter=sentence_splitter
5485 )
5488class HUNER_DISEASE(HunerMultiCorpus):
5489 """
5490 Union of all HUNER disease data sets.
5491 """
5493 def __init__(self, sentence_splitter: SentenceSplitter = None):
5494 super(HUNER_DISEASE, self).__init__(
5495 entity_type="DISEASE",
5496 sentence_splitter=sentence_splitter
5497 )
5500class HUNER_GENE(HunerMultiCorpus):
5501 """
5502 Union of all HUNER gene data sets.
5503 """
5505 def __init__(self, sentence_splitter: SentenceSplitter = None):
5506 super(HUNER_GENE, self).__init__(
5507 entity_type="GENE",
5508 sentence_splitter=sentence_splitter
5509 )
5512class HUNER_SPECIES(HunerMultiCorpus):
5513 """
5514 Union of all HUNER species data sets.
5515 """
5517 def __init__(self, sentence_splitter: SentenceSplitter = None):
5518 super(HUNER_SPECIES, self).__init__(
5519 entity_type="SPECIES",
5520 sentence_splitter=sentence_splitter
5521 )