Coverage for flair/flair/datasets/conllu.py: 17%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import logging
3from pathlib import Path
4from typing import List, Union, Optional, Sequence, Dict, Tuple, Any
6import conllu
8from flair.data import Sentence, Corpus, Token, FlairDataset, Span, RelationLabel, SpanLabel
9from flair.datasets.base import find_train_dev_test_files
11log = logging.getLogger("flair")
13DEFAULT_FIELDS: Tuple[str, ...] = ("id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc")
15DEFAULT_TOKEN_ANNOTATION_FIELDS: Tuple[str, ...] = ("lemma", "upos", "xpos", "feats", "head", "deprel")
17# noinspection PyProtectedMember
18DEFAULT_METADATA_PARSERS: Dict[str, conllu._MetadataParserType] = {
19 **conllu.parser.DEFAULT_METADATA_PARSERS,
20 **{"relations": lambda key, value: parse_relation_tuple_list(key, value, list_sep="|", value_sep=";")}
21}
24def parse_relation_tuple_list(key: str,
25 value: Optional[str] = None,
26 list_sep: str = "|",
27 value_sep: str = ";") -> Optional[Tuple[str, List[Tuple[int, int, int, int, str]]]]:
28 if value is None:
29 return value
31 relation_tuples: List[Tuple[int, int, int, int, str]] = []
32 for relation in value.split(list_sep):
33 head_start, head_end, tail_start, tail_end, label = relation.split(value_sep)
34 relation_tuples.append((int(head_start), int(head_end), int(tail_start), int(tail_end), label))
36 return key, relation_tuples
39class CoNLLUCorpus(Corpus):
41 # noinspection PyProtectedMember
42 def __init__(self,
43 data_folder: Union[str, Path],
44 train_file=None,
45 test_file=None,
46 dev_file=None,
47 in_memory: bool = True,
48 fields: Optional[Sequence[str]] = None,
49 token_annotation_fields: Optional[Sequence[str]] = None,
50 field_parsers: Optional[Dict[str, conllu._FieldParserType]] = None,
51 metadata_parsers: Optional[Dict[str, conllu._MetadataParserType]] = None,
52 sample_missing_splits: bool = True):
53 """
54 Instantiates a Corpus from CoNLL-U (Plus) column-formatted task data
56 Universal dependencies corpora that contain multi-word tokens are not supported yet.
57 The annotation of flair sentences with the "deps" column is not yet supported as well.
58 Please consider using the "UniversalDependenciesCorpus" instead.
60 :param data_folder: base folder with the task data
61 :param train_file: the name of the train file
62 :param test_file: the name of the test file
63 :param dev_file: the name of the dev file, if None, dev data is sampled from train
64 :param in_memory: If set to True, keeps full dataset in memory, otherwise does disk reads
65 :param token_annotation_fields: A subset of the fields parameter for token level annotations
66 :return: a Corpus with annotated train, dev and test data
67 """
69 # find train, dev and test files if not specified
70 dev_file, test_file, train_file = find_train_dev_test_files(data_folder, dev_file, test_file, train_file)
72 # get train data
73 train = CoNLLUDataset(
74 train_file,
75 in_memory=in_memory,
76 fields=fields,
77 token_annotation_fields=token_annotation_fields,
78 field_parsers=field_parsers,
79 metadata_parsers=metadata_parsers,
80 )
82 # get test data
83 test = (
84 CoNLLUDataset(
85 test_file,
86 in_memory=in_memory,
87 fields=fields,
88 token_annotation_fields=token_annotation_fields,
89 field_parsers=field_parsers,
90 metadata_parsers=metadata_parsers,
91 )
92 if test_file is not None
93 else None
94 )
96 # get dev data
97 dev = (
98 CoNLLUDataset(
99 dev_file,
100 in_memory=in_memory,
101 fields=fields,
102 token_annotation_fields=token_annotation_fields,
103 field_parsers=field_parsers,
104 metadata_parsers=metadata_parsers,
105 )
106 if dev_file is not None
107 else None
108 )
110 super(CoNLLUCorpus, self).__init__(train, dev, test, name=str(data_folder),
111 sample_missing_splits=sample_missing_splits)
114class CoNLLUDataset(FlairDataset):
116 # noinspection PyProtectedMember
117 def __init__(self,
118 path_to_conllu_file: Union[str, Path],
119 in_memory: bool = True,
120 fields: Optional[Sequence[str]] = None,
121 token_annotation_fields: Optional[Sequence[str]] = None,
122 field_parsers: Optional[Dict[str, conllu._FieldParserType]] = None,
123 metadata_parsers: Optional[Dict[str, conllu._MetadataParserType]] = None):
124 """
125 Instantiates a column dataset in CoNLL-U (Plus) format.
127 Universal dependencies datasets that contain multi-word tokens are not supported yet.
128 The annotation of flair sentences with the "deps" column is not yet supported as well.
129 Please consider using the "UniversalDependenciesDataset" instead.
131 :param path_to_conllu_file: Path to the CoNLL-U formatted file
132 :param in_memory: If set to True, keeps full dataset in memory, otherwise does disk reads
133 :param token_annotation_fields: A subset of the fields parameter for token level annotations
134 """
135 if type(path_to_conllu_file) is str:
136 path_to_conllu_file = Path(path_to_conllu_file)
137 assert path_to_conllu_file.exists()
139 self.path_to_conllu_file = path_to_conllu_file
140 self.in_memory = in_memory
142 # if no fields specified, check if the file is CoNLL plus formatted and get fields
143 if fields is None:
144 with open(str(self.path_to_conllu_file), encoding="utf-8") as file:
145 fields = conllu.parser.parse_conllu_plus_fields(file)
147 self.fields = fields or DEFAULT_FIELDS
148 self.token_annotation_fields = token_annotation_fields or DEFAULT_TOKEN_ANNOTATION_FIELDS
150 # Validate fields and token_annotation_fields
151 if not set(self.token_annotation_fields).issubset(self.fields):
152 raise ValueError(f"The token annotation fields {repr(self.token_annotation_fields)} "
153 f"are not a subset of the parsed fields {repr(self.fields)}.")
155 # noinspection PyProtectedMember
156 augmented_default_field_parsers: Dict[str, conllu._FieldParserType] = {
157 **{
158 field: lambda line_, i: conllu.parser.parse_nullable_value(line_[i])
159 for field in self.token_annotation_fields
160 },
161 **conllu.parser.DEFAULT_FIELD_PARSERS
162 }
164 self.field_parsers = field_parsers or augmented_default_field_parsers
165 self.metadata_parsers = metadata_parsers or DEFAULT_METADATA_PARSERS
167 self.total_sentence_count: int = 0
169 with open(str(self.path_to_conllu_file), encoding="utf-8") as file:
171 # option 1: read only sentence boundaries as offset positions
172 if not self.in_memory:
173 self.indices: List[int] = []
175 line = file.readline()
176 position = 0
177 while line:
178 line = line.strip()
179 if line == "":
180 self.indices.append(position)
181 position = file.tell()
182 line = file.readline()
184 self.indices.append(position)
185 self.total_sentence_count = len(self.indices)
187 # option 2: keep everything in memory
188 if self.in_memory:
189 self.sentences: List[Sentence] = [
190 self.token_list_to_sentence(token_list)
191 for token_list in conllu.parse_incr(
192 file,
193 fields=self.fields,
194 field_parsers=self.field_parsers,
195 metadata_parsers=self.metadata_parsers,
196 )
197 ]
199 # pointer to previous
200 previous_sentence = None
202 for sentence in self.sentences:
204 sentence._previous_sentence = previous_sentence
205 sentence._next_sentence = None
206 if previous_sentence: previous_sentence._next_sentence = sentence
207 previous_sentence = sentence
209 self.total_sentence_count = len(self.sentences)
211 def is_in_memory(self) -> bool:
212 return self.in_memory
214 def __len__(self):
215 return self.total_sentence_count
217 def __getitem__(self, index: int = 0) -> Sentence:
219 # if in memory, retrieve parsed sentence
220 if self.in_memory:
221 sentence = self.sentences[index]
223 # else skip to position in file where sentence begins
224 else:
225 with open(str(self.path_to_conllu_file), encoding="utf-8") as file:
226 file.seek(self.indices[index])
227 token_list = next(conllu.parse_incr(file, self.fields, self.field_parsers, self.metadata_parsers))
228 sentence = self.token_list_to_sentence(token_list)
230 return sentence
232 def token_list_to_sentence(self, token_list: conllu.TokenList) -> Sentence:
233 sentence: Sentence = Sentence()
235 # Build the sentence tokens and add the annotations.
236 for conllu_token in token_list:
237 token = Token(conllu_token["form"])
239 for field in self.token_annotation_fields:
240 field_value: Any = conllu_token[field]
241 if isinstance(field_value, dict):
242 # For fields that contain key-value annotations,
243 # we add the key as label type-name and the value as the label value.
244 for key, value in field_value.items():
245 token.add_label(typename=key, value=str(value))
246 else:
247 token.add_label(typename=field, value=str(field_value))
249 if conllu_token.get("misc") is not None:
250 space_after: Optional[str] = conllu_token["misc"].get("SpaceAfter")
251 if space_after == "No":
252 token.whitespace_after = False
254 sentence.add_token(token)
256 if "sentence_id" in token_list.metadata:
257 sentence.add_label("sentence_id", token_list.metadata["sentence_id"])
259 if "relations" in token_list.metadata:
260 for head_start, head_end, tail_start, tail_end, label in token_list.metadata["relations"]:
261 # head and tail span indices are 1-indexed and end index is inclusive
262 head = Span(sentence.tokens[head_start - 1: head_end])
263 tail = Span(sentence.tokens[tail_start - 1: tail_end])
265 sentence.add_complex_label("relation", RelationLabel(value=label, head=head, tail=tail))
267 # determine all NER label types in sentence and add all NER spans as sentence-level labels
268 ner_label_types = []
269 for token in sentence.tokens:
270 for annotation in token.annotation_layers.keys():
271 if annotation.startswith("ner") and annotation not in ner_label_types:
272 ner_label_types.append(annotation)
274 for label_type in ner_label_types:
275 spans = sentence.get_spans(label_type)
276 for span in spans:
277 sentence.add_complex_label("entity", label=SpanLabel(span=span, value=span.tag, score=span.score))
279 return sentence