Coverage for flair/flair/datasets/conllu.py: 96%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

109 statements  

1import logging 

2 

3from pathlib import Path 

4from typing import List, Union, Optional, Sequence, Dict, Tuple, Any 

5 

6import conllu 

7 

8from flair.data import Sentence, Corpus, Token, FlairDataset, Span, RelationLabel, SpanLabel 

9from flair.datasets.base import find_train_dev_test_files 

10 

11log = logging.getLogger("flair") 

12 

13DEFAULT_FIELDS: Tuple[str, ...] = ("id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc") 

14 

15DEFAULT_TOKEN_ANNOTATION_FIELDS: Tuple[str, ...] = ("lemma", "upos", "xpos", "feats", "head", "deprel") 

16 

17# noinspection PyProtectedMember 

18DEFAULT_METADATA_PARSERS: Dict[str, conllu._MetadataParserType] = { 

19 **conllu.parser.DEFAULT_METADATA_PARSERS, 

20 **{"relations": lambda key, value: parse_relation_tuple_list(key, value, list_sep="|", value_sep=";")} 

21} 

22 

23 

24def parse_relation_tuple_list(key: str, 

25 value: Optional[str] = None, 

26 list_sep: str = "|", 

27 value_sep: str = ";") -> Optional[Tuple[str, List[Tuple[int, int, int, int, str]]]]: 

28 if value is None: 

29 return value 

30 

31 relation_tuples: List[Tuple[int, int, int, int, str]] = [] 

32 for relation in value.split(list_sep): 

33 head_start, head_end, tail_start, tail_end, label = relation.split(value_sep) 

34 relation_tuples.append((int(head_start), int(head_end), int(tail_start), int(tail_end), label)) 

35 

36 return key, relation_tuples 

37 

38 

39class CoNLLUCorpus(Corpus): 

40 

41 # noinspection PyProtectedMember 

42 def __init__(self, 

43 data_folder: Union[str, Path], 

44 train_file=None, 

45 test_file=None, 

46 dev_file=None, 

47 in_memory: bool = True, 

48 fields: Optional[Sequence[str]] = None, 

49 token_annotation_fields: Optional[Sequence[str]] = None, 

50 field_parsers: Optional[Dict[str, conllu._FieldParserType]] = None, 

51 metadata_parsers: Optional[Dict[str, conllu._MetadataParserType]] = None, 

52 sample_missing_splits: bool = True): 

53 """ 

54 Instantiates a Corpus from CoNLL-U (Plus) column-formatted task data 

55 

56 Universal dependencies corpora that contain multi-word tokens are not supported yet. 

57 The annotation of flair sentences with the "deps" column is not yet supported as well. 

58 Please consider using the "UniversalDependenciesCorpus" instead. 

59 

60 :param data_folder: base folder with the task data 

61 :param train_file: the name of the train file 

62 :param test_file: the name of the test file 

63 :param dev_file: the name of the dev file, if None, dev data is sampled from train 

64 :param in_memory: If set to True, keeps full dataset in memory, otherwise does disk reads 

65 :param token_annotation_fields: A subset of the fields parameter for token level annotations 

66 :return: a Corpus with annotated train, dev and test data 

67 """ 

68 

69 # find train, dev and test files if not specified 

70 dev_file, test_file, train_file = find_train_dev_test_files(data_folder, dev_file, test_file, train_file) 

71 

72 # get train data 

73 train = CoNLLUDataset( 

74 train_file, 

75 in_memory=in_memory, 

76 fields=fields, 

77 token_annotation_fields=token_annotation_fields, 

78 field_parsers=field_parsers, 

79 metadata_parsers=metadata_parsers, 

80 ) 

81 

82 # get test data 

83 test = ( 

84 CoNLLUDataset( 

85 test_file, 

86 in_memory=in_memory, 

87 fields=fields, 

88 token_annotation_fields=token_annotation_fields, 

89 field_parsers=field_parsers, 

90 metadata_parsers=metadata_parsers, 

91 ) 

92 if test_file is not None 

93 else None 

94 ) 

95 

96 # get dev data 

97 dev = ( 

98 CoNLLUDataset( 

99 dev_file, 

100 in_memory=in_memory, 

101 fields=fields, 

102 token_annotation_fields=token_annotation_fields, 

103 field_parsers=field_parsers, 

104 metadata_parsers=metadata_parsers, 

105 ) 

106 if dev_file is not None 

107 else None 

108 ) 

109 

110 super(CoNLLUCorpus, self).__init__(train, dev, test, name=str(data_folder), 

111 sample_missing_splits=sample_missing_splits) 

112 

113 

114class CoNLLUDataset(FlairDataset): 

115 

116 # noinspection PyProtectedMember 

117 def __init__(self, 

118 path_to_conllu_file: Union[str, Path], 

119 in_memory: bool = True, 

120 fields: Optional[Sequence[str]] = None, 

121 token_annotation_fields: Optional[Sequence[str]] = None, 

122 field_parsers: Optional[Dict[str, conllu._FieldParserType]] = None, 

123 metadata_parsers: Optional[Dict[str, conllu._MetadataParserType]] = None): 

124 """ 

125 Instantiates a column dataset in CoNLL-U (Plus) format. 

126 

127 Universal dependencies datasets that contain multi-word tokens are not supported yet. 

128 The annotation of flair sentences with the "deps" column is not yet supported as well. 

129 Please consider using the "UniversalDependenciesDataset" instead. 

130 

131 :param path_to_conllu_file: Path to the CoNLL-U formatted file 

132 :param in_memory: If set to True, keeps full dataset in memory, otherwise does disk reads 

133 :param token_annotation_fields: A subset of the fields parameter for token level annotations 

134 """ 

135 if type(path_to_conllu_file) is str: 

136 path_to_conllu_file = Path(path_to_conllu_file) 

137 assert path_to_conllu_file.exists() 

138 

139 self.path_to_conllu_file = path_to_conllu_file 

140 self.in_memory = in_memory 

141 

142 # if no fields specified, check if the file is CoNLL plus formatted and get fields 

143 if fields is None: 

144 with open(str(self.path_to_conllu_file), encoding="utf-8") as file: 

145 fields = conllu.parser.parse_conllu_plus_fields(file) 

146 

147 self.fields = fields or DEFAULT_FIELDS 

148 self.token_annotation_fields = token_annotation_fields or DEFAULT_TOKEN_ANNOTATION_FIELDS 

149 

150 # Validate fields and token_annotation_fields 

151 if not set(self.token_annotation_fields).issubset(self.fields): 

152 raise ValueError(f"The token annotation fields {repr(self.token_annotation_fields)} " 

153 f"are not a subset of the parsed fields {repr(self.fields)}.") 

154 

155 # noinspection PyProtectedMember 

156 augmented_default_field_parsers: Dict[str, conllu._FieldParserType] = { 

157 **{ 

158 field: lambda line_, i: conllu.parser.parse_nullable_value(line_[i]) 

159 for field in self.token_annotation_fields 

160 }, 

161 **conllu.parser.DEFAULT_FIELD_PARSERS 

162 } 

163 

164 self.field_parsers = field_parsers or augmented_default_field_parsers 

165 self.metadata_parsers = metadata_parsers or DEFAULT_METADATA_PARSERS 

166 

167 self.total_sentence_count: int = 0 

168 

169 with open(str(self.path_to_conllu_file), encoding="utf-8") as file: 

170 

171 # option 1: read only sentence boundaries as offset positions 

172 if not self.in_memory: 

173 self.indices: List[int] = [] 

174 

175 line = file.readline() 

176 position = 0 

177 while line: 

178 line = line.strip() 

179 if line == "": 

180 self.indices.append(position) 

181 position = file.tell() 

182 line = file.readline() 

183 

184 self.indices.append(position) 

185 self.total_sentence_count = len(self.indices) 

186 

187 # option 2: keep everything in memory 

188 if self.in_memory: 

189 self.sentences: List[Sentence] = [ 

190 self.token_list_to_sentence(token_list) 

191 for token_list in conllu.parse_incr( 

192 file, 

193 fields=self.fields, 

194 field_parsers=self.field_parsers, 

195 metadata_parsers=self.metadata_parsers, 

196 ) 

197 ] 

198 

199 # pointer to previous 

200 previous_sentence = None 

201 

202 for sentence in self.sentences: 

203 

204 sentence._previous_sentence = previous_sentence 

205 sentence._next_sentence = None 

206 if previous_sentence: previous_sentence._next_sentence = sentence 

207 previous_sentence = sentence 

208 

209 self.total_sentence_count = len(self.sentences) 

210 

211 def is_in_memory(self) -> bool: 

212 return self.in_memory 

213 

214 def __len__(self): 

215 return self.total_sentence_count 

216 

217 def __getitem__(self, index: int = 0) -> Sentence: 

218 

219 # if in memory, retrieve parsed sentence 

220 if self.in_memory: 

221 sentence = self.sentences[index] 

222 

223 # else skip to position in file where sentence begins 

224 else: 

225 with open(str(self.path_to_conllu_file), encoding="utf-8") as file: 

226 file.seek(self.indices[index]) 

227 token_list = next(conllu.parse_incr(file, self.fields, self.field_parsers, self.metadata_parsers)) 

228 sentence = self.token_list_to_sentence(token_list) 

229 

230 return sentence 

231 

232 def token_list_to_sentence(self, token_list: conllu.TokenList) -> Sentence: 

233 sentence: Sentence = Sentence() 

234 

235 # Build the sentence tokens and add the annotations. 

236 for conllu_token in token_list: 

237 token = Token(conllu_token["form"]) 

238 

239 for field in self.token_annotation_fields: 

240 field_value: Any = conllu_token[field] 

241 if isinstance(field_value, dict): 

242 # For fields that contain key-value annotations, 

243 # we add the key as label type-name and the value as the label value. 

244 for key, value in field_value.items(): 

245 token.add_label(typename=key, value=str(value)) 

246 else: 

247 token.add_label(typename=field, value=str(field_value)) 

248 

249 if conllu_token.get("misc") is not None: 

250 space_after: Optional[str] = conllu_token["misc"].get("SpaceAfter") 

251 if space_after == "No": 

252 token.whitespace_after = False 

253 

254 sentence.add_token(token) 

255 

256 if "sentence_id" in token_list.metadata: 

257 sentence.add_label("sentence_id", token_list.metadata["sentence_id"]) 

258 

259 if "relations" in token_list.metadata: 

260 for head_start, head_end, tail_start, tail_end, label in token_list.metadata["relations"]: 

261 # head and tail span indices are 1-indexed and end index is inclusive 

262 head = Span(sentence.tokens[head_start - 1: head_end]) 

263 tail = Span(sentence.tokens[tail_start - 1: tail_end]) 

264 

265 sentence.add_complex_label("relation", RelationLabel(value=label, head=head, tail=tail)) 

266 

267 # determine all NER label types in sentence and add all NER spans as sentence-level labels 

268 ner_label_types = [] 

269 for token in sentence.tokens: 

270 for annotation in token.annotation_layers.keys(): 

271 if annotation.startswith("ner") and annotation not in ner_label_types: 

272 ner_label_types.append(annotation) 

273 

274 for label_type in ner_label_types: 

275 spans = sentence.get_spans(label_type) 

276 for span in spans: 

277 sentence.add_complex_label("entity", label=SpanLabel(span=span, value=span.tag, score=span.score)) 

278 

279 return sentence