Coverage for flair/flair/hyperparameter/param_selection.py: 27%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import logging
2from abc import abstractmethod
3from enum import Enum
4from pathlib import Path
5from typing import Tuple, Union
6import numpy as np
8from hyperopt import hp, fmin, tpe
10import flair.nn
11from flair.data import Corpus
12from flair.embeddings import DocumentPoolEmbeddings, DocumentRNNEmbeddings
13from flair.hyperparameter import Parameter
14from flair.hyperparameter.parameter import (
15 SEQUENCE_TAGGER_PARAMETERS,
16 TRAINING_PARAMETERS,
17 DOCUMENT_EMBEDDING_PARAMETERS,
18 MODEL_TRAINER_PARAMETERS,
19)
20from flair.models import SequenceTagger, TextClassifier
21from flair.trainers import ModelTrainer
22from flair.training_utils import (
23 EvaluationMetric,
24 log_line,
25 init_output_file,
26 add_file_handler,
27)
29log = logging.getLogger("flair")
32class OptimizationValue(Enum):
33 DEV_LOSS = "loss"
34 DEV_SCORE = "score"
37class SearchSpace(object):
38 def __init__(self):
39 self.search_space = {}
41 def add(self, parameter: Parameter, func, **kwargs):
42 self.search_space[parameter.value] = func(parameter.value, **kwargs)
44 def get_search_space(self):
45 return hp.choice("parameters", [self.search_space])
48class ParamSelector(object):
49 def __init__(
50 self,
51 corpus: Corpus,
52 base_path: Union[str, Path],
53 max_epochs: int,
54 evaluation_metric: EvaluationMetric,
55 training_runs: int,
56 optimization_value: OptimizationValue,
57 ):
58 if type(base_path) is str:
59 base_path = Path(base_path)
61 self.corpus = corpus
62 self.max_epochs = max_epochs
63 self.base_path = base_path
64 self.evaluation_metric = evaluation_metric
65 self.run = 1
66 self.training_runs = training_runs
67 self.optimization_value = optimization_value
69 self.param_selection_file = init_output_file(base_path, "param_selection.txt")
71 @abstractmethod
72 def _set_up_model(self, params: dict) -> flair.nn.Model:
73 pass
75 def _objective(self, params: dict):
76 log_line(log)
77 log.info(f"Evaluation run: {self.run}")
78 log.info(f"Evaluating parameter combination:")
79 for k, v in params.items():
80 if isinstance(v, Tuple):
81 v = ",".join([str(x) for x in v])
82 log.info(f"\t{k}: {str(v)}")
83 log_line(log)
85 for sent in self.corpus.get_all_sentences():
86 sent.clear_embeddings()
88 scores = []
89 vars = []
91 for i in range(0, self.training_runs):
92 log_line(log)
93 log.info(f"Training run: {i + 1}")
95 model = self._set_up_model(params)
97 training_params = {
98 key: params[key] for key in params if key in TRAINING_PARAMETERS
99 }
100 model_trainer_parameters = {
101 key: params[key] for key in params if key in MODEL_TRAINER_PARAMETERS
102 }
104 trainer: ModelTrainer = ModelTrainer(
105 model, self.corpus, **model_trainer_parameters
106 )
108 result = trainer.train(
109 self.base_path,
110 max_epochs=self.max_epochs,
111 param_selection_mode=True,
112 **training_params,
113 )
115 # take the average over the last three scores of training
116 if self.optimization_value == OptimizationValue.DEV_LOSS:
117 curr_scores = result["dev_loss_history"][-3:]
118 else:
119 curr_scores = list(
120 map(lambda s: 1 - s, result["dev_score_history"][-3:])
121 )
123 score = sum(curr_scores) / float(len(curr_scores))
124 var = np.var(curr_scores)
125 scores.append(score)
126 vars.append(var)
128 # take average over the scores from the different training runs
129 final_score = sum(scores) / float(len(scores))
130 final_var = sum(vars) / float(len(vars))
132 test_score = result["test_score"]
133 log_line(log)
134 log.info(f"Done evaluating parameter combination:")
135 for k, v in params.items():
136 if isinstance(v, Tuple):
137 v = ",".join([str(x) for x in v])
138 log.info(f"\t{k}: {v}")
139 log.info(f"{self.optimization_value.value}: {final_score}")
140 log.info(f"variance: {final_var}")
141 log.info(f"test_score: {test_score}\n")
142 log_line(log)
144 with open(self.param_selection_file, "a") as f:
145 f.write(f"evaluation run {self.run}\n")
146 for k, v in params.items():
147 if isinstance(v, Tuple):
148 v = ",".join([str(x) for x in v])
149 f.write(f"\t{k}: {str(v)}\n")
150 f.write(f"{self.optimization_value.value}: {final_score}\n")
151 f.write(f"variance: {final_var}\n")
152 f.write(f"test_score: {test_score}\n")
153 f.write("-" * 100 + "\n")
155 self.run += 1
157 return {"status": "ok", "loss": final_score, "loss_variance": final_var}
159 def optimize(self, space: SearchSpace, max_evals=100):
160 search_space = space.search_space
161 best = fmin(
162 self._objective, search_space, algo=tpe.suggest, max_evals=max_evals
163 )
165 log_line(log)
166 log.info("Optimizing parameter configuration done.")
167 log.info("Best parameter configuration found:")
168 for k, v in best.items():
169 log.info(f"\t{k}: {v}")
170 log_line(log)
172 with open(self.param_selection_file, "a") as f:
173 f.write("best parameter combination\n")
174 for k, v in best.items():
175 if isinstance(v, Tuple):
176 v = ",".join([str(x) for x in v])
177 f.write(f"\t{k}: {str(v)}\n")
180class SequenceTaggerParamSelector(ParamSelector):
181 def __init__(
182 self,
183 corpus: Corpus,
184 tag_type: str,
185 base_path: Union[str, Path],
186 max_epochs: int = 50,
187 evaluation_metric: EvaluationMetric = EvaluationMetric.MICRO_F1_SCORE,
188 training_runs: int = 1,
189 optimization_value: OptimizationValue = OptimizationValue.DEV_LOSS,
190 ):
191 """
192 :param corpus: the corpus
193 :param tag_type: tag type to use
194 :param base_path: the path to the result folder (results will be written to that folder)
195 :param max_epochs: number of epochs to perform on every evaluation run
196 :param evaluation_metric: evaluation metric used during training
197 :param training_runs: number of training runs per evaluation run
198 :param optimization_value: value to optimize
199 """
200 super().__init__(
201 corpus,
202 base_path,
203 max_epochs,
204 evaluation_metric,
205 training_runs,
206 optimization_value,
207 )
209 self.tag_type = tag_type
210 self.tag_dictionary = self.corpus.make_label_dictionary(self.tag_type)
212 def _set_up_model(self, params: dict):
213 sequence_tagger_params = {
214 key: params[key] for key in params if key in SEQUENCE_TAGGER_PARAMETERS
215 }
217 tagger: SequenceTagger = SequenceTagger(
218 tag_dictionary=self.tag_dictionary,
219 tag_type=self.tag_type,
220 **sequence_tagger_params,
221 )
222 return tagger
225class TextClassifierParamSelector(ParamSelector):
226 def __init__(
227 self,
228 corpus: Corpus,
229 multi_label: bool,
230 base_path: Union[str, Path],
231 document_embedding_type: str,
232 max_epochs: int = 50,
233 evaluation_metric: EvaluationMetric = EvaluationMetric.MICRO_F1_SCORE,
234 training_runs: int = 1,
235 optimization_value: OptimizationValue = OptimizationValue.DEV_LOSS,
236 ):
237 """
238 :param corpus: the corpus
239 :param multi_label: true, if the dataset is multi label, false otherwise
240 :param base_path: the path to the result folder (results will be written to that folder)
241 :param document_embedding_type: either 'lstm', 'mean', 'min', or 'max'
242 :param max_epochs: number of epochs to perform on every evaluation run
243 :param evaluation_metric: evaluation metric used during training
244 :param training_runs: number of training runs per evaluation run
245 :param optimization_value: value to optimize
246 """
247 super().__init__(
248 corpus,
249 base_path,
250 max_epochs,
251 evaluation_metric,
252 training_runs,
253 optimization_value,
254 )
256 self.multi_label = multi_label
257 self.document_embedding_type = document_embedding_type
259 self.label_dictionary = self.corpus.make_label_dictionary()
261 def _set_up_model(self, params: dict):
262 embdding_params = {
263 key: params[key] for key in params if key in DOCUMENT_EMBEDDING_PARAMETERS
264 }
266 if self.document_embedding_type == "lstm":
267 document_embedding = DocumentRNNEmbeddings(**embdding_params)
268 else:
269 document_embedding = DocumentPoolEmbeddings(**embdding_params)
271 text_classifier: TextClassifier = TextClassifier(
272 label_dictionary=self.label_dictionary,
273 multi_label=self.multi_label,
274 document_embeddings=document_embedding,
275 )
277 return text_classifier