Coverage for flair/flair/hyperparameter/param_selection.py: 27%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

131 statements  

1import logging 

2from abc import abstractmethod 

3from enum import Enum 

4from pathlib import Path 

5from typing import Tuple, Union 

6import numpy as np 

7 

8from hyperopt import hp, fmin, tpe 

9 

10import flair.nn 

11from flair.data import Corpus 

12from flair.embeddings import DocumentPoolEmbeddings, DocumentRNNEmbeddings 

13from flair.hyperparameter import Parameter 

14from flair.hyperparameter.parameter import ( 

15 SEQUENCE_TAGGER_PARAMETERS, 

16 TRAINING_PARAMETERS, 

17 DOCUMENT_EMBEDDING_PARAMETERS, 

18 MODEL_TRAINER_PARAMETERS, 

19) 

20from flair.models import SequenceTagger, TextClassifier 

21from flair.trainers import ModelTrainer 

22from flair.training_utils import ( 

23 EvaluationMetric, 

24 log_line, 

25 init_output_file, 

26 add_file_handler, 

27) 

28 

29log = logging.getLogger("flair") 

30 

31 

32class OptimizationValue(Enum): 

33 DEV_LOSS = "loss" 

34 DEV_SCORE = "score" 

35 

36 

37class SearchSpace(object): 

38 def __init__(self): 

39 self.search_space = {} 

40 

41 def add(self, parameter: Parameter, func, **kwargs): 

42 self.search_space[parameter.value] = func(parameter.value, **kwargs) 

43 

44 def get_search_space(self): 

45 return hp.choice("parameters", [self.search_space]) 

46 

47 

48class ParamSelector(object): 

49 def __init__( 

50 self, 

51 corpus: Corpus, 

52 base_path: Union[str, Path], 

53 max_epochs: int, 

54 evaluation_metric: EvaluationMetric, 

55 training_runs: int, 

56 optimization_value: OptimizationValue, 

57 ): 

58 if type(base_path) is str: 

59 base_path = Path(base_path) 

60 

61 self.corpus = corpus 

62 self.max_epochs = max_epochs 

63 self.base_path = base_path 

64 self.evaluation_metric = evaluation_metric 

65 self.run = 1 

66 self.training_runs = training_runs 

67 self.optimization_value = optimization_value 

68 

69 self.param_selection_file = init_output_file(base_path, "param_selection.txt") 

70 

71 @abstractmethod 

72 def _set_up_model(self, params: dict) -> flair.nn.Model: 

73 pass 

74 

75 def _objective(self, params: dict): 

76 log_line(log) 

77 log.info(f"Evaluation run: {self.run}") 

78 log.info(f"Evaluating parameter combination:") 

79 for k, v in params.items(): 

80 if isinstance(v, Tuple): 

81 v = ",".join([str(x) for x in v]) 

82 log.info(f"\t{k}: {str(v)}") 

83 log_line(log) 

84 

85 for sent in self.corpus.get_all_sentences(): 

86 sent.clear_embeddings() 

87 

88 scores = [] 

89 vars = [] 

90 

91 for i in range(0, self.training_runs): 

92 log_line(log) 

93 log.info(f"Training run: {i + 1}") 

94 

95 model = self._set_up_model(params) 

96 

97 training_params = { 

98 key: params[key] for key in params if key in TRAINING_PARAMETERS 

99 } 

100 model_trainer_parameters = { 

101 key: params[key] for key in params if key in MODEL_TRAINER_PARAMETERS 

102 } 

103 

104 trainer: ModelTrainer = ModelTrainer( 

105 model, self.corpus, **model_trainer_parameters 

106 ) 

107 

108 result = trainer.train( 

109 self.base_path, 

110 max_epochs=self.max_epochs, 

111 param_selection_mode=True, 

112 **training_params, 

113 ) 

114 

115 # take the average over the last three scores of training 

116 if self.optimization_value == OptimizationValue.DEV_LOSS: 

117 curr_scores = result["dev_loss_history"][-3:] 

118 else: 

119 curr_scores = list( 

120 map(lambda s: 1 - s, result["dev_score_history"][-3:]) 

121 ) 

122 

123 score = sum(curr_scores) / float(len(curr_scores)) 

124 var = np.var(curr_scores) 

125 scores.append(score) 

126 vars.append(var) 

127 

128 # take average over the scores from the different training runs 

129 final_score = sum(scores) / float(len(scores)) 

130 final_var = sum(vars) / float(len(vars)) 

131 

132 test_score = result["test_score"] 

133 log_line(log) 

134 log.info(f"Done evaluating parameter combination:") 

135 for k, v in params.items(): 

136 if isinstance(v, Tuple): 

137 v = ",".join([str(x) for x in v]) 

138 log.info(f"\t{k}: {v}") 

139 log.info(f"{self.optimization_value.value}: {final_score}") 

140 log.info(f"variance: {final_var}") 

141 log.info(f"test_score: {test_score}\n") 

142 log_line(log) 

143 

144 with open(self.param_selection_file, "a") as f: 

145 f.write(f"evaluation run {self.run}\n") 

146 for k, v in params.items(): 

147 if isinstance(v, Tuple): 

148 v = ",".join([str(x) for x in v]) 

149 f.write(f"\t{k}: {str(v)}\n") 

150 f.write(f"{self.optimization_value.value}: {final_score}\n") 

151 f.write(f"variance: {final_var}\n") 

152 f.write(f"test_score: {test_score}\n") 

153 f.write("-" * 100 + "\n") 

154 

155 self.run += 1 

156 

157 return {"status": "ok", "loss": final_score, "loss_variance": final_var} 

158 

159 def optimize(self, space: SearchSpace, max_evals=100): 

160 search_space = space.search_space 

161 best = fmin( 

162 self._objective, search_space, algo=tpe.suggest, max_evals=max_evals 

163 ) 

164 

165 log_line(log) 

166 log.info("Optimizing parameter configuration done.") 

167 log.info("Best parameter configuration found:") 

168 for k, v in best.items(): 

169 log.info(f"\t{k}: {v}") 

170 log_line(log) 

171 

172 with open(self.param_selection_file, "a") as f: 

173 f.write("best parameter combination\n") 

174 for k, v in best.items(): 

175 if isinstance(v, Tuple): 

176 v = ",".join([str(x) for x in v]) 

177 f.write(f"\t{k}: {str(v)}\n") 

178 

179 

180class SequenceTaggerParamSelector(ParamSelector): 

181 def __init__( 

182 self, 

183 corpus: Corpus, 

184 tag_type: str, 

185 base_path: Union[str, Path], 

186 max_epochs: int = 50, 

187 evaluation_metric: EvaluationMetric = EvaluationMetric.MICRO_F1_SCORE, 

188 training_runs: int = 1, 

189 optimization_value: OptimizationValue = OptimizationValue.DEV_LOSS, 

190 ): 

191 """ 

192 :param corpus: the corpus 

193 :param tag_type: tag type to use 

194 :param base_path: the path to the result folder (results will be written to that folder) 

195 :param max_epochs: number of epochs to perform on every evaluation run 

196 :param evaluation_metric: evaluation metric used during training 

197 :param training_runs: number of training runs per evaluation run 

198 :param optimization_value: value to optimize 

199 """ 

200 super().__init__( 

201 corpus, 

202 base_path, 

203 max_epochs, 

204 evaluation_metric, 

205 training_runs, 

206 optimization_value, 

207 ) 

208 

209 self.tag_type = tag_type 

210 self.tag_dictionary = self.corpus.make_label_dictionary(self.tag_type) 

211 

212 def _set_up_model(self, params: dict): 

213 sequence_tagger_params = { 

214 key: params[key] for key in params if key in SEQUENCE_TAGGER_PARAMETERS 

215 } 

216 

217 tagger: SequenceTagger = SequenceTagger( 

218 tag_dictionary=self.tag_dictionary, 

219 tag_type=self.tag_type, 

220 **sequence_tagger_params, 

221 ) 

222 return tagger 

223 

224 

225class TextClassifierParamSelector(ParamSelector): 

226 def __init__( 

227 self, 

228 corpus: Corpus, 

229 multi_label: bool, 

230 base_path: Union[str, Path], 

231 document_embedding_type: str, 

232 max_epochs: int = 50, 

233 evaluation_metric: EvaluationMetric = EvaluationMetric.MICRO_F1_SCORE, 

234 training_runs: int = 1, 

235 optimization_value: OptimizationValue = OptimizationValue.DEV_LOSS, 

236 ): 

237 """ 

238 :param corpus: the corpus 

239 :param multi_label: true, if the dataset is multi label, false otherwise 

240 :param base_path: the path to the result folder (results will be written to that folder) 

241 :param document_embedding_type: either 'lstm', 'mean', 'min', or 'max' 

242 :param max_epochs: number of epochs to perform on every evaluation run 

243 :param evaluation_metric: evaluation metric used during training 

244 :param training_runs: number of training runs per evaluation run 

245 :param optimization_value: value to optimize 

246 """ 

247 super().__init__( 

248 corpus, 

249 base_path, 

250 max_epochs, 

251 evaluation_metric, 

252 training_runs, 

253 optimization_value, 

254 ) 

255 

256 self.multi_label = multi_label 

257 self.document_embedding_type = document_embedding_type 

258 

259 self.label_dictionary = self.corpus.make_label_dictionary() 

260 

261 def _set_up_model(self, params: dict): 

262 embdding_params = { 

263 key: params[key] for key in params if key in DOCUMENT_EMBEDDING_PARAMETERS 

264 } 

265 

266 if self.document_embedding_type == "lstm": 

267 document_embedding = DocumentRNNEmbeddings(**embdding_params) 

268 else: 

269 document_embedding = DocumentPoolEmbeddings(**embdding_params) 

270 

271 text_classifier: TextClassifier = TextClassifier( 

272 label_dictionary=self.label_dictionary, 

273 multi_label=self.multi_label, 

274 document_embeddings=document_embedding, 

275 ) 

276 

277 return text_classifier