Coverage for flair/flair/embeddings/legacy.py: 18%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

683 statements  

1from pathlib import Path 

2from deprecated import deprecated 

3from abc import abstractmethod 

4from typing import List, Union, Tuple, Dict 

5 

6import torch 

7import logging 

8import flair 

9 

10from flair.data import Sentence, Token 

11from flair.embeddings.base import ScalarMix 

12from flair.embeddings.document import DocumentEmbeddings 

13from flair.embeddings.token import TokenEmbeddings, StackedEmbeddings 

14from flair.file_utils import cached_path 

15 

16from transformers import ( 

17 AlbertTokenizer, 

18 AlbertModel, 

19 BertTokenizer, 

20 BertModel, 

21 CamembertTokenizer, 

22 CamembertModel, 

23 RobertaTokenizer, 

24 RobertaModel, 

25 TransfoXLTokenizer, 

26 TransfoXLModel, 

27 OpenAIGPTModel, 

28 OpenAIGPTTokenizer, 

29 GPT2Model, 

30 GPT2Tokenizer, 

31 XLNetTokenizer, 

32 XLMTokenizer, 

33 XLNetModel, 

34 XLMModel, 

35 XLMRobertaTokenizer, 

36 XLMRobertaModel, 

37 PreTrainedTokenizer, 

38 PreTrainedModel, 

39 AutoTokenizer, AutoConfig, AutoModel, T5Tokenizer) 

40 

41from flair.nn import LockedDropout, WordDropout 

42 

43log = logging.getLogger("flair") 

44 

45 

46class CharLMEmbeddings(TokenEmbeddings): 

47 """Contextual string embeddings of words, as proposed in Akbik et al., 2018. """ 

48 

49 @deprecated(version="0.4", reason="Use 'FlairEmbeddings' instead.") 

50 def __init__( 

51 self, 

52 model: str, 

53 detach: bool = True, 

54 use_cache: bool = False, 

55 cache_directory: Path = None, 

56 ): 

57 """ 

58 initializes contextual string embeddings using a character-level language model. 

59 :param model: model string, one of 'news-forward', 'news-backward', 'news-forward-fast', 'news-backward-fast', 

60 'mix-forward', 'mix-backward', 'german-forward', 'german-backward', 'polish-backward', 'polish-forward' 

61 depending on which character language model is desired. 

62 :param detach: if set to False, the gradient will propagate into the language model. this dramatically slows down 

63 training and often leads to worse results, so not recommended. 

64 :param use_cache: if set to False, will not write embeddings to file for later retrieval. this saves disk space but will 

65 not allow re-use of once computed embeddings that do not fit into memory 

66 :param cache_directory: if cache_directory is not set, the cache will be written to ~/.flair/embeddings. otherwise the cache 

67 is written to the provided directory. 

68 """ 

69 super().__init__() 

70 

71 cache_dir = Path("embeddings") 

72 

73 # multilingual forward (English, German, French, Italian, Dutch, Polish) 

74 if model.lower() == "multi-forward": 

75 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/lm-multi-forward-v0.1.pt" 

76 model = cached_path(base_path, cache_dir=cache_dir) 

77 # multilingual backward (English, German, French, Italian, Dutch, Polish) 

78 elif model.lower() == "multi-backward": 

79 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/lm-multi-backward-v0.1.pt" 

80 model = cached_path(base_path, cache_dir=cache_dir) 

81 

82 # news-english-forward 

83 elif model.lower() == "news-forward": 

84 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-forward-v0.2rc.pt" 

85 model = cached_path(base_path, cache_dir=cache_dir) 

86 

87 # news-english-backward 

88 elif model.lower() == "news-backward": 

89 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-backward-v0.2rc.pt" 

90 model = cached_path(base_path, cache_dir=cache_dir) 

91 

92 # news-english-forward 

93 elif model.lower() == "news-forward-fast": 

94 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-forward-1024-v0.2rc.pt" 

95 model = cached_path(base_path, cache_dir=cache_dir) 

96 

97 # news-english-backward 

98 elif model.lower() == "news-backward-fast": 

99 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-backward-1024-v0.2rc.pt" 

100 model = cached_path(base_path, cache_dir=cache_dir) 

101 

102 # mix-english-forward 

103 elif model.lower() == "mix-forward": 

104 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-english-forward-v0.2rc.pt" 

105 model = cached_path(base_path, cache_dir=cache_dir) 

106 

107 # mix-english-backward 

108 elif model.lower() == "mix-backward": 

109 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-english-backward-v0.2rc.pt" 

110 model = cached_path(base_path, cache_dir=cache_dir) 

111 

112 # mix-german-forward 

113 elif model.lower() == "german-forward" or model.lower() == "de-forward": 

114 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-german-forward-v0.2rc.pt" 

115 model = cached_path(base_path, cache_dir=cache_dir) 

116 

117 # mix-german-backward 

118 elif model.lower() == "german-backward" or model.lower() == "de-backward": 

119 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-german-backward-v0.2rc.pt" 

120 model = cached_path(base_path, cache_dir=cache_dir) 

121 

122 # common crawl Polish forward 

123 elif model.lower() == "polish-forward" or model.lower() == "pl-forward": 

124 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-polish-forward-v0.2.pt" 

125 model = cached_path(base_path, cache_dir=cache_dir) 

126 

127 # common crawl Polish backward 

128 elif model.lower() == "polish-backward" or model.lower() == "pl-backward": 

129 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-polish-backward-v0.2.pt" 

130 model = cached_path(base_path, cache_dir=cache_dir) 

131 

132 # Slovenian forward 

133 elif model.lower() == "slovenian-forward" or model.lower() == "sl-forward": 

134 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.3/lm-sl-large-forward-v0.1.pt" 

135 model = cached_path(base_path, cache_dir=cache_dir) 

136 # Slovenian backward 

137 elif model.lower() == "slovenian-backward" or model.lower() == "sl-backward": 

138 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.3/lm-sl-large-backward-v0.1.pt" 

139 model = cached_path(base_path, cache_dir=cache_dir) 

140 

141 # Bulgarian forward 

142 elif model.lower() == "bulgarian-forward" or model.lower() == "bg-forward": 

143 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.3/lm-bg-small-forward-v0.1.pt" 

144 model = cached_path(base_path, cache_dir=cache_dir) 

145 # Bulgarian backward 

146 elif model.lower() == "bulgarian-backward" or model.lower() == "bg-backward": 

147 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.3/lm-bg-small-backward-v0.1.pt" 

148 model = cached_path(base_path, cache_dir=cache_dir) 

149 

150 # Dutch forward 

151 elif model.lower() == "dutch-forward" or model.lower() == "nl-forward": 

152 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/lm-nl-large-forward-v0.1.pt" 

153 model = cached_path(base_path, cache_dir=cache_dir) 

154 # Dutch backward 

155 elif model.lower() == "dutch-backward" or model.lower() == "nl-backward": 

156 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/lm-nl-large-backward-v0.1.pt" 

157 model = cached_path(base_path, cache_dir=cache_dir) 

158 

159 # Swedish forward 

160 elif model.lower() == "swedish-forward" or model.lower() == "sv-forward": 

161 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/lm-sv-large-forward-v0.1.pt" 

162 model = cached_path(base_path, cache_dir=cache_dir) 

163 # Swedish backward 

164 elif model.lower() == "swedish-backward" or model.lower() == "sv-backward": 

165 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/lm-sv-large-backward-v0.1.pt" 

166 model = cached_path(base_path, cache_dir=cache_dir) 

167 

168 # French forward 

169 elif model.lower() == "french-forward" or model.lower() == "fr-forward": 

170 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-fr-charlm-forward.pt" 

171 model = cached_path(base_path, cache_dir=cache_dir) 

172 # French backward 

173 elif model.lower() == "french-backward" or model.lower() == "fr-backward": 

174 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-fr-charlm-backward.pt" 

175 model = cached_path(base_path, cache_dir=cache_dir) 

176 

177 # Czech forward 

178 elif model.lower() == "czech-forward" or model.lower() == "cs-forward": 

179 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/lm-cs-large-forward-v0.1.pt" 

180 model = cached_path(base_path, cache_dir=cache_dir) 

181 # Czech backward 

182 elif model.lower() == "czech-backward" or model.lower() == "cs-backward": 

183 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/lm-cs-large-backward-v0.1.pt" 

184 model = cached_path(base_path, cache_dir=cache_dir) 

185 

186 # Portuguese forward 

187 elif model.lower() == "portuguese-forward" or model.lower() == "pt-forward": 

188 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/lm-pt-forward.pt" 

189 model = cached_path(base_path, cache_dir=cache_dir) 

190 # Portuguese backward 

191 elif model.lower() == "portuguese-backward" or model.lower() == "pt-backward": 

192 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/lm-pt-backward.pt" 

193 model = cached_path(base_path, cache_dir=cache_dir) 

194 

195 elif not Path(model).exists(): 

196 raise ValueError( 

197 f'The given model "{model}" is not available or is not a valid path.' 

198 ) 

199 

200 self.name = str(model) 

201 self.static_embeddings = detach 

202 

203 from flair.models import LanguageModel 

204 

205 self.lm = LanguageModel.load_language_model(model) 

206 self.detach = detach 

207 

208 self.is_forward_lm: bool = self.lm.is_forward_lm 

209 

210 # initialize cache if use_cache set 

211 self.cache = None 

212 if use_cache: 

213 cache_path = ( 

214 Path(f"{self.name}-tmp-cache.sqllite") 

215 if not cache_directory 

216 else cache_directory / f"{self.name}-tmp-cache.sqllite" 

217 ) 

218 from sqlitedict import SqliteDict 

219 

220 self.cache = SqliteDict(str(cache_path), autocommit=True) 

221 

222 # embed a dummy sentence to determine embedding_length 

223 dummy_sentence: Sentence = Sentence() 

224 dummy_sentence.add_token(Token("hello")) 

225 embedded_dummy = self.embed(dummy_sentence) 

226 self.__embedding_length: int = len( 

227 embedded_dummy[0].get_token(1).get_embedding() 

228 ) 

229 

230 # set to eval mode 

231 self.eval() 

232 

233 def train(self, mode=True): 

234 pass 

235 

236 def __getstate__(self): 

237 # Copy the object's state from self.__dict__ which contains 

238 # all our instance attributes. Always use the dict.copy() 

239 # method to avoid modifying the original state. 

240 state = self.__dict__.copy() 

241 # Remove the unpicklable entries. 

242 state["cache"] = None 

243 return state 

244 

245 @property 

246 def embedding_length(self) -> int: 

247 return self.__embedding_length 

248 

249 def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]: 

250 

251 # if cache is used, try setting embeddings from cache first 

252 if "cache" in self.__dict__ and self.cache is not None: 

253 

254 # try populating embeddings from cache 

255 all_embeddings_retrieved_from_cache: bool = True 

256 for sentence in sentences: 

257 key = sentence.to_tokenized_string() 

258 embeddings = self.cache.get(key) 

259 

260 if not embeddings: 

261 all_embeddings_retrieved_from_cache = False 

262 break 

263 else: 

264 for token, embedding in zip(sentence, embeddings): 

265 token.set_embedding(self.name, torch.FloatTensor(embedding)) 

266 

267 if all_embeddings_retrieved_from_cache: 

268 return sentences 

269 

270 # if this is not possible, use LM to generate embedding. First, get text sentences 

271 text_sentences = [sentence.to_tokenized_string() for sentence in sentences] 

272 

273 start_marker = "\n" 

274 end_marker = " " 

275 

276 # get hidden states from language model 

277 all_hidden_states_in_lm = self.lm.get_representation( 

278 text_sentences, start_marker, end_marker, self.chars_per_chunk 

279 ) 

280 

281 # take first or last hidden states from language model as word representation 

282 for i, sentence in enumerate(sentences): 

283 sentence_text = sentence.to_tokenized_string() 

284 

285 offset_forward: int = len(start_marker) 

286 offset_backward: int = len(sentence_text) + len(start_marker) 

287 

288 for token in sentence.tokens: 

289 

290 offset_forward += len(token.text) 

291 

292 if self.is_forward_lm: 

293 offset = offset_forward 

294 else: 

295 offset = offset_backward 

296 

297 embedding = all_hidden_states_in_lm[offset, i, :] 

298 

299 # if self.tokenized_lm or token.whitespace_after: 

300 offset_forward += 1 

301 offset_backward -= 1 

302 

303 offset_backward -= len(token.text) 

304 

305 token.set_embedding(self.name, embedding) 

306 

307 if "cache" in self.__dict__ and self.cache is not None: 

308 for sentence in sentences: 

309 self.cache[sentence.to_tokenized_string()] = [ 

310 token._embeddings[self.name].tolist() for token in sentence 

311 ] 

312 

313 return sentences 

314 

315 def __str__(self): 

316 return self.name 

317 

318 

319class TransformerXLEmbeddings(TokenEmbeddings): 

320 

321 @deprecated( 

322 version="0.4.5", 

323 reason="Use 'TransformerWordEmbeddings' for all transformer-based word embeddings", 

324 ) 

325 def __init__( 

326 self, 

327 pretrained_model_name_or_path: str = "transfo-xl-wt103", 

328 layers: str = "1,2,3", 

329 use_scalar_mix: bool = False, 

330 ): 

331 """Transformer-XL embeddings, as proposed in Dai et al., 2019. 

332 :param pretrained_model_name_or_path: name or path of Transformer-XL model 

333 :param layers: comma-separated list of layers 

334 :param use_scalar_mix: defines the usage of scalar mix for specified layer(s) 

335 """ 

336 super().__init__() 

337 

338 self.tokenizer = TransfoXLTokenizer.from_pretrained( 

339 pretrained_model_name_or_path 

340 ) 

341 self.model = TransfoXLModel.from_pretrained( 

342 pretrained_model_name_or_path=pretrained_model_name_or_path, 

343 output_hidden_states=True, 

344 ) 

345 self.name = pretrained_model_name_or_path 

346 self.layers: List[int] = [int(layer) for layer in layers.split(",")] 

347 self.use_scalar_mix = use_scalar_mix 

348 self.static_embeddings = True 

349 

350 dummy_sentence: Sentence = Sentence() 

351 dummy_sentence.add_token(Token("hello")) 

352 embedded_dummy = self.embed(dummy_sentence) 

353 self.__embedding_length: int = len( 

354 embedded_dummy[0].get_token(1).get_embedding() 

355 ) 

356 

357 @property 

358 def embedding_length(self) -> int: 

359 return self.__embedding_length 

360 

361 def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]: 

362 self.model.to(flair.device) 

363 self.model.eval() 

364 

365 sentences = _get_transformer_sentence_embeddings( 

366 sentences=sentences, 

367 tokenizer=self.tokenizer, 

368 model=self.model, 

369 name=self.name, 

370 layers=self.layers, 

371 pooling_operation="first", 

372 use_scalar_mix=self.use_scalar_mix, 

373 eos_token="<eos>", 

374 ) 

375 

376 return sentences 

377 

378 def extra_repr(self): 

379 return "model={}".format(self.name) 

380 

381 def __str__(self): 

382 return self.name 

383 

384 

385class XLNetEmbeddings(TokenEmbeddings): 

386 

387 @deprecated( 

388 version="0.4.5", 

389 reason="Use 'TransformerWordEmbeddings' for all transformer-based word embeddings", 

390 ) 

391 def __init__( 

392 self, 

393 pretrained_model_name_or_path: str = "xlnet-large-cased", 

394 layers: str = "1", 

395 pooling_operation: str = "first_last", 

396 use_scalar_mix: bool = False, 

397 ): 

398 """XLNet embeddings, as proposed in Yang et al., 2019. 

399 :param pretrained_model_name_or_path: name or path of XLNet model 

400 :param layers: comma-separated list of layers 

401 :param pooling_operation: defines pooling operation for subwords 

402 :param use_scalar_mix: defines the usage of scalar mix for specified layer(s) 

403 """ 

404 super().__init__() 

405 

406 self.tokenizer = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path) 

407 self.model = XLNetModel.from_pretrained( 

408 pretrained_model_name_or_path=pretrained_model_name_or_path, 

409 output_hidden_states=True, 

410 ) 

411 self.name = pretrained_model_name_or_path 

412 self.layers: List[int] = [int(layer) for layer in layers.split(",")] 

413 self.pooling_operation = pooling_operation 

414 self.use_scalar_mix = use_scalar_mix 

415 self.static_embeddings = True 

416 

417 dummy_sentence: Sentence = Sentence() 

418 dummy_sentence.add_token(Token("hello")) 

419 embedded_dummy = self.embed(dummy_sentence) 

420 self.__embedding_length: int = len( 

421 embedded_dummy[0].get_token(1).get_embedding() 

422 ) 

423 

424 @property 

425 def embedding_length(self) -> int: 

426 return self.__embedding_length 

427 

428 def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]: 

429 self.model.to(flair.device) 

430 self.model.eval() 

431 

432 sentences = _get_transformer_sentence_embeddings( 

433 sentences=sentences, 

434 tokenizer=self.tokenizer, 

435 model=self.model, 

436 name=self.name, 

437 layers=self.layers, 

438 pooling_operation=self.pooling_operation, 

439 use_scalar_mix=self.use_scalar_mix, 

440 bos_token="<s>", 

441 eos_token="</s>", 

442 ) 

443 

444 return sentences 

445 

446 def extra_repr(self): 

447 return "model={}".format(self.name) 

448 

449 def __str__(self): 

450 return self.name 

451 

452 

453class XLMEmbeddings(TokenEmbeddings): 

454 

455 @deprecated( 

456 version="0.4.5", 

457 reason="Use 'TransformerWordEmbeddings' for all transformer-based word embeddings", 

458 ) 

459 def __init__( 

460 self, 

461 pretrained_model_name_or_path: str = "xlm-mlm-en-2048", 

462 layers: str = "1", 

463 pooling_operation: str = "first_last", 

464 use_scalar_mix: bool = False, 

465 ): 

466 """ 

467 XLM embeddings, as proposed in Guillaume et al., 2019. 

468 :param pretrained_model_name_or_path: name or path of XLM model 

469 :param layers: comma-separated list of layers 

470 :param pooling_operation: defines pooling operation for subwords 

471 :param use_scalar_mix: defines the usage of scalar mix for specified layer(s) 

472 """ 

473 super().__init__() 

474 

475 self.tokenizer = XLMTokenizer.from_pretrained(pretrained_model_name_or_path) 

476 self.model = XLMModel.from_pretrained( 

477 pretrained_model_name_or_path=pretrained_model_name_or_path, 

478 output_hidden_states=True, 

479 ) 

480 self.name = pretrained_model_name_or_path 

481 self.layers: List[int] = [int(layer) for layer in layers.split(",")] 

482 self.pooling_operation = pooling_operation 

483 self.use_scalar_mix = use_scalar_mix 

484 self.static_embeddings = True 

485 

486 dummy_sentence: Sentence = Sentence() 

487 dummy_sentence.add_token(Token("hello")) 

488 embedded_dummy = self.embed(dummy_sentence) 

489 self.__embedding_length: int = len( 

490 embedded_dummy[0].get_token(1).get_embedding() 

491 ) 

492 

493 @property 

494 def embedding_length(self) -> int: 

495 return self.__embedding_length 

496 

497 def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]: 

498 self.model.to(flair.device) 

499 self.model.eval() 

500 

501 sentences = _get_transformer_sentence_embeddings( 

502 sentences=sentences, 

503 tokenizer=self.tokenizer, 

504 model=self.model, 

505 name=self.name, 

506 layers=self.layers, 

507 pooling_operation=self.pooling_operation, 

508 use_scalar_mix=self.use_scalar_mix, 

509 bos_token="<s>", 

510 eos_token="</s>", 

511 ) 

512 

513 return sentences 

514 

515 def extra_repr(self): 

516 return "model={}".format(self.name) 

517 

518 def __str__(self): 

519 return self.name 

520 

521 

522class OpenAIGPTEmbeddings(TokenEmbeddings): 

523 

524 @deprecated( 

525 version="0.4.5", 

526 reason="Use 'TransformerWordEmbeddings' for all transformer-based word embeddings", 

527 ) 

528 def __init__( 

529 self, 

530 pretrained_model_name_or_path: str = "openai-gpt", 

531 layers: str = "1", 

532 pooling_operation: str = "first_last", 

533 use_scalar_mix: bool = False, 

534 ): 

535 """OpenAI GPT embeddings, as proposed in Radford et al. 2018. 

536 :param pretrained_model_name_or_path: name or path of OpenAI GPT model 

537 :param layers: comma-separated list of layers 

538 :param pooling_operation: defines pooling operation for subwords 

539 :param use_scalar_mix: defines the usage of scalar mix for specified layer(s) 

540 """ 

541 super().__init__() 

542 

543 self.tokenizer = OpenAIGPTTokenizer.from_pretrained( 

544 pretrained_model_name_or_path 

545 ) 

546 self.model = OpenAIGPTModel.from_pretrained( 

547 pretrained_model_name_or_path=pretrained_model_name_or_path, 

548 output_hidden_states=True, 

549 ) 

550 self.name = pretrained_model_name_or_path 

551 self.layers: List[int] = [int(layer) for layer in layers.split(",")] 

552 self.pooling_operation = pooling_operation 

553 self.use_scalar_mix = use_scalar_mix 

554 self.static_embeddings = True 

555 

556 dummy_sentence: Sentence = Sentence() 

557 dummy_sentence.add_token(Token("hello")) 

558 embedded_dummy = self.embed(dummy_sentence) 

559 self.__embedding_length: int = len( 

560 embedded_dummy[0].get_token(1).get_embedding() 

561 ) 

562 

563 @property 

564 def embedding_length(self) -> int: 

565 return self.__embedding_length 

566 

567 def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]: 

568 self.model.to(flair.device) 

569 self.model.eval() 

570 

571 sentences = _get_transformer_sentence_embeddings( 

572 sentences=sentences, 

573 tokenizer=self.tokenizer, 

574 model=self.model, 

575 name=self.name, 

576 layers=self.layers, 

577 pooling_operation=self.pooling_operation, 

578 use_scalar_mix=self.use_scalar_mix, 

579 ) 

580 

581 return sentences 

582 

583 def extra_repr(self): 

584 return "model={}".format(self.name) 

585 

586 def __str__(self): 

587 return self.name 

588 

589 

590class OpenAIGPT2Embeddings(TokenEmbeddings): 

591 

592 @deprecated( 

593 version="0.4.5", 

594 reason="Use 'TransformerWordEmbeddings' for all transformer-based word embeddings", 

595 ) 

596 def __init__( 

597 self, 

598 pretrained_model_name_or_path: str = "gpt2-medium", 

599 layers: str = "1", 

600 pooling_operation: str = "first_last", 

601 use_scalar_mix: bool = False, 

602 ): 

603 """OpenAI GPT-2 embeddings, as proposed in Radford et al. 2019. 

604 :param pretrained_model_name_or_path: name or path of OpenAI GPT-2 model 

605 :param layers: comma-separated list of layers 

606 :param pooling_operation: defines pooling operation for subwords 

607 :param use_scalar_mix: defines the usage of scalar mix for specified layer(s) 

608 """ 

609 super().__init__() 

610 

611 self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path) 

612 self.model = GPT2Model.from_pretrained( 

613 pretrained_model_name_or_path=pretrained_model_name_or_path, 

614 output_hidden_states=True, 

615 ) 

616 self.name = pretrained_model_name_or_path 

617 self.layers: List[int] = [int(layer) for layer in layers.split(",")] 

618 self.pooling_operation = pooling_operation 

619 self.use_scalar_mix = use_scalar_mix 

620 self.static_embeddings = True 

621 

622 dummy_sentence: Sentence = Sentence() 

623 dummy_sentence.add_token(Token("hello")) 

624 embedded_dummy = self.embed(dummy_sentence) 

625 self.__embedding_length: int = len( 

626 embedded_dummy[0].get_token(1).get_embedding() 

627 ) 

628 

629 @property 

630 def embedding_length(self) -> int: 

631 return self.__embedding_length 

632 

633 def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]: 

634 self.model.to(flair.device) 

635 self.model.eval() 

636 

637 sentences = _get_transformer_sentence_embeddings( 

638 sentences=sentences, 

639 tokenizer=self.tokenizer, 

640 model=self.model, 

641 name=self.name, 

642 layers=self.layers, 

643 pooling_operation=self.pooling_operation, 

644 use_scalar_mix=self.use_scalar_mix, 

645 bos_token="<|endoftext|>", 

646 eos_token="<|endoftext|>", 

647 ) 

648 

649 return sentences 

650 

651 

652class RoBERTaEmbeddings(TokenEmbeddings): 

653 

654 @deprecated( 

655 version="0.4.5", 

656 reason="Use 'TransformerWordEmbeddings' for all transformer-based word embeddings", 

657 ) 

658 def __init__( 

659 self, 

660 pretrained_model_name_or_path: str = "roberta-base", 

661 layers: str = "-1", 

662 pooling_operation: str = "first", 

663 use_scalar_mix: bool = False, 

664 ): 

665 """RoBERTa, as proposed by Liu et al. 2019. 

666 :param pretrained_model_name_or_path: name or path of RoBERTa model 

667 :param layers: comma-separated list of layers 

668 :param pooling_operation: defines pooling operation for subwords 

669 :param use_scalar_mix: defines the usage of scalar mix for specified layer(s) 

670 """ 

671 super().__init__() 

672 

673 self.tokenizer = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path) 

674 self.model = RobertaModel.from_pretrained( 

675 pretrained_model_name_or_path=pretrained_model_name_or_path, 

676 output_hidden_states=True, 

677 ) 

678 self.name = pretrained_model_name_or_path 

679 self.layers: List[int] = [int(layer) for layer in layers.split(",")] 

680 self.pooling_operation = pooling_operation 

681 self.use_scalar_mix = use_scalar_mix 

682 self.static_embeddings = True 

683 

684 dummy_sentence: Sentence = Sentence() 

685 dummy_sentence.add_token(Token("hello")) 

686 embedded_dummy = self.embed(dummy_sentence) 

687 self.__embedding_length: int = len( 

688 embedded_dummy[0].get_token(1).get_embedding() 

689 ) 

690 

691 @property 

692 def embedding_length(self) -> int: 

693 return self.__embedding_length 

694 

695 def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]: 

696 self.model.to(flair.device) 

697 self.model.eval() 

698 

699 sentences = _get_transformer_sentence_embeddings( 

700 sentences=sentences, 

701 tokenizer=self.tokenizer, 

702 model=self.model, 

703 name=self.name, 

704 layers=self.layers, 

705 pooling_operation=self.pooling_operation, 

706 use_scalar_mix=self.use_scalar_mix, 

707 bos_token="<s>", 

708 eos_token="</s>", 

709 ) 

710 

711 return sentences 

712 

713 

714class CamembertEmbeddings(TokenEmbeddings): 

715 

716 @deprecated( 

717 version="0.4.5", 

718 reason="Use 'TransformerWordEmbeddings' for all transformer-based word embeddings", 

719 ) 

720 def __init__( 

721 self, 

722 pretrained_model_name_or_path: str = "camembert-base", 

723 layers: str = "-1", 

724 pooling_operation: str = "first", 

725 use_scalar_mix: bool = False, 

726 ): 

727 """CamemBERT, a Tasty French Language Model, as proposed by Martin et al. 2019. 

728 :param pretrained_model_name_or_path: name or path of RoBERTa model 

729 :param layers: comma-separated list of layers 

730 :param pooling_operation: defines pooling operation for subwords 

731 :param use_scalar_mix: defines the usage of scalar mix for specified layer(s) 

732 """ 

733 super().__init__() 

734 

735 self.tokenizer = CamembertTokenizer.from_pretrained( 

736 pretrained_model_name_or_path 

737 ) 

738 self.model = CamembertModel.from_pretrained( 

739 pretrained_model_name_or_path=pretrained_model_name_or_path, 

740 output_hidden_states=True, 

741 ) 

742 self.name = pretrained_model_name_or_path 

743 self.layers: List[int] = [int(layer) for layer in layers.split(",")] 

744 self.pooling_operation = pooling_operation 

745 self.use_scalar_mix = use_scalar_mix 

746 self.static_embeddings = True 

747 

748 dummy_sentence: Sentence = Sentence() 

749 dummy_sentence.add_token(Token("hello")) 

750 embedded_dummy = self.embed(dummy_sentence) 

751 self.__embedding_length: int = len( 

752 embedded_dummy[0].get_token(1).get_embedding() 

753 ) 

754 

755 def __getstate__(self): 

756 state = self.__dict__.copy() 

757 state["tokenizer"] = None 

758 return state 

759 

760 def __setstate__(self, d): 

761 self.__dict__ = d 

762 

763 # 1-camembert-base -> camembert-base 

764 if any(char.isdigit() for char in self.name): 

765 self.tokenizer = CamembertTokenizer.from_pretrained( 

766 "-".join(self.name.split("-")[1:])) 

767 else: 

768 self.tokenizer = CamembertTokenizer.from_pretrained(self.name) 

769 

770 @property 

771 def embedding_length(self) -> int: 

772 return self.__embedding_length 

773 

774 def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]: 

775 self.model.to(flair.device) 

776 self.model.eval() 

777 

778 sentences = _get_transformer_sentence_embeddings( 

779 sentences=sentences, 

780 tokenizer=self.tokenizer, 

781 model=self.model, 

782 name=self.name, 

783 layers=self.layers, 

784 pooling_operation=self.pooling_operation, 

785 use_scalar_mix=self.use_scalar_mix, 

786 bos_token="<s>", 

787 eos_token="</s>", 

788 ) 

789 

790 return sentences 

791 

792 

793class XLMRobertaEmbeddings(TokenEmbeddings): 

794 

795 @deprecated( 

796 version="0.4.5", 

797 reason="Use 'TransformerWordEmbeddings' for all transformer-based word embeddings", 

798 ) 

799 def __init__( 

800 self, 

801 pretrained_model_name_or_path: str = "xlm-roberta-large", 

802 layers: str = "-1", 

803 pooling_operation: str = "first", 

804 use_scalar_mix: bool = False, 

805 ): 

806 """XLM-RoBERTa as proposed by Conneau et al. 2019. 

807 :param pretrained_model_name_or_path: name or path of XLM-R model 

808 :param layers: comma-separated list of layers 

809 :param pooling_operation: defines pooling operation for subwords 

810 :param use_scalar_mix: defines the usage of scalar mix for specified layer(s) 

811 """ 

812 super().__init__() 

813 

814 self.tokenizer = XLMRobertaTokenizer.from_pretrained( 

815 pretrained_model_name_or_path 

816 ) 

817 self.model = XLMRobertaModel.from_pretrained( 

818 pretrained_model_name_or_path=pretrained_model_name_or_path, 

819 output_hidden_states=True, 

820 ) 

821 self.name = pretrained_model_name_or_path 

822 self.layers: List[int] = [int(layer) for layer in layers.split(",")] 

823 self.pooling_operation = pooling_operation 

824 self.use_scalar_mix = use_scalar_mix 

825 self.static_embeddings = True 

826 

827 dummy_sentence: Sentence = Sentence() 

828 dummy_sentence.add_token(Token("hello")) 

829 embedded_dummy = self.embed(dummy_sentence) 

830 self.__embedding_length: int = len( 

831 embedded_dummy[0].get_token(1).get_embedding() 

832 ) 

833 

834 def __getstate__(self): 

835 state = self.__dict__.copy() 

836 state["tokenizer"] = None 

837 return state 

838 

839 def __setstate__(self, d): 

840 self.__dict__ = d 

841 

842 # 1-xlm-roberta-large -> xlm-roberta-large 

843 self.tokenizer = self.tokenizer = XLMRobertaTokenizer.from_pretrained( 

844 "-".join(self.name.split("-")[1:]) 

845 ) 

846 

847 @property 

848 def embedding_length(self) -> int: 

849 return self.__embedding_length 

850 

851 def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]: 

852 self.model.to(flair.device) 

853 self.model.eval() 

854 

855 sentences = _get_transformer_sentence_embeddings( 

856 sentences=sentences, 

857 tokenizer=self.tokenizer, 

858 model=self.model, 

859 name=self.name, 

860 layers=self.layers, 

861 pooling_operation=self.pooling_operation, 

862 use_scalar_mix=self.use_scalar_mix, 

863 bos_token="<s>", 

864 eos_token="</s>", 

865 ) 

866 

867 return sentences 

868 

869def _extract_embeddings( 

870 hidden_states: List[torch.FloatTensor], 

871 layers: List[int], 

872 pooling_operation: str, 

873 subword_start_idx: int, 

874 subword_end_idx: int, 

875 use_scalar_mix: bool = False, 

876) -> List[torch.FloatTensor]: 

877 """ 

878 Extracts subword embeddings from specified layers from hidden states. 

879 :param hidden_states: list of hidden states from model 

880 :param layers: list of layers 

881 :param pooling_operation: pooling operation for subword embeddings (supported: first, last, first_last and mean) 

882 :param subword_start_idx: defines start index for subword 

883 :param subword_end_idx: defines end index for subword 

884 :param use_scalar_mix: determines, if scalar mix should be used 

885 :return: list of extracted subword embeddings 

886 """ 

887 subtoken_embeddings: List[torch.FloatTensor] = [] 

888 

889 for layer in layers: 

890 current_embeddings = hidden_states[layer][0][subword_start_idx:subword_end_idx] 

891 

892 first_embedding: torch.FloatTensor = current_embeddings[0] 

893 if pooling_operation == "first_last": 

894 last_embedding: torch.FloatTensor = current_embeddings[-1] 

895 final_embedding: torch.FloatTensor = torch.cat( 

896 [first_embedding, last_embedding] 

897 ) 

898 elif pooling_operation == "last": 

899 final_embedding: torch.FloatTensor = current_embeddings[-1] 

900 elif pooling_operation == "mean": 

901 all_embeddings: List[torch.FloatTensor] = [ 

902 embedding.unsqueeze(0) for embedding in current_embeddings 

903 ] 

904 final_embedding: torch.FloatTensor = torch.mean( 

905 torch.cat(all_embeddings, dim=0), dim=0 

906 ) 

907 else: 

908 final_embedding: torch.FloatTensor = first_embedding 

909 

910 subtoken_embeddings.append(final_embedding) 

911 

912 if use_scalar_mix: 

913 sm = ScalarMix(mixture_size=len(subtoken_embeddings)) 

914 sm_embeddings = sm(subtoken_embeddings) 

915 

916 subtoken_embeddings = [sm_embeddings] 

917 

918 return subtoken_embeddings 

919 

920 

921def _build_token_subwords_mapping( 

922 sentence: Sentence, tokenizer: PreTrainedTokenizer 

923) -> Tuple[Dict[int, int], str]: 

924 """ Builds a dictionary that stores the following information: 

925 Token index (key) and number of corresponding subwords (value) for a sentence. 

926 

927 :param sentence: input sentence 

928 :param tokenizer: Transformers tokenization object 

929 :return: dictionary of token index to corresponding number of subwords, tokenized string 

930 """ 

931 token_subwords_mapping: Dict[int, int] = {} 

932 

933 tokens = [] 

934 

935 for token in sentence.tokens: 

936 token_text = token.text 

937 

938 subwords = tokenizer.tokenize(token_text) 

939 

940 tokens.append(token.text if subwords else tokenizer.unk_token) 

941 

942 token_subwords_mapping[token.idx] = len(subwords) if subwords else 1 

943 

944 return token_subwords_mapping, " ".join(tokens) 

945 

946 

947def _build_token_subwords_mapping_gpt2( 

948 sentence: Sentence, tokenizer: PreTrainedTokenizer 

949) -> Tuple[Dict[int, int], str]: 

950 """ Builds a dictionary that stores the following information: 

951 Token index (key) and number of corresponding subwords (value) for a sentence. 

952 

953 :param sentence: input sentence 

954 :param tokenizer: Transformers tokenization object 

955 :return: dictionary of token index to corresponding number of subwords, tokenized string 

956 """ 

957 token_subwords_mapping: Dict[int, int] = {} 

958 

959 tokens = [] 

960 

961 for token in sentence.tokens: 

962 # Dummy token is needed to get the actually token tokenized correctly with special ``Ġ`` symbol 

963 

964 if token.idx == 1: 

965 token_text = token.text 

966 subwords = tokenizer.tokenize(token_text) 

967 else: 

968 token_text = "X " + token.text 

969 subwords = tokenizer.tokenize(token_text)[1:] 

970 

971 tokens.append(token.text if subwords else tokenizer.unk_token) 

972 

973 token_subwords_mapping[token.idx] = len(subwords) if subwords else 1 

974 

975 return token_subwords_mapping, " ".join(tokens) 

976 

977 

978def _get_transformer_sentence_embeddings( 

979 sentences: List[Sentence], 

980 tokenizer: PreTrainedTokenizer, 

981 model: PreTrainedModel, 

982 name: str, 

983 layers: List[int], 

984 pooling_operation: str, 

985 use_scalar_mix: bool, 

986 bos_token: str = None, 

987 eos_token: str = None, 

988) -> List[Sentence]: 

989 """ 

990 Builds sentence embeddings for Transformer-based architectures. 

991 :param sentences: input sentences 

992 :param tokenizer: tokenization object 

993 :param model: model object 

994 :param name: name of the Transformer-based model 

995 :param layers: list of layers 

996 :param pooling_operation: defines pooling operation for subword extraction 

997 :param use_scalar_mix: defines the usage of scalar mix for specified layer(s) 

998 :param bos_token: defines begin of sentence token (used for left padding) 

999 :param eos_token: defines end of sentence token (used for right padding) 

1000 :return: list of sentences (each token of a sentence is now embedded) 

1001 """ 

1002 with torch.no_grad(): 

1003 for sentence in sentences: 

1004 token_subwords_mapping: Dict[int, int] = {} 

1005 

1006 if ("gpt2" in name or "roberta" in name) and "xlm" not in name: 

1007 ( 

1008 token_subwords_mapping, 

1009 tokenized_string, 

1010 ) = _build_token_subwords_mapping_gpt2( 

1011 sentence=sentence, tokenizer=tokenizer 

1012 ) 

1013 else: 

1014 ( 

1015 token_subwords_mapping, 

1016 tokenized_string, 

1017 ) = _build_token_subwords_mapping( 

1018 sentence=sentence, tokenizer=tokenizer 

1019 ) 

1020 

1021 subwords = tokenizer.tokenize(tokenized_string) 

1022 

1023 offset = 0 

1024 

1025 if bos_token: 

1026 subwords = [bos_token] + subwords 

1027 offset = 1 

1028 

1029 if eos_token: 

1030 subwords = subwords + [eos_token] 

1031 

1032 indexed_tokens = tokenizer.convert_tokens_to_ids(subwords) 

1033 tokens_tensor = torch.tensor([indexed_tokens]) 

1034 tokens_tensor = tokens_tensor.to(flair.device) 

1035 

1036 hidden_states = model(tokens_tensor)[-1] 

1037 

1038 for token in sentence.tokens: 

1039 len_subwords = token_subwords_mapping[token.idx] 

1040 

1041 subtoken_embeddings = _extract_embeddings( 

1042 hidden_states=hidden_states, 

1043 layers=layers, 

1044 pooling_operation=pooling_operation, 

1045 subword_start_idx=offset, 

1046 subword_end_idx=offset + len_subwords, 

1047 use_scalar_mix=use_scalar_mix, 

1048 ) 

1049 

1050 offset += len_subwords 

1051 

1052 final_subtoken_embedding = torch.cat(subtoken_embeddings) 

1053 token.set_embedding(name, final_subtoken_embedding) 

1054 

1055 return sentences 

1056 

1057 

1058class BertEmbeddings(TokenEmbeddings): 

1059 

1060 @deprecated( 

1061 version="0.4.5", 

1062 reason="Use 'TransformerWordEmbeddings' for all transformer-based word embeddings", 

1063 ) 

1064 def __init__( 

1065 self, 

1066 bert_model_or_path: str = "bert-base-uncased", 

1067 layers: str = "-1,-2,-3,-4", 

1068 pooling_operation: str = "first", 

1069 use_scalar_mix: bool = False, 

1070 ): 

1071 """ 

1072 Bidirectional transformer embeddings of words, as proposed in Devlin et al., 2018. 

1073 :param bert_model_or_path: name of BERT model ('') or directory path containing custom model, configuration file 

1074 and vocab file (names of three files should be - config.json, pytorch_model.bin/model.chkpt, vocab.txt) 

1075 :param layers: string indicating which layers to take for embedding 

1076 :param pooling_operation: how to get from token piece embeddings to token embedding. Either pool them and take 

1077 the average ('mean') or use first word piece embedding as token embedding ('first) 

1078 """ 

1079 super().__init__() 

1080 

1081 if "distilbert" in bert_model_or_path: 

1082 try: 

1083 from transformers import DistilBertTokenizer, DistilBertModel 

1084 except ImportError: 

1085 log.warning("-" * 100) 

1086 log.warning( 

1087 "ATTENTION! To use DistilBert, please first install a recent version of transformers!" 

1088 ) 

1089 log.warning("-" * 100) 

1090 pass 

1091 

1092 self.tokenizer = DistilBertTokenizer.from_pretrained(bert_model_or_path) 

1093 self.model = DistilBertModel.from_pretrained( 

1094 pretrained_model_name_or_path=bert_model_or_path, 

1095 output_hidden_states=True, 

1096 ) 

1097 elif "albert" in bert_model_or_path: 

1098 self.tokenizer = AlbertTokenizer.from_pretrained(bert_model_or_path) 

1099 self.model = AlbertModel.from_pretrained( 

1100 pretrained_model_name_or_path=bert_model_or_path, 

1101 output_hidden_states=True, 

1102 ) 

1103 else: 

1104 self.tokenizer = BertTokenizer.from_pretrained(bert_model_or_path) 

1105 self.model = BertModel.from_pretrained( 

1106 pretrained_model_name_or_path=bert_model_or_path, 

1107 output_hidden_states=True, 

1108 ) 

1109 self.layer_indexes = [int(x) for x in layers.split(",")] 

1110 self.pooling_operation = pooling_operation 

1111 self.use_scalar_mix = use_scalar_mix 

1112 self.name = str(bert_model_or_path) 

1113 self.static_embeddings = True 

1114 

1115 class BertInputFeatures(object): 

1116 """Private helper class for holding BERT-formatted features""" 

1117 

1118 def __init__( 

1119 self, 

1120 unique_id, 

1121 tokens, 

1122 input_ids, 

1123 input_mask, 

1124 input_type_ids, 

1125 token_subtoken_count, 

1126 ): 

1127 self.unique_id = unique_id 

1128 self.tokens = tokens 

1129 self.input_ids = input_ids 

1130 self.input_mask = input_mask 

1131 self.input_type_ids = input_type_ids 

1132 self.token_subtoken_count = token_subtoken_count 

1133 

1134 def _convert_sentences_to_features( 

1135 self, sentences, max_sequence_length: int 

1136 ) -> [BertInputFeatures]: 

1137 

1138 max_sequence_length = max_sequence_length + 2 

1139 

1140 features: List[BertEmbeddings.BertInputFeatures] = [] 

1141 for (sentence_index, sentence) in enumerate(sentences): 

1142 

1143 bert_tokenization: List[str] = [] 

1144 token_subtoken_count: Dict[int, int] = {} 

1145 

1146 for token in sentence: 

1147 subtokens = self.tokenizer.tokenize(token.text) 

1148 bert_tokenization.extend(subtokens) 

1149 token_subtoken_count[token.idx] = len(subtokens) 

1150 

1151 if len(bert_tokenization) > max_sequence_length - 2: 

1152 bert_tokenization = bert_tokenization[0 : (max_sequence_length - 2)] 

1153 

1154 tokens = [] 

1155 input_type_ids = [] 

1156 tokens.append("[CLS]") 

1157 input_type_ids.append(0) 

1158 for token in bert_tokenization: 

1159 tokens.append(token) 

1160 input_type_ids.append(0) 

1161 tokens.append("[SEP]") 

1162 input_type_ids.append(0) 

1163 

1164 input_ids = self.tokenizer.convert_tokens_to_ids(tokens) 

1165 # The mask has 1 for real tokens and 0 for padding tokens. Only real 

1166 # tokens are attended to. 

1167 input_mask = [1] * len(input_ids) 

1168 

1169 # Zero-pad up to the sequence length. 

1170 while len(input_ids) < max_sequence_length: 

1171 input_ids.append(0) 

1172 input_mask.append(0) 

1173 input_type_ids.append(0) 

1174 

1175 features.append( 

1176 BertEmbeddings.BertInputFeatures( 

1177 unique_id=sentence_index, 

1178 tokens=tokens, 

1179 input_ids=input_ids, 

1180 input_mask=input_mask, 

1181 input_type_ids=input_type_ids, 

1182 token_subtoken_count=token_subtoken_count, 

1183 ) 

1184 ) 

1185 

1186 return features 

1187 

1188 def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]: 

1189 """Add embeddings to all words in a list of sentences. If embeddings are already added, 

1190 updates only if embeddings are non-static.""" 

1191 

1192 # first, find longest sentence in batch 

1193 longest_sentence_in_batch: int = len( 

1194 max( 

1195 [ 

1196 self.tokenizer.tokenize(sentence.to_tokenized_string()) 

1197 for sentence in sentences 

1198 ], 

1199 key=len, 

1200 ) 

1201 ) 

1202 

1203 # prepare id maps for BERT model 

1204 features = self._convert_sentences_to_features( 

1205 sentences, longest_sentence_in_batch 

1206 ) 

1207 all_input_ids = torch.LongTensor([f.input_ids for f in features]).to( 

1208 flair.device 

1209 ) 

1210 all_input_masks = torch.LongTensor([f.input_mask for f in features]).to( 

1211 flair.device 

1212 ) 

1213 

1214 # put encoded batch through BERT model to get all hidden states of all encoder layers 

1215 self.model.to(flair.device) 

1216 self.model.eval() 

1217 all_encoder_layers = self.model(all_input_ids, attention_mask=all_input_masks)[ 

1218 -1 

1219 ] 

1220 

1221 with torch.no_grad(): 

1222 

1223 for sentence_index, sentence in enumerate(sentences): 

1224 

1225 feature = features[sentence_index] 

1226 

1227 # get aggregated embeddings for each BERT-subtoken in sentence 

1228 subtoken_embeddings = [] 

1229 for token_index, _ in enumerate(feature.tokens): 

1230 all_layers = [] 

1231 for layer_index in self.layer_indexes: 

1232 layer_output = all_encoder_layers[int(layer_index)][ 

1233 sentence_index 

1234 ] 

1235 all_layers.append(layer_output[token_index]) 

1236 

1237 if self.use_scalar_mix: 

1238 sm = ScalarMix(mixture_size=len(all_layers)) 

1239 sm_embeddings = sm(all_layers) 

1240 all_layers = [sm_embeddings] 

1241 

1242 subtoken_embeddings.append(torch.cat(all_layers)) 

1243 

1244 # get the current sentence object 

1245 token_idx = 0 

1246 for token in sentence: 

1247 # add concatenated embedding to sentence 

1248 token_idx += 1 

1249 

1250 if self.pooling_operation == "first": 

1251 # use first subword embedding if pooling operation is 'first' 

1252 token.set_embedding(self.name, subtoken_embeddings[token_idx]) 

1253 else: 

1254 # otherwise, do a mean over all subwords in token 

1255 embeddings = subtoken_embeddings[ 

1256 token_idx : token_idx 

1257 + feature.token_subtoken_count[token.idx] 

1258 ] 

1259 embeddings = [ 

1260 embedding.unsqueeze(0) for embedding in embeddings 

1261 ] 

1262 mean = torch.mean(torch.cat(embeddings, dim=0), dim=0) 

1263 token.set_embedding(self.name, mean) 

1264 

1265 token_idx += feature.token_subtoken_count[token.idx] - 1 

1266 

1267 return sentences 

1268 

1269 @property 

1270 @abstractmethod 

1271 def embedding_length(self) -> int: 

1272 """Returns the length of the embedding vector.""" 

1273 return ( 

1274 len(self.layer_indexes) * self.model.config.hidden_size 

1275 if not self.use_scalar_mix 

1276 else self.model.config.hidden_size 

1277 ) 

1278 

1279 

1280class DocumentMeanEmbeddings(DocumentEmbeddings): 

1281 @deprecated( 

1282 version="0.3.1", 

1283 reason="The functionality of this class is moved to 'DocumentPoolEmbeddings'", 

1284 ) 

1285 def __init__(self, token_embeddings: List[TokenEmbeddings]): 

1286 """The constructor takes a list of embeddings to be combined.""" 

1287 super().__init__() 

1288 

1289 self.embeddings: StackedEmbeddings = StackedEmbeddings( 

1290 embeddings=token_embeddings 

1291 ) 

1292 self.name: str = "document_mean" 

1293 

1294 self.__embedding_length: int = self.embeddings.embedding_length 

1295 

1296 self.to(flair.device) 

1297 

1298 @property 

1299 def embedding_length(self) -> int: 

1300 return self.__embedding_length 

1301 

1302 def embed(self, sentences: Union[List[Sentence], Sentence]): 

1303 """Add embeddings to every sentence in the given list of sentences. If embeddings are already added, updates 

1304 only if embeddings are non-static.""" 

1305 

1306 everything_embedded: bool = True 

1307 

1308 # if only one sentence is passed, convert to list of sentence 

1309 if type(sentences) is Sentence: 

1310 sentences = [sentences] 

1311 

1312 for sentence in sentences: 

1313 if self.name not in sentence._embeddings.keys(): 

1314 everything_embedded = False 

1315 

1316 if not everything_embedded: 

1317 

1318 self.embeddings.embed(sentences) 

1319 

1320 for sentence in sentences: 

1321 word_embeddings = [] 

1322 for token in sentence.tokens: 

1323 word_embeddings.append(token.get_embedding().unsqueeze(0)) 

1324 

1325 word_embeddings = torch.cat(word_embeddings, dim=0).to(flair.device) 

1326 

1327 mean_embedding = torch.mean(word_embeddings, 0) 

1328 

1329 sentence.set_embedding(self.name, mean_embedding) 

1330 

1331 def _add_embeddings_internal(self, sentences: List[Sentence]): 

1332 pass 

1333 

1334 

1335class DocumentLSTMEmbeddings(DocumentEmbeddings): 

1336 @deprecated( 

1337 version="0.4", 

1338 reason="The functionality of this class is moved to 'DocumentRNNEmbeddings'", 

1339 ) 

1340 def __init__( 

1341 self, 

1342 embeddings: List[TokenEmbeddings], 

1343 hidden_size=128, 

1344 rnn_layers=1, 

1345 reproject_words: bool = True, 

1346 reproject_words_dimension: int = None, 

1347 bidirectional: bool = False, 

1348 dropout: float = 0.5, 

1349 word_dropout: float = 0.0, 

1350 locked_dropout: float = 0.0, 

1351 ): 

1352 """The constructor takes a list of embeddings to be combined. 

1353 :param embeddings: a list of token embeddings 

1354 :param hidden_size: the number of hidden states in the lstm 

1355 :param rnn_layers: the number of layers for the lstm 

1356 :param reproject_words: boolean value, indicating whether to reproject the token embeddings in a separate linear 

1357 layer before putting them into the lstm or not 

1358 :param reproject_words_dimension: output dimension of reprojecting token embeddings. If None the same output 

1359 dimension as before will be taken. 

1360 :param bidirectional: boolean value, indicating whether to use a bidirectional lstm or not 

1361 :param dropout: the dropout value to be used 

1362 :param word_dropout: the word dropout value to be used, if 0.0 word dropout is not used 

1363 :param locked_dropout: the locked dropout value to be used, if 0.0 locked dropout is not used 

1364 """ 

1365 super().__init__() 

1366 

1367 self.embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embeddings) 

1368 

1369 self.reproject_words = reproject_words 

1370 self.bidirectional = bidirectional 

1371 

1372 self.length_of_all_token_embeddings: int = self.embeddings.embedding_length 

1373 

1374 self.name = "document_lstm" 

1375 self.static_embeddings = False 

1376 

1377 self.__embedding_length: int = hidden_size 

1378 if self.bidirectional: 

1379 self.__embedding_length *= 4 

1380 

1381 self.embeddings_dimension: int = self.length_of_all_token_embeddings 

1382 if self.reproject_words and reproject_words_dimension is not None: 

1383 self.embeddings_dimension = reproject_words_dimension 

1384 

1385 # bidirectional LSTM on top of embedding layer 

1386 self.word_reprojection_map = torch.nn.Linear( 

1387 self.length_of_all_token_embeddings, self.embeddings_dimension 

1388 ) 

1389 self.rnn = torch.nn.GRU( 

1390 self.embeddings_dimension, 

1391 hidden_size, 

1392 num_layers=rnn_layers, 

1393 bidirectional=self.bidirectional, 

1394 ) 

1395 

1396 # dropouts 

1397 if locked_dropout > 0.0: 

1398 self.dropout: torch.nn.Module = LockedDropout(locked_dropout) 

1399 else: 

1400 self.dropout = torch.nn.Dropout(dropout) 

1401 

1402 self.use_word_dropout: bool = word_dropout > 0.0 

1403 if self.use_word_dropout: 

1404 self.word_dropout = WordDropout(word_dropout) 

1405 

1406 torch.nn.init.xavier_uniform_(self.word_reprojection_map.weight) 

1407 

1408 self.to(flair.device) 

1409 

1410 @property 

1411 def embedding_length(self) -> int: 

1412 return self.__embedding_length 

1413 

1414 def embed(self, sentences: Union[List[Sentence], Sentence]): 

1415 """Add embeddings to all sentences in the given list of sentences. If embeddings are already added, update 

1416 only if embeddings are non-static.""" 

1417 

1418 if type(sentences) is Sentence: 

1419 sentences = [sentences] 

1420 

1421 self.rnn.zero_grad() 

1422 

1423 sentences.sort(key=lambda x: len(x), reverse=True) 

1424 

1425 self.embeddings.embed(sentences) 

1426 

1427 # first, sort sentences by number of tokens 

1428 longest_token_sequence_in_batch: int = len(sentences[0]) 

1429 

1430 all_sentence_tensors = [] 

1431 lengths: List[int] = [] 

1432 

1433 # go through each sentence in batch 

1434 for i, sentence in enumerate(sentences): 

1435 

1436 lengths.append(len(sentence.tokens)) 

1437 

1438 word_embeddings = [] 

1439 

1440 for token, token_idx in zip(sentence.tokens, range(len(sentence.tokens))): 

1441 word_embeddings.append(token.get_embedding().unsqueeze(0)) 

1442 

1443 # PADDING: pad shorter sentences out 

1444 for add in range(longest_token_sequence_in_batch - len(sentence.tokens)): 

1445 word_embeddings.append( 

1446 torch.zeros( 

1447 self.length_of_all_token_embeddings, dtype=torch.float 

1448 ).unsqueeze(0).to(flair.device) 

1449 ) 

1450 

1451 word_embeddings_tensor = torch.cat(word_embeddings, 0).to(flair.device) 

1452 

1453 sentence_states = word_embeddings_tensor 

1454 

1455 # ADD TO SENTENCE LIST: add the representation 

1456 all_sentence_tensors.append(sentence_states.unsqueeze(1)) 

1457 

1458 # -------------------------------------------------------------------- 

1459 # GET REPRESENTATION FOR ENTIRE BATCH 

1460 # -------------------------------------------------------------------- 

1461 sentence_tensor = torch.cat(all_sentence_tensors, 1) 

1462 

1463 # -------------------------------------------------------------------- 

1464 # FF PART 

1465 # -------------------------------------------------------------------- 

1466 # use word dropout if set 

1467 if self.use_word_dropout: 

1468 sentence_tensor = self.word_dropout(sentence_tensor) 

1469 

1470 if self.reproject_words: 

1471 sentence_tensor = self.word_reprojection_map(sentence_tensor) 

1472 

1473 sentence_tensor = self.dropout(sentence_tensor) 

1474 

1475 packed = torch.nn.utils.rnn.pack_padded_sequence(sentence_tensor, lengths) 

1476 

1477 self.rnn.flatten_parameters() 

1478 

1479 lstm_out, hidden = self.rnn(packed) 

1480 

1481 outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(lstm_out) 

1482 

1483 outputs = self.dropout(outputs) 

1484 

1485 # -------------------------------------------------------------------- 

1486 # EXTRACT EMBEDDINGS FROM LSTM 

1487 # -------------------------------------------------------------------- 

1488 for sentence_no, length in enumerate(lengths): 

1489 last_rep = outputs[length - 1, sentence_no] 

1490 

1491 embedding = last_rep 

1492 if self.bidirectional: 

1493 first_rep = outputs[0, sentence_no] 

1494 embedding = torch.cat([first_rep, last_rep], 0) 

1495 

1496 sentence = sentences[sentence_no] 

1497 sentence.set_embedding(self.name, embedding) 

1498 

1499 def _add_embeddings_internal(self, sentences: List[Sentence]): 

1500 pass 

1501 

1502 

1503class ELMoTransformerEmbeddings(TokenEmbeddings): 

1504 """Contextual word embeddings using word-level Transformer-based LM, as proposed in Peters et al., 2018.""" 

1505 

1506 @deprecated( 

1507 version="0.4.2", 

1508 reason="Not possible to load or save ELMo Transformer models. @stefan-it is working on it.", 

1509 ) 

1510 def __init__(self, model_file: str): 

1511 super().__init__() 

1512 

1513 try: 

1514 from allennlp.modules.token_embedders.bidirectional_language_model_token_embedder import ( 

1515 BidirectionalLanguageModelTokenEmbedder, 

1516 ) 

1517 from allennlp.data.token_indexers.elmo_indexer import ( 

1518 ELMoTokenCharactersIndexer, 

1519 ) 

1520 except ModuleNotFoundError: 

1521 log.warning("-" * 100) 

1522 log.warning('ATTENTION! The library "allennlp" is not installed!') 

1523 log.warning( 

1524 "To use ELMoTransformerEmbeddings, please first install a recent version from https://github.com/allenai/allennlp" 

1525 ) 

1526 log.warning("-" * 100) 

1527 pass 

1528 

1529 self.name = "elmo-transformer" 

1530 self.static_embeddings = True 

1531 self.lm_embedder = BidirectionalLanguageModelTokenEmbedder( 

1532 archive_file=model_file, 

1533 dropout=0.2, 

1534 bos_eos_tokens=("<S>", "</S>"), 

1535 remove_bos_eos=True, 

1536 requires_grad=False, 

1537 ) 

1538 self.lm_embedder = self.lm_embedder.to(device=flair.device) 

1539 self.vocab = self.lm_embedder._lm.vocab 

1540 self.indexer = ELMoTokenCharactersIndexer() 

1541 

1542 # embed a dummy sentence to determine embedding_length 

1543 dummy_sentence: Sentence = Sentence() 

1544 dummy_sentence.add_token(Token("hello")) 

1545 embedded_dummy = self.embed(dummy_sentence) 

1546 self.__embedding_length: int = len( 

1547 embedded_dummy[0].get_token(1).get_embedding() 

1548 ) 

1549 

1550 @property 

1551 def embedding_length(self) -> int: 

1552 return self.__embedding_length 

1553 

1554 def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]: 

1555 # Avoid conflicts with flair's Token class 

1556 import allennlp.data.tokenizers.token as allen_nlp_token 

1557 

1558 indexer = self.indexer 

1559 vocab = self.vocab 

1560 

1561 for sentence in sentences: 

1562 character_indices = indexer.tokens_to_indices( 

1563 [allen_nlp_token.Token(token.text) for token in sentence], vocab, "elmo" 

1564 )["elmo"] 

1565 

1566 indices_tensor = torch.LongTensor([character_indices]) 

1567 indices_tensor = indices_tensor.to(device=flair.device) 

1568 embeddings = self.lm_embedder(indices_tensor)[0].detach().cpu().numpy() 

1569 

1570 for token, token_idx in zip(sentence.tokens, range(len(sentence.tokens))): 

1571 embedding = embeddings[token_idx] 

1572 word_embedding = torch.FloatTensor(embedding) 

1573 token.set_embedding(self.name, word_embedding) 

1574 

1575 return sentences 

1576 

1577 def extra_repr(self): 

1578 return "model={}".format(self.name) 

1579 

1580 def __str__(self): 

1581 return self.name