Coverage for flair/flair/embeddings/legacy.py: 18%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from pathlib import Path
2from deprecated import deprecated
3from abc import abstractmethod
4from typing import List, Union, Tuple, Dict
6import torch
7import logging
8import flair
10from flair.data import Sentence, Token
11from flair.embeddings.base import ScalarMix
12from flair.embeddings.document import DocumentEmbeddings
13from flair.embeddings.token import TokenEmbeddings, StackedEmbeddings
14from flair.file_utils import cached_path
16from transformers import (
17 AlbertTokenizer,
18 AlbertModel,
19 BertTokenizer,
20 BertModel,
21 CamembertTokenizer,
22 CamembertModel,
23 RobertaTokenizer,
24 RobertaModel,
25 TransfoXLTokenizer,
26 TransfoXLModel,
27 OpenAIGPTModel,
28 OpenAIGPTTokenizer,
29 GPT2Model,
30 GPT2Tokenizer,
31 XLNetTokenizer,
32 XLMTokenizer,
33 XLNetModel,
34 XLMModel,
35 XLMRobertaTokenizer,
36 XLMRobertaModel,
37 PreTrainedTokenizer,
38 PreTrainedModel,
39 AutoTokenizer, AutoConfig, AutoModel, T5Tokenizer)
41from flair.nn import LockedDropout, WordDropout
43log = logging.getLogger("flair")
46class CharLMEmbeddings(TokenEmbeddings):
47 """Contextual string embeddings of words, as proposed in Akbik et al., 2018. """
49 @deprecated(version="0.4", reason="Use 'FlairEmbeddings' instead.")
50 def __init__(
51 self,
52 model: str,
53 detach: bool = True,
54 use_cache: bool = False,
55 cache_directory: Path = None,
56 ):
57 """
58 initializes contextual string embeddings using a character-level language model.
59 :param model: model string, one of 'news-forward', 'news-backward', 'news-forward-fast', 'news-backward-fast',
60 'mix-forward', 'mix-backward', 'german-forward', 'german-backward', 'polish-backward', 'polish-forward'
61 depending on which character language model is desired.
62 :param detach: if set to False, the gradient will propagate into the language model. this dramatically slows down
63 training and often leads to worse results, so not recommended.
64 :param use_cache: if set to False, will not write embeddings to file for later retrieval. this saves disk space but will
65 not allow re-use of once computed embeddings that do not fit into memory
66 :param cache_directory: if cache_directory is not set, the cache will be written to ~/.flair/embeddings. otherwise the cache
67 is written to the provided directory.
68 """
69 super().__init__()
71 cache_dir = Path("embeddings")
73 # multilingual forward (English, German, French, Italian, Dutch, Polish)
74 if model.lower() == "multi-forward":
75 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/lm-multi-forward-v0.1.pt"
76 model = cached_path(base_path, cache_dir=cache_dir)
77 # multilingual backward (English, German, French, Italian, Dutch, Polish)
78 elif model.lower() == "multi-backward":
79 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/lm-multi-backward-v0.1.pt"
80 model = cached_path(base_path, cache_dir=cache_dir)
82 # news-english-forward
83 elif model.lower() == "news-forward":
84 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-forward-v0.2rc.pt"
85 model = cached_path(base_path, cache_dir=cache_dir)
87 # news-english-backward
88 elif model.lower() == "news-backward":
89 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-backward-v0.2rc.pt"
90 model = cached_path(base_path, cache_dir=cache_dir)
92 # news-english-forward
93 elif model.lower() == "news-forward-fast":
94 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-forward-1024-v0.2rc.pt"
95 model = cached_path(base_path, cache_dir=cache_dir)
97 # news-english-backward
98 elif model.lower() == "news-backward-fast":
99 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-backward-1024-v0.2rc.pt"
100 model = cached_path(base_path, cache_dir=cache_dir)
102 # mix-english-forward
103 elif model.lower() == "mix-forward":
104 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-english-forward-v0.2rc.pt"
105 model = cached_path(base_path, cache_dir=cache_dir)
107 # mix-english-backward
108 elif model.lower() == "mix-backward":
109 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-english-backward-v0.2rc.pt"
110 model = cached_path(base_path, cache_dir=cache_dir)
112 # mix-german-forward
113 elif model.lower() == "german-forward" or model.lower() == "de-forward":
114 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-german-forward-v0.2rc.pt"
115 model = cached_path(base_path, cache_dir=cache_dir)
117 # mix-german-backward
118 elif model.lower() == "german-backward" or model.lower() == "de-backward":
119 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-german-backward-v0.2rc.pt"
120 model = cached_path(base_path, cache_dir=cache_dir)
122 # common crawl Polish forward
123 elif model.lower() == "polish-forward" or model.lower() == "pl-forward":
124 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-polish-forward-v0.2.pt"
125 model = cached_path(base_path, cache_dir=cache_dir)
127 # common crawl Polish backward
128 elif model.lower() == "polish-backward" or model.lower() == "pl-backward":
129 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-polish-backward-v0.2.pt"
130 model = cached_path(base_path, cache_dir=cache_dir)
132 # Slovenian forward
133 elif model.lower() == "slovenian-forward" or model.lower() == "sl-forward":
134 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.3/lm-sl-large-forward-v0.1.pt"
135 model = cached_path(base_path, cache_dir=cache_dir)
136 # Slovenian backward
137 elif model.lower() == "slovenian-backward" or model.lower() == "sl-backward":
138 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.3/lm-sl-large-backward-v0.1.pt"
139 model = cached_path(base_path, cache_dir=cache_dir)
141 # Bulgarian forward
142 elif model.lower() == "bulgarian-forward" or model.lower() == "bg-forward":
143 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.3/lm-bg-small-forward-v0.1.pt"
144 model = cached_path(base_path, cache_dir=cache_dir)
145 # Bulgarian backward
146 elif model.lower() == "bulgarian-backward" or model.lower() == "bg-backward":
147 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.3/lm-bg-small-backward-v0.1.pt"
148 model = cached_path(base_path, cache_dir=cache_dir)
150 # Dutch forward
151 elif model.lower() == "dutch-forward" or model.lower() == "nl-forward":
152 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/lm-nl-large-forward-v0.1.pt"
153 model = cached_path(base_path, cache_dir=cache_dir)
154 # Dutch backward
155 elif model.lower() == "dutch-backward" or model.lower() == "nl-backward":
156 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/lm-nl-large-backward-v0.1.pt"
157 model = cached_path(base_path, cache_dir=cache_dir)
159 # Swedish forward
160 elif model.lower() == "swedish-forward" or model.lower() == "sv-forward":
161 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/lm-sv-large-forward-v0.1.pt"
162 model = cached_path(base_path, cache_dir=cache_dir)
163 # Swedish backward
164 elif model.lower() == "swedish-backward" or model.lower() == "sv-backward":
165 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/lm-sv-large-backward-v0.1.pt"
166 model = cached_path(base_path, cache_dir=cache_dir)
168 # French forward
169 elif model.lower() == "french-forward" or model.lower() == "fr-forward":
170 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-fr-charlm-forward.pt"
171 model = cached_path(base_path, cache_dir=cache_dir)
172 # French backward
173 elif model.lower() == "french-backward" or model.lower() == "fr-backward":
174 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-fr-charlm-backward.pt"
175 model = cached_path(base_path, cache_dir=cache_dir)
177 # Czech forward
178 elif model.lower() == "czech-forward" or model.lower() == "cs-forward":
179 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/lm-cs-large-forward-v0.1.pt"
180 model = cached_path(base_path, cache_dir=cache_dir)
181 # Czech backward
182 elif model.lower() == "czech-backward" or model.lower() == "cs-backward":
183 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/lm-cs-large-backward-v0.1.pt"
184 model = cached_path(base_path, cache_dir=cache_dir)
186 # Portuguese forward
187 elif model.lower() == "portuguese-forward" or model.lower() == "pt-forward":
188 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/lm-pt-forward.pt"
189 model = cached_path(base_path, cache_dir=cache_dir)
190 # Portuguese backward
191 elif model.lower() == "portuguese-backward" or model.lower() == "pt-backward":
192 base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/lm-pt-backward.pt"
193 model = cached_path(base_path, cache_dir=cache_dir)
195 elif not Path(model).exists():
196 raise ValueError(
197 f'The given model "{model}" is not available or is not a valid path.'
198 )
200 self.name = str(model)
201 self.static_embeddings = detach
203 from flair.models import LanguageModel
205 self.lm = LanguageModel.load_language_model(model)
206 self.detach = detach
208 self.is_forward_lm: bool = self.lm.is_forward_lm
210 # initialize cache if use_cache set
211 self.cache = None
212 if use_cache:
213 cache_path = (
214 Path(f"{self.name}-tmp-cache.sqllite")
215 if not cache_directory
216 else cache_directory / f"{self.name}-tmp-cache.sqllite"
217 )
218 from sqlitedict import SqliteDict
220 self.cache = SqliteDict(str(cache_path), autocommit=True)
222 # embed a dummy sentence to determine embedding_length
223 dummy_sentence: Sentence = Sentence()
224 dummy_sentence.add_token(Token("hello"))
225 embedded_dummy = self.embed(dummy_sentence)
226 self.__embedding_length: int = len(
227 embedded_dummy[0].get_token(1).get_embedding()
228 )
230 # set to eval mode
231 self.eval()
233 def train(self, mode=True):
234 pass
236 def __getstate__(self):
237 # Copy the object's state from self.__dict__ which contains
238 # all our instance attributes. Always use the dict.copy()
239 # method to avoid modifying the original state.
240 state = self.__dict__.copy()
241 # Remove the unpicklable entries.
242 state["cache"] = None
243 return state
245 @property
246 def embedding_length(self) -> int:
247 return self.__embedding_length
249 def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:
251 # if cache is used, try setting embeddings from cache first
252 if "cache" in self.__dict__ and self.cache is not None:
254 # try populating embeddings from cache
255 all_embeddings_retrieved_from_cache: bool = True
256 for sentence in sentences:
257 key = sentence.to_tokenized_string()
258 embeddings = self.cache.get(key)
260 if not embeddings:
261 all_embeddings_retrieved_from_cache = False
262 break
263 else:
264 for token, embedding in zip(sentence, embeddings):
265 token.set_embedding(self.name, torch.FloatTensor(embedding))
267 if all_embeddings_retrieved_from_cache:
268 return sentences
270 # if this is not possible, use LM to generate embedding. First, get text sentences
271 text_sentences = [sentence.to_tokenized_string() for sentence in sentences]
273 start_marker = "\n"
274 end_marker = " "
276 # get hidden states from language model
277 all_hidden_states_in_lm = self.lm.get_representation(
278 text_sentences, start_marker, end_marker, self.chars_per_chunk
279 )
281 # take first or last hidden states from language model as word representation
282 for i, sentence in enumerate(sentences):
283 sentence_text = sentence.to_tokenized_string()
285 offset_forward: int = len(start_marker)
286 offset_backward: int = len(sentence_text) + len(start_marker)
288 for token in sentence.tokens:
290 offset_forward += len(token.text)
292 if self.is_forward_lm:
293 offset = offset_forward
294 else:
295 offset = offset_backward
297 embedding = all_hidden_states_in_lm[offset, i, :]
299 # if self.tokenized_lm or token.whitespace_after:
300 offset_forward += 1
301 offset_backward -= 1
303 offset_backward -= len(token.text)
305 token.set_embedding(self.name, embedding)
307 if "cache" in self.__dict__ and self.cache is not None:
308 for sentence in sentences:
309 self.cache[sentence.to_tokenized_string()] = [
310 token._embeddings[self.name].tolist() for token in sentence
311 ]
313 return sentences
315 def __str__(self):
316 return self.name
319class TransformerXLEmbeddings(TokenEmbeddings):
321 @deprecated(
322 version="0.4.5",
323 reason="Use 'TransformerWordEmbeddings' for all transformer-based word embeddings",
324 )
325 def __init__(
326 self,
327 pretrained_model_name_or_path: str = "transfo-xl-wt103",
328 layers: str = "1,2,3",
329 use_scalar_mix: bool = False,
330 ):
331 """Transformer-XL embeddings, as proposed in Dai et al., 2019.
332 :param pretrained_model_name_or_path: name or path of Transformer-XL model
333 :param layers: comma-separated list of layers
334 :param use_scalar_mix: defines the usage of scalar mix for specified layer(s)
335 """
336 super().__init__()
338 self.tokenizer = TransfoXLTokenizer.from_pretrained(
339 pretrained_model_name_or_path
340 )
341 self.model = TransfoXLModel.from_pretrained(
342 pretrained_model_name_or_path=pretrained_model_name_or_path,
343 output_hidden_states=True,
344 )
345 self.name = pretrained_model_name_or_path
346 self.layers: List[int] = [int(layer) for layer in layers.split(",")]
347 self.use_scalar_mix = use_scalar_mix
348 self.static_embeddings = True
350 dummy_sentence: Sentence = Sentence()
351 dummy_sentence.add_token(Token("hello"))
352 embedded_dummy = self.embed(dummy_sentence)
353 self.__embedding_length: int = len(
354 embedded_dummy[0].get_token(1).get_embedding()
355 )
357 @property
358 def embedding_length(self) -> int:
359 return self.__embedding_length
361 def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:
362 self.model.to(flair.device)
363 self.model.eval()
365 sentences = _get_transformer_sentence_embeddings(
366 sentences=sentences,
367 tokenizer=self.tokenizer,
368 model=self.model,
369 name=self.name,
370 layers=self.layers,
371 pooling_operation="first",
372 use_scalar_mix=self.use_scalar_mix,
373 eos_token="<eos>",
374 )
376 return sentences
378 def extra_repr(self):
379 return "model={}".format(self.name)
381 def __str__(self):
382 return self.name
385class XLNetEmbeddings(TokenEmbeddings):
387 @deprecated(
388 version="0.4.5",
389 reason="Use 'TransformerWordEmbeddings' for all transformer-based word embeddings",
390 )
391 def __init__(
392 self,
393 pretrained_model_name_or_path: str = "xlnet-large-cased",
394 layers: str = "1",
395 pooling_operation: str = "first_last",
396 use_scalar_mix: bool = False,
397 ):
398 """XLNet embeddings, as proposed in Yang et al., 2019.
399 :param pretrained_model_name_or_path: name or path of XLNet model
400 :param layers: comma-separated list of layers
401 :param pooling_operation: defines pooling operation for subwords
402 :param use_scalar_mix: defines the usage of scalar mix for specified layer(s)
403 """
404 super().__init__()
406 self.tokenizer = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path)
407 self.model = XLNetModel.from_pretrained(
408 pretrained_model_name_or_path=pretrained_model_name_or_path,
409 output_hidden_states=True,
410 )
411 self.name = pretrained_model_name_or_path
412 self.layers: List[int] = [int(layer) for layer in layers.split(",")]
413 self.pooling_operation = pooling_operation
414 self.use_scalar_mix = use_scalar_mix
415 self.static_embeddings = True
417 dummy_sentence: Sentence = Sentence()
418 dummy_sentence.add_token(Token("hello"))
419 embedded_dummy = self.embed(dummy_sentence)
420 self.__embedding_length: int = len(
421 embedded_dummy[0].get_token(1).get_embedding()
422 )
424 @property
425 def embedding_length(self) -> int:
426 return self.__embedding_length
428 def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:
429 self.model.to(flair.device)
430 self.model.eval()
432 sentences = _get_transformer_sentence_embeddings(
433 sentences=sentences,
434 tokenizer=self.tokenizer,
435 model=self.model,
436 name=self.name,
437 layers=self.layers,
438 pooling_operation=self.pooling_operation,
439 use_scalar_mix=self.use_scalar_mix,
440 bos_token="<s>",
441 eos_token="</s>",
442 )
444 return sentences
446 def extra_repr(self):
447 return "model={}".format(self.name)
449 def __str__(self):
450 return self.name
453class XLMEmbeddings(TokenEmbeddings):
455 @deprecated(
456 version="0.4.5",
457 reason="Use 'TransformerWordEmbeddings' for all transformer-based word embeddings",
458 )
459 def __init__(
460 self,
461 pretrained_model_name_or_path: str = "xlm-mlm-en-2048",
462 layers: str = "1",
463 pooling_operation: str = "first_last",
464 use_scalar_mix: bool = False,
465 ):
466 """
467 XLM embeddings, as proposed in Guillaume et al., 2019.
468 :param pretrained_model_name_or_path: name or path of XLM model
469 :param layers: comma-separated list of layers
470 :param pooling_operation: defines pooling operation for subwords
471 :param use_scalar_mix: defines the usage of scalar mix for specified layer(s)
472 """
473 super().__init__()
475 self.tokenizer = XLMTokenizer.from_pretrained(pretrained_model_name_or_path)
476 self.model = XLMModel.from_pretrained(
477 pretrained_model_name_or_path=pretrained_model_name_or_path,
478 output_hidden_states=True,
479 )
480 self.name = pretrained_model_name_or_path
481 self.layers: List[int] = [int(layer) for layer in layers.split(",")]
482 self.pooling_operation = pooling_operation
483 self.use_scalar_mix = use_scalar_mix
484 self.static_embeddings = True
486 dummy_sentence: Sentence = Sentence()
487 dummy_sentence.add_token(Token("hello"))
488 embedded_dummy = self.embed(dummy_sentence)
489 self.__embedding_length: int = len(
490 embedded_dummy[0].get_token(1).get_embedding()
491 )
493 @property
494 def embedding_length(self) -> int:
495 return self.__embedding_length
497 def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:
498 self.model.to(flair.device)
499 self.model.eval()
501 sentences = _get_transformer_sentence_embeddings(
502 sentences=sentences,
503 tokenizer=self.tokenizer,
504 model=self.model,
505 name=self.name,
506 layers=self.layers,
507 pooling_operation=self.pooling_operation,
508 use_scalar_mix=self.use_scalar_mix,
509 bos_token="<s>",
510 eos_token="</s>",
511 )
513 return sentences
515 def extra_repr(self):
516 return "model={}".format(self.name)
518 def __str__(self):
519 return self.name
522class OpenAIGPTEmbeddings(TokenEmbeddings):
524 @deprecated(
525 version="0.4.5",
526 reason="Use 'TransformerWordEmbeddings' for all transformer-based word embeddings",
527 )
528 def __init__(
529 self,
530 pretrained_model_name_or_path: str = "openai-gpt",
531 layers: str = "1",
532 pooling_operation: str = "first_last",
533 use_scalar_mix: bool = False,
534 ):
535 """OpenAI GPT embeddings, as proposed in Radford et al. 2018.
536 :param pretrained_model_name_or_path: name or path of OpenAI GPT model
537 :param layers: comma-separated list of layers
538 :param pooling_operation: defines pooling operation for subwords
539 :param use_scalar_mix: defines the usage of scalar mix for specified layer(s)
540 """
541 super().__init__()
543 self.tokenizer = OpenAIGPTTokenizer.from_pretrained(
544 pretrained_model_name_or_path
545 )
546 self.model = OpenAIGPTModel.from_pretrained(
547 pretrained_model_name_or_path=pretrained_model_name_or_path,
548 output_hidden_states=True,
549 )
550 self.name = pretrained_model_name_or_path
551 self.layers: List[int] = [int(layer) for layer in layers.split(",")]
552 self.pooling_operation = pooling_operation
553 self.use_scalar_mix = use_scalar_mix
554 self.static_embeddings = True
556 dummy_sentence: Sentence = Sentence()
557 dummy_sentence.add_token(Token("hello"))
558 embedded_dummy = self.embed(dummy_sentence)
559 self.__embedding_length: int = len(
560 embedded_dummy[0].get_token(1).get_embedding()
561 )
563 @property
564 def embedding_length(self) -> int:
565 return self.__embedding_length
567 def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:
568 self.model.to(flair.device)
569 self.model.eval()
571 sentences = _get_transformer_sentence_embeddings(
572 sentences=sentences,
573 tokenizer=self.tokenizer,
574 model=self.model,
575 name=self.name,
576 layers=self.layers,
577 pooling_operation=self.pooling_operation,
578 use_scalar_mix=self.use_scalar_mix,
579 )
581 return sentences
583 def extra_repr(self):
584 return "model={}".format(self.name)
586 def __str__(self):
587 return self.name
590class OpenAIGPT2Embeddings(TokenEmbeddings):
592 @deprecated(
593 version="0.4.5",
594 reason="Use 'TransformerWordEmbeddings' for all transformer-based word embeddings",
595 )
596 def __init__(
597 self,
598 pretrained_model_name_or_path: str = "gpt2-medium",
599 layers: str = "1",
600 pooling_operation: str = "first_last",
601 use_scalar_mix: bool = False,
602 ):
603 """OpenAI GPT-2 embeddings, as proposed in Radford et al. 2019.
604 :param pretrained_model_name_or_path: name or path of OpenAI GPT-2 model
605 :param layers: comma-separated list of layers
606 :param pooling_operation: defines pooling operation for subwords
607 :param use_scalar_mix: defines the usage of scalar mix for specified layer(s)
608 """
609 super().__init__()
611 self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path)
612 self.model = GPT2Model.from_pretrained(
613 pretrained_model_name_or_path=pretrained_model_name_or_path,
614 output_hidden_states=True,
615 )
616 self.name = pretrained_model_name_or_path
617 self.layers: List[int] = [int(layer) for layer in layers.split(",")]
618 self.pooling_operation = pooling_operation
619 self.use_scalar_mix = use_scalar_mix
620 self.static_embeddings = True
622 dummy_sentence: Sentence = Sentence()
623 dummy_sentence.add_token(Token("hello"))
624 embedded_dummy = self.embed(dummy_sentence)
625 self.__embedding_length: int = len(
626 embedded_dummy[0].get_token(1).get_embedding()
627 )
629 @property
630 def embedding_length(self) -> int:
631 return self.__embedding_length
633 def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:
634 self.model.to(flair.device)
635 self.model.eval()
637 sentences = _get_transformer_sentence_embeddings(
638 sentences=sentences,
639 tokenizer=self.tokenizer,
640 model=self.model,
641 name=self.name,
642 layers=self.layers,
643 pooling_operation=self.pooling_operation,
644 use_scalar_mix=self.use_scalar_mix,
645 bos_token="<|endoftext|>",
646 eos_token="<|endoftext|>",
647 )
649 return sentences
652class RoBERTaEmbeddings(TokenEmbeddings):
654 @deprecated(
655 version="0.4.5",
656 reason="Use 'TransformerWordEmbeddings' for all transformer-based word embeddings",
657 )
658 def __init__(
659 self,
660 pretrained_model_name_or_path: str = "roberta-base",
661 layers: str = "-1",
662 pooling_operation: str = "first",
663 use_scalar_mix: bool = False,
664 ):
665 """RoBERTa, as proposed by Liu et al. 2019.
666 :param pretrained_model_name_or_path: name or path of RoBERTa model
667 :param layers: comma-separated list of layers
668 :param pooling_operation: defines pooling operation for subwords
669 :param use_scalar_mix: defines the usage of scalar mix for specified layer(s)
670 """
671 super().__init__()
673 self.tokenizer = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path)
674 self.model = RobertaModel.from_pretrained(
675 pretrained_model_name_or_path=pretrained_model_name_or_path,
676 output_hidden_states=True,
677 )
678 self.name = pretrained_model_name_or_path
679 self.layers: List[int] = [int(layer) for layer in layers.split(",")]
680 self.pooling_operation = pooling_operation
681 self.use_scalar_mix = use_scalar_mix
682 self.static_embeddings = True
684 dummy_sentence: Sentence = Sentence()
685 dummy_sentence.add_token(Token("hello"))
686 embedded_dummy = self.embed(dummy_sentence)
687 self.__embedding_length: int = len(
688 embedded_dummy[0].get_token(1).get_embedding()
689 )
691 @property
692 def embedding_length(self) -> int:
693 return self.__embedding_length
695 def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:
696 self.model.to(flair.device)
697 self.model.eval()
699 sentences = _get_transformer_sentence_embeddings(
700 sentences=sentences,
701 tokenizer=self.tokenizer,
702 model=self.model,
703 name=self.name,
704 layers=self.layers,
705 pooling_operation=self.pooling_operation,
706 use_scalar_mix=self.use_scalar_mix,
707 bos_token="<s>",
708 eos_token="</s>",
709 )
711 return sentences
714class CamembertEmbeddings(TokenEmbeddings):
716 @deprecated(
717 version="0.4.5",
718 reason="Use 'TransformerWordEmbeddings' for all transformer-based word embeddings",
719 )
720 def __init__(
721 self,
722 pretrained_model_name_or_path: str = "camembert-base",
723 layers: str = "-1",
724 pooling_operation: str = "first",
725 use_scalar_mix: bool = False,
726 ):
727 """CamemBERT, a Tasty French Language Model, as proposed by Martin et al. 2019.
728 :param pretrained_model_name_or_path: name or path of RoBERTa model
729 :param layers: comma-separated list of layers
730 :param pooling_operation: defines pooling operation for subwords
731 :param use_scalar_mix: defines the usage of scalar mix for specified layer(s)
732 """
733 super().__init__()
735 self.tokenizer = CamembertTokenizer.from_pretrained(
736 pretrained_model_name_or_path
737 )
738 self.model = CamembertModel.from_pretrained(
739 pretrained_model_name_or_path=pretrained_model_name_or_path,
740 output_hidden_states=True,
741 )
742 self.name = pretrained_model_name_or_path
743 self.layers: List[int] = [int(layer) for layer in layers.split(",")]
744 self.pooling_operation = pooling_operation
745 self.use_scalar_mix = use_scalar_mix
746 self.static_embeddings = True
748 dummy_sentence: Sentence = Sentence()
749 dummy_sentence.add_token(Token("hello"))
750 embedded_dummy = self.embed(dummy_sentence)
751 self.__embedding_length: int = len(
752 embedded_dummy[0].get_token(1).get_embedding()
753 )
755 def __getstate__(self):
756 state = self.__dict__.copy()
757 state["tokenizer"] = None
758 return state
760 def __setstate__(self, d):
761 self.__dict__ = d
763 # 1-camembert-base -> camembert-base
764 if any(char.isdigit() for char in self.name):
765 self.tokenizer = CamembertTokenizer.from_pretrained(
766 "-".join(self.name.split("-")[1:]))
767 else:
768 self.tokenizer = CamembertTokenizer.from_pretrained(self.name)
770 @property
771 def embedding_length(self) -> int:
772 return self.__embedding_length
774 def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:
775 self.model.to(flair.device)
776 self.model.eval()
778 sentences = _get_transformer_sentence_embeddings(
779 sentences=sentences,
780 tokenizer=self.tokenizer,
781 model=self.model,
782 name=self.name,
783 layers=self.layers,
784 pooling_operation=self.pooling_operation,
785 use_scalar_mix=self.use_scalar_mix,
786 bos_token="<s>",
787 eos_token="</s>",
788 )
790 return sentences
793class XLMRobertaEmbeddings(TokenEmbeddings):
795 @deprecated(
796 version="0.4.5",
797 reason="Use 'TransformerWordEmbeddings' for all transformer-based word embeddings",
798 )
799 def __init__(
800 self,
801 pretrained_model_name_or_path: str = "xlm-roberta-large",
802 layers: str = "-1",
803 pooling_operation: str = "first",
804 use_scalar_mix: bool = False,
805 ):
806 """XLM-RoBERTa as proposed by Conneau et al. 2019.
807 :param pretrained_model_name_or_path: name or path of XLM-R model
808 :param layers: comma-separated list of layers
809 :param pooling_operation: defines pooling operation for subwords
810 :param use_scalar_mix: defines the usage of scalar mix for specified layer(s)
811 """
812 super().__init__()
814 self.tokenizer = XLMRobertaTokenizer.from_pretrained(
815 pretrained_model_name_or_path
816 )
817 self.model = XLMRobertaModel.from_pretrained(
818 pretrained_model_name_or_path=pretrained_model_name_or_path,
819 output_hidden_states=True,
820 )
821 self.name = pretrained_model_name_or_path
822 self.layers: List[int] = [int(layer) for layer in layers.split(",")]
823 self.pooling_operation = pooling_operation
824 self.use_scalar_mix = use_scalar_mix
825 self.static_embeddings = True
827 dummy_sentence: Sentence = Sentence()
828 dummy_sentence.add_token(Token("hello"))
829 embedded_dummy = self.embed(dummy_sentence)
830 self.__embedding_length: int = len(
831 embedded_dummy[0].get_token(1).get_embedding()
832 )
834 def __getstate__(self):
835 state = self.__dict__.copy()
836 state["tokenizer"] = None
837 return state
839 def __setstate__(self, d):
840 self.__dict__ = d
842 # 1-xlm-roberta-large -> xlm-roberta-large
843 self.tokenizer = self.tokenizer = XLMRobertaTokenizer.from_pretrained(
844 "-".join(self.name.split("-")[1:])
845 )
847 @property
848 def embedding_length(self) -> int:
849 return self.__embedding_length
851 def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:
852 self.model.to(flair.device)
853 self.model.eval()
855 sentences = _get_transformer_sentence_embeddings(
856 sentences=sentences,
857 tokenizer=self.tokenizer,
858 model=self.model,
859 name=self.name,
860 layers=self.layers,
861 pooling_operation=self.pooling_operation,
862 use_scalar_mix=self.use_scalar_mix,
863 bos_token="<s>",
864 eos_token="</s>",
865 )
867 return sentences
869def _extract_embeddings(
870 hidden_states: List[torch.FloatTensor],
871 layers: List[int],
872 pooling_operation: str,
873 subword_start_idx: int,
874 subword_end_idx: int,
875 use_scalar_mix: bool = False,
876) -> List[torch.FloatTensor]:
877 """
878 Extracts subword embeddings from specified layers from hidden states.
879 :param hidden_states: list of hidden states from model
880 :param layers: list of layers
881 :param pooling_operation: pooling operation for subword embeddings (supported: first, last, first_last and mean)
882 :param subword_start_idx: defines start index for subword
883 :param subword_end_idx: defines end index for subword
884 :param use_scalar_mix: determines, if scalar mix should be used
885 :return: list of extracted subword embeddings
886 """
887 subtoken_embeddings: List[torch.FloatTensor] = []
889 for layer in layers:
890 current_embeddings = hidden_states[layer][0][subword_start_idx:subword_end_idx]
892 first_embedding: torch.FloatTensor = current_embeddings[0]
893 if pooling_operation == "first_last":
894 last_embedding: torch.FloatTensor = current_embeddings[-1]
895 final_embedding: torch.FloatTensor = torch.cat(
896 [first_embedding, last_embedding]
897 )
898 elif pooling_operation == "last":
899 final_embedding: torch.FloatTensor = current_embeddings[-1]
900 elif pooling_operation == "mean":
901 all_embeddings: List[torch.FloatTensor] = [
902 embedding.unsqueeze(0) for embedding in current_embeddings
903 ]
904 final_embedding: torch.FloatTensor = torch.mean(
905 torch.cat(all_embeddings, dim=0), dim=0
906 )
907 else:
908 final_embedding: torch.FloatTensor = first_embedding
910 subtoken_embeddings.append(final_embedding)
912 if use_scalar_mix:
913 sm = ScalarMix(mixture_size=len(subtoken_embeddings))
914 sm_embeddings = sm(subtoken_embeddings)
916 subtoken_embeddings = [sm_embeddings]
918 return subtoken_embeddings
921def _build_token_subwords_mapping(
922 sentence: Sentence, tokenizer: PreTrainedTokenizer
923) -> Tuple[Dict[int, int], str]:
924 """ Builds a dictionary that stores the following information:
925 Token index (key) and number of corresponding subwords (value) for a sentence.
927 :param sentence: input sentence
928 :param tokenizer: Transformers tokenization object
929 :return: dictionary of token index to corresponding number of subwords, tokenized string
930 """
931 token_subwords_mapping: Dict[int, int] = {}
933 tokens = []
935 for token in sentence.tokens:
936 token_text = token.text
938 subwords = tokenizer.tokenize(token_text)
940 tokens.append(token.text if subwords else tokenizer.unk_token)
942 token_subwords_mapping[token.idx] = len(subwords) if subwords else 1
944 return token_subwords_mapping, " ".join(tokens)
947def _build_token_subwords_mapping_gpt2(
948 sentence: Sentence, tokenizer: PreTrainedTokenizer
949) -> Tuple[Dict[int, int], str]:
950 """ Builds a dictionary that stores the following information:
951 Token index (key) and number of corresponding subwords (value) for a sentence.
953 :param sentence: input sentence
954 :param tokenizer: Transformers tokenization object
955 :return: dictionary of token index to corresponding number of subwords, tokenized string
956 """
957 token_subwords_mapping: Dict[int, int] = {}
959 tokens = []
961 for token in sentence.tokens:
962 # Dummy token is needed to get the actually token tokenized correctly with special ``Ġ`` symbol
964 if token.idx == 1:
965 token_text = token.text
966 subwords = tokenizer.tokenize(token_text)
967 else:
968 token_text = "X " + token.text
969 subwords = tokenizer.tokenize(token_text)[1:]
971 tokens.append(token.text if subwords else tokenizer.unk_token)
973 token_subwords_mapping[token.idx] = len(subwords) if subwords else 1
975 return token_subwords_mapping, " ".join(tokens)
978def _get_transformer_sentence_embeddings(
979 sentences: List[Sentence],
980 tokenizer: PreTrainedTokenizer,
981 model: PreTrainedModel,
982 name: str,
983 layers: List[int],
984 pooling_operation: str,
985 use_scalar_mix: bool,
986 bos_token: str = None,
987 eos_token: str = None,
988) -> List[Sentence]:
989 """
990 Builds sentence embeddings for Transformer-based architectures.
991 :param sentences: input sentences
992 :param tokenizer: tokenization object
993 :param model: model object
994 :param name: name of the Transformer-based model
995 :param layers: list of layers
996 :param pooling_operation: defines pooling operation for subword extraction
997 :param use_scalar_mix: defines the usage of scalar mix for specified layer(s)
998 :param bos_token: defines begin of sentence token (used for left padding)
999 :param eos_token: defines end of sentence token (used for right padding)
1000 :return: list of sentences (each token of a sentence is now embedded)
1001 """
1002 with torch.no_grad():
1003 for sentence in sentences:
1004 token_subwords_mapping: Dict[int, int] = {}
1006 if ("gpt2" in name or "roberta" in name) and "xlm" not in name:
1007 (
1008 token_subwords_mapping,
1009 tokenized_string,
1010 ) = _build_token_subwords_mapping_gpt2(
1011 sentence=sentence, tokenizer=tokenizer
1012 )
1013 else:
1014 (
1015 token_subwords_mapping,
1016 tokenized_string,
1017 ) = _build_token_subwords_mapping(
1018 sentence=sentence, tokenizer=tokenizer
1019 )
1021 subwords = tokenizer.tokenize(tokenized_string)
1023 offset = 0
1025 if bos_token:
1026 subwords = [bos_token] + subwords
1027 offset = 1
1029 if eos_token:
1030 subwords = subwords + [eos_token]
1032 indexed_tokens = tokenizer.convert_tokens_to_ids(subwords)
1033 tokens_tensor = torch.tensor([indexed_tokens])
1034 tokens_tensor = tokens_tensor.to(flair.device)
1036 hidden_states = model(tokens_tensor)[-1]
1038 for token in sentence.tokens:
1039 len_subwords = token_subwords_mapping[token.idx]
1041 subtoken_embeddings = _extract_embeddings(
1042 hidden_states=hidden_states,
1043 layers=layers,
1044 pooling_operation=pooling_operation,
1045 subword_start_idx=offset,
1046 subword_end_idx=offset + len_subwords,
1047 use_scalar_mix=use_scalar_mix,
1048 )
1050 offset += len_subwords
1052 final_subtoken_embedding = torch.cat(subtoken_embeddings)
1053 token.set_embedding(name, final_subtoken_embedding)
1055 return sentences
1058class BertEmbeddings(TokenEmbeddings):
1060 @deprecated(
1061 version="0.4.5",
1062 reason="Use 'TransformerWordEmbeddings' for all transformer-based word embeddings",
1063 )
1064 def __init__(
1065 self,
1066 bert_model_or_path: str = "bert-base-uncased",
1067 layers: str = "-1,-2,-3,-4",
1068 pooling_operation: str = "first",
1069 use_scalar_mix: bool = False,
1070 ):
1071 """
1072 Bidirectional transformer embeddings of words, as proposed in Devlin et al., 2018.
1073 :param bert_model_or_path: name of BERT model ('') or directory path containing custom model, configuration file
1074 and vocab file (names of three files should be - config.json, pytorch_model.bin/model.chkpt, vocab.txt)
1075 :param layers: string indicating which layers to take for embedding
1076 :param pooling_operation: how to get from token piece embeddings to token embedding. Either pool them and take
1077 the average ('mean') or use first word piece embedding as token embedding ('first)
1078 """
1079 super().__init__()
1081 if "distilbert" in bert_model_or_path:
1082 try:
1083 from transformers import DistilBertTokenizer, DistilBertModel
1084 except ImportError:
1085 log.warning("-" * 100)
1086 log.warning(
1087 "ATTENTION! To use DistilBert, please first install a recent version of transformers!"
1088 )
1089 log.warning("-" * 100)
1090 pass
1092 self.tokenizer = DistilBertTokenizer.from_pretrained(bert_model_or_path)
1093 self.model = DistilBertModel.from_pretrained(
1094 pretrained_model_name_or_path=bert_model_or_path,
1095 output_hidden_states=True,
1096 )
1097 elif "albert" in bert_model_or_path:
1098 self.tokenizer = AlbertTokenizer.from_pretrained(bert_model_or_path)
1099 self.model = AlbertModel.from_pretrained(
1100 pretrained_model_name_or_path=bert_model_or_path,
1101 output_hidden_states=True,
1102 )
1103 else:
1104 self.tokenizer = BertTokenizer.from_pretrained(bert_model_or_path)
1105 self.model = BertModel.from_pretrained(
1106 pretrained_model_name_or_path=bert_model_or_path,
1107 output_hidden_states=True,
1108 )
1109 self.layer_indexes = [int(x) for x in layers.split(",")]
1110 self.pooling_operation = pooling_operation
1111 self.use_scalar_mix = use_scalar_mix
1112 self.name = str(bert_model_or_path)
1113 self.static_embeddings = True
1115 class BertInputFeatures(object):
1116 """Private helper class for holding BERT-formatted features"""
1118 def __init__(
1119 self,
1120 unique_id,
1121 tokens,
1122 input_ids,
1123 input_mask,
1124 input_type_ids,
1125 token_subtoken_count,
1126 ):
1127 self.unique_id = unique_id
1128 self.tokens = tokens
1129 self.input_ids = input_ids
1130 self.input_mask = input_mask
1131 self.input_type_ids = input_type_ids
1132 self.token_subtoken_count = token_subtoken_count
1134 def _convert_sentences_to_features(
1135 self, sentences, max_sequence_length: int
1136 ) -> [BertInputFeatures]:
1138 max_sequence_length = max_sequence_length + 2
1140 features: List[BertEmbeddings.BertInputFeatures] = []
1141 for (sentence_index, sentence) in enumerate(sentences):
1143 bert_tokenization: List[str] = []
1144 token_subtoken_count: Dict[int, int] = {}
1146 for token in sentence:
1147 subtokens = self.tokenizer.tokenize(token.text)
1148 bert_tokenization.extend(subtokens)
1149 token_subtoken_count[token.idx] = len(subtokens)
1151 if len(bert_tokenization) > max_sequence_length - 2:
1152 bert_tokenization = bert_tokenization[0 : (max_sequence_length - 2)]
1154 tokens = []
1155 input_type_ids = []
1156 tokens.append("[CLS]")
1157 input_type_ids.append(0)
1158 for token in bert_tokenization:
1159 tokens.append(token)
1160 input_type_ids.append(0)
1161 tokens.append("[SEP]")
1162 input_type_ids.append(0)
1164 input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
1165 # The mask has 1 for real tokens and 0 for padding tokens. Only real
1166 # tokens are attended to.
1167 input_mask = [1] * len(input_ids)
1169 # Zero-pad up to the sequence length.
1170 while len(input_ids) < max_sequence_length:
1171 input_ids.append(0)
1172 input_mask.append(0)
1173 input_type_ids.append(0)
1175 features.append(
1176 BertEmbeddings.BertInputFeatures(
1177 unique_id=sentence_index,
1178 tokens=tokens,
1179 input_ids=input_ids,
1180 input_mask=input_mask,
1181 input_type_ids=input_type_ids,
1182 token_subtoken_count=token_subtoken_count,
1183 )
1184 )
1186 return features
1188 def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:
1189 """Add embeddings to all words in a list of sentences. If embeddings are already added,
1190 updates only if embeddings are non-static."""
1192 # first, find longest sentence in batch
1193 longest_sentence_in_batch: int = len(
1194 max(
1195 [
1196 self.tokenizer.tokenize(sentence.to_tokenized_string())
1197 for sentence in sentences
1198 ],
1199 key=len,
1200 )
1201 )
1203 # prepare id maps for BERT model
1204 features = self._convert_sentences_to_features(
1205 sentences, longest_sentence_in_batch
1206 )
1207 all_input_ids = torch.LongTensor([f.input_ids for f in features]).to(
1208 flair.device
1209 )
1210 all_input_masks = torch.LongTensor([f.input_mask for f in features]).to(
1211 flair.device
1212 )
1214 # put encoded batch through BERT model to get all hidden states of all encoder layers
1215 self.model.to(flair.device)
1216 self.model.eval()
1217 all_encoder_layers = self.model(all_input_ids, attention_mask=all_input_masks)[
1218 -1
1219 ]
1221 with torch.no_grad():
1223 for sentence_index, sentence in enumerate(sentences):
1225 feature = features[sentence_index]
1227 # get aggregated embeddings for each BERT-subtoken in sentence
1228 subtoken_embeddings = []
1229 for token_index, _ in enumerate(feature.tokens):
1230 all_layers = []
1231 for layer_index in self.layer_indexes:
1232 layer_output = all_encoder_layers[int(layer_index)][
1233 sentence_index
1234 ]
1235 all_layers.append(layer_output[token_index])
1237 if self.use_scalar_mix:
1238 sm = ScalarMix(mixture_size=len(all_layers))
1239 sm_embeddings = sm(all_layers)
1240 all_layers = [sm_embeddings]
1242 subtoken_embeddings.append(torch.cat(all_layers))
1244 # get the current sentence object
1245 token_idx = 0
1246 for token in sentence:
1247 # add concatenated embedding to sentence
1248 token_idx += 1
1250 if self.pooling_operation == "first":
1251 # use first subword embedding if pooling operation is 'first'
1252 token.set_embedding(self.name, subtoken_embeddings[token_idx])
1253 else:
1254 # otherwise, do a mean over all subwords in token
1255 embeddings = subtoken_embeddings[
1256 token_idx : token_idx
1257 + feature.token_subtoken_count[token.idx]
1258 ]
1259 embeddings = [
1260 embedding.unsqueeze(0) for embedding in embeddings
1261 ]
1262 mean = torch.mean(torch.cat(embeddings, dim=0), dim=0)
1263 token.set_embedding(self.name, mean)
1265 token_idx += feature.token_subtoken_count[token.idx] - 1
1267 return sentences
1269 @property
1270 @abstractmethod
1271 def embedding_length(self) -> int:
1272 """Returns the length of the embedding vector."""
1273 return (
1274 len(self.layer_indexes) * self.model.config.hidden_size
1275 if not self.use_scalar_mix
1276 else self.model.config.hidden_size
1277 )
1280class DocumentMeanEmbeddings(DocumentEmbeddings):
1281 @deprecated(
1282 version="0.3.1",
1283 reason="The functionality of this class is moved to 'DocumentPoolEmbeddings'",
1284 )
1285 def __init__(self, token_embeddings: List[TokenEmbeddings]):
1286 """The constructor takes a list of embeddings to be combined."""
1287 super().__init__()
1289 self.embeddings: StackedEmbeddings = StackedEmbeddings(
1290 embeddings=token_embeddings
1291 )
1292 self.name: str = "document_mean"
1294 self.__embedding_length: int = self.embeddings.embedding_length
1296 self.to(flair.device)
1298 @property
1299 def embedding_length(self) -> int:
1300 return self.__embedding_length
1302 def embed(self, sentences: Union[List[Sentence], Sentence]):
1303 """Add embeddings to every sentence in the given list of sentences. If embeddings are already added, updates
1304 only if embeddings are non-static."""
1306 everything_embedded: bool = True
1308 # if only one sentence is passed, convert to list of sentence
1309 if type(sentences) is Sentence:
1310 sentences = [sentences]
1312 for sentence in sentences:
1313 if self.name not in sentence._embeddings.keys():
1314 everything_embedded = False
1316 if not everything_embedded:
1318 self.embeddings.embed(sentences)
1320 for sentence in sentences:
1321 word_embeddings = []
1322 for token in sentence.tokens:
1323 word_embeddings.append(token.get_embedding().unsqueeze(0))
1325 word_embeddings = torch.cat(word_embeddings, dim=0).to(flair.device)
1327 mean_embedding = torch.mean(word_embeddings, 0)
1329 sentence.set_embedding(self.name, mean_embedding)
1331 def _add_embeddings_internal(self, sentences: List[Sentence]):
1332 pass
1335class DocumentLSTMEmbeddings(DocumentEmbeddings):
1336 @deprecated(
1337 version="0.4",
1338 reason="The functionality of this class is moved to 'DocumentRNNEmbeddings'",
1339 )
1340 def __init__(
1341 self,
1342 embeddings: List[TokenEmbeddings],
1343 hidden_size=128,
1344 rnn_layers=1,
1345 reproject_words: bool = True,
1346 reproject_words_dimension: int = None,
1347 bidirectional: bool = False,
1348 dropout: float = 0.5,
1349 word_dropout: float = 0.0,
1350 locked_dropout: float = 0.0,
1351 ):
1352 """The constructor takes a list of embeddings to be combined.
1353 :param embeddings: a list of token embeddings
1354 :param hidden_size: the number of hidden states in the lstm
1355 :param rnn_layers: the number of layers for the lstm
1356 :param reproject_words: boolean value, indicating whether to reproject the token embeddings in a separate linear
1357 layer before putting them into the lstm or not
1358 :param reproject_words_dimension: output dimension of reprojecting token embeddings. If None the same output
1359 dimension as before will be taken.
1360 :param bidirectional: boolean value, indicating whether to use a bidirectional lstm or not
1361 :param dropout: the dropout value to be used
1362 :param word_dropout: the word dropout value to be used, if 0.0 word dropout is not used
1363 :param locked_dropout: the locked dropout value to be used, if 0.0 locked dropout is not used
1364 """
1365 super().__init__()
1367 self.embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embeddings)
1369 self.reproject_words = reproject_words
1370 self.bidirectional = bidirectional
1372 self.length_of_all_token_embeddings: int = self.embeddings.embedding_length
1374 self.name = "document_lstm"
1375 self.static_embeddings = False
1377 self.__embedding_length: int = hidden_size
1378 if self.bidirectional:
1379 self.__embedding_length *= 4
1381 self.embeddings_dimension: int = self.length_of_all_token_embeddings
1382 if self.reproject_words and reproject_words_dimension is not None:
1383 self.embeddings_dimension = reproject_words_dimension
1385 # bidirectional LSTM on top of embedding layer
1386 self.word_reprojection_map = torch.nn.Linear(
1387 self.length_of_all_token_embeddings, self.embeddings_dimension
1388 )
1389 self.rnn = torch.nn.GRU(
1390 self.embeddings_dimension,
1391 hidden_size,
1392 num_layers=rnn_layers,
1393 bidirectional=self.bidirectional,
1394 )
1396 # dropouts
1397 if locked_dropout > 0.0:
1398 self.dropout: torch.nn.Module = LockedDropout(locked_dropout)
1399 else:
1400 self.dropout = torch.nn.Dropout(dropout)
1402 self.use_word_dropout: bool = word_dropout > 0.0
1403 if self.use_word_dropout:
1404 self.word_dropout = WordDropout(word_dropout)
1406 torch.nn.init.xavier_uniform_(self.word_reprojection_map.weight)
1408 self.to(flair.device)
1410 @property
1411 def embedding_length(self) -> int:
1412 return self.__embedding_length
1414 def embed(self, sentences: Union[List[Sentence], Sentence]):
1415 """Add embeddings to all sentences in the given list of sentences. If embeddings are already added, update
1416 only if embeddings are non-static."""
1418 if type(sentences) is Sentence:
1419 sentences = [sentences]
1421 self.rnn.zero_grad()
1423 sentences.sort(key=lambda x: len(x), reverse=True)
1425 self.embeddings.embed(sentences)
1427 # first, sort sentences by number of tokens
1428 longest_token_sequence_in_batch: int = len(sentences[0])
1430 all_sentence_tensors = []
1431 lengths: List[int] = []
1433 # go through each sentence in batch
1434 for i, sentence in enumerate(sentences):
1436 lengths.append(len(sentence.tokens))
1438 word_embeddings = []
1440 for token, token_idx in zip(sentence.tokens, range(len(sentence.tokens))):
1441 word_embeddings.append(token.get_embedding().unsqueeze(0))
1443 # PADDING: pad shorter sentences out
1444 for add in range(longest_token_sequence_in_batch - len(sentence.tokens)):
1445 word_embeddings.append(
1446 torch.zeros(
1447 self.length_of_all_token_embeddings, dtype=torch.float
1448 ).unsqueeze(0).to(flair.device)
1449 )
1451 word_embeddings_tensor = torch.cat(word_embeddings, 0).to(flair.device)
1453 sentence_states = word_embeddings_tensor
1455 # ADD TO SENTENCE LIST: add the representation
1456 all_sentence_tensors.append(sentence_states.unsqueeze(1))
1458 # --------------------------------------------------------------------
1459 # GET REPRESENTATION FOR ENTIRE BATCH
1460 # --------------------------------------------------------------------
1461 sentence_tensor = torch.cat(all_sentence_tensors, 1)
1463 # --------------------------------------------------------------------
1464 # FF PART
1465 # --------------------------------------------------------------------
1466 # use word dropout if set
1467 if self.use_word_dropout:
1468 sentence_tensor = self.word_dropout(sentence_tensor)
1470 if self.reproject_words:
1471 sentence_tensor = self.word_reprojection_map(sentence_tensor)
1473 sentence_tensor = self.dropout(sentence_tensor)
1475 packed = torch.nn.utils.rnn.pack_padded_sequence(sentence_tensor, lengths)
1477 self.rnn.flatten_parameters()
1479 lstm_out, hidden = self.rnn(packed)
1481 outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(lstm_out)
1483 outputs = self.dropout(outputs)
1485 # --------------------------------------------------------------------
1486 # EXTRACT EMBEDDINGS FROM LSTM
1487 # --------------------------------------------------------------------
1488 for sentence_no, length in enumerate(lengths):
1489 last_rep = outputs[length - 1, sentence_no]
1491 embedding = last_rep
1492 if self.bidirectional:
1493 first_rep = outputs[0, sentence_no]
1494 embedding = torch.cat([first_rep, last_rep], 0)
1496 sentence = sentences[sentence_no]
1497 sentence.set_embedding(self.name, embedding)
1499 def _add_embeddings_internal(self, sentences: List[Sentence]):
1500 pass
1503class ELMoTransformerEmbeddings(TokenEmbeddings):
1504 """Contextual word embeddings using word-level Transformer-based LM, as proposed in Peters et al., 2018."""
1506 @deprecated(
1507 version="0.4.2",
1508 reason="Not possible to load or save ELMo Transformer models. @stefan-it is working on it.",
1509 )
1510 def __init__(self, model_file: str):
1511 super().__init__()
1513 try:
1514 from allennlp.modules.token_embedders.bidirectional_language_model_token_embedder import (
1515 BidirectionalLanguageModelTokenEmbedder,
1516 )
1517 from allennlp.data.token_indexers.elmo_indexer import (
1518 ELMoTokenCharactersIndexer,
1519 )
1520 except ModuleNotFoundError:
1521 log.warning("-" * 100)
1522 log.warning('ATTENTION! The library "allennlp" is not installed!')
1523 log.warning(
1524 "To use ELMoTransformerEmbeddings, please first install a recent version from https://github.com/allenai/allennlp"
1525 )
1526 log.warning("-" * 100)
1527 pass
1529 self.name = "elmo-transformer"
1530 self.static_embeddings = True
1531 self.lm_embedder = BidirectionalLanguageModelTokenEmbedder(
1532 archive_file=model_file,
1533 dropout=0.2,
1534 bos_eos_tokens=("<S>", "</S>"),
1535 remove_bos_eos=True,
1536 requires_grad=False,
1537 )
1538 self.lm_embedder = self.lm_embedder.to(device=flair.device)
1539 self.vocab = self.lm_embedder._lm.vocab
1540 self.indexer = ELMoTokenCharactersIndexer()
1542 # embed a dummy sentence to determine embedding_length
1543 dummy_sentence: Sentence = Sentence()
1544 dummy_sentence.add_token(Token("hello"))
1545 embedded_dummy = self.embed(dummy_sentence)
1546 self.__embedding_length: int = len(
1547 embedded_dummy[0].get_token(1).get_embedding()
1548 )
1550 @property
1551 def embedding_length(self) -> int:
1552 return self.__embedding_length
1554 def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:
1555 # Avoid conflicts with flair's Token class
1556 import allennlp.data.tokenizers.token as allen_nlp_token
1558 indexer = self.indexer
1559 vocab = self.vocab
1561 for sentence in sentences:
1562 character_indices = indexer.tokens_to_indices(
1563 [allen_nlp_token.Token(token.text) for token in sentence], vocab, "elmo"
1564 )["elmo"]
1566 indices_tensor = torch.LongTensor([character_indices])
1567 indices_tensor = indices_tensor.to(device=flair.device)
1568 embeddings = self.lm_embedder(indices_tensor)[0].detach().cpu().numpy()
1570 for token, token_idx in zip(sentence.tokens, range(len(sentence.tokens))):
1571 embedding = embeddings[token_idx]
1572 word_embedding = torch.FloatTensor(embedding)
1573 token.set_embedding(self.name, word_embedding)
1575 return sentences
1577 def extra_repr(self):
1578 return "model={}".format(self.name)
1580 def __str__(self):
1581 return self.name