Coverage for flair/flair/tokenization.py: 39%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import logging
2from abc import ABC, abstractmethod
3from typing import List, Callable, Optional
5from more_itertools import stagger
6from segtok.segmenter import split_single, split_multi
7from segtok.tokenizer import split_contractions, word_tokenizer
9from flair.data import Sentence, Token, Tokenizer
11log = logging.getLogger("flair")
14class SpacyTokenizer(Tokenizer):
15 """
16 Implementation of :class:`Tokenizer`, using models from Spacy.
18 :param model a Spacy V2 model or the name of the model to load.
19 """
21 def __init__(self, model):
22 super(SpacyTokenizer, self).__init__()
24 try:
25 import spacy
26 from spacy.language import Language
27 except ImportError:
28 raise ImportError(
29 "Please install Spacy v2.0 or better before using the Spacy tokenizer, "
30 "otherwise you can use SegtokTokenizer as advanced tokenizer."
31 )
33 if isinstance(model, Language):
34 self.model: Language = model
35 elif isinstance(model, str):
36 self.model: Language = spacy.load(model)
37 else:
38 raise AssertionError(f"Unexpected type of parameter model. Please provide a loaded "
39 f"spacy model or the name of the model to load.")
41 def tokenize(self, text: str) -> List[Token]:
42 from spacy.tokens.doc import Doc
43 from spacy.tokens.token import Token as SpacyToken
45 doc: Doc = self.model.make_doc(text)
46 previous_token = None
47 tokens: List[Token] = []
48 for word in doc:
49 word: SpacyToken = word
50 if len(word.text.strip()) == 0:
51 continue
53 token = Token(
54 text=word.text, start_position=word.idx, whitespace_after=True
55 )
56 tokens.append(token)
58 if (previous_token is not None) and (
59 token.start_pos == previous_token.start_pos + len(previous_token.text)
60 ):
61 previous_token.whitespace_after = False
63 previous_token = token
65 return tokens
67 @property
68 def name(self) -> str:
69 return (
70 self.__class__.__name__
71 + "_"
72 + self.model.meta["name"]
73 + "_"
74 + self.model.meta["version"]
75 )
78class SegtokTokenizer(Tokenizer):
79 """
80 Tokenizer using segtok, a third party library dedicated to rules-based Indo-European languages.
82 For further details see: https://github.com/fnl/segtok
83 """
85 def __init__(self):
86 super(SegtokTokenizer, self).__init__()
88 def tokenize(self, text: str) -> List[Token]:
89 return SegtokTokenizer.run_tokenize(text)
91 @staticmethod
92 def run_tokenize(text: str) -> List[Token]:
93 tokens: List[Token] = []
94 words: List[str] = []
96 sentences = split_single(text)
97 for sentence in sentences:
98 contractions = split_contractions(word_tokenizer(sentence))
99 words.extend(contractions)
101 words = list(filter(None, words))
103 # determine offsets for whitespace_after field
104 index = text.index
105 current_offset = 0
106 previous_word_offset = -1
107 previous_token = None
108 for word in words:
109 try:
110 word_offset = index(word, current_offset)
111 start_position = word_offset
112 except:
113 word_offset = previous_word_offset + 1
114 start_position = (
115 current_offset + 1 if current_offset > 0 else current_offset
116 )
118 if word:
119 token = Token(
120 text=word, start_position=start_position, whitespace_after=True
121 )
122 tokens.append(token)
124 if (previous_token is not None) and word_offset - 1 == previous_word_offset:
125 previous_token.whitespace_after = False
127 current_offset = word_offset + len(word)
128 previous_word_offset = current_offset - 1
129 previous_token = token
131 return tokens
134class SpaceTokenizer(Tokenizer):
135 """
136 Tokenizer based on space character only.
137 """
139 def __init__(self):
140 super(SpaceTokenizer, self).__init__()
142 def tokenize(self, text: str) -> List[Token]:
143 return SpaceTokenizer.run_tokenize(text)
145 @staticmethod
146 def run_tokenize(text: str) -> List[Token]:
147 tokens: List[Token] = []
148 word = ""
149 index = -1
150 for index, char in enumerate(text):
151 if char == " ":
152 if len(word) > 0:
153 start_position = index - len(word)
154 tokens.append(
155 Token(
156 text=word, start_position=start_position, whitespace_after=True
157 )
158 )
160 word = ""
161 else:
162 word += char
163 # increment for last token in sentence if not followed by whitespace
164 index += 1
165 if len(word) > 0:
166 start_position = index - len(word)
167 tokens.append(
168 Token(text=word, start_position=start_position, whitespace_after=False)
169 )
171 return tokens
174class JapaneseTokenizer(Tokenizer):
175 """
176 Tokenizer using konoha, a third party library which supports
177 multiple Japanese tokenizer such as MeCab, Janome and SudachiPy.
179 For further details see:
180 https://github.com/himkt/konoha
181 """
183 def __init__(self, tokenizer: str, sudachi_mode: str = "A"):
184 super(JapaneseTokenizer, self).__init__()
186 available_tokenizers = ["mecab", "janome", "sudachi"]
188 if tokenizer.lower() not in available_tokenizers:
189 raise NotImplementedError(
190 f"Currently, {tokenizer} is only supported. Supported tokenizers: {available_tokenizers}."
191 )
193 try:
194 import konoha
195 except ModuleNotFoundError:
196 log.warning("-" * 100)
197 log.warning('ATTENTION! The library "konoha" is not installed!')
198 log.warning(
199 '- If you want to use MeCab, install mecab with "sudo apt install mecab libmecab-dev mecab-ipadic".'
200 )
201 log.warning('- Install konoha with "pip install konoha[{tokenizer_name}]"')
202 log.warning(' - You can choose tokenizer from ["mecab", "janome", "sudachi"].')
203 log.warning("-" * 100)
204 exit()
206 self.tokenizer = tokenizer
207 self.sentence_tokenizer = konoha.SentenceTokenizer()
208 self.word_tokenizer = konoha.WordTokenizer(tokenizer, mode=sudachi_mode)
210 def tokenize(self, text: str) -> List[Token]:
211 tokens: List[Token] = []
212 words: List[str] = []
214 sentences = self.sentence_tokenizer.tokenize(text)
215 for sentence in sentences:
216 konoha_tokens = self.word_tokenizer.tokenize(sentence)
217 words.extend(list(map(str, konoha_tokens)))
219 # determine offsets for whitespace_after field
220 index = text.index
221 current_offset = 0
222 previous_word_offset = -1
223 previous_token = None
224 for word in words:
225 try:
226 word_offset = index(word, current_offset)
227 start_position = word_offset
228 except:
229 word_offset = previous_word_offset + 1
230 start_position = (
231 current_offset + 1 if current_offset > 0 else current_offset
232 )
234 token = Token(
235 text=word, start_position=start_position, whitespace_after=True
236 )
237 tokens.append(token)
239 if (previous_token is not None) and word_offset - 1 == previous_word_offset:
240 previous_token.whitespace_after = False
242 current_offset = word_offset + len(word)
243 previous_word_offset = current_offset - 1
244 previous_token = token
246 return tokens
248 @property
249 def name(self) -> str:
250 return (
251 self.__class__.__name__
252 + "_"
253 + self.tokenizer
254 )
257class TokenizerWrapper(Tokenizer):
258 """
259 Helper class to wrap tokenizer functions to the class-based tokenizer interface.
260 """
262 def __init__(self, tokenizer_func: Callable[[str], List[Token]]):
263 super(TokenizerWrapper, self).__init__()
264 self.tokenizer_func = tokenizer_func
266 def tokenize(self, text: str) -> List[Token]:
267 return self.tokenizer_func(text)
269 @property
270 def name(self) -> str:
271 return self.__class__.__name__ + "_" + self.tokenizer_func.__name__
274class SciSpacyTokenizer(Tokenizer):
275 """
276 Implementation of :class:`Tokenizer` which uses the en_core_sci_sm Spacy model
277 extended by special heuristics to consider characters such as "(", ")" "-" as
278 additional token separators. The latter distinguishs this implementation from
279 :class:`SpacyTokenizer`.
281 Note, you if you want to use the "normal" SciSpacy tokenization just use
282 :class:`SpacyTokenizer`.
283 """
285 def __init__(self):
286 super(SciSpacyTokenizer, self).__init__()
288 try:
289 import spacy
290 from spacy.lang import char_classes
291 except ImportError:
292 raise ImportError(
293 " Please install scispacy version 0.2.5 (recommended) or higher before using the SciSpacy tokenizer, "
294 "otherwise you can use SegtokTokenizer as alternative implementation.\n"
295 " You can install scispacy (version 0.2.5) by running:\n\n"
296 " pip install scispacy==0.2.5\n\n"
297 " By default HunFlair uses the `en_core_sci_sm` model. You can install the model by running:\n\n"
298 " pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_sm-0.2.5.tar.gz\n\n"
299 " Note that the scispacy version and the version of the model must match to work properly!"
300 )
302 def combined_rule_prefixes() -> List[str]:
303 """Helper function that returns the prefix pattern for the tokenizer.
304 It is a helper function to accommodate spacy tests that only test
305 prefixes.
306 """
307 prefix_punct = char_classes.PUNCT.replace("|", " ")
309 prefixes = (
310 ["§", "%", "=", r"\+"]
311 + char_classes.split_chars(prefix_punct)
312 + char_classes.LIST_ELLIPSES
313 + char_classes.LIST_QUOTES
314 + char_classes.LIST_CURRENCY
315 + char_classes.LIST_ICONS
316 )
317 return prefixes
319 infixes = (
320 char_classes.LIST_ELLIPSES
321 + char_classes.LIST_ICONS
322 + [
323 r"×", # added this special x character to tokenize it separately
324 r"[\(\)\[\]\{\}]", # want to split at every bracket
325 r"/", # want to split at every slash
326 r"(?<=[0-9])[+\-\*^](?=[0-9-])",
327 r"(?<=[{al}])\.(?=[{au}])".format(
328 al=char_classes.ALPHA_LOWER, au=char_classes.ALPHA_UPPER
329 ),
330 r"(?<=[{a}]),(?=[{a}])".format(a=char_classes.ALPHA),
331 r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format(
332 a=char_classes.ALPHA, h=char_classes.HYPHENS
333 ),
334 r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=char_classes.ALPHA),
335 ]
336 )
338 prefix_re = spacy.util.compile_prefix_regex(combined_rule_prefixes())
339 infix_re = spacy.util.compile_infix_regex(infixes)
341 self.model = spacy.load(
342 "en_core_sci_sm", disable=["tagger", "ner", "parser", "textcat", "lemmatizer"]
343 )
344 self.model.tokenizer.prefix_search = prefix_re.search
345 self.model.tokenizer.infix_finditer = infix_re.finditer
347 def tokenize(self, text: str) -> List[Token]:
348 from spacy.tokens.token import Token as SpacyToken
350 sentence = self.model(text)
352 previous_token = None
353 tokens: List[Token] = []
354 for word in sentence:
355 word: SpacyToken = word
356 token = Token(
357 text=word.text, start_position=word.idx, whitespace_after=True
358 )
359 tokens.append(token)
361 if (previous_token is not None) and (
362 token.start_pos == previous_token.start_pos + len(previous_token.text)
363 ) and (not word.text[0].isspace()):
364 previous_token.whitespace_after = False
366 previous_token = token
368 return tokens
370 @property
371 def name(self) -> str:
372 return (
373 self.__class__.__name__
374 + "_"
375 + self.model.meta["name"]
376 + "_"
377 + self.model.meta["version"]
378 )
381class SentenceSplitter(ABC):
382 r"""An abstract class representing a :class:`SentenceSplitter`.
384 Sentence splitters are used to represent algorithms and models to split plain text into
385 sentences and individual tokens / words. All subclasses should overwrite :meth:`splits`,
386 which splits the given plain text into a sequence of sentences (:class:`Sentence`). The
387 individual sentences are in turn subdivided into tokens / words. In most cases, this can
388 be controlled by passing custom implementation of :class:`Tokenizer`.
390 Moreover, subclasses may overwrite :meth:`name`, returning a unique identifier representing
391 the sentence splitter's configuration.
392 """
394 @abstractmethod
395 def split(self, text: str) -> List[Sentence]:
396 raise NotImplementedError()
398 @property
399 def name(self) -> str:
400 return self.__class__.__name__
402 @property
403 def tokenizer(self) -> Tokenizer:
404 raise NotImplementedError()
406 @tokenizer.setter
407 def tokenizer(self, value: Tokenizer):
408 raise NotImplementedError()
411class SegtokSentenceSplitter(SentenceSplitter):
412 """
413 Implementation of :class:`SentenceSplitter` using the SegTok library.
415 For further details see: https://github.com/fnl/segtok
416 """
418 def __init__(self, tokenizer: Tokenizer = SegtokTokenizer()):
419 super(SegtokSentenceSplitter, self).__init__()
420 self._tokenizer = tokenizer
422 def split(self, text: str) -> List[Sentence]:
423 plain_sentences: List[str] = list(split_multi(text))
425 try:
426 sentence_offset: Optional[int] = text.index(plain_sentences[0])
427 except ValueError as error:
428 raise AssertionError(f"Can't find the sentence offset for sentence {repr(plain_sentences[0])} "
429 f"from the text's starting position") from error
431 sentences: List[Sentence] = []
432 for sentence, next_sentence in stagger(plain_sentences, offsets=(0, 1), longest=True):
434 sentences.append(
435 Sentence(
436 text=sentence,
437 use_tokenizer=self._tokenizer,
438 start_position=sentence_offset
439 )
440 )
442 offset: int = sentence_offset + len(sentence)
443 try:
444 sentence_offset = text.index(next_sentence, offset) if next_sentence is not None else None
445 except ValueError as error:
446 raise AssertionError(f"Can't find the sentence offset for sentence {repr(sentence)} "
447 f"starting from position {repr(offset)}") from error
449 return sentences
451 @property
452 def name(self) -> str:
453 return self.__class__.__name__
455 @property
456 def tokenizer(self) -> Tokenizer:
457 return self._tokenizer
459 @tokenizer.setter
460 def tokenizer(self, value: Tokenizer):
461 self._tokenizer = value
464class SpacySentenceSplitter(SentenceSplitter):
465 """
466 Implementation of :class:`SentenceSplitter`, using models from Spacy.
468 :param model Spacy V2 model or the name of the model to load.
469 :param tokenizer Custom tokenizer to use (default :class:`SpacyTokenizer`)
470 """
472 def __init__(self, model: str, tokenizer: Tokenizer = None):
473 super(SpacySentenceSplitter, self).__init__()
475 try:
476 import spacy
477 from spacy.language import Language
478 except ImportError:
479 raise ImportError(
480 "Please install spacy v2.3.2 or higher before using the SpacySentenceSplitter, "
481 "otherwise you can use SegtokSentenceSplitter as alternative implementation."
482 )
484 if isinstance(model, Language):
485 self.model: Language = model
486 elif isinstance(model, str):
487 self.model: Language = spacy.load(model)
489 if tokenizer is None:
490 self._tokenizer = SpacyTokenizer("en_core_sci_sm")
491 else:
492 self._tokenizer = tokenizer
494 def split(self, text: str) -> List[Sentence]:
495 document = self.model(text)
497 sentences = [
498 Sentence(
499 text=str(spacy_sent),
500 use_tokenizer=self._tokenizer,
501 start_position=spacy_sent.start_char
502 )
503 for spacy_sent in document.sents
504 if len(str(spacy_sent)) > 0
505 ]
507 return sentences
509 @property
510 def tokenizer(self) -> Tokenizer:
511 return self._tokenizer
513 @tokenizer.setter
514 def tokenizer(self, value: Tokenizer):
515 self._tokenizer = value
517 @property
518 def name(self) -> str:
519 return (
520 self.__class__.__name__
521 + "_"
522 + self.model.meta["name"]
523 + "_"
524 + self.model.meta["version"]
525 + "_"
526 + self._tokenizer.name
527 )
530class SciSpacySentenceSplitter(SpacySentenceSplitter):
531 """
532 Convenience class to instantiate :class:`SpacySentenceSplitter` with Spacy model `en_core_sci_sm`
533 for sentence splitting and :class:`SciSpacyTokenizer` as tokenizer.
534 """
536 def __init__(self):
537 super(SciSpacySentenceSplitter, self).__init__("en_core_sci_sm", SciSpacyTokenizer())
540class TagSentenceSplitter(SentenceSplitter):
541 """
542 Implementation of :class:`SentenceSplitter` which assumes that there is a special tag within
543 the text that is used to mark sentence boundaries.
544 """
546 def __init__(self, tag: str, tokenizer: Tokenizer = SegtokTokenizer()):
547 super(TagSentenceSplitter, self).__init__()
548 self._tokenizer = tokenizer
549 self.tag = tag
551 def split(self, text: str) -> List[Sentence]:
552 plain_sentences = text.split(self.tag)
554 sentences = []
555 last_offset = 0
557 for sentence in plain_sentences:
558 if len(sentence.strip()) == 0:
559 continue
561 sentences += [
562 Sentence(
563 text=sentence,
564 use_tokenizer=self._tokenizer,
565 start_position=last_offset
566 )
567 ]
569 last_offset += len(sentence) + len(self.tag)
571 return sentences
573 @property
574 def tokenizer(self) -> Tokenizer:
575 return self._tokenizer
577 @tokenizer.setter
578 def tokenizer(self, value: Tokenizer):
579 self._tokenizer = value
581 @property
582 def name(self) -> str:
583 return (
584 self.__class__.__name__
585 + "_"
586 + self.tag
587 + "_"
588 + self._tokenizer.name
589 )
592class NewlineSentenceSplitter(TagSentenceSplitter):
593 """
594 Convenience class to instantiate :class:`SentenceTagSplitter` with newline ("\n") as
595 sentence boundary marker.
596 """
598 def __init__(self, tokenizer: Tokenizer = SegtokTokenizer()):
599 super(NewlineSentenceSplitter, self).__init__(tag="\n", tokenizer=tokenizer)
601 @property
602 def name(self) -> str:
603 return (
604 self.__class__.__name__
605 + "_"
606 + self._tokenizer.name
607 )
610class NoSentenceSplitter(SentenceSplitter):
611 """
612 Implementation of :class:`SentenceSplitter` which treats the complete text as one sentence.
613 """
615 def __init__(self, tokenizer: Tokenizer = SegtokTokenizer()):
616 super(NoSentenceSplitter, self).__init__()
617 self._tokenizer = tokenizer
619 def split(self, text: str) -> List[Sentence]:
620 return [
621 Sentence(
622 text=text,
623 use_tokenizer=self._tokenizer,
624 start_position=0
625 )
626 ]
628 @property
629 def tokenizer(self) -> Tokenizer:
630 return self._tokenizer
632 @tokenizer.setter
633 def tokenizer(self, value: Tokenizer):
634 self._tokenizer = value
636 @property
637 def name(self) -> str:
638 return (
639 self.__class__.__name__
640 + "_"
641 + self._tokenizer.name
642 )