Coverage for flair/flair/tokenization.py: 63%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

304 statements  

1import logging 

2from abc import ABC, abstractmethod 

3from typing import List, Callable, Optional 

4 

5from more_itertools import stagger 

6from segtok.segmenter import split_single, split_multi 

7from segtok.tokenizer import split_contractions, word_tokenizer 

8 

9from flair.data import Sentence, Token, Tokenizer 

10 

11log = logging.getLogger("flair") 

12 

13 

14class SpacyTokenizer(Tokenizer): 

15 """ 

16 Implementation of :class:`Tokenizer`, using models from Spacy. 

17 

18 :param model a Spacy V2 model or the name of the model to load. 

19 """ 

20 

21 def __init__(self, model): 

22 super(SpacyTokenizer, self).__init__() 

23 

24 try: 

25 import spacy 

26 from spacy.language import Language 

27 except ImportError: 

28 raise ImportError( 

29 "Please install Spacy v2.0 or better before using the Spacy tokenizer, " 

30 "otherwise you can use SegtokTokenizer as advanced tokenizer." 

31 ) 

32 

33 if isinstance(model, Language): 

34 self.model: Language = model 

35 elif isinstance(model, str): 

36 self.model: Language = spacy.load(model) 

37 else: 

38 raise AssertionError(f"Unexpected type of parameter model. Please provide a loaded " 

39 f"spacy model or the name of the model to load.") 

40 

41 def tokenize(self, text: str) -> List[Token]: 

42 from spacy.tokens.doc import Doc 

43 from spacy.tokens.token import Token as SpacyToken 

44 

45 doc: Doc = self.model.make_doc(text) 

46 previous_token = None 

47 tokens: List[Token] = [] 

48 for word in doc: 

49 word: SpacyToken = word 

50 if len(word.text.strip()) == 0: 

51 continue 

52 

53 token = Token( 

54 text=word.text, start_position=word.idx, whitespace_after=True 

55 ) 

56 tokens.append(token) 

57 

58 if (previous_token is not None) and ( 

59 token.start_pos == previous_token.start_pos + len(previous_token.text) 

60 ): 

61 previous_token.whitespace_after = False 

62 

63 previous_token = token 

64 

65 return tokens 

66 

67 @property 

68 def name(self) -> str: 

69 return ( 

70 self.__class__.__name__ 

71 + "_" 

72 + self.model.meta["name"] 

73 + "_" 

74 + self.model.meta["version"] 

75 ) 

76 

77 

78class SegtokTokenizer(Tokenizer): 

79 """ 

80 Tokenizer using segtok, a third party library dedicated to rules-based Indo-European languages. 

81 

82 For further details see: https://github.com/fnl/segtok 

83 """ 

84 

85 def __init__(self): 

86 super(SegtokTokenizer, self).__init__() 

87 

88 def tokenize(self, text: str) -> List[Token]: 

89 return SegtokTokenizer.run_tokenize(text) 

90 

91 @staticmethod 

92 def run_tokenize(text: str) -> List[Token]: 

93 tokens: List[Token] = [] 

94 words: List[str] = [] 

95 

96 sentences = split_single(text) 

97 for sentence in sentences: 

98 contractions = split_contractions(word_tokenizer(sentence)) 

99 words.extend(contractions) 

100 

101 words = list(filter(None, words)) 

102 

103 # determine offsets for whitespace_after field 

104 index = text.index 

105 current_offset = 0 

106 previous_word_offset = -1 

107 previous_token = None 

108 for word in words: 

109 try: 

110 word_offset = index(word, current_offset) 

111 start_position = word_offset 

112 except: 

113 word_offset = previous_word_offset + 1 

114 start_position = ( 

115 current_offset + 1 if current_offset > 0 else current_offset 

116 ) 

117 

118 if word: 

119 token = Token( 

120 text=word, start_position=start_position, whitespace_after=True 

121 ) 

122 tokens.append(token) 

123 

124 if (previous_token is not None) and word_offset - 1 == previous_word_offset: 

125 previous_token.whitespace_after = False 

126 

127 current_offset = word_offset + len(word) 

128 previous_word_offset = current_offset - 1 

129 previous_token = token 

130 

131 return tokens 

132 

133 

134class SpaceTokenizer(Tokenizer): 

135 """ 

136 Tokenizer based on space character only. 

137 """ 

138 

139 def __init__(self): 

140 super(SpaceTokenizer, self).__init__() 

141 

142 def tokenize(self, text: str) -> List[Token]: 

143 return SpaceTokenizer.run_tokenize(text) 

144 

145 @staticmethod 

146 def run_tokenize(text: str) -> List[Token]: 

147 tokens: List[Token] = [] 

148 word = "" 

149 index = -1 

150 for index, char in enumerate(text): 

151 if char == " ": 

152 if len(word) > 0: 

153 start_position = index - len(word) 

154 tokens.append( 

155 Token( 

156 text=word, start_position=start_position, whitespace_after=True 

157 ) 

158 ) 

159 

160 word = "" 

161 else: 

162 word += char 

163 # increment for last token in sentence if not followed by whitespace 

164 index += 1 

165 if len(word) > 0: 

166 start_position = index - len(word) 

167 tokens.append( 

168 Token(text=word, start_position=start_position, whitespace_after=False) 

169 ) 

170 

171 return tokens 

172 

173 

174class JapaneseTokenizer(Tokenizer): 

175 """ 

176 Tokenizer using konoha, a third party library which supports 

177 multiple Japanese tokenizer such as MeCab, Janome and SudachiPy. 

178 

179 For further details see: 

180 https://github.com/himkt/konoha 

181 """ 

182 

183 def __init__(self, tokenizer: str, sudachi_mode: str = "A"): 

184 super(JapaneseTokenizer, self).__init__() 

185 

186 available_tokenizers = ["mecab", "janome", "sudachi"] 

187 

188 if tokenizer.lower() not in available_tokenizers: 

189 raise NotImplementedError( 

190 f"Currently, {tokenizer} is only supported. Supported tokenizers: {available_tokenizers}." 

191 ) 

192 

193 try: 

194 import konoha 

195 except ModuleNotFoundError: 

196 log.warning("-" * 100) 

197 log.warning('ATTENTION! The library "konoha" is not installed!') 

198 log.warning( 

199 '- If you want to use MeCab, install mecab with "sudo apt install mecab libmecab-dev mecab-ipadic".' 

200 ) 

201 log.warning('- Install konoha with "pip install konoha[{tokenizer_name}]"') 

202 log.warning(' - You can choose tokenizer from ["mecab", "janome", "sudachi"].') 

203 log.warning("-" * 100) 

204 exit() 

205 

206 self.tokenizer = tokenizer 

207 self.sentence_tokenizer = konoha.SentenceTokenizer() 

208 self.word_tokenizer = konoha.WordTokenizer(tokenizer, mode=sudachi_mode) 

209 

210 def tokenize(self, text: str) -> List[Token]: 

211 tokens: List[Token] = [] 

212 words: List[str] = [] 

213 

214 sentences = self.sentence_tokenizer.tokenize(text) 

215 for sentence in sentences: 

216 konoha_tokens = self.word_tokenizer.tokenize(sentence) 

217 words.extend(list(map(str, konoha_tokens))) 

218 

219 # determine offsets for whitespace_after field 

220 index = text.index 

221 current_offset = 0 

222 previous_word_offset = -1 

223 previous_token = None 

224 for word in words: 

225 try: 

226 word_offset = index(word, current_offset) 

227 start_position = word_offset 

228 except: 

229 word_offset = previous_word_offset + 1 

230 start_position = ( 

231 current_offset + 1 if current_offset > 0 else current_offset 

232 ) 

233 

234 token = Token( 

235 text=word, start_position=start_position, whitespace_after=True 

236 ) 

237 tokens.append(token) 

238 

239 if (previous_token is not None) and word_offset - 1 == previous_word_offset: 

240 previous_token.whitespace_after = False 

241 

242 current_offset = word_offset + len(word) 

243 previous_word_offset = current_offset - 1 

244 previous_token = token 

245 

246 return tokens 

247 

248 @property 

249 def name(self) -> str: 

250 return ( 

251 self.__class__.__name__ 

252 + "_" 

253 + self.tokenizer 

254 ) 

255 

256 

257class TokenizerWrapper(Tokenizer): 

258 """ 

259 Helper class to wrap tokenizer functions to the class-based tokenizer interface. 

260 """ 

261 

262 def __init__(self, tokenizer_func: Callable[[str], List[Token]]): 

263 super(TokenizerWrapper, self).__init__() 

264 self.tokenizer_func = tokenizer_func 

265 

266 def tokenize(self, text: str) -> List[Token]: 

267 return self.tokenizer_func(text) 

268 

269 @property 

270 def name(self) -> str: 

271 return self.__class__.__name__ + "_" + self.tokenizer_func.__name__ 

272 

273 

274class SciSpacyTokenizer(Tokenizer): 

275 """ 

276 Implementation of :class:`Tokenizer` which uses the en_core_sci_sm Spacy model 

277 extended by special heuristics to consider characters such as "(", ")" "-" as 

278 additional token separators. The latter distinguishs this implementation from 

279 :class:`SpacyTokenizer`. 

280 

281 Note, you if you want to use the "normal" SciSpacy tokenization just use 

282 :class:`SpacyTokenizer`. 

283 """ 

284 

285 def __init__(self): 

286 super(SciSpacyTokenizer, self).__init__() 

287 

288 try: 

289 import spacy 

290 from spacy.lang import char_classes 

291 except ImportError: 

292 raise ImportError( 

293 " Please install scispacy version 0.2.5 (recommended) or higher before using the SciSpacy tokenizer, " 

294 "otherwise you can use SegtokTokenizer as alternative implementation.\n" 

295 " You can install scispacy (version 0.2.5) by running:\n\n" 

296 " pip install scispacy==0.2.5\n\n" 

297 " By default HunFlair uses the `en_core_sci_sm` model. You can install the model by running:\n\n" 

298 " pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_sm-0.2.5.tar.gz\n\n" 

299 " Note that the scispacy version and the version of the model must match to work properly!" 

300 ) 

301 

302 def combined_rule_prefixes() -> List[str]: 

303 """Helper function that returns the prefix pattern for the tokenizer. 

304 It is a helper function to accommodate spacy tests that only test 

305 prefixes. 

306 """ 

307 prefix_punct = char_classes.PUNCT.replace("|", " ") 

308 

309 prefixes = ( 

310 ["§", "%", "=", r"\+"] 

311 + char_classes.split_chars(prefix_punct) 

312 + char_classes.LIST_ELLIPSES 

313 + char_classes.LIST_QUOTES 

314 + char_classes.LIST_CURRENCY 

315 + char_classes.LIST_ICONS 

316 ) 

317 return prefixes 

318 

319 infixes = ( 

320 char_classes.LIST_ELLIPSES 

321 + char_classes.LIST_ICONS 

322 + [ 

323 r"×", # added this special x character to tokenize it separately 

324 r"[\(\)\[\]\{\}]", # want to split at every bracket 

325 r"/", # want to split at every slash 

326 r"(?<=[0-9])[+\-\*^](?=[0-9-])", 

327 r"(?<=[{al}])\.(?=[{au}])".format( 

328 al=char_classes.ALPHA_LOWER, au=char_classes.ALPHA_UPPER 

329 ), 

330 r"(?<=[{a}]),(?=[{a}])".format(a=char_classes.ALPHA), 

331 r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format( 

332 a=char_classes.ALPHA, h=char_classes.HYPHENS 

333 ), 

334 r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=char_classes.ALPHA), 

335 ] 

336 ) 

337 

338 prefix_re = spacy.util.compile_prefix_regex(combined_rule_prefixes()) 

339 infix_re = spacy.util.compile_infix_regex(infixes) 

340 

341 self.model = spacy.load( 

342 "en_core_sci_sm", disable=["tagger", "ner", "parser", "textcat", "lemmatizer"] 

343 ) 

344 self.model.tokenizer.prefix_search = prefix_re.search 

345 self.model.tokenizer.infix_finditer = infix_re.finditer 

346 

347 def tokenize(self, text: str) -> List[Token]: 

348 from spacy.tokens.token import Token as SpacyToken 

349 

350 sentence = self.model(text) 

351 

352 previous_token = None 

353 tokens: List[Token] = [] 

354 for word in sentence: 

355 word: SpacyToken = word 

356 token = Token( 

357 text=word.text, start_position=word.idx, whitespace_after=True 

358 ) 

359 tokens.append(token) 

360 

361 if (previous_token is not None) and ( 

362 token.start_pos == previous_token.start_pos + len(previous_token.text) 

363 ) and (not word.text[0].isspace()): 

364 previous_token.whitespace_after = False 

365 

366 previous_token = token 

367 

368 return tokens 

369 

370 @property 

371 def name(self) -> str: 

372 return ( 

373 self.__class__.__name__ 

374 + "_" 

375 + self.model.meta["name"] 

376 + "_" 

377 + self.model.meta["version"] 

378 ) 

379 

380 

381class SentenceSplitter(ABC): 

382 r"""An abstract class representing a :class:`SentenceSplitter`. 

383 

384 Sentence splitters are used to represent algorithms and models to split plain text into 

385 sentences and individual tokens / words. All subclasses should overwrite :meth:`splits`, 

386 which splits the given plain text into a sequence of sentences (:class:`Sentence`). The 

387 individual sentences are in turn subdivided into tokens / words. In most cases, this can 

388 be controlled by passing custom implementation of :class:`Tokenizer`. 

389 

390 Moreover, subclasses may overwrite :meth:`name`, returning a unique identifier representing 

391 the sentence splitter's configuration. 

392 """ 

393 

394 @abstractmethod 

395 def split(self, text: str) -> List[Sentence]: 

396 raise NotImplementedError() 

397 

398 @property 

399 def name(self) -> str: 

400 return self.__class__.__name__ 

401 

402 @property 

403 def tokenizer(self) -> Tokenizer: 

404 raise NotImplementedError() 

405 

406 @tokenizer.setter 

407 def tokenizer(self, value: Tokenizer): 

408 raise NotImplementedError() 

409 

410 

411class SegtokSentenceSplitter(SentenceSplitter): 

412 """ 

413 Implementation of :class:`SentenceSplitter` using the SegTok library. 

414 

415 For further details see: https://github.com/fnl/segtok 

416 """ 

417 

418 def __init__(self, tokenizer: Tokenizer = SegtokTokenizer()): 

419 super(SegtokSentenceSplitter, self).__init__() 

420 self._tokenizer = tokenizer 

421 

422 def split(self, text: str) -> List[Sentence]: 

423 plain_sentences: List[str] = list(split_multi(text)) 

424 

425 try: 

426 sentence_offset: Optional[int] = text.index(plain_sentences[0]) 

427 except ValueError as error: 

428 raise AssertionError(f"Can't find the sentence offset for sentence {repr(plain_sentences[0])} " 

429 f"from the text's starting position") from error 

430 

431 sentences: List[Sentence] = [] 

432 for sentence, next_sentence in stagger(plain_sentences, offsets=(0, 1), longest=True): 

433 

434 sentences.append( 

435 Sentence( 

436 text=sentence, 

437 use_tokenizer=self._tokenizer, 

438 start_position=sentence_offset 

439 ) 

440 ) 

441 

442 offset: int = sentence_offset + len(sentence) 

443 try: 

444 sentence_offset = text.index(next_sentence, offset) if next_sentence is not None else None 

445 except ValueError as error: 

446 raise AssertionError(f"Can't find the sentence offset for sentence {repr(sentence)} " 

447 f"starting from position {repr(offset)}") from error 

448 

449 return sentences 

450 

451 @property 

452 def name(self) -> str: 

453 return self.__class__.__name__ 

454 

455 @property 

456 def tokenizer(self) -> Tokenizer: 

457 return self._tokenizer 

458 

459 @tokenizer.setter 

460 def tokenizer(self, value: Tokenizer): 

461 self._tokenizer = value 

462 

463 

464class SpacySentenceSplitter(SentenceSplitter): 

465 """ 

466 Implementation of :class:`SentenceSplitter`, using models from Spacy. 

467 

468 :param model Spacy V2 model or the name of the model to load. 

469 :param tokenizer Custom tokenizer to use (default :class:`SpacyTokenizer`) 

470 """ 

471 

472 def __init__(self, model: str, tokenizer: Tokenizer = None): 

473 super(SpacySentenceSplitter, self).__init__() 

474 

475 try: 

476 import spacy 

477 from spacy.language import Language 

478 except ImportError: 

479 raise ImportError( 

480 "Please install spacy v2.3.2 or higher before using the SpacySentenceSplitter, " 

481 "otherwise you can use SegtokSentenceSplitter as alternative implementation." 

482 ) 

483 

484 if isinstance(model, Language): 

485 self.model: Language = model 

486 elif isinstance(model, str): 

487 self.model: Language = spacy.load(model) 

488 

489 if tokenizer is None: 

490 self._tokenizer = SpacyTokenizer("en_core_sci_sm") 

491 else: 

492 self._tokenizer = tokenizer 

493 

494 def split(self, text: str) -> List[Sentence]: 

495 document = self.model(text) 

496 

497 sentences = [ 

498 Sentence( 

499 text=str(spacy_sent), 

500 use_tokenizer=self._tokenizer, 

501 start_position=spacy_sent.start_char 

502 ) 

503 for spacy_sent in document.sents 

504 if len(str(spacy_sent)) > 0 

505 ] 

506 

507 return sentences 

508 

509 @property 

510 def tokenizer(self) -> Tokenizer: 

511 return self._tokenizer 

512 

513 @tokenizer.setter 

514 def tokenizer(self, value: Tokenizer): 

515 self._tokenizer = value 

516 

517 @property 

518 def name(self) -> str: 

519 return ( 

520 self.__class__.__name__ 

521 + "_" 

522 + self.model.meta["name"] 

523 + "_" 

524 + self.model.meta["version"] 

525 + "_" 

526 + self._tokenizer.name 

527 ) 

528 

529 

530class SciSpacySentenceSplitter(SpacySentenceSplitter): 

531 """ 

532 Convenience class to instantiate :class:`SpacySentenceSplitter` with Spacy model `en_core_sci_sm` 

533 for sentence splitting and :class:`SciSpacyTokenizer` as tokenizer. 

534 """ 

535 

536 def __init__(self): 

537 super(SciSpacySentenceSplitter, self).__init__("en_core_sci_sm", SciSpacyTokenizer()) 

538 

539 

540class TagSentenceSplitter(SentenceSplitter): 

541 """ 

542 Implementation of :class:`SentenceSplitter` which assumes that there is a special tag within 

543 the text that is used to mark sentence boundaries. 

544 """ 

545 

546 def __init__(self, tag: str, tokenizer: Tokenizer = SegtokTokenizer()): 

547 super(TagSentenceSplitter, self).__init__() 

548 self._tokenizer = tokenizer 

549 self.tag = tag 

550 

551 def split(self, text: str) -> List[Sentence]: 

552 plain_sentences = text.split(self.tag) 

553 

554 sentences = [] 

555 last_offset = 0 

556 

557 for sentence in plain_sentences: 

558 if len(sentence.strip()) == 0: 

559 continue 

560 

561 sentences += [ 

562 Sentence( 

563 text=sentence, 

564 use_tokenizer=self._tokenizer, 

565 start_position=last_offset 

566 ) 

567 ] 

568 

569 last_offset += len(sentence) + len(self.tag) 

570 

571 return sentences 

572 

573 @property 

574 def tokenizer(self) -> Tokenizer: 

575 return self._tokenizer 

576 

577 @tokenizer.setter 

578 def tokenizer(self, value: Tokenizer): 

579 self._tokenizer = value 

580 

581 @property 

582 def name(self) -> str: 

583 return ( 

584 self.__class__.__name__ 

585 + "_" 

586 + self.tag 

587 + "_" 

588 + self._tokenizer.name 

589 ) 

590 

591 

592class NewlineSentenceSplitter(TagSentenceSplitter): 

593 """ 

594 Convenience class to instantiate :class:`SentenceTagSplitter` with newline ("\n") as 

595 sentence boundary marker. 

596 """ 

597 

598 def __init__(self, tokenizer: Tokenizer = SegtokTokenizer()): 

599 super(NewlineSentenceSplitter, self).__init__(tag="\n", tokenizer=tokenizer) 

600 

601 @property 

602 def name(self) -> str: 

603 return ( 

604 self.__class__.__name__ 

605 + "_" 

606 + self._tokenizer.name 

607 ) 

608 

609 

610class NoSentenceSplitter(SentenceSplitter): 

611 """ 

612 Implementation of :class:`SentenceSplitter` which treats the complete text as one sentence. 

613 """ 

614 

615 def __init__(self, tokenizer: Tokenizer = SegtokTokenizer()): 

616 super(NoSentenceSplitter, self).__init__() 

617 self._tokenizer = tokenizer 

618 

619 def split(self, text: str) -> List[Sentence]: 

620 return [ 

621 Sentence( 

622 text=text, 

623 use_tokenizer=self._tokenizer, 

624 start_position=0 

625 ) 

626 ] 

627 

628 @property 

629 def tokenizer(self) -> Tokenizer: 

630 return self._tokenizer 

631 

632 @tokenizer.setter 

633 def tokenizer(self, value: Tokenizer): 

634 self._tokenizer = value 

635 

636 @property 

637 def name(self) -> str: 

638 return ( 

639 self.__class__.__name__ 

640 + "_" 

641 + self._tokenizer.name 

642 )