Coverage for /home/ubuntu/Documents/Research/mut_p1/flair/flair/datasets/text_text.py: 14%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

354 statements  

1import logging 

2import os 

3from pathlib import Path 

4from typing import List, Union 

5 

6import flair 

7from flair.data import ( 

8 Sentence, 

9 Corpus, 

10 FlairDataset, 

11 DataPair, 

12) 

13from flair.datasets.base import find_train_dev_test_files 

14from flair.file_utils import cached_path, unpack_file, unzip_file 

15 

16log = logging.getLogger("flair") 

17 

18 

19class ParallelTextCorpus(Corpus): 

20 def __init__( 

21 self, 

22 source_file: Union[str, Path], 

23 target_file: Union[str, Path], 

24 name: str = None, 

25 use_tokenizer: bool = True, 

26 max_tokens_per_doc=-1, 

27 max_chars_per_doc=-1, 

28 in_memory: bool = True, 

29 **corpusargs, 

30 ): 

31 """ 

32 Instantiates a Corpus for text classification from CSV column formatted data 

33 

34 :param data_folder: base folder with the task data 

35 :param train_file: the name of the train file 

36 :param test_file: the name of the test file 

37 :param dev_file: the name of the dev file, if None, dev data is sampled from train 

38 :return: a Corpus with annotated train, dev and test data 

39 """ 

40 train: FlairDataset = ParallelTextDataset( 

41 source_file, 

42 target_file, 

43 use_tokenizer=use_tokenizer, 

44 max_tokens_per_doc=max_tokens_per_doc, 

45 max_chars_per_doc=max_chars_per_doc, 

46 in_memory=in_memory, 

47 ) 

48 

49 self.in_memory = in_memory 

50 

51 super(ParallelTextCorpus, self).__init__(train, name=name, **corpusargs) 

52 

53 def is_in_memory(self) -> bool: 

54 return self.in_memory 

55 

56 

57class OpusParallelCorpus(ParallelTextCorpus): 

58 def __init__( 

59 self, 

60 dataset: str, 

61 l1: str, 

62 l2: str, 

63 use_tokenizer: bool = True, 

64 max_tokens_per_doc=-1, 

65 max_chars_per_doc=-1, 

66 in_memory: bool = True, 

67 **corpusargs, 

68 ): 

69 """ 

70 Instantiates a Parallel Corpus from OPUS (http://opus.nlpl.eu/) 

71 :param dataset: Name of the dataset (one of "tatoeba") 

72 :param l1: Language code of first language in pair ("en", "de", etc.) 

73 :param l2: Language code of second language in pair ("en", "de", etc.) 

74 :param use_tokenizer: Whether or not to use in-built tokenizer 

75 :param max_tokens_per_doc: If set, shortens sentences to this maximum number of tokens 

76 :param max_chars_per_doc: If set, shortens sentences to this maximum number of characters 

77 :param in_memory: If True, keeps dataset fully in memory 

78 """ 

79 

80 if l1 > l2: 

81 l1, l2 = l2, l1 

82 

83 # check if dataset is supported 

84 supported_datasets = ["tatoeba", "subtitles"] 

85 if dataset not in supported_datasets: 

86 log.error(f"Dataset must be one of: {supported_datasets}") 

87 

88 # set file names 

89 if dataset == "tatoeba": 

90 link = f"https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/moses/{l1}-{l2}.txt.zip" 

91 

92 l1_file = (flair.cache_root / "datasets" / dataset / f"{l1}-{l2}" / f"Tatoeba.{l1}-{l2}.{l1}") 

93 l2_file = (flair.cache_root / "datasets" / dataset / f"{l1}-{l2}" / f"Tatoeba.{l1}-{l2}.{l2}") 

94 

95 # set file names 

96 if dataset == "subtitles": 

97 link = f"https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/moses/{l1}-{l2}.txt.zip" 

98 

99 l1_file = (flair.cache_root / "datasets" / dataset / f"{l1}-{l2}" / f"OpenSubtitles.{l1}-{l2}.{l1}") 

100 l2_file = (flair.cache_root / "datasets" / dataset / f"{l1}-{l2}" / f"OpenSubtitles.{l1}-{l2}.{l2}") 

101 

102 # download and unzip in file structure if necessary 

103 if not l1_file.exists(): 

104 path = cached_path(link, Path("datasets") / dataset / f"{l1}-{l2}") 

105 unzip_file(path, flair.cache_root / Path("datasets") / dataset / f"{l1}-{l2}") 

106 

107 # instantiate corpus 

108 super(OpusParallelCorpus, self).__init__( 

109 l1_file, 

110 l2_file, 

111 name=f"{dataset}-{l1_file}-{l2_file}", 

112 use_tokenizer=use_tokenizer, 

113 max_tokens_per_doc=max_tokens_per_doc, 

114 max_chars_per_doc=max_chars_per_doc, 

115 in_memory=in_memory, 

116 **corpusargs, 

117 ) 

118 

119 

120class ParallelTextDataset(FlairDataset): 

121 def __init__( 

122 self, 

123 path_to_source: Union[str, Path], 

124 path_to_target: Union[str, Path], 

125 max_tokens_per_doc=-1, 

126 max_chars_per_doc=-1, 

127 use_tokenizer=True, 

128 in_memory: bool = True, 

129 ): 

130 if type(path_to_source) == str: 

131 path_to_source: Path = Path(path_to_source) 

132 if type(path_to_target) == str: 

133 path_to_target: Path = Path(path_to_target) 

134 

135 assert path_to_source.exists() 

136 assert path_to_target.exists() 

137 

138 self.in_memory = in_memory 

139 

140 self.use_tokenizer = use_tokenizer 

141 self.max_tokens_per_doc = max_tokens_per_doc 

142 

143 self.total_sentence_count: int = 0 

144 

145 if self.in_memory: 

146 self.bi_sentences: List[DataPair] = [] 

147 else: 

148 self.source_lines: List[str] = [] 

149 self.target_lines: List[str] = [] 

150 

151 with open(str(path_to_source), encoding="utf-8") as source_file, open( 

152 str(path_to_target), encoding="utf-8" 

153 ) as target_file: 

154 

155 source_line = source_file.readline() 

156 target_line = target_file.readline() 

157 

158 while source_line and target_line: 

159 

160 source_line = source_file.readline() 

161 target_line = target_file.readline() 

162 

163 if source_line.strip() == "": 

164 continue 

165 if target_line.strip() == "": 

166 continue 

167 

168 if max_chars_per_doc > 0: 

169 source_line = source_line[:max_chars_per_doc] 

170 target_line = target_line[:max_chars_per_doc] 

171 

172 if self.in_memory: 

173 bi_sentence = self._make_bi_sentence(source_line, target_line) 

174 self.bi_sentences.append(bi_sentence) 

175 else: 

176 self.source_lines.append(source_line) 

177 self.target_lines.append(target_line) 

178 

179 self.total_sentence_count += 1 

180 

181 def _make_bi_sentence(self, source_line: str, target_line: str): 

182 

183 source_sentence = Sentence(source_line, use_tokenizer=self.use_tokenizer) 

184 target_sentence = Sentence(target_line, use_tokenizer=self.use_tokenizer) 

185 

186 if self.max_tokens_per_doc > 0: 

187 source_sentence.tokens = source_sentence.tokens[: self.max_tokens_per_doc] 

188 target_sentence.tokens = target_sentence.tokens[: self.max_tokens_per_doc] 

189 

190 return DataPair(source_sentence, target_sentence) 

191 

192 def __len__(self): 

193 return self.total_sentence_count 

194 

195 def __getitem__(self, index: int = 0) -> DataPair: 

196 if self.in_memory: 

197 return self.bi_sentences[index] 

198 else: 

199 return self._make_bi_sentence( 

200 self.source_lines[index], self.target_lines[index] 

201 ) 

202 

203 def is_in_memory(self) -> bool: 

204 return self.in_memory 

205 

206 

207class DataPairCorpus(Corpus): 

208 def __init__( 

209 self, 

210 data_folder: Union[str, Path], 

211 columns: List[int] = [0, 1, 2], 

212 train_file=None, 

213 test_file=None, 

214 dev_file=None, 

215 use_tokenizer: bool = True, 

216 max_tokens_per_doc=-1, 

217 max_chars_per_doc=-1, 

218 in_memory: bool = True, 

219 label_type: str = None, 

220 autofind_splits=True, 

221 sample_missing_splits: bool = True, 

222 skip_first_line: bool = False, 

223 separator: str = '\t', 

224 encoding: str = 'utf-8' 

225 ): 

226 """ 

227 Corpus for tasks involving pairs of sentences or paragraphs. The data files are expected to be in column format where each line has a colmun  

228 for the first sentence/paragraph, the second sentence/paragraph and the labels, respectively. The columns must be separated by a given separator (default: '\t'). 

229  

230 :param data_folder: base folder with the task data 

231 :param columns: List that indicates the columns for the first sentence (first entry in the list), the second sentence (second entry) and label (last entry). 

232 default = [0,1,2] 

233 :param train_file: the name of the train file 

234 :param test_file: the name of the test file, if None, dev data is sampled from train (if sample_missing_splits is true) 

235 :param dev_file: the name of the dev file, if None, dev data is sampled from train (if sample_missing_splits is true) 

236 :param use_tokenizer: Whether or not to use in-built tokenizer 

237 :param max_tokens_per_doc: If set, shortens sentences to this maximum number of tokens 

238 :param max_chars_per_doc: If set, shortens sentences to this maximum number of characters 

239 :param in_memory: If True, data will be saved in list of flair.data.DataPair objects, other wise we use lists with simple strings which needs less space 

240 :param label_type: Name of the label of the data pairs 

241 :param autofind_splits: If True, train/test/dev files will be automatically identified in the given data_folder 

242 :param sample_missing_splits: If True, a missing train/test/dev file will be sampled from the available data 

243 :param skip_first_line: If True, first line of data files will be ignored 

244 :param separator: Separator between columns in data files 

245 :param encoding: Encoding of data files 

246  

247 :return: a Corpus with annotated train, dev and test data 

248 """ 

249 

250 # find train, dev and test files if not specified 

251 dev_file, test_file, train_file = \ 

252 find_train_dev_test_files(data_folder, dev_file, test_file, train_file, autofind_splits=autofind_splits) 

253 

254 # create DataPairDataset for train, test and dev file, if they are given 

255 

256 train: FlairDataset = DataPairDataset( 

257 train_file, 

258 columns=columns, 

259 use_tokenizer=use_tokenizer, 

260 max_tokens_per_doc=max_tokens_per_doc, 

261 max_chars_per_doc=max_chars_per_doc, 

262 in_memory=in_memory, 

263 label_type=label_type, 

264 skip_first_line=skip_first_line, 

265 separator=separator, 

266 encoding=encoding 

267 ) if train_file is not None else None 

268 

269 test: FlairDataset = DataPairDataset( 

270 test_file, 

271 columns=columns, 

272 use_tokenizer=use_tokenizer, 

273 max_tokens_per_doc=max_tokens_per_doc, 

274 max_chars_per_doc=max_chars_per_doc, 

275 in_memory=in_memory, 

276 label_type=label_type, 

277 skip_first_line=skip_first_line, 

278 separator=separator, 

279 encoding=encoding 

280 ) if test_file is not None else None 

281 

282 dev: FlairDataset = DataPairDataset( 

283 dev_file, 

284 columns=columns, 

285 use_tokenizer=use_tokenizer, 

286 max_tokens_per_doc=max_tokens_per_doc, 

287 max_chars_per_doc=max_chars_per_doc, 

288 in_memory=in_memory, 

289 label_type=label_type, 

290 skip_first_line=skip_first_line, 

291 separator=separator, 

292 encoding=encoding 

293 ) if dev_file is not None else None 

294 

295 super(DataPairCorpus, self).__init__(train, dev, test, 

296 sample_missing_splits=sample_missing_splits, 

297 name=str(data_folder)) 

298 

299 

300class DataPairDataset(FlairDataset): 

301 def __init__( 

302 self, 

303 path_to_data: Union[str, Path], 

304 columns: List[int] = [0, 1, 2], 

305 max_tokens_per_doc=-1, 

306 max_chars_per_doc=-1, 

307 use_tokenizer=True, 

308 in_memory: bool = True, 

309 label_type: str = None, 

310 skip_first_line: bool = False, 

311 separator: str = '\t', 

312 encoding: str = 'utf-8', 

313 label: bool = True 

314 ): 

315 """ 

316 Creates a Dataset for pairs of sentences/paragraphs. The file needs to be in a column format,  

317 where each line has a column for the first sentence/paragraph, the second sentence/paragraph and the label  

318 seperated by e.g. '\t' (just like in the glue RTE-dataset https://gluebenchmark.com/tasks) . 

319 For each data pair we create a flair.data.DataPair object. 

320  

321 :param path_to_data: path to the data file 

322 :param columns: list of integers that indicate the respective columns. The first entry is the column 

323 for the first sentence, the second for the second sentence and the third for the label. Default [0,1,2] 

324 :param max_tokens_per_doc: If set, shortens sentences to this maximum number of tokens 

325 :param max_chars_per_doc: If set, shortens sentences to this maximum number of characters 

326 :param use_tokenizer: Whether or not to use in-built tokenizer 

327 :param in_memory: If True, data will be saved in list of flair.data.DataPair objects, other wise we use lists with simple strings which needs less space 

328 :param label_type: Name of the label of the data pairs 

329 :param skip_first_line: If True, first line of data file will be ignored 

330 :param separator: Separator between columns in the data file 

331 :param encoding: Encoding of the data file 

332 :param label: If False, the dataset expects unlabeled data 

333 """ 

334 

335 if type(path_to_data) == str: 

336 path_to_data: Path = Path(path_to_data) 

337 

338 # stop if file does not exist 

339 assert path_to_data.exists() 

340 

341 self.in_memory = in_memory 

342 

343 self.use_tokenizer = use_tokenizer 

344 

345 self.max_tokens_per_doc = max_tokens_per_doc 

346 

347 self.label = label 

348 

349 self.label_type = label_type 

350 

351 self.total_data_count: int = 0 

352 

353 if self.in_memory: 

354 self.data_pairs: List[DataPair] = [] 

355 else: 

356 self.first_elements: List[str] = [] 

357 self.second_elements: List[str] = [] 

358 self.labels: List[str] = [] 

359 

360 with open(str(path_to_data), encoding=encoding) as source_file: 

361 

362 source_line = source_file.readline() 

363 

364 if skip_first_line: 

365 source_line = source_file.readline() 

366 

367 while source_line: 

368 

369 source_line_list = source_line.strip().split(separator) 

370 

371 first_element = source_line_list[columns[0]] 

372 second_element = source_line_list[columns[1]] 

373 

374 if self.label: 

375 pair_label = source_line_list[columns[2]] 

376 else: 

377 pair_label = None 

378 

379 if max_chars_per_doc > 0: 

380 first_element = first_element[:max_chars_per_doc] 

381 second_element = second_element[:max_chars_per_doc] 

382 

383 if self.in_memory: 

384 

385 data_pair = self._make_data_pair(first_element, second_element, pair_label) 

386 self.data_pairs.append(data_pair) 

387 else: 

388 self.first_elements.append(first_element) 

389 self.second_elements.append(second_element) 

390 if self.label: 

391 self.labels.append(pair_label) 

392 

393 self.total_data_count += 1 

394 

395 source_line = source_file.readline() 

396 

397 # create a DataPair object from strings 

398 def _make_data_pair(self, first_element: str, second_element: str, label: str = None): 

399 

400 first_sentence = Sentence(first_element, use_tokenizer=self.use_tokenizer) 

401 second_sentence = Sentence(second_element, use_tokenizer=self.use_tokenizer) 

402 

403 if self.max_tokens_per_doc > 0: 

404 first_sentence.tokens = first_sentence.tokens[: self.max_tokens_per_doc] 

405 second_sentence.tokens = second_sentence.tokens[: self.max_tokens_per_doc] 

406 

407 data_pair = DataPair(first_sentence, second_sentence) 

408 

409 if label: 

410 data_pair.add_label(typename=self.label_type, value=label) 

411 

412 return data_pair 

413 

414 def is_in_memory(self) -> bool: 

415 

416 return self.in_memory 

417 

418 def __len__(self): 

419 return self.total_data_count 

420 

421 # if in_memory is True we return a datapair, otherwise we create one from the lists of strings 

422 def __getitem__(self, index: int = 0) -> DataPair: 

423 if self.in_memory: 

424 return self.data_pairs[index] 

425 elif self.label: 

426 return self._make_data_pair( 

427 self.first_elements[index], self.second_elements[index], self.labels[index] 

428 ) 

429 else: 

430 return self._make_data_pair( 

431 self.first_elements[index], self.second_elements[index] 

432 ) 

433 

434 

435class GLUE_RTE(DataPairCorpus): 

436 def __init__( 

437 self, 

438 label_type="entailment", 

439 base_path: Union[str, Path] = None, 

440 max_tokens_per_doc=-1, 

441 max_chars_per_doc=-1, 

442 use_tokenizer=True, 

443 in_memory: bool = True, 

444 sample_missing_splits: bool = True 

445 ): 

446 """ 

447 Creates a DataPairCorpus for the Glue Recognizing Textual Entailment (RTE) data (https://gluebenchmark.com/tasks). 

448 Additionaly to the Corpus we have a eval_dataset containing the test file of the Glue data.  

449 This file contains unlabeled test data to evaluate models on the Glue RTE task. 

450 """ 

451 

452 if type(base_path) == str: 

453 base_path: Path = Path(base_path) 

454 

455 dataset_name = "glue" 

456 

457 # if no base_path provided take cache root 

458 if not base_path: 

459 base_path = flair.cache_root / "datasets" 

460 data_folder = base_path / dataset_name 

461 

462 data_file = data_folder / "RTE/train.tsv" 

463 

464 # if data is not downloaded yet, download it 

465 if not data_file.is_file(): 

466 # get the zip file 

467 zipped_data_path = cached_path( 

468 'https://dl.fbaipublicfiles.com/glue/data/RTE.zip', 

469 Path("datasets") / dataset_name 

470 ) 

471 

472 unpack_file( 

473 zipped_data_path, 

474 data_folder, 

475 mode="zip", 

476 keep=False 

477 ) 

478 

479 # rename test file to eval_dataset, since it has no labels 

480 os.rename(str(data_folder / "RTE/test.tsv"), str(data_folder / "RTE/eval_dataset.tsv")) 

481 

482 super(GLUE_RTE, self).__init__( 

483 data_folder / "RTE", 

484 label_type=label_type, 

485 columns=[1, 2, 3], 

486 skip_first_line=True, 

487 use_tokenizer=use_tokenizer, 

488 max_tokens_per_doc=max_tokens_per_doc, 

489 max_chars_per_doc=max_chars_per_doc, 

490 in_memory=in_memory, 

491 sample_missing_splits=sample_missing_splits 

492 

493 ) 

494 

495 self.eval_dataset = DataPairDataset( 

496 data_folder / "RTE/eval_dataset.tsv", 

497 columns=[1, 2, 3], 

498 use_tokenizer=use_tokenizer, 

499 max_tokens_per_doc=max_tokens_per_doc, 

500 max_chars_per_doc=max_chars_per_doc, 

501 in_memory=in_memory, 

502 skip_first_line=True, 

503 label=False 

504 ) 

505 

506 """  

507 This function creates a tsv file of the predictions of the eval_dataset (after calling classifier.predict(corpus.eval_dataset, label_name='textual_entailment')). 

508 The resulting file is called RTE.tsv and is in the format required for submission to the Glue Benchmark. 

509 """ 

510 

511 def tsv_from_eval_dataset(self, folder_path: Union[str, Path]): 

512 

513 if type(folder_path) == str: 

514 folder_path = Path(folder_path) 

515 folder_path = folder_path / 'RTE.tsv' 

516 

517 with open(folder_path, mode='w') as tsv_file: 

518 tsv_file.write("index\tprediction\n") 

519 for index, datapoint in enumerate(self.eval_dataset): 

520 tsv_file.write(str(index) + '\t' + datapoint.get_labels('textual_entailment')[0].value + '\n') 

521 

522 

523class GLUE_MNLI(DataPairCorpus): 

524 def __init__( 

525 self, 

526 label_type="entailment", 

527 evaluate_on_matched: bool = True, 

528 base_path: Union[str, Path] = None, 

529 max_tokens_per_doc=-1, 

530 max_chars_per_doc=-1, 

531 use_tokenizer=True, 

532 in_memory: bool = True, 

533 sample_missing_splits: bool = True 

534 ): 

535 """ 

536 Creates a DataPairCorpus for the Multi-Genre Natural Language Inference Corpus (MNLI) 

537 from GLUE benchmark (https://gluebenchmark.com/tasks). Entailment annotations are:  

538 entailment, contradiction, neutral. This corpus includes two dev sets mathced/mismatched  

539 and two unlabeled test sets: eval_dataset_matched, eval_dataset_mismatched. 

540 """ 

541 

542 if type(base_path) == str: 

543 base_path: Path = Path(base_path) 

544 

545 dataset_name = "glue" 

546 

547 # if no base_path provided take cache root 

548 if not base_path: 

549 base_path = flair.cache_root / "datasets" 

550 data_folder = base_path / dataset_name 

551 

552 data_file = data_folder / "MNLI/train.tsv" 

553 

554 # if data is not downloaded yet, download it 

555 if not data_file.is_file(): 

556 # get the zip file 

557 zipped_data_path = cached_path( 

558 "https://dl.fbaipublicfiles.com/glue/data/MNLI.zip", 

559 Path("datasets") / dataset_name 

560 ) 

561 

562 unpack_file( 

563 zipped_data_path, 

564 data_folder, 

565 mode="zip", 

566 keep=False 

567 ) 

568 

569 # reorder dev datasets to have same columns as in train set: 8, 9, and 11 

570 # dev sets include 5 different annotations but we will only keep the gold label 

571 for dev_filename in ["dev_matched.tsv", "dev_mismatched.tsv"]: 

572 

573 temp_file = str("temp_" + dev_filename) 

574 os.rename(str(data_folder / "MNLI" / dev_filename), 

575 str(data_folder / "MNLI" / temp_file)) 

576 

577 with open(data_folder / "MNLI" / dev_filename, "a") as out_file, open( 

578 data_folder / "MNLI" / temp_file) as in_file: 

579 for line in in_file: 

580 fields = line.split('\t') 

581 reordered_columns = '\t'.join(fields[column_id] for column_id in range(11)) 

582 reordered_columns += '\t' + fields[15] 

583 out_file.write(reordered_columns) 

584 os.remove(str(data_folder / "MNLI" / temp_file)) 

585 

586 # rename test file to eval_dataset, since it has no labels 

587 os.rename(str(data_folder / "MNLI/test_matched.tsv"), 

588 str(data_folder / "MNLI/eval_dataset_matched.tsv")) 

589 os.rename(str(data_folder / "MNLI/test_mismatched.tsv"), 

590 str(data_folder / "MNLI/eval_dataset_mismatched.tsv")) 

591 

592 matched_suffix = "matched" if evaluate_on_matched else "mismatched" 

593 

594 dev_dataset = "dev_" + matched_suffix + ".tsv" 

595 eval_dataset = "eval_dataset_" + matched_suffix + ".tsv" 

596 

597 self.evaluate_on_matched = evaluate_on_matched 

598 

599 super(GLUE_MNLI, self).__init__( 

600 data_folder / "MNLI", 

601 train_file=data_file, 

602 dev_file=dev_dataset, 

603 label_type=label_type, 

604 columns=[8, 9, 11], 

605 skip_first_line=True, 

606 use_tokenizer=use_tokenizer, 

607 max_tokens_per_doc=max_tokens_per_doc, 

608 max_chars_per_doc=max_chars_per_doc, 

609 in_memory=in_memory, 

610 sample_missing_splits=sample_missing_splits 

611 ) 

612 

613 self.eval_dataset = DataPairDataset( 

614 data_folder / "MNLI" / eval_dataset, 

615 columns=[8, 9, 11], 

616 use_tokenizer=use_tokenizer, 

617 max_tokens_per_doc=max_tokens_per_doc, 

618 max_chars_per_doc=max_chars_per_doc, 

619 in_memory=in_memory, 

620 skip_first_line=True, 

621 label=False 

622 ) 

623 

624 """ 

625 This function creates a tsv file of the predictions of the eval_dataset (after calling  

626 classifier.predict(corpus.eval_dataset, label_name='textual_entailment')). The resulting file  

627 is called MNLI-m.tsv or MNLI-mm.tsv and is in the format required for the Glue Benchmark. 

628 """ 

629 

630 def tsv_from_eval_dataset(self, folder_path: Union[str, Path]): 

631 

632 if type(folder_path) == str: 

633 folder_path = Path(folder_path) 

634 glue_eval_tsv = "MNLI-m.tsv" if self.evaluate_on_matched else "MNLI-mm.tsv" 

635 folder_path = folder_path / glue_eval_tsv 

636 

637 with open(folder_path, mode='w') as tsv_file: 

638 tsv_file.write("index\tprediction\n") 

639 for index, datapoint in enumerate(self.eval_dataset): 

640 label = datapoint.get_labels('textual_entailment')[0].value 

641 tsv_file.write(str(index) + '\t' + label + '\n') 

642 

643 

644class GLUE_MRPC(DataPairCorpus): 

645 def __init__( 

646 self, 

647 label_type="paraphrase", 

648 base_path: Union[str, Path] = None, 

649 max_tokens_per_doc=-1, 

650 max_chars_per_doc=-1, 

651 use_tokenizer=True, 

652 in_memory: bool = True, 

653 sample_missing_splits: bool = True 

654 ): 

655 """ 

656 Creates a DataPairCorpus for the Microsoft Research Paraphrase Corpus (MRPC)  

657 from Glue benchmark (https://gluebenchmark.com/tasks). MRPC includes annotated  

658 train and test sets. Dev set is sampled each time when creating this corpus. 

659 """ 

660 

661 if type(base_path) == str: 

662 base_path: Path = Path(base_path) 

663 

664 dataset_name = "glue" 

665 

666 # if no base_path provided take cache root 

667 if not base_path: 

668 base_path = flair.cache_root / "datasets" 

669 data_folder = base_path / dataset_name 

670 

671 data_file = data_folder / "MRPC/train.tsv" 

672 

673 mrpc_path = "https://dl.fbaipublicfiles.com/senteval/senteval_data/" 

674 

675 original_filenames = ["msr_paraphrase_train.txt", "msr_paraphrase_test.txt"] 

676 

677 # if data is not downloaded yet, download it 

678 if not data_file.is_file(): 

679 for original_filename in original_filenames: 

680 # get test and dev sets 

681 cached_path(f"{mrpc_path}{original_filename}", 

682 Path("datasets") / dataset_name / "MRPC") 

683 

684 os.rename(str(data_folder / "MRPC/msr_paraphrase_train.txt"), 

685 str(data_folder / "MRPC/train.tsv")) 

686 os.rename(str(data_folder / "MRPC/msr_paraphrase_test.txt"), 

687 str(data_folder / "MRPC/test.tsv")) 

688 

689 super(GLUE_MRPC, self).__init__( 

690 data_folder / "MRPC", 

691 label_type=label_type, 

692 columns=[3, 4, 0], 

693 skip_first_line=True, 

694 use_tokenizer=use_tokenizer, 

695 max_tokens_per_doc=max_tokens_per_doc, 

696 max_chars_per_doc=max_chars_per_doc, 

697 in_memory=in_memory, 

698 sample_missing_splits=sample_missing_splits 

699 ) 

700 

701 """ 

702 This function creates a tsv file of the predictions of the eval_dataset (after calling 

703 classifier.predict(corpus.test, label_name='paraphrase')). The dataset that is used 

704 for evaluation is the same as the test set. The resulting file is called MRPC.tsv 

705 and is in the format required for submission to the Glue Benchmark. 

706 """ 

707 

708 def tsv_from_eval_dataset(self, folder_path: Union[str, Path]): 

709 

710 if type(folder_path) == str: 

711 folder_path = Path(folder_path) 

712 folder_path = folder_path / 'MRPC.tsv' 

713 

714 with open(folder_path, mode='w') as tsv_file: 

715 tsv_file.write("index\tprediction\n") 

716 for index, datapoint in enumerate(self.test): 

717 label = datapoint.get_labels('paraphrase')[0].value 

718 tsv_file.write(str(index) + '\t' + label + '\n') 

719 

720 

721class GLUE_QNLI(DataPairCorpus): 

722 def __init__( 

723 self, 

724 label_type="entailment", 

725 base_path: Union[str, Path] = None, 

726 max_tokens_per_doc=-1, 

727 max_chars_per_doc=-1, 

728 use_tokenizer=True, 

729 in_memory: bool = True, 

730 sample_missing_splits: bool = True 

731 ): 

732 """ 

733 Creates a DataPairCorpus for the Question-answering Natural Language Inference dataset  

734 (QNLI) from GLUE benchmark (https://gluebenchmark.com/tasks). 

735 Additionaly to the Corpus we have a eval_dataset containing the test file of the Glue data.  

736 This file contains unlabeled test data to evaluate models on the Glue QNLI task. 

737 """ 

738 

739 if type(base_path) == str: 

740 base_path: Path = Path(base_path) 

741 

742 dataset_name = "glue" 

743 

744 # if no base_path provided take cache root 

745 if not base_path: 

746 base_path = flair.cache_root / "datasets" 

747 data_folder = base_path / dataset_name 

748 

749 data_file = data_folder / "QNLI/train.tsv" 

750 

751 # if data is not downloaded yet, download it 

752 if not data_file.is_file(): 

753 # get the zip file 

754 zipped_data_path = cached_path( 

755 "https://dl.fbaipublicfiles.com/glue/data/QNLIv2.zip", 

756 Path("datasets") / dataset_name 

757 ) 

758 

759 unpack_file( 

760 zipped_data_path, 

761 data_folder, 

762 mode="zip", 

763 keep=False 

764 ) 

765 

766 # rename test file to eval_dataset, since it has no labels 

767 os.rename(str(data_folder / "QNLI/test.tsv"), 

768 str(data_folder / "QNLI/eval_dataset.tsv")) 

769 

770 super(GLUE_QNLI, self).__init__( 

771 data_folder / "QNLI", 

772 label_type=label_type, 

773 columns=[1, 2, 3], 

774 skip_first_line=True, 

775 use_tokenizer=use_tokenizer, 

776 max_tokens_per_doc=max_tokens_per_doc, 

777 max_chars_per_doc=max_chars_per_doc, 

778 in_memory=in_memory, 

779 sample_missing_splits=sample_missing_splits 

780 ) 

781 

782 self.eval_dataset = DataPairDataset( 

783 data_folder / "QNLI/eval_dataset.tsv", 

784 columns=[1, 2, 3], 

785 use_tokenizer=use_tokenizer, 

786 max_tokens_per_doc=max_tokens_per_doc, 

787 max_chars_per_doc=max_chars_per_doc, 

788 in_memory=in_memory, 

789 skip_first_line=True, 

790 label=False 

791 ) 

792 

793 """  

794 This function creates a tsv file of the predictions of the eval_dataset (after calling 

795 classifier.predict(corpus.eval_dataset, label_name='textual_entailment')). The resulting 

796 file is called QNLI.tsv and is in the format required for submission to the Glue Benchmark. 

797 """ 

798 

799 def tsv_from_eval_dataset(self, folder_path: Union[str, Path]): 

800 

801 if type(folder_path) == str: 

802 folder_path = Path(folder_path) 

803 folder_path = folder_path / 'QNLI.tsv' 

804 

805 with open(folder_path, mode='w') as tsv_file: 

806 tsv_file.write("index\tprediction\n") 

807 for index, datapoint in enumerate(self.eval_dataset): 

808 label = datapoint.get_labels('textual_entailment')[0].value 

809 tsv_file.write(str(index) + '\t' + label + '\n') 

810 

811 

812class GLUE_QQP(DataPairCorpus): 

813 def __init__( 

814 self, 

815 label_type="paraphrase", 

816 base_path: Union[str, Path] = None, 

817 max_tokens_per_doc=-1, 

818 max_chars_per_doc=-1, 

819 use_tokenizer=True, 

820 in_memory: bool = True, 

821 sample_missing_splits: bool = True 

822 ): 

823 """ 

824 Creates a Quora Question Pairs (QQP) Corpus from the Glue benchmark (https://gluebenchmark.com/tasks). 

825 The task is to determine whether a pair of questions are semantically equivalent. 

826 Additionaly to the Corpus we have a eval_dataset containing the test file of the Glue data.  

827 This file contains unlabeled test data to evaluate models on the Glue QQP task. 

828 """ 

829 

830 if type(base_path) == str: 

831 base_path: Path = Path(base_path) 

832 

833 dataset_name = "glue" 

834 

835 # if no base_path provided take cache root 

836 if not base_path: 

837 base_path = flair.cache_root / "datasets" 

838 data_folder = base_path / dataset_name 

839 

840 data_file = data_folder / "QQP/train.tsv" 

841 

842 # if data is not downloaded yet, download it 

843 if not data_file.is_file(): 

844 # get the zip file 

845 zipped_data_path = cached_path( 

846 "https://dl.fbaipublicfiles.com/glue/data/QQP-clean.zip", 

847 Path("datasets") / dataset_name 

848 ) 

849 

850 unpack_file( 

851 zipped_data_path, 

852 data_folder, 

853 mode="zip", 

854 keep=False 

855 ) 

856 

857 # rename test file to eval_dataset, since it has no labels 

858 os.rename(str(data_folder / "QQP/test.tsv"), 

859 str(data_folder / "QQP/eval_dataset.tsv")) 

860 

861 super(GLUE_QQP, self).__init__( 

862 data_folder / "QQP", 

863 label_type=label_type, 

864 columns=[3, 4, 5], 

865 skip_first_line=True, 

866 use_tokenizer=use_tokenizer, 

867 max_tokens_per_doc=max_tokens_per_doc, 

868 max_chars_per_doc=max_chars_per_doc, 

869 in_memory=in_memory, 

870 sample_missing_splits=sample_missing_splits 

871 ) 

872 

873 self.eval_dataset = DataPairDataset( 

874 data_folder / "QQP/eval_dataset.tsv", 

875 columns=[1, 2, 0], 

876 use_tokenizer=use_tokenizer, 

877 max_tokens_per_doc=max_tokens_per_doc, 

878 max_chars_per_doc=max_chars_per_doc, 

879 in_memory=in_memory, 

880 skip_first_line=True, 

881 label=False 

882 ) 

883 

884 """ 

885 This function creates a tsv file of the predictions of the eval_dataset (after calling  

886 classifier.predict(corpus.eval_dataset, label_name='paraphrase')). The resulting file 

887 is called QQP.tsv and is in the format required for submission to the Glue Benchmark. 

888 """ 

889 

890 def tsv_from_eval_dataset(self, folder_path: Union[str, Path]): 

891 

892 if type(folder_path) == str: 

893 folder_path = Path(folder_path) 

894 folder_path = folder_path / 'QQP.tsv' 

895 

896 with open(folder_path, mode='w') as tsv_file: 

897 tsv_file.write("index\tprediction\n") 

898 for index, datapoint in enumerate(self.eval_dataset): 

899 label = datapoint.get_labels('paraphrase')[0].value 

900 tsv_file.write(str(index) + '\t' + label + '\n') 

901 

902 

903class GLUE_WNLI(DataPairCorpus): 

904 def __init__( 

905 self, 

906 label_type="entailment", 

907 base_path: Union[str, Path] = None, 

908 max_tokens_per_doc=-1, 

909 max_chars_per_doc=-1, 

910 use_tokenizer=True, 

911 in_memory: bool = True, 

912 sample_missing_splits: bool = True 

913 ): 

914 """ 

915 Creates a Winograd Schema Challenge Corpus formated as Natural Language Inference task (WNLI). 

916 The task is to predict if the sentence with the pronoun substituted is entailed by the original sentence. 

917 Additionaly to the Corpus we have a eval_dataset containing the test file of the Glue data. 

918 This file contains unlabeled test data to evaluate models on the Glue WNLI task. 

919 """ 

920 

921 if type(base_path) == str: 

922 base_path: Path = Path(base_path) 

923 

924 dataset_name = "glue" 

925 

926 # if no base_path provided take cache root 

927 if not base_path: 

928 base_path = flair.cache_root / "datasets" 

929 data_folder = base_path / dataset_name 

930 

931 data_file = data_folder / "WNLI/train.tsv" 

932 

933 # if data is not downloaded yet, download it 

934 if not data_file.is_file(): 

935 # get the zip file 

936 zipped_data_path = cached_path( 

937 "https://dl.fbaipublicfiles.com/glue/data/WNLI.zip", 

938 Path("datasets") / dataset_name 

939 ) 

940 

941 unpack_file( 

942 zipped_data_path, 

943 data_folder, 

944 mode="zip", 

945 keep=False 

946 ) 

947 

948 # rename test file to eval_dataset, since it has no labels 

949 os.rename(str(data_folder / "WNLI/test.tsv"), 

950 str(data_folder / "WNLI/eval_dataset.tsv")) 

951 

952 super(GLUE_WNLI, self).__init__( 

953 data_folder / "WNLI", 

954 label_type=label_type, 

955 columns=[1, 2, 3], 

956 skip_first_line=True, 

957 use_tokenizer=use_tokenizer, 

958 max_tokens_per_doc=max_tokens_per_doc, 

959 max_chars_per_doc=max_chars_per_doc, 

960 in_memory=in_memory, 

961 sample_missing_splits=sample_missing_splits 

962 ) 

963 

964 self.eval_dataset = DataPairDataset( 

965 data_folder / "WNLI/eval_dataset.tsv", 

966 columns=[1, 2, 3], 

967 use_tokenizer=use_tokenizer, 

968 max_tokens_per_doc=max_tokens_per_doc, 

969 max_chars_per_doc=max_chars_per_doc, 

970 in_memory=in_memory, 

971 skip_first_line=True, 

972 label=False 

973 ) 

974 

975 """ 

976 This function creates a tsv file of the predictions of the eval_dataset (after calling 

977 classifier.predict(corpus.eval_dataset, label_name='textual_entailment')). The resulting file 

978 is called WNLI.tsv and is in the format required for submission to the Glue Benchmark. 

979 """ 

980 

981 def tsv_from_eval_dataset(self, folder_path: Union[str, Path]): 

982 

983 if type(folder_path) == str: 

984 folder_path = Path(folder_path) 

985 folder_path = folder_path / 'WNLI.tsv' 

986 

987 with open(folder_path, mode='w') as tsv_file: 

988 tsv_file.write("index\tprediction\n") 

989 for index, datapoint in enumerate(self.eval_dataset): 

990 tsv_file.write(str(index) + '\t' + datapoint.get_labels('entailment')[0].value + '\n') 

991 

992 

993class SUPERGLUE_RTE(DataPairCorpus): 

994 def __init__( 

995 self, 

996 base_path: Union[str, Path] = None, 

997 max_tokens_per_doc=-1, 

998 max_chars_per_doc=-1, 

999 use_tokenizer=True, 

1000 in_memory: bool = True, 

1001 sample_missing_splits: bool = True 

1002 ): 

1003 """ 

1004 Creates a DataPairCorpus for the SuperGlue Recognizing Textual Entailment (RTE) data (https://super.gluebenchmark.com/tasks). 

1005 Additionaly to the Corpus we have a eval_dataset containing the test file of the SuperGlue data.  

1006 This file contains unlabeled test data to evaluate models on the SuperGlue RTE task. 

1007 """ 

1008 

1009 if type(base_path) == str: 

1010 base_path: Path = Path(base_path) 

1011 

1012 dataset_name = "superglue" 

1013 

1014 # if no base_path provided take cache root 

1015 if not base_path: 

1016 base_path = flair.cache_root / "datasets" 

1017 data_folder = base_path / dataset_name 

1018 

1019 data_file = data_folder / "RTE/train.tsv" 

1020 

1021 # if data not downloaded yet, download it 

1022 if not data_file.is_file(): 

1023 # get the zip file 

1024 zipped_data_path = cached_path( 

1025 'https://dl.fbaipublicfiles.com/glue/superglue/data/v2/RTE.zip', 

1026 Path("datasets") / dataset_name 

1027 ) 

1028 

1029 unpack_file( 

1030 zipped_data_path, 

1031 data_folder, 

1032 mode="zip", 

1033 keep=False 

1034 ) 

1035 

1036 # the downloaded files have json format, we transform them to tsv 

1037 rte_jsonl_to_tsv(data_folder / "RTE/train.jsonl", remove=True) 

1038 rte_jsonl_to_tsv(data_folder / "RTE/test.jsonl", remove=True, label=False) 

1039 rte_jsonl_to_tsv(data_folder / "RTE/val.jsonl", remove=True) 

1040 

1041 os.rename(str(data_folder / "RTE/val.tsv"), str(data_folder / "RTE/dev.tsv")) 

1042 os.rename(str(data_folder / "RTE/test.tsv"), str(data_folder / "RTE/eval_dataset.tsv")) 

1043 

1044 super(SUPERGLUE_RTE, self).__init__( 

1045 data_folder / "RTE", 

1046 columns=[0, 1, 2], 

1047 use_tokenizer=use_tokenizer, 

1048 max_tokens_per_doc=max_tokens_per_doc, 

1049 max_chars_per_doc=max_chars_per_doc, 

1050 in_memory=in_memory, 

1051 label_type='textual_entailment', 

1052 sample_missing_splits=sample_missing_splits 

1053 ) 

1054 

1055 self.eval_dataset = DataPairDataset( 

1056 data_folder / "RTE/eval_dataset.tsv", 

1057 columns=[0, 1, 2], 

1058 use_tokenizer=use_tokenizer, 

1059 max_tokens_per_doc=max_tokens_per_doc, 

1060 max_chars_per_doc=max_chars_per_doc, 

1061 in_memory=in_memory, 

1062 skip_first_line=False, 

1063 label=False 

1064 ) 

1065 

1066 """  

1067 Creates JSONL file of the predictions of the eval_dataset (after calling classifier.predict(corpus.eval_dataset, label_name='textual_entailment')). 

1068 The resulting file is called RTE.jsonl and is in the form required for submission to the SuperGlue Benchmark. 

1069 """ 

1070 

1071 def jsonl_from_eval_dataset(self, folder_path: Union[str, Path]): 

1072 

1073 if type(folder_path) == str: 

1074 folder_path = Path(folder_path) 

1075 folder_path = folder_path / 'RTE.jsonl' 

1076 

1077 with open(folder_path, mode='w') as jsonl_file: 

1078 

1079 for index, datapoint in enumerate(self.eval_dataset): 

1080 entry = {"idx": index, "label": datapoint.get_labels('textual_entailment')[0].value} 

1081 jsonl_file.write(str(entry) + '\n') 

1082 

1083 

1084# Function to transform JSON file to tsv for Recognizing Textual Entailment Data 

1085def rte_jsonl_to_tsv(file_path: Union[str, Path], label: bool = True, remove: bool = False, encoding='utf-8'): 

1086 import json 

1087 

1088 tsv_file = os.path.splitext(file_path)[0] + '.tsv' 

1089 

1090 with open(file_path, 'r', encoding=encoding) as jsonl_file: 

1091 with open(tsv_file, 'w', encoding=encoding) as tsv_file: 

1092 

1093 line = jsonl_file.readline() 

1094 

1095 while line: 

1096 

1097 obj = json.loads(line) 

1098 new_line = obj["premise"] + '\t' + obj["hypothesis"] 

1099 if label: 

1100 new_line += '\t' + obj["label"] 

1101 new_line += '\n' 

1102 

1103 tsv_file.write(new_line) 

1104 

1105 line = jsonl_file.readline() 

1106 

1107 # remove json file 

1108 if remove: 

1109 os.remove(file_path)