Coverage for /home/ubuntu/Documents/Research/mut_p1/flair/flair/datasets/text_text.py: 14%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import logging
2import os
3from pathlib import Path
4from typing import List, Union
6import flair
7from flair.data import (
8 Sentence,
9 Corpus,
10 FlairDataset,
11 DataPair,
12)
13from flair.datasets.base import find_train_dev_test_files
14from flair.file_utils import cached_path, unpack_file, unzip_file
16log = logging.getLogger("flair")
19class ParallelTextCorpus(Corpus):
20 def __init__(
21 self,
22 source_file: Union[str, Path],
23 target_file: Union[str, Path],
24 name: str = None,
25 use_tokenizer: bool = True,
26 max_tokens_per_doc=-1,
27 max_chars_per_doc=-1,
28 in_memory: bool = True,
29 **corpusargs,
30 ):
31 """
32 Instantiates a Corpus for text classification from CSV column formatted data
34 :param data_folder: base folder with the task data
35 :param train_file: the name of the train file
36 :param test_file: the name of the test file
37 :param dev_file: the name of the dev file, if None, dev data is sampled from train
38 :return: a Corpus with annotated train, dev and test data
39 """
40 train: FlairDataset = ParallelTextDataset(
41 source_file,
42 target_file,
43 use_tokenizer=use_tokenizer,
44 max_tokens_per_doc=max_tokens_per_doc,
45 max_chars_per_doc=max_chars_per_doc,
46 in_memory=in_memory,
47 )
49 self.in_memory = in_memory
51 super(ParallelTextCorpus, self).__init__(train, name=name, **corpusargs)
53 def is_in_memory(self) -> bool:
54 return self.in_memory
57class OpusParallelCorpus(ParallelTextCorpus):
58 def __init__(
59 self,
60 dataset: str,
61 l1: str,
62 l2: str,
63 use_tokenizer: bool = True,
64 max_tokens_per_doc=-1,
65 max_chars_per_doc=-1,
66 in_memory: bool = True,
67 **corpusargs,
68 ):
69 """
70 Instantiates a Parallel Corpus from OPUS (http://opus.nlpl.eu/)
71 :param dataset: Name of the dataset (one of "tatoeba")
72 :param l1: Language code of first language in pair ("en", "de", etc.)
73 :param l2: Language code of second language in pair ("en", "de", etc.)
74 :param use_tokenizer: Whether or not to use in-built tokenizer
75 :param max_tokens_per_doc: If set, shortens sentences to this maximum number of tokens
76 :param max_chars_per_doc: If set, shortens sentences to this maximum number of characters
77 :param in_memory: If True, keeps dataset fully in memory
78 """
80 if l1 > l2:
81 l1, l2 = l2, l1
83 # check if dataset is supported
84 supported_datasets = ["tatoeba", "subtitles"]
85 if dataset not in supported_datasets:
86 log.error(f"Dataset must be one of: {supported_datasets}")
88 # set file names
89 if dataset == "tatoeba":
90 link = f"https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/moses/{l1}-{l2}.txt.zip"
92 l1_file = (flair.cache_root / "datasets" / dataset / f"{l1}-{l2}" / f"Tatoeba.{l1}-{l2}.{l1}")
93 l2_file = (flair.cache_root / "datasets" / dataset / f"{l1}-{l2}" / f"Tatoeba.{l1}-{l2}.{l2}")
95 # set file names
96 if dataset == "subtitles":
97 link = f"https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/moses/{l1}-{l2}.txt.zip"
99 l1_file = (flair.cache_root / "datasets" / dataset / f"{l1}-{l2}" / f"OpenSubtitles.{l1}-{l2}.{l1}")
100 l2_file = (flair.cache_root / "datasets" / dataset / f"{l1}-{l2}" / f"OpenSubtitles.{l1}-{l2}.{l2}")
102 # download and unzip in file structure if necessary
103 if not l1_file.exists():
104 path = cached_path(link, Path("datasets") / dataset / f"{l1}-{l2}")
105 unzip_file(path, flair.cache_root / Path("datasets") / dataset / f"{l1}-{l2}")
107 # instantiate corpus
108 super(OpusParallelCorpus, self).__init__(
109 l1_file,
110 l2_file,
111 name=f"{dataset}-{l1_file}-{l2_file}",
112 use_tokenizer=use_tokenizer,
113 max_tokens_per_doc=max_tokens_per_doc,
114 max_chars_per_doc=max_chars_per_doc,
115 in_memory=in_memory,
116 **corpusargs,
117 )
120class ParallelTextDataset(FlairDataset):
121 def __init__(
122 self,
123 path_to_source: Union[str, Path],
124 path_to_target: Union[str, Path],
125 max_tokens_per_doc=-1,
126 max_chars_per_doc=-1,
127 use_tokenizer=True,
128 in_memory: bool = True,
129 ):
130 if type(path_to_source) == str:
131 path_to_source: Path = Path(path_to_source)
132 if type(path_to_target) == str:
133 path_to_target: Path = Path(path_to_target)
135 assert path_to_source.exists()
136 assert path_to_target.exists()
138 self.in_memory = in_memory
140 self.use_tokenizer = use_tokenizer
141 self.max_tokens_per_doc = max_tokens_per_doc
143 self.total_sentence_count: int = 0
145 if self.in_memory:
146 self.bi_sentences: List[DataPair] = []
147 else:
148 self.source_lines: List[str] = []
149 self.target_lines: List[str] = []
151 with open(str(path_to_source), encoding="utf-8") as source_file, open(
152 str(path_to_target), encoding="utf-8"
153 ) as target_file:
155 source_line = source_file.readline()
156 target_line = target_file.readline()
158 while source_line and target_line:
160 source_line = source_file.readline()
161 target_line = target_file.readline()
163 if source_line.strip() == "":
164 continue
165 if target_line.strip() == "":
166 continue
168 if max_chars_per_doc > 0:
169 source_line = source_line[:max_chars_per_doc]
170 target_line = target_line[:max_chars_per_doc]
172 if self.in_memory:
173 bi_sentence = self._make_bi_sentence(source_line, target_line)
174 self.bi_sentences.append(bi_sentence)
175 else:
176 self.source_lines.append(source_line)
177 self.target_lines.append(target_line)
179 self.total_sentence_count += 1
181 def _make_bi_sentence(self, source_line: str, target_line: str):
183 source_sentence = Sentence(source_line, use_tokenizer=self.use_tokenizer)
184 target_sentence = Sentence(target_line, use_tokenizer=self.use_tokenizer)
186 if self.max_tokens_per_doc > 0:
187 source_sentence.tokens = source_sentence.tokens[: self.max_tokens_per_doc]
188 target_sentence.tokens = target_sentence.tokens[: self.max_tokens_per_doc]
190 return DataPair(source_sentence, target_sentence)
192 def __len__(self):
193 return self.total_sentence_count
195 def __getitem__(self, index: int = 0) -> DataPair:
196 if self.in_memory:
197 return self.bi_sentences[index]
198 else:
199 return self._make_bi_sentence(
200 self.source_lines[index], self.target_lines[index]
201 )
203 def is_in_memory(self) -> bool:
204 return self.in_memory
207class DataPairCorpus(Corpus):
208 def __init__(
209 self,
210 data_folder: Union[str, Path],
211 columns: List[int] = [0, 1, 2],
212 train_file=None,
213 test_file=None,
214 dev_file=None,
215 use_tokenizer: bool = True,
216 max_tokens_per_doc=-1,
217 max_chars_per_doc=-1,
218 in_memory: bool = True,
219 label_type: str = None,
220 autofind_splits=True,
221 sample_missing_splits: bool = True,
222 skip_first_line: bool = False,
223 separator: str = '\t',
224 encoding: str = 'utf-8'
225 ):
226 """
227 Corpus for tasks involving pairs of sentences or paragraphs. The data files are expected to be in column format where each line has a colmun
228 for the first sentence/paragraph, the second sentence/paragraph and the labels, respectively. The columns must be separated by a given separator (default: '\t').
230 :param data_folder: base folder with the task data
231 :param columns: List that indicates the columns for the first sentence (first entry in the list), the second sentence (second entry) and label (last entry).
232 default = [0,1,2]
233 :param train_file: the name of the train file
234 :param test_file: the name of the test file, if None, dev data is sampled from train (if sample_missing_splits is true)
235 :param dev_file: the name of the dev file, if None, dev data is sampled from train (if sample_missing_splits is true)
236 :param use_tokenizer: Whether or not to use in-built tokenizer
237 :param max_tokens_per_doc: If set, shortens sentences to this maximum number of tokens
238 :param max_chars_per_doc: If set, shortens sentences to this maximum number of characters
239 :param in_memory: If True, data will be saved in list of flair.data.DataPair objects, other wise we use lists with simple strings which needs less space
240 :param label_type: Name of the label of the data pairs
241 :param autofind_splits: If True, train/test/dev files will be automatically identified in the given data_folder
242 :param sample_missing_splits: If True, a missing train/test/dev file will be sampled from the available data
243 :param skip_first_line: If True, first line of data files will be ignored
244 :param separator: Separator between columns in data files
245 :param encoding: Encoding of data files
247 :return: a Corpus with annotated train, dev and test data
248 """
250 # find train, dev and test files if not specified
251 dev_file, test_file, train_file = \
252 find_train_dev_test_files(data_folder, dev_file, test_file, train_file, autofind_splits=autofind_splits)
254 # create DataPairDataset for train, test and dev file, if they are given
256 train: FlairDataset = DataPairDataset(
257 train_file,
258 columns=columns,
259 use_tokenizer=use_tokenizer,
260 max_tokens_per_doc=max_tokens_per_doc,
261 max_chars_per_doc=max_chars_per_doc,
262 in_memory=in_memory,
263 label_type=label_type,
264 skip_first_line=skip_first_line,
265 separator=separator,
266 encoding=encoding
267 ) if train_file is not None else None
269 test: FlairDataset = DataPairDataset(
270 test_file,
271 columns=columns,
272 use_tokenizer=use_tokenizer,
273 max_tokens_per_doc=max_tokens_per_doc,
274 max_chars_per_doc=max_chars_per_doc,
275 in_memory=in_memory,
276 label_type=label_type,
277 skip_first_line=skip_first_line,
278 separator=separator,
279 encoding=encoding
280 ) if test_file is not None else None
282 dev: FlairDataset = DataPairDataset(
283 dev_file,
284 columns=columns,
285 use_tokenizer=use_tokenizer,
286 max_tokens_per_doc=max_tokens_per_doc,
287 max_chars_per_doc=max_chars_per_doc,
288 in_memory=in_memory,
289 label_type=label_type,
290 skip_first_line=skip_first_line,
291 separator=separator,
292 encoding=encoding
293 ) if dev_file is not None else None
295 super(DataPairCorpus, self).__init__(train, dev, test,
296 sample_missing_splits=sample_missing_splits,
297 name=str(data_folder))
300class DataPairDataset(FlairDataset):
301 def __init__(
302 self,
303 path_to_data: Union[str, Path],
304 columns: List[int] = [0, 1, 2],
305 max_tokens_per_doc=-1,
306 max_chars_per_doc=-1,
307 use_tokenizer=True,
308 in_memory: bool = True,
309 label_type: str = None,
310 skip_first_line: bool = False,
311 separator: str = '\t',
312 encoding: str = 'utf-8',
313 label: bool = True
314 ):
315 """
316 Creates a Dataset for pairs of sentences/paragraphs. The file needs to be in a column format,
317 where each line has a column for the first sentence/paragraph, the second sentence/paragraph and the label
318 seperated by e.g. '\t' (just like in the glue RTE-dataset https://gluebenchmark.com/tasks) .
319 For each data pair we create a flair.data.DataPair object.
321 :param path_to_data: path to the data file
322 :param columns: list of integers that indicate the respective columns. The first entry is the column
323 for the first sentence, the second for the second sentence and the third for the label. Default [0,1,2]
324 :param max_tokens_per_doc: If set, shortens sentences to this maximum number of tokens
325 :param max_chars_per_doc: If set, shortens sentences to this maximum number of characters
326 :param use_tokenizer: Whether or not to use in-built tokenizer
327 :param in_memory: If True, data will be saved in list of flair.data.DataPair objects, other wise we use lists with simple strings which needs less space
328 :param label_type: Name of the label of the data pairs
329 :param skip_first_line: If True, first line of data file will be ignored
330 :param separator: Separator between columns in the data file
331 :param encoding: Encoding of the data file
332 :param label: If False, the dataset expects unlabeled data
333 """
335 if type(path_to_data) == str:
336 path_to_data: Path = Path(path_to_data)
338 # stop if file does not exist
339 assert path_to_data.exists()
341 self.in_memory = in_memory
343 self.use_tokenizer = use_tokenizer
345 self.max_tokens_per_doc = max_tokens_per_doc
347 self.label = label
349 self.label_type = label_type
351 self.total_data_count: int = 0
353 if self.in_memory:
354 self.data_pairs: List[DataPair] = []
355 else:
356 self.first_elements: List[str] = []
357 self.second_elements: List[str] = []
358 self.labels: List[str] = []
360 with open(str(path_to_data), encoding=encoding) as source_file:
362 source_line = source_file.readline()
364 if skip_first_line:
365 source_line = source_file.readline()
367 while source_line:
369 source_line_list = source_line.strip().split(separator)
371 first_element = source_line_list[columns[0]]
372 second_element = source_line_list[columns[1]]
374 if self.label:
375 pair_label = source_line_list[columns[2]]
376 else:
377 pair_label = None
379 if max_chars_per_doc > 0:
380 first_element = first_element[:max_chars_per_doc]
381 second_element = second_element[:max_chars_per_doc]
383 if self.in_memory:
385 data_pair = self._make_data_pair(first_element, second_element, pair_label)
386 self.data_pairs.append(data_pair)
387 else:
388 self.first_elements.append(first_element)
389 self.second_elements.append(second_element)
390 if self.label:
391 self.labels.append(pair_label)
393 self.total_data_count += 1
395 source_line = source_file.readline()
397 # create a DataPair object from strings
398 def _make_data_pair(self, first_element: str, second_element: str, label: str = None):
400 first_sentence = Sentence(first_element, use_tokenizer=self.use_tokenizer)
401 second_sentence = Sentence(second_element, use_tokenizer=self.use_tokenizer)
403 if self.max_tokens_per_doc > 0:
404 first_sentence.tokens = first_sentence.tokens[: self.max_tokens_per_doc]
405 second_sentence.tokens = second_sentence.tokens[: self.max_tokens_per_doc]
407 data_pair = DataPair(first_sentence, second_sentence)
409 if label:
410 data_pair.add_label(typename=self.label_type, value=label)
412 return data_pair
414 def is_in_memory(self) -> bool:
416 return self.in_memory
418 def __len__(self):
419 return self.total_data_count
421 # if in_memory is True we return a datapair, otherwise we create one from the lists of strings
422 def __getitem__(self, index: int = 0) -> DataPair:
423 if self.in_memory:
424 return self.data_pairs[index]
425 elif self.label:
426 return self._make_data_pair(
427 self.first_elements[index], self.second_elements[index], self.labels[index]
428 )
429 else:
430 return self._make_data_pair(
431 self.first_elements[index], self.second_elements[index]
432 )
435class GLUE_RTE(DataPairCorpus):
436 def __init__(
437 self,
438 label_type="entailment",
439 base_path: Union[str, Path] = None,
440 max_tokens_per_doc=-1,
441 max_chars_per_doc=-1,
442 use_tokenizer=True,
443 in_memory: bool = True,
444 sample_missing_splits: bool = True
445 ):
446 """
447 Creates a DataPairCorpus for the Glue Recognizing Textual Entailment (RTE) data (https://gluebenchmark.com/tasks).
448 Additionaly to the Corpus we have a eval_dataset containing the test file of the Glue data.
449 This file contains unlabeled test data to evaluate models on the Glue RTE task.
450 """
452 if type(base_path) == str:
453 base_path: Path = Path(base_path)
455 dataset_name = "glue"
457 # if no base_path provided take cache root
458 if not base_path:
459 base_path = flair.cache_root / "datasets"
460 data_folder = base_path / dataset_name
462 data_file = data_folder / "RTE/train.tsv"
464 # if data is not downloaded yet, download it
465 if not data_file.is_file():
466 # get the zip file
467 zipped_data_path = cached_path(
468 'https://dl.fbaipublicfiles.com/glue/data/RTE.zip',
469 Path("datasets") / dataset_name
470 )
472 unpack_file(
473 zipped_data_path,
474 data_folder,
475 mode="zip",
476 keep=False
477 )
479 # rename test file to eval_dataset, since it has no labels
480 os.rename(str(data_folder / "RTE/test.tsv"), str(data_folder / "RTE/eval_dataset.tsv"))
482 super(GLUE_RTE, self).__init__(
483 data_folder / "RTE",
484 label_type=label_type,
485 columns=[1, 2, 3],
486 skip_first_line=True,
487 use_tokenizer=use_tokenizer,
488 max_tokens_per_doc=max_tokens_per_doc,
489 max_chars_per_doc=max_chars_per_doc,
490 in_memory=in_memory,
491 sample_missing_splits=sample_missing_splits
493 )
495 self.eval_dataset = DataPairDataset(
496 data_folder / "RTE/eval_dataset.tsv",
497 columns=[1, 2, 3],
498 use_tokenizer=use_tokenizer,
499 max_tokens_per_doc=max_tokens_per_doc,
500 max_chars_per_doc=max_chars_per_doc,
501 in_memory=in_memory,
502 skip_first_line=True,
503 label=False
504 )
506 """
507 This function creates a tsv file of the predictions of the eval_dataset (after calling classifier.predict(corpus.eval_dataset, label_name='textual_entailment')).
508 The resulting file is called RTE.tsv and is in the format required for submission to the Glue Benchmark.
509 """
511 def tsv_from_eval_dataset(self, folder_path: Union[str, Path]):
513 if type(folder_path) == str:
514 folder_path = Path(folder_path)
515 folder_path = folder_path / 'RTE.tsv'
517 with open(folder_path, mode='w') as tsv_file:
518 tsv_file.write("index\tprediction\n")
519 for index, datapoint in enumerate(self.eval_dataset):
520 tsv_file.write(str(index) + '\t' + datapoint.get_labels('textual_entailment')[0].value + '\n')
523class GLUE_MNLI(DataPairCorpus):
524 def __init__(
525 self,
526 label_type="entailment",
527 evaluate_on_matched: bool = True,
528 base_path: Union[str, Path] = None,
529 max_tokens_per_doc=-1,
530 max_chars_per_doc=-1,
531 use_tokenizer=True,
532 in_memory: bool = True,
533 sample_missing_splits: bool = True
534 ):
535 """
536 Creates a DataPairCorpus for the Multi-Genre Natural Language Inference Corpus (MNLI)
537 from GLUE benchmark (https://gluebenchmark.com/tasks). Entailment annotations are:
538 entailment, contradiction, neutral. This corpus includes two dev sets mathced/mismatched
539 and two unlabeled test sets: eval_dataset_matched, eval_dataset_mismatched.
540 """
542 if type(base_path) == str:
543 base_path: Path = Path(base_path)
545 dataset_name = "glue"
547 # if no base_path provided take cache root
548 if not base_path:
549 base_path = flair.cache_root / "datasets"
550 data_folder = base_path / dataset_name
552 data_file = data_folder / "MNLI/train.tsv"
554 # if data is not downloaded yet, download it
555 if not data_file.is_file():
556 # get the zip file
557 zipped_data_path = cached_path(
558 "https://dl.fbaipublicfiles.com/glue/data/MNLI.zip",
559 Path("datasets") / dataset_name
560 )
562 unpack_file(
563 zipped_data_path,
564 data_folder,
565 mode="zip",
566 keep=False
567 )
569 # reorder dev datasets to have same columns as in train set: 8, 9, and 11
570 # dev sets include 5 different annotations but we will only keep the gold label
571 for dev_filename in ["dev_matched.tsv", "dev_mismatched.tsv"]:
573 temp_file = str("temp_" + dev_filename)
574 os.rename(str(data_folder / "MNLI" / dev_filename),
575 str(data_folder / "MNLI" / temp_file))
577 with open(data_folder / "MNLI" / dev_filename, "a") as out_file, open(
578 data_folder / "MNLI" / temp_file) as in_file:
579 for line in in_file:
580 fields = line.split('\t')
581 reordered_columns = '\t'.join(fields[column_id] for column_id in range(11))
582 reordered_columns += '\t' + fields[15]
583 out_file.write(reordered_columns)
584 os.remove(str(data_folder / "MNLI" / temp_file))
586 # rename test file to eval_dataset, since it has no labels
587 os.rename(str(data_folder / "MNLI/test_matched.tsv"),
588 str(data_folder / "MNLI/eval_dataset_matched.tsv"))
589 os.rename(str(data_folder / "MNLI/test_mismatched.tsv"),
590 str(data_folder / "MNLI/eval_dataset_mismatched.tsv"))
592 matched_suffix = "matched" if evaluate_on_matched else "mismatched"
594 dev_dataset = "dev_" + matched_suffix + ".tsv"
595 eval_dataset = "eval_dataset_" + matched_suffix + ".tsv"
597 self.evaluate_on_matched = evaluate_on_matched
599 super(GLUE_MNLI, self).__init__(
600 data_folder / "MNLI",
601 train_file=data_file,
602 dev_file=dev_dataset,
603 label_type=label_type,
604 columns=[8, 9, 11],
605 skip_first_line=True,
606 use_tokenizer=use_tokenizer,
607 max_tokens_per_doc=max_tokens_per_doc,
608 max_chars_per_doc=max_chars_per_doc,
609 in_memory=in_memory,
610 sample_missing_splits=sample_missing_splits
611 )
613 self.eval_dataset = DataPairDataset(
614 data_folder / "MNLI" / eval_dataset,
615 columns=[8, 9, 11],
616 use_tokenizer=use_tokenizer,
617 max_tokens_per_doc=max_tokens_per_doc,
618 max_chars_per_doc=max_chars_per_doc,
619 in_memory=in_memory,
620 skip_first_line=True,
621 label=False
622 )
624 """
625 This function creates a tsv file of the predictions of the eval_dataset (after calling
626 classifier.predict(corpus.eval_dataset, label_name='textual_entailment')). The resulting file
627 is called MNLI-m.tsv or MNLI-mm.tsv and is in the format required for the Glue Benchmark.
628 """
630 def tsv_from_eval_dataset(self, folder_path: Union[str, Path]):
632 if type(folder_path) == str:
633 folder_path = Path(folder_path)
634 glue_eval_tsv = "MNLI-m.tsv" if self.evaluate_on_matched else "MNLI-mm.tsv"
635 folder_path = folder_path / glue_eval_tsv
637 with open(folder_path, mode='w') as tsv_file:
638 tsv_file.write("index\tprediction\n")
639 for index, datapoint in enumerate(self.eval_dataset):
640 label = datapoint.get_labels('textual_entailment')[0].value
641 tsv_file.write(str(index) + '\t' + label + '\n')
644class GLUE_MRPC(DataPairCorpus):
645 def __init__(
646 self,
647 label_type="paraphrase",
648 base_path: Union[str, Path] = None,
649 max_tokens_per_doc=-1,
650 max_chars_per_doc=-1,
651 use_tokenizer=True,
652 in_memory: bool = True,
653 sample_missing_splits: bool = True
654 ):
655 """
656 Creates a DataPairCorpus for the Microsoft Research Paraphrase Corpus (MRPC)
657 from Glue benchmark (https://gluebenchmark.com/tasks). MRPC includes annotated
658 train and test sets. Dev set is sampled each time when creating this corpus.
659 """
661 if type(base_path) == str:
662 base_path: Path = Path(base_path)
664 dataset_name = "glue"
666 # if no base_path provided take cache root
667 if not base_path:
668 base_path = flair.cache_root / "datasets"
669 data_folder = base_path / dataset_name
671 data_file = data_folder / "MRPC/train.tsv"
673 mrpc_path = "https://dl.fbaipublicfiles.com/senteval/senteval_data/"
675 original_filenames = ["msr_paraphrase_train.txt", "msr_paraphrase_test.txt"]
677 # if data is not downloaded yet, download it
678 if not data_file.is_file():
679 for original_filename in original_filenames:
680 # get test and dev sets
681 cached_path(f"{mrpc_path}{original_filename}",
682 Path("datasets") / dataset_name / "MRPC")
684 os.rename(str(data_folder / "MRPC/msr_paraphrase_train.txt"),
685 str(data_folder / "MRPC/train.tsv"))
686 os.rename(str(data_folder / "MRPC/msr_paraphrase_test.txt"),
687 str(data_folder / "MRPC/test.tsv"))
689 super(GLUE_MRPC, self).__init__(
690 data_folder / "MRPC",
691 label_type=label_type,
692 columns=[3, 4, 0],
693 skip_first_line=True,
694 use_tokenizer=use_tokenizer,
695 max_tokens_per_doc=max_tokens_per_doc,
696 max_chars_per_doc=max_chars_per_doc,
697 in_memory=in_memory,
698 sample_missing_splits=sample_missing_splits
699 )
701 """
702 This function creates a tsv file of the predictions of the eval_dataset (after calling
703 classifier.predict(corpus.test, label_name='paraphrase')). The dataset that is used
704 for evaluation is the same as the test set. The resulting file is called MRPC.tsv
705 and is in the format required for submission to the Glue Benchmark.
706 """
708 def tsv_from_eval_dataset(self, folder_path: Union[str, Path]):
710 if type(folder_path) == str:
711 folder_path = Path(folder_path)
712 folder_path = folder_path / 'MRPC.tsv'
714 with open(folder_path, mode='w') as tsv_file:
715 tsv_file.write("index\tprediction\n")
716 for index, datapoint in enumerate(self.test):
717 label = datapoint.get_labels('paraphrase')[0].value
718 tsv_file.write(str(index) + '\t' + label + '\n')
721class GLUE_QNLI(DataPairCorpus):
722 def __init__(
723 self,
724 label_type="entailment",
725 base_path: Union[str, Path] = None,
726 max_tokens_per_doc=-1,
727 max_chars_per_doc=-1,
728 use_tokenizer=True,
729 in_memory: bool = True,
730 sample_missing_splits: bool = True
731 ):
732 """
733 Creates a DataPairCorpus for the Question-answering Natural Language Inference dataset
734 (QNLI) from GLUE benchmark (https://gluebenchmark.com/tasks).
735 Additionaly to the Corpus we have a eval_dataset containing the test file of the Glue data.
736 This file contains unlabeled test data to evaluate models on the Glue QNLI task.
737 """
739 if type(base_path) == str:
740 base_path: Path = Path(base_path)
742 dataset_name = "glue"
744 # if no base_path provided take cache root
745 if not base_path:
746 base_path = flair.cache_root / "datasets"
747 data_folder = base_path / dataset_name
749 data_file = data_folder / "QNLI/train.tsv"
751 # if data is not downloaded yet, download it
752 if not data_file.is_file():
753 # get the zip file
754 zipped_data_path = cached_path(
755 "https://dl.fbaipublicfiles.com/glue/data/QNLIv2.zip",
756 Path("datasets") / dataset_name
757 )
759 unpack_file(
760 zipped_data_path,
761 data_folder,
762 mode="zip",
763 keep=False
764 )
766 # rename test file to eval_dataset, since it has no labels
767 os.rename(str(data_folder / "QNLI/test.tsv"),
768 str(data_folder / "QNLI/eval_dataset.tsv"))
770 super(GLUE_QNLI, self).__init__(
771 data_folder / "QNLI",
772 label_type=label_type,
773 columns=[1, 2, 3],
774 skip_first_line=True,
775 use_tokenizer=use_tokenizer,
776 max_tokens_per_doc=max_tokens_per_doc,
777 max_chars_per_doc=max_chars_per_doc,
778 in_memory=in_memory,
779 sample_missing_splits=sample_missing_splits
780 )
782 self.eval_dataset = DataPairDataset(
783 data_folder / "QNLI/eval_dataset.tsv",
784 columns=[1, 2, 3],
785 use_tokenizer=use_tokenizer,
786 max_tokens_per_doc=max_tokens_per_doc,
787 max_chars_per_doc=max_chars_per_doc,
788 in_memory=in_memory,
789 skip_first_line=True,
790 label=False
791 )
793 """
794 This function creates a tsv file of the predictions of the eval_dataset (after calling
795 classifier.predict(corpus.eval_dataset, label_name='textual_entailment')). The resulting
796 file is called QNLI.tsv and is in the format required for submission to the Glue Benchmark.
797 """
799 def tsv_from_eval_dataset(self, folder_path: Union[str, Path]):
801 if type(folder_path) == str:
802 folder_path = Path(folder_path)
803 folder_path = folder_path / 'QNLI.tsv'
805 with open(folder_path, mode='w') as tsv_file:
806 tsv_file.write("index\tprediction\n")
807 for index, datapoint in enumerate(self.eval_dataset):
808 label = datapoint.get_labels('textual_entailment')[0].value
809 tsv_file.write(str(index) + '\t' + label + '\n')
812class GLUE_QQP(DataPairCorpus):
813 def __init__(
814 self,
815 label_type="paraphrase",
816 base_path: Union[str, Path] = None,
817 max_tokens_per_doc=-1,
818 max_chars_per_doc=-1,
819 use_tokenizer=True,
820 in_memory: bool = True,
821 sample_missing_splits: bool = True
822 ):
823 """
824 Creates a Quora Question Pairs (QQP) Corpus from the Glue benchmark (https://gluebenchmark.com/tasks).
825 The task is to determine whether a pair of questions are semantically equivalent.
826 Additionaly to the Corpus we have a eval_dataset containing the test file of the Glue data.
827 This file contains unlabeled test data to evaluate models on the Glue QQP task.
828 """
830 if type(base_path) == str:
831 base_path: Path = Path(base_path)
833 dataset_name = "glue"
835 # if no base_path provided take cache root
836 if not base_path:
837 base_path = flair.cache_root / "datasets"
838 data_folder = base_path / dataset_name
840 data_file = data_folder / "QQP/train.tsv"
842 # if data is not downloaded yet, download it
843 if not data_file.is_file():
844 # get the zip file
845 zipped_data_path = cached_path(
846 "https://dl.fbaipublicfiles.com/glue/data/QQP-clean.zip",
847 Path("datasets") / dataset_name
848 )
850 unpack_file(
851 zipped_data_path,
852 data_folder,
853 mode="zip",
854 keep=False
855 )
857 # rename test file to eval_dataset, since it has no labels
858 os.rename(str(data_folder / "QQP/test.tsv"),
859 str(data_folder / "QQP/eval_dataset.tsv"))
861 super(GLUE_QQP, self).__init__(
862 data_folder / "QQP",
863 label_type=label_type,
864 columns=[3, 4, 5],
865 skip_first_line=True,
866 use_tokenizer=use_tokenizer,
867 max_tokens_per_doc=max_tokens_per_doc,
868 max_chars_per_doc=max_chars_per_doc,
869 in_memory=in_memory,
870 sample_missing_splits=sample_missing_splits
871 )
873 self.eval_dataset = DataPairDataset(
874 data_folder / "QQP/eval_dataset.tsv",
875 columns=[1, 2, 0],
876 use_tokenizer=use_tokenizer,
877 max_tokens_per_doc=max_tokens_per_doc,
878 max_chars_per_doc=max_chars_per_doc,
879 in_memory=in_memory,
880 skip_first_line=True,
881 label=False
882 )
884 """
885 This function creates a tsv file of the predictions of the eval_dataset (after calling
886 classifier.predict(corpus.eval_dataset, label_name='paraphrase')). The resulting file
887 is called QQP.tsv and is in the format required for submission to the Glue Benchmark.
888 """
890 def tsv_from_eval_dataset(self, folder_path: Union[str, Path]):
892 if type(folder_path) == str:
893 folder_path = Path(folder_path)
894 folder_path = folder_path / 'QQP.tsv'
896 with open(folder_path, mode='w') as tsv_file:
897 tsv_file.write("index\tprediction\n")
898 for index, datapoint in enumerate(self.eval_dataset):
899 label = datapoint.get_labels('paraphrase')[0].value
900 tsv_file.write(str(index) + '\t' + label + '\n')
903class GLUE_WNLI(DataPairCorpus):
904 def __init__(
905 self,
906 label_type="entailment",
907 base_path: Union[str, Path] = None,
908 max_tokens_per_doc=-1,
909 max_chars_per_doc=-1,
910 use_tokenizer=True,
911 in_memory: bool = True,
912 sample_missing_splits: bool = True
913 ):
914 """
915 Creates a Winograd Schema Challenge Corpus formated as Natural Language Inference task (WNLI).
916 The task is to predict if the sentence with the pronoun substituted is entailed by the original sentence.
917 Additionaly to the Corpus we have a eval_dataset containing the test file of the Glue data.
918 This file contains unlabeled test data to evaluate models on the Glue WNLI task.
919 """
921 if type(base_path) == str:
922 base_path: Path = Path(base_path)
924 dataset_name = "glue"
926 # if no base_path provided take cache root
927 if not base_path:
928 base_path = flair.cache_root / "datasets"
929 data_folder = base_path / dataset_name
931 data_file = data_folder / "WNLI/train.tsv"
933 # if data is not downloaded yet, download it
934 if not data_file.is_file():
935 # get the zip file
936 zipped_data_path = cached_path(
937 "https://dl.fbaipublicfiles.com/glue/data/WNLI.zip",
938 Path("datasets") / dataset_name
939 )
941 unpack_file(
942 zipped_data_path,
943 data_folder,
944 mode="zip",
945 keep=False
946 )
948 # rename test file to eval_dataset, since it has no labels
949 os.rename(str(data_folder / "WNLI/test.tsv"),
950 str(data_folder / "WNLI/eval_dataset.tsv"))
952 super(GLUE_WNLI, self).__init__(
953 data_folder / "WNLI",
954 label_type=label_type,
955 columns=[1, 2, 3],
956 skip_first_line=True,
957 use_tokenizer=use_tokenizer,
958 max_tokens_per_doc=max_tokens_per_doc,
959 max_chars_per_doc=max_chars_per_doc,
960 in_memory=in_memory,
961 sample_missing_splits=sample_missing_splits
962 )
964 self.eval_dataset = DataPairDataset(
965 data_folder / "WNLI/eval_dataset.tsv",
966 columns=[1, 2, 3],
967 use_tokenizer=use_tokenizer,
968 max_tokens_per_doc=max_tokens_per_doc,
969 max_chars_per_doc=max_chars_per_doc,
970 in_memory=in_memory,
971 skip_first_line=True,
972 label=False
973 )
975 """
976 This function creates a tsv file of the predictions of the eval_dataset (after calling
977 classifier.predict(corpus.eval_dataset, label_name='textual_entailment')). The resulting file
978 is called WNLI.tsv and is in the format required for submission to the Glue Benchmark.
979 """
981 def tsv_from_eval_dataset(self, folder_path: Union[str, Path]):
983 if type(folder_path) == str:
984 folder_path = Path(folder_path)
985 folder_path = folder_path / 'WNLI.tsv'
987 with open(folder_path, mode='w') as tsv_file:
988 tsv_file.write("index\tprediction\n")
989 for index, datapoint in enumerate(self.eval_dataset):
990 tsv_file.write(str(index) + '\t' + datapoint.get_labels('entailment')[0].value + '\n')
993class SUPERGLUE_RTE(DataPairCorpus):
994 def __init__(
995 self,
996 base_path: Union[str, Path] = None,
997 max_tokens_per_doc=-1,
998 max_chars_per_doc=-1,
999 use_tokenizer=True,
1000 in_memory: bool = True,
1001 sample_missing_splits: bool = True
1002 ):
1003 """
1004 Creates a DataPairCorpus for the SuperGlue Recognizing Textual Entailment (RTE) data (https://super.gluebenchmark.com/tasks).
1005 Additionaly to the Corpus we have a eval_dataset containing the test file of the SuperGlue data.
1006 This file contains unlabeled test data to evaluate models on the SuperGlue RTE task.
1007 """
1009 if type(base_path) == str:
1010 base_path: Path = Path(base_path)
1012 dataset_name = "superglue"
1014 # if no base_path provided take cache root
1015 if not base_path:
1016 base_path = flair.cache_root / "datasets"
1017 data_folder = base_path / dataset_name
1019 data_file = data_folder / "RTE/train.tsv"
1021 # if data not downloaded yet, download it
1022 if not data_file.is_file():
1023 # get the zip file
1024 zipped_data_path = cached_path(
1025 'https://dl.fbaipublicfiles.com/glue/superglue/data/v2/RTE.zip',
1026 Path("datasets") / dataset_name
1027 )
1029 unpack_file(
1030 zipped_data_path,
1031 data_folder,
1032 mode="zip",
1033 keep=False
1034 )
1036 # the downloaded files have json format, we transform them to tsv
1037 rte_jsonl_to_tsv(data_folder / "RTE/train.jsonl", remove=True)
1038 rte_jsonl_to_tsv(data_folder / "RTE/test.jsonl", remove=True, label=False)
1039 rte_jsonl_to_tsv(data_folder / "RTE/val.jsonl", remove=True)
1041 os.rename(str(data_folder / "RTE/val.tsv"), str(data_folder / "RTE/dev.tsv"))
1042 os.rename(str(data_folder / "RTE/test.tsv"), str(data_folder / "RTE/eval_dataset.tsv"))
1044 super(SUPERGLUE_RTE, self).__init__(
1045 data_folder / "RTE",
1046 columns=[0, 1, 2],
1047 use_tokenizer=use_tokenizer,
1048 max_tokens_per_doc=max_tokens_per_doc,
1049 max_chars_per_doc=max_chars_per_doc,
1050 in_memory=in_memory,
1051 label_type='textual_entailment',
1052 sample_missing_splits=sample_missing_splits
1053 )
1055 self.eval_dataset = DataPairDataset(
1056 data_folder / "RTE/eval_dataset.tsv",
1057 columns=[0, 1, 2],
1058 use_tokenizer=use_tokenizer,
1059 max_tokens_per_doc=max_tokens_per_doc,
1060 max_chars_per_doc=max_chars_per_doc,
1061 in_memory=in_memory,
1062 skip_first_line=False,
1063 label=False
1064 )
1066 """
1067 Creates JSONL file of the predictions of the eval_dataset (after calling classifier.predict(corpus.eval_dataset, label_name='textual_entailment')).
1068 The resulting file is called RTE.jsonl and is in the form required for submission to the SuperGlue Benchmark.
1069 """
1071 def jsonl_from_eval_dataset(self, folder_path: Union[str, Path]):
1073 if type(folder_path) == str:
1074 folder_path = Path(folder_path)
1075 folder_path = folder_path / 'RTE.jsonl'
1077 with open(folder_path, mode='w') as jsonl_file:
1079 for index, datapoint in enumerate(self.eval_dataset):
1080 entry = {"idx": index, "label": datapoint.get_labels('textual_entailment')[0].value}
1081 jsonl_file.write(str(entry) + '\n')
1084# Function to transform JSON file to tsv for Recognizing Textual Entailment Data
1085def rte_jsonl_to_tsv(file_path: Union[str, Path], label: bool = True, remove: bool = False, encoding='utf-8'):
1086 import json
1088 tsv_file = os.path.splitext(file_path)[0] + '.tsv'
1090 with open(file_path, 'r', encoding=encoding) as jsonl_file:
1091 with open(tsv_file, 'w', encoding=encoding) as tsv_file:
1093 line = jsonl_file.readline()
1095 while line:
1097 obj = json.loads(line)
1098 new_line = obj["premise"] + '\t' + obj["hypothesis"]
1099 if label:
1100 new_line += '\t' + obj["label"]
1101 new_line += '\n'
1103 tsv_file.write(new_line)
1105 line = jsonl_file.readline()
1107 # remove json file
1108 if remove:
1109 os.remove(file_path)