Coverage for flair/flair/datasets/document_classification.py: 10%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import csv
2import json
3import os
5from pathlib import Path
6from typing import List, Dict, Union, Callable
8import flair
9from flair.data import (
10 Sentence,
11 Corpus,
12 Token,
13 FlairDataset,
14 Tokenizer, DataPair
15)
16from flair.tokenization import SegtokTokenizer, SpaceTokenizer
17from flair.datasets.base import find_train_dev_test_files
18from flair.file_utils import cached_path, unzip_file, unpack_file
20import logging
21log = logging.getLogger("flair")
24class ClassificationCorpus(Corpus):
25 """
26 A classification corpus from FastText-formatted text files.
27 """
29 def __init__(
30 self,
31 data_folder: Union[str, Path],
32 label_type: str = 'class',
33 train_file=None,
34 test_file=None,
35 dev_file=None,
36 truncate_to_max_tokens: int = -1,
37 truncate_to_max_chars: int = -1,
38 filter_if_longer_than: int = -1,
39 tokenizer: Tokenizer = SegtokTokenizer(),
40 memory_mode: str = "partial",
41 label_name_map: Dict[str, str] = None,
42 skip_labels: List[str] = None,
43 allow_examples_without_labels=False,
44 sample_missing_splits: bool = True,
45 encoding: str = 'utf-8',
46 ):
47 """
48 Instantiates a Corpus from text classification-formatted task data
50 :param data_folder: base folder with the task data
51 :param label_type: name of the label
52 :param train_file: the name of the train file
53 :param test_file: the name of the test file
54 :param dev_file: the name of the dev file, if None, dev data is sampled from train
55 :param truncate_to_max_tokens: If set, truncates each Sentence to a maximum number of tokens
56 :param truncate_to_max_chars: If set, truncates each Sentence to a maximum number of chars
57 :param filter_if_longer_than: If set, filters documents that are longer that the specified number of tokens.
58 :param tokenizer: Tokenizer for dataset, default is SegtokTokenizer
59 :param memory_mode: Set to what degree to keep corpus in memory ('full', 'partial' or 'disk'). Use 'full'
60 if full corpus and all embeddings fits into memory for speedups during training. Otherwise use 'partial' and if
61 even this is too much for your memory, use 'disk'.
62 :param label_name_map: Optionally map label names to different schema.
63 :param allow_examples_without_labels: set to True to allow Sentences without label in the corpus.
64 :param encoding: Default is 'uft-8' but some datasets are in 'latin-1
65 :return: a Corpus with annotated train, dev and test data
66 """
68 # find train, dev and test files if not specified
69 dev_file, test_file, train_file = \
70 find_train_dev_test_files(data_folder, dev_file, test_file, train_file)
72 train: FlairDataset = ClassificationDataset(
73 train_file,
74 label_type=label_type,
75 tokenizer=tokenizer,
76 truncate_to_max_tokens=truncate_to_max_tokens,
77 truncate_to_max_chars=truncate_to_max_chars,
78 filter_if_longer_than=filter_if_longer_than,
79 memory_mode=memory_mode,
80 label_name_map=label_name_map,
81 skip_labels=skip_labels,
82 allow_examples_without_labels=allow_examples_without_labels,
83 encoding=encoding,
84 )
86 # use test_file to create test split if available
87 test: FlairDataset = ClassificationDataset(
88 test_file,
89 label_type=label_type,
90 tokenizer=tokenizer,
91 truncate_to_max_tokens=truncate_to_max_tokens,
92 truncate_to_max_chars=truncate_to_max_chars,
93 filter_if_longer_than=filter_if_longer_than,
94 memory_mode=memory_mode,
95 label_name_map=label_name_map,
96 skip_labels=skip_labels,
97 allow_examples_without_labels=allow_examples_without_labels,
98 encoding=encoding,
99 ) if test_file is not None else None
101 # use dev_file to create test split if available
102 dev: FlairDataset = ClassificationDataset(
103 dev_file,
104 label_type=label_type,
105 tokenizer=tokenizer,
106 truncate_to_max_tokens=truncate_to_max_tokens,
107 truncate_to_max_chars=truncate_to_max_chars,
108 filter_if_longer_than=filter_if_longer_than,
109 memory_mode=memory_mode,
110 label_name_map=label_name_map,
111 skip_labels=skip_labels,
112 allow_examples_without_labels=allow_examples_without_labels,
113 encoding=encoding,
114 ) if dev_file is not None else None
116 super(ClassificationCorpus, self).__init__(
117 train, dev, test, name=str(data_folder), sample_missing_splits=sample_missing_splits
118 )
120 log.info(f"Initialized corpus {self.name} (label type name is '{label_type}')")
123class ClassificationDataset(FlairDataset):
124 """
125 Dataset for classification instantiated from a single FastText-formatted file.
126 """
128 def __init__(
129 self,
130 path_to_file: Union[str, Path],
131 label_type: str,
132 truncate_to_max_tokens=-1,
133 truncate_to_max_chars=-1,
134 filter_if_longer_than: int = -1,
135 tokenizer: Tokenizer = SegtokTokenizer(),
136 memory_mode: str = "partial",
137 label_name_map: Dict[str, str] = None,
138 skip_labels: List[str] = None,
139 allow_examples_without_labels=False,
140 encoding: str = 'utf-8',
141 ):
142 """
143 Reads a data file for text classification. The file should contain one document/text per line.
144 The line should have the following format:
145 __label__<class_name> <text>
146 If you have a multi class task, you can have as many labels as you want at the beginning of the line, e.g.,
147 __label__<class_name_1> __label__<class_name_2> <text>
148 :param path_to_file: the path to the data file
149 :param label_type: name of the label
150 :param truncate_to_max_tokens: If set, truncates each Sentence to a maximum number of tokens
151 :param truncate_to_max_chars: If set, truncates each Sentence to a maximum number of chars
152 :param filter_if_longer_than: If set, filters documents that are longer that the specified number of tokens.
153 :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer)
154 :param memory_mode: Set to what degree to keep corpus in memory ('full', 'partial' or 'disk'). Use 'full'
155 if full corpus and all embeddings fits into memory for speedups during training. Otherwise use 'partial' and if
156 even this is too much for your memory, use 'disk'.
157 :param label_name_map: Optionally map label names to different schema.
158 :param allow_examples_without_labels: set to True to allow Sentences without label in the Dataset.
159 :param encoding: Default is 'uft-8' but some datasets are in 'latin-1
160 :return: list of sentences
161 """
162 if type(path_to_file) == str:
163 path_to_file: Path = Path(path_to_file)
165 assert path_to_file.exists()
167 self.label_prefix = "__label__"
168 self.label_type = label_type
170 self.memory_mode = memory_mode
171 self.tokenizer = tokenizer
173 if self.memory_mode == 'full':
174 self.sentences = []
175 if self.memory_mode == 'partial':
176 self.lines = []
177 if self.memory_mode == 'disk':
178 self.indices = []
180 self.total_sentence_count: int = 0
181 self.truncate_to_max_chars = truncate_to_max_chars
182 self.truncate_to_max_tokens = truncate_to_max_tokens
183 self.filter_if_longer_than = filter_if_longer_than
184 self.label_name_map = label_name_map
185 self.allow_examples_without_labels = allow_examples_without_labels
187 self.path_to_file = path_to_file
189 with open(str(path_to_file), encoding=encoding) as f:
190 line = f.readline()
191 position = 0
192 while line:
193 if ("__label__" not in line and not allow_examples_without_labels) or (" " not in line and "\t" not in line):
194 position = f.tell()
195 line = f.readline()
196 continue
198 if 0 < self.filter_if_longer_than < len(line.split(' ')):
199 position = f.tell()
200 line = f.readline()
201 continue
203 # if data point contains black-listed label, do not use
204 if skip_labels:
205 skip = False
206 for skip_label in skip_labels:
207 if "__label__" + skip_label in line:
208 skip = True
209 if skip:
210 line = f.readline()
211 continue
213 if self.memory_mode == 'full':
214 sentence = self._parse_line_to_sentence(
215 line, self.label_prefix, tokenizer
216 )
217 if sentence is not None and len(sentence.tokens) > 0:
218 self.sentences.append(sentence)
219 self.total_sentence_count += 1
221 if self.memory_mode == 'partial' or self.memory_mode == 'disk':
223 # first check if valid sentence
224 words = line.split()
225 l_len = 0
226 label = False
227 for i in range(len(words)):
228 if words[i].startswith(self.label_prefix):
229 l_len += len(words[i]) + 1
230 label = True
231 else:
232 break
233 text = line[l_len:].strip()
235 # if so, add to indices
236 if text and (label or allow_examples_without_labels):
238 if self.memory_mode == 'partial':
239 self.lines.append(line)
240 self.total_sentence_count += 1
242 if self.memory_mode == 'disk':
243 self.indices.append(position)
244 self.total_sentence_count += 1
246 position = f.tell()
247 line = f.readline()
249 def _parse_line_to_sentence(
250 self, line: str, label_prefix: str, tokenizer: Union[Callable[[str], List[Token]], Tokenizer]
251 ):
252 words = line.split()
254 labels = []
255 l_len = 0
257 for i in range(len(words)):
258 if words[i].startswith(label_prefix):
259 l_len += len(words[i]) + 1
260 label = words[i].replace(label_prefix, "")
262 if self.label_name_map and label in self.label_name_map.keys():
263 label = self.label_name_map[label]
265 labels.append(label)
266 else:
267 break
269 text = line[l_len:].strip()
271 if self.truncate_to_max_chars > 0:
272 text = text[: self.truncate_to_max_chars]
274 if text and (labels or self.allow_examples_without_labels):
275 sentence = Sentence(text, use_tokenizer=tokenizer)
277 for label in labels:
278 sentence.add_label(self.label_type, label)
280 if (
281 sentence is not None
282 and 0 < self.truncate_to_max_tokens < len(sentence)
283 ):
284 sentence.tokens = sentence.tokens[: self.truncate_to_max_tokens]
286 return sentence
287 return None
289 def is_in_memory(self) -> bool:
290 if self.memory_mode == 'disk': return False
291 if self.memory_mode == 'partial': return False
292 return True
294 def __len__(self):
295 return self.total_sentence_count
297 def __getitem__(self, index: int = 0) -> Sentence:
299 if self.memory_mode == 'full':
300 return self.sentences[index]
302 if self.memory_mode == 'partial':
303 sentence = self._parse_line_to_sentence(
304 self.lines[index], self.label_prefix, self.tokenizer
305 )
306 return sentence
308 if self.memory_mode == 'disk':
309 with open(str(self.path_to_file), encoding="utf-8") as file:
310 file.seek(self.indices[index])
311 line = file.readline()
312 sentence = self._parse_line_to_sentence(
313 line, self.label_prefix, self.tokenizer
314 )
315 return sentence
318class CSVClassificationCorpus(Corpus):
319 """
320 Classification corpus instantiated from CSV data files.
321 """
323 def __init__(
324 self,
325 data_folder: Union[str, Path],
326 column_name_map: Dict[int, str],
327 label_type: str,
328 train_file=None,
329 test_file=None,
330 dev_file=None,
331 max_tokens_per_doc=-1,
332 max_chars_per_doc=-1,
333 tokenizer: Tokenizer = SegtokTokenizer(),
334 in_memory: bool = False,
335 skip_header: bool = False,
336 encoding: str = 'utf-8',
337 no_class_label=None,
338 **fmtparams,
339 ):
340 """
341 Instantiates a Corpus for text classification from CSV column formatted data
343 :param data_folder: base folder with the task data
344 :param column_name_map: a column name map that indicates which column is text and which the label(s)
345 :param label_type: name of the label
346 :param train_file: the name of the train file
347 :param test_file: the name of the test file
348 :param dev_file: the name of the dev file, if None, dev data is sampled from train
349 :param max_tokens_per_doc: If set, truncates each Sentence to a maximum number of Tokens
350 :param max_chars_per_doc: If set, truncates each Sentence to a maximum number of chars
351 :param tokenizer: Tokenizer for dataset, default is SegtokTokenizer
352 :param in_memory: If True, keeps dataset as Sentences in memory, otherwise only keeps strings
353 :param skip_header: If True, skips first line because it is header
354 :param encoding: Default is 'uft-8' but some datasets are in 'latin-1
355 :param fmtparams: additional parameters for the CSV file reader
356 :return: a Corpus with annotated train, dev and test data
357 """
359 # find train, dev and test files if not specified
360 dev_file, test_file, train_file = \
361 find_train_dev_test_files(data_folder, dev_file, test_file, train_file)
363 train: FlairDataset = CSVClassificationDataset(
364 train_file,
365 column_name_map,
366 label_type=label_type,
367 tokenizer=tokenizer,
368 max_tokens_per_doc=max_tokens_per_doc,
369 max_chars_per_doc=max_chars_per_doc,
370 in_memory=in_memory,
371 skip_header=skip_header,
372 encoding=encoding,
373 no_class_label=no_class_label,
374 **fmtparams,
375 )
377 test: FlairDataset = CSVClassificationDataset(
378 test_file,
379 column_name_map,
380 label_type=label_type,
381 tokenizer=tokenizer,
382 max_tokens_per_doc=max_tokens_per_doc,
383 max_chars_per_doc=max_chars_per_doc,
384 in_memory=in_memory,
385 skip_header=skip_header,
386 encoding=encoding,
387 no_class_label=no_class_label,
388 **fmtparams,
389 ) if test_file is not None else None
391 dev: FlairDataset = CSVClassificationDataset(
392 dev_file,
393 column_name_map,
394 label_type=label_type,
395 tokenizer=tokenizer,
396 max_tokens_per_doc=max_tokens_per_doc,
397 max_chars_per_doc=max_chars_per_doc,
398 in_memory=in_memory,
399 skip_header=skip_header,
400 encoding=encoding,
401 no_class_label=no_class_label,
402 **fmtparams,
403 ) if dev_file is not None else None
405 super(CSVClassificationCorpus, self).__init__(
406 train, dev, test, name=str(data_folder)
407 )
410class CSVClassificationDataset(FlairDataset):
411 """
412 Dataset for text classification from CSV column formatted data.
413 """
415 def __init__(
416 self,
417 path_to_file: Union[str, Path],
418 column_name_map: Dict[int, str],
419 label_type: str,
420 max_tokens_per_doc: int = -1,
421 max_chars_per_doc: int = -1,
422 tokenizer: Tokenizer = SegtokTokenizer(),
423 in_memory: bool = True,
424 skip_header: bool = False,
425 encoding: str = 'utf-8',
426 no_class_label=None,
427 **fmtparams,
428 ):
429 """
430 Instantiates a Dataset for text classification from CSV column formatted data
432 :param path_to_file: path to the file with the CSV data
433 :param column_name_map: a column name map that indicates which column is text and which the label(s)
434 :param label_type: name of the label
435 :param max_tokens_per_doc: If set, truncates each Sentence to a maximum number of Tokens
436 :param max_chars_per_doc: If set, truncates each Sentence to a maximum number of chars
437 :param tokenizer: Tokenizer for dataset, default is SegTokTokenizer
438 :param in_memory: If True, keeps dataset as Sentences in memory, otherwise only keeps strings
439 :param skip_header: If True, skips first line because it is header
440 :param encoding: Most datasets are 'utf-8' but some are 'latin-1'
441 :param fmtparams: additional parameters for the CSV file reader
442 :return: a Corpus with annotated train, dev and test data
443 """
445 if type(path_to_file) == str:
446 path_to_file: Path = Path(path_to_file)
448 assert path_to_file.exists()
450 # variables
451 self.path_to_file = path_to_file
452 self.in_memory = in_memory
453 self.tokenizer = tokenizer
454 self.column_name_map = column_name_map
455 self.max_tokens_per_doc = max_tokens_per_doc
456 self.max_chars_per_doc = max_chars_per_doc
457 self.no_class_label = no_class_label
459 self.label_type = label_type
461 # different handling of in_memory data than streaming data
462 if self.in_memory:
463 self.sentences = []
464 else:
465 self.raw_data = []
467 self.total_sentence_count: int = 0
469 # most data sets have the token text in the first column, if not, pass 'text' as column
470 self.text_columns: List[int] = []
471 self.pair_columns: List[int] = []
472 for column in column_name_map:
473 if column_name_map[column] == "text":
474 self.text_columns.append(column)
475 if column_name_map[column] == "pair":
476 self.pair_columns.append(column)
478 with open(self.path_to_file, encoding=encoding) as csv_file:
480 csv_reader = csv.reader(csv_file, **fmtparams)
482 if skip_header:
483 next(csv_reader, None) # skip the headers
485 for row in csv_reader:
487 # test if format is OK
488 wrong_format = False
489 for text_column in self.text_columns:
490 if text_column >= len(row):
491 wrong_format = True
493 if wrong_format:
494 continue
496 # test if at least one label given
497 has_label = False
498 for column in self.column_name_map:
499 if self.column_name_map[column].startswith("label") and row[column]:
500 has_label = True
501 break
503 if not has_label:
504 continue
506 if self.in_memory:
508 sentence = self._make_labeled_data_point(row)
510 self.sentences.append(sentence)
512 else:
513 self.raw_data.append(row)
515 self.total_sentence_count += 1
517 def _make_labeled_data_point(self, row):
519 # make sentence from text (and filter for length)
520 text = " ".join(
521 [row[text_column] for text_column in self.text_columns]
522 )
524 if self.max_chars_per_doc > 0:
525 text = text[: self.max_chars_per_doc]
527 sentence = Sentence(text, use_tokenizer=self.tokenizer)
529 if 0 < self.max_tokens_per_doc < len(sentence):
530 sentence.tokens = sentence.tokens[: self.max_tokens_per_doc]
532 # if a pair column is defined, make a sentence pair object
533 if len(self.pair_columns) > 0:
535 text = " ".join(
536 [row[pair_column] for pair_column in self.pair_columns]
537 )
539 if self.max_chars_per_doc > 0:
540 text = text[: self.max_chars_per_doc]
542 pair = Sentence(text, use_tokenizer=self.tokenizer)
544 if 0 < self.max_tokens_per_doc < len(sentence):
545 pair.tokens = pair.tokens[: self.max_tokens_per_doc]
547 data_point = DataPair(first=sentence, second=pair)
549 else:
550 data_point = sentence
552 for column in self.column_name_map:
553 column_value = row[column]
554 if (
555 self.column_name_map[column].startswith("label")
556 and column_value
557 ):
558 if column_value != self.no_class_label:
559 data_point.add_label(self.label_type, column_value)
561 return data_point
563 def is_in_memory(self) -> bool:
564 return self.in_memory
566 def __len__(self):
567 return self.total_sentence_count
569 def __getitem__(self, index: int = 0) -> Sentence:
570 if self.in_memory:
571 return self.sentences[index]
572 else:
573 row = self.raw_data[index]
575 sentence = self._make_labeled_data_point(row)
577 return sentence
580class AMAZON_REVIEWS(ClassificationCorpus):
581 """
582 A very large corpus of Amazon reviews with positivity ratings. Corpus is downloaded from and documented at
583 https://nijianmo.github.io/amazon/index.html. We download the 5-core subset which is still tens of millions of
584 reviews.
585 """
587 # noinspection PyDefaultArgument
588 def __init__(
589 self,
590 split_max: int = 30000,
591 label_name_map: Dict[str, str] = {
592 '1.0': 'NEGATIVE',
593 '2.0': 'NEGATIVE',
594 '3.0': 'NEGATIVE',
595 '4.0': 'POSITIVE',
596 '5.0': 'POSITIVE',
597 },
598 skip_labels=['3.0', '4.0'],
599 fraction_of_5_star_reviews: int = 10,
600 tokenizer: Tokenizer = SegtokTokenizer(),
601 memory_mode='partial',
602 **corpusargs
603 ):
604 """
605 Constructs corpus object. Split_max indicates how many data points from each of the 28 splits are used, so
606 set this higher or lower to increase/decrease corpus size.
607 :param label_name_map: Map label names to different schema. By default, the 5-star rating is mapped onto 3
608 classes (POSITIVE, NEGATIVE, NEUTRAL)
609 :param split_max: Split_max indicates how many data points from each of the 28 splits are used, so
610 set this higher or lower to increase/decrease corpus size.
611 :param memory_mode: Set to what degree to keep corpus in memory ('full', 'partial' or 'disk'). Use 'full'
612 if full corpus and all embeddings fits into memory for speedups during training. Otherwise use 'partial' and if
613 even this is too much for your memory, use 'disk'.
614 :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer)
615 :param corpusargs: Arguments for ClassificationCorpus
616 """
618 # dataset name includes the split size
619 dataset_name = self.__class__.__name__.lower() + '_' + str(split_max) + '_' + str(fraction_of_5_star_reviews)
621 # default dataset folder is the cache root
622 data_folder = flair.cache_root / "datasets" / dataset_name
624 # download data if necessary
625 if not (data_folder / "train.txt").is_file():
626 # download each of the 28 splits
627 self.download_and_prepare_amazon_product_file(data_folder, "AMAZON_FASHION_5.json.gz", split_max,
628 fraction_of_5_star_reviews)
629 self.download_and_prepare_amazon_product_file(data_folder, "All_Beauty_5.json.gz", split_max,
630 fraction_of_5_star_reviews)
631 self.download_and_prepare_amazon_product_file(data_folder, "Appliances_5.json.gz", split_max,
632 fraction_of_5_star_reviews)
633 self.download_and_prepare_amazon_product_file(data_folder, "Arts_Crafts_and_Sewing_5.json.gz", split_max,
634 fraction_of_5_star_reviews)
635 self.download_and_prepare_amazon_product_file(data_folder, "Arts_Crafts_and_Sewing_5.json.gz", split_max,
636 fraction_of_5_star_reviews)
637 self.download_and_prepare_amazon_product_file(data_folder, "Automotive_5.json.gz", split_max,
638 fraction_of_5_star_reviews)
639 self.download_and_prepare_amazon_product_file(data_folder, "Books_5.json.gz", split_max,
640 fraction_of_5_star_reviews)
641 self.download_and_prepare_amazon_product_file(data_folder, "CDs_and_Vinyl_5.json.gz", split_max,
642 fraction_of_5_star_reviews)
643 self.download_and_prepare_amazon_product_file(data_folder, "Cell_Phones_and_Accessories_5.json.gz",
644 split_max, fraction_of_5_star_reviews)
645 self.download_and_prepare_amazon_product_file(data_folder, "Clothing_Shoes_and_Jewelry_5.json.gz",
646 split_max, fraction_of_5_star_reviews)
647 self.download_and_prepare_amazon_product_file(data_folder, "Digital_Music_5.json.gz", split_max,
648 fraction_of_5_star_reviews)
649 self.download_and_prepare_amazon_product_file(data_folder, "Electronics_5.json.gz", split_max,
650 fraction_of_5_star_reviews)
651 self.download_and_prepare_amazon_product_file(data_folder, "Gift_Cards_5.json.gz", split_max,
652 fraction_of_5_star_reviews)
653 self.download_and_prepare_amazon_product_file(data_folder, "Grocery_and_Gourmet_Food_5.json.gz", split_max,
654 fraction_of_5_star_reviews)
655 self.download_and_prepare_amazon_product_file(data_folder, "Home_and_Kitchen_5.json.gz", split_max,
656 fraction_of_5_star_reviews)
657 self.download_and_prepare_amazon_product_file(data_folder, "Industrial_and_Scientific_5.json.gz", split_max,
658 fraction_of_5_star_reviews)
659 self.download_and_prepare_amazon_product_file(data_folder, "Kindle_Store_5.json.gz", split_max,
660 fraction_of_5_star_reviews)
661 self.download_and_prepare_amazon_product_file(data_folder, "Luxury_Beauty_5.json.gz", split_max,
662 fraction_of_5_star_reviews)
663 self.download_and_prepare_amazon_product_file(data_folder, "Magazine_Subscriptions_5.json.gz", split_max,
664 fraction_of_5_star_reviews)
665 self.download_and_prepare_amazon_product_file(data_folder, "Movies_and_TV_5.json.gz", split_max,
666 fraction_of_5_star_reviews)
667 self.download_and_prepare_amazon_product_file(data_folder, "Musical_Instruments_5.json.gz", split_max,
668 fraction_of_5_star_reviews)
669 self.download_and_prepare_amazon_product_file(data_folder, "Office_Products_5.json.gz", split_max,
670 fraction_of_5_star_reviews)
671 self.download_and_prepare_amazon_product_file(data_folder, "Patio_Lawn_and_Garden_5.json.gz", split_max,
672 fraction_of_5_star_reviews)
673 self.download_and_prepare_amazon_product_file(data_folder, "Pet_Supplies_5.json.gz", split_max,
674 fraction_of_5_star_reviews)
675 self.download_and_prepare_amazon_product_file(data_folder, "Prime_Pantry_5.json.gz", split_max,
676 fraction_of_5_star_reviews)
677 self.download_and_prepare_amazon_product_file(data_folder, "Software_5.json.gz", split_max,
678 fraction_of_5_star_reviews)
679 self.download_and_prepare_amazon_product_file(data_folder, "Sports_and_Outdoors_5.json.gz", split_max,
680 fraction_of_5_star_reviews)
681 self.download_and_prepare_amazon_product_file(data_folder, "Tools_and_Home_Improvement_5.json.gz",
682 split_max, fraction_of_5_star_reviews)
683 self.download_and_prepare_amazon_product_file(data_folder, "Toys_and_Games_5.json.gz", split_max,
684 fraction_of_5_star_reviews)
685 self.download_and_prepare_amazon_product_file(data_folder, "Video_Games_5.json.gz", split_max,
686 fraction_of_5_star_reviews)
688 super(AMAZON_REVIEWS, self).__init__(
689 data_folder,
690 label_type='sentiment',
691 label_name_map=label_name_map,
692 skip_labels=skip_labels,
693 tokenizer=tokenizer,
694 memory_mode=memory_mode,
695 **corpusargs
696 )
698 def download_and_prepare_amazon_product_file(self, data_folder, part_name, max_data_points=None,
699 fraction_of_5_star_reviews=None):
700 amazon__path = "http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall"
701 cached_path(f"{amazon__path}/{part_name}", Path("datasets") / 'Amazon_Product_Reviews')
702 import gzip
703 # create dataset directory if necessary
704 if not os.path.exists(data_folder):
705 os.makedirs(data_folder)
706 with open(data_folder / "train.txt", "a") as train_file:
708 write_count = 0
709 review_5_count = 0
710 # download senteval datasets if necessary und unzip
711 with gzip.open(flair.cache_root / "datasets" / 'Amazon_Product_Reviews' / part_name, "rb", ) as f_in:
712 for line in f_in:
713 parsed_json = json.loads(line)
714 if 'reviewText' not in parsed_json:
715 continue
716 if parsed_json['reviewText'].strip() == '':
717 continue
718 text = parsed_json['reviewText'].replace('\n', '')
720 if fraction_of_5_star_reviews and str(parsed_json['overall']) == '5.0':
721 review_5_count += 1
722 if review_5_count != fraction_of_5_star_reviews:
723 continue
724 else:
725 review_5_count = 0
727 train_file.write(f"__label__{parsed_json['overall']} {text}\n")
729 write_count += 1
730 if max_data_points and write_count >= max_data_points:
731 break
734class IMDB(ClassificationCorpus):
735 """
736 Corpus of IMDB movie reviews labeled by sentiment (POSITIVE, NEGATIVE). Downloaded from and documented at
737 http://ai.stanford.edu/~amaas/data/sentiment/.
738 """
740 def __init__(self,
741 base_path: Union[str, Path] = None,
742 rebalance_corpus: bool = True,
743 tokenizer: Tokenizer = SegtokTokenizer(),
744 memory_mode='partial',
745 **corpusargs):
746 """
748 :param base_path: Provide this only if you store the IMDB corpus in a specific folder, otherwise use default.
749 :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer)
750 :param rebalance_corpus: Default splits for this corpus have a strange 50/50 train/test split that are impractical.
751 With rebalance_corpus=True (default setting), corpus is rebalanced to a 80/10/10 train/dev/test split. If you
752 want to use original splits, set this to False.
753 :param memory_mode: Set to 'partial' because this is a huge corpus, but you can also set to 'full' for faster
754 processing or 'none' for less memory.
755 :param corpusargs: Other args for ClassificationCorpus.
756 """
758 if type(base_path) == str:
759 base_path: Path = Path(base_path)
761 # this dataset name
762 dataset_name = self.__class__.__name__.lower() + '_v4'
764 # default dataset folder is the cache root
765 if not base_path:
766 base_path = flair.cache_root / "datasets"
768 # download data if necessary
769 imdb_acl_path = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
771 if rebalance_corpus:
772 dataset_name = dataset_name + '-rebalanced'
773 data_folder = base_path / dataset_name
774 data_path = flair.cache_root / "datasets" / dataset_name
775 train_data_file = data_path / "train.txt"
776 test_data_file = data_path / "test.txt"
778 if train_data_file.is_file()==False or (rebalance_corpus==False and test_data_file.is_file()==False):
779 [os.remove(file_path) for file_path in [train_data_file, test_data_file] if file_path.is_file()]
781 cached_path(imdb_acl_path, Path("datasets") / dataset_name)
782 import tarfile
784 with tarfile.open(
785 flair.cache_root
786 / "datasets"
787 / dataset_name
788 / "aclImdb_v1.tar.gz",
789 "r:gz",
790 ) as f_in:
791 datasets = ["train", "test"]
792 labels = ["pos", "neg"]
794 for label in labels:
795 for dataset in datasets:
796 f_in.extractall(
797 data_path,
798 members=[
799 m
800 for m in f_in.getmembers()
801 if f"{dataset}/{label}" in m.name
802 ],
803 )
804 data_file = train_data_file
805 if rebalance_corpus==False and dataset=="test":
806 data_file = test_data_file
808 with open(data_file, "at") as f_p:
809 current_path = data_path / "aclImdb" / dataset / label
810 for file_name in current_path.iterdir():
811 if file_name.is_file() and file_name.name.endswith(
812 ".txt"
813 ):
814 if label == "pos": sentiment_label = 'POSITIVE'
815 if label == "neg": sentiment_label = 'NEGATIVE'
816 f_p.write(
817 f"__label__{sentiment_label} "
818 + file_name.open("rt", encoding="utf-8").read()
819 + "\n"
820 )
822 super(IMDB, self).__init__(
823 data_folder, label_type='sentiment', tokenizer=tokenizer, memory_mode=memory_mode, **corpusargs
824 )
827class NEWSGROUPS(ClassificationCorpus):
828 """
829 20 newsgroups corpus available at "http://qwone.com/~jason/20Newsgroups", classifying
830 news items into one of 20 categories. Each data point is a full news article so documents may be very long.
831 """
833 def __init__(self,
834 base_path: Union[str, Path] = None,
835 tokenizer: Tokenizer = SegtokTokenizer(),
836 memory_mode: str = 'partial',
837 **corpusargs
838 ):
839 """
840 Instantiates 20 newsgroups corpus.
841 :param base_path: Provide this only if you store the IMDB corpus in a specific folder, otherwise use default.
842 :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer)
843 :param memory_mode: Set to 'partial' because this is a big corpus, but you can also set to 'full' for faster
844 processing or 'none' for less memory.
845 :param corpusargs: Other args for ClassificationCorpus.
846 """
848 if type(base_path) == str:
849 base_path: Path = Path(base_path)
851 # this dataset name
852 dataset_name = self.__class__.__name__.lower()
854 # default dataset folder is the cache root
855 if not base_path:
856 base_path = flair.cache_root / "datasets"
857 data_folder = base_path / dataset_name
859 # download data if necessary
860 twenty_newsgroups_path = (
861 "http://qwone.com/~jason/20Newsgroups/20news-bydate.tar.gz"
862 )
863 data_path = flair.cache_root / "datasets" / dataset_name
864 data_file = data_path / "20news-bydate-train.txt"
865 if not data_file.is_file():
866 cached_path(
867 twenty_newsgroups_path, Path("datasets") / dataset_name / "original"
868 )
870 import tarfile
872 with tarfile.open(
873 flair.cache_root
874 / "datasets"
875 / dataset_name
876 / "original"
877 / "20news-bydate.tar.gz",
878 "r:gz",
879 ) as f_in:
880 datasets = ["20news-bydate-test", "20news-bydate-train"]
881 labels = [
882 "alt.atheism",
883 "comp.graphics",
884 "comp.os.ms-windows.misc",
885 "comp.sys.ibm.pc.hardware",
886 "comp.sys.mac.hardware",
887 "comp.windows.x",
888 "misc.forsale",
889 "rec.autos",
890 "rec.motorcycles",
891 "rec.sport.baseball",
892 "rec.sport.hockey",
893 "sci.crypt",
894 "sci.electronics",
895 "sci.med",
896 "sci.space",
897 "soc.religion.christian",
898 "talk.politics.guns",
899 "talk.politics.mideast",
900 "talk.politics.misc",
901 "talk.religion.misc",
902 ]
904 for label in labels:
905 for dataset in datasets:
906 f_in.extractall(
907 data_path / "original",
908 members=[
909 m
910 for m in f_in.getmembers()
911 if f"{dataset}/{label}" in m.name
912 ],
913 )
914 with open(
915 f"{data_path}/{dataset}.txt", "at", encoding="utf-8"
916 ) as f_p:
917 current_path = data_path / "original" / dataset / label
918 for file_name in current_path.iterdir():
919 if file_name.is_file():
920 f_p.write(
921 f"__label__{label} "
922 + file_name.open("rt", encoding="latin1")
923 .read()
924 .replace("\n", " <n> ")
925 + "\n"
926 )
928 super(NEWSGROUPS, self).__init__(
929 data_folder, tokenizer=tokenizer, memory_mode=memory_mode, **corpusargs,
930 )
933class SENTIMENT_140(ClassificationCorpus):
934 """
935 Twitter sentiment corpus downloaded from and documented at http://help.sentiment140.com/for-students. Two sentiments
936 in train data (POSITIVE, NEGATIVE) and three sentiments in test data (POSITIVE, NEGATIVE, NEUTRAL).
937 """
939 def __init__(
940 self,
941 label_name_map=None,
942 tokenizer: Tokenizer = SegtokTokenizer(),
943 memory_mode: str = 'partial',
944 **corpusargs,
945 ):
946 """
947 Instantiates twitter sentiment corpus.
948 :param label_name_map: By default, the numeric values are mapped to ('NEGATIVE', 'POSITIVE' and 'NEUTRAL')
949 :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer)
950 :param memory_mode: Set to 'partial' because this is a big corpus, but you can also set to 'full' for faster
951 processing or 'none' for less memory.
952 :param corpusargs: Other args for ClassificationCorpus.
953 """
955 # by defaut, map point score to POSITIVE / NEGATIVE values
956 if label_name_map is None:
957 label_name_map = {'0': 'NEGATIVE',
958 '2': 'NEUTRAL',
959 '4': 'POSITIVE'}
961 # this dataset name
962 dataset_name = self.__class__.__name__.lower()
964 # default dataset folder is the cache root
965 data_folder = flair.cache_root / "datasets" / dataset_name
967 # download data if necessary
968 if True or not (data_folder / "train.txt").is_file():
970 # download senteval datasets if necessary und unzip
971 sentiment_url = "https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip"
972 cached_path(sentiment_url, Path("datasets") / dataset_name / 'raw')
973 senteval_folder = flair.cache_root / "datasets" / dataset_name / 'raw'
974 unzip_file(senteval_folder / "trainingandtestdata.zip", senteval_folder)
976 # create dataset directory if necessary
977 if not os.path.exists(data_folder):
978 os.makedirs(data_folder)
980 # create train.txt file from CSV
981 with open(data_folder / "train.txt", "w") as train_file:
983 with open(senteval_folder / "training.1600000.processed.noemoticon.csv",
984 encoding='latin-1') as csv_train:
985 csv_reader = csv.reader(csv_train)
987 for row in csv_reader:
988 label = row[0]
989 text = row[5]
990 train_file.write(f"__label__{label} {text}\n")
992 # create test.txt file from CSV
993 with open(data_folder / "test.txt", "w") as train_file:
995 with open(senteval_folder / "testdata.manual.2009.06.14.csv", encoding='latin-1') as csv_train:
996 csv_reader = csv.reader(csv_train)
998 for row in csv_reader:
999 label = row[0]
1000 text = row[5]
1001 train_file.write(f"__label__{label} {text}\n")
1003 super(SENTIMENT_140, self).__init__(
1004 data_folder, label_type='sentiment', tokenizer=tokenizer,
1005 memory_mode=memory_mode, label_name_map=label_name_map, **corpusargs,
1006 )
1009class SENTEVAL_CR(ClassificationCorpus):
1010 """
1011 The customer reviews dataset of SentEval, see https://github.com/facebookresearch/SentEval, classified into
1012 NEGATIVE or POSITIVE sentiment.
1013 """
1015 def __init__(
1016 self,
1017 tokenizer: Union[bool, Callable[[str], List[Token]], Tokenizer] = SpaceTokenizer(),
1018 memory_mode: str = 'full',
1019 **corpusargs,
1020 ):
1021 """
1022 Instantiates SentEval customer reviews dataset.
1023 :param corpusargs: Other args for ClassificationCorpus.
1024 :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer())
1025 :param memory_mode: Set to 'full' by default since this is a small corpus. Can also be 'partial' or 'none'.
1026 """
1028 # this dataset name
1029 dataset_name = self.__class__.__name__.lower()
1031 # default dataset folder is the cache root
1032 data_folder = flair.cache_root / "datasets" / dataset_name
1034 # download data if necessary
1035 if not (data_folder / "train.txt").is_file():
1037 # download senteval datasets if necessary und unzip
1038 senteval_path = "https://dl.fbaipublicfiles.com/senteval/senteval_data/datasmall_NB_ACL12.zip"
1039 cached_path(senteval_path, Path("datasets") / "senteval")
1040 senteval_folder = flair.cache_root / "datasets" / "senteval"
1041 unzip_file(senteval_folder / "datasmall_NB_ACL12.zip", senteval_folder)
1043 # create dataset directory if necessary
1044 if not os.path.exists(data_folder):
1045 os.makedirs(data_folder)
1047 # create train.txt file by iterating over pos and neg file
1048 with open(data_folder / "train.txt", "a") as train_file:
1050 with open(senteval_folder / "data" / "customerr" / "custrev.pos", encoding="latin1") as file:
1051 for line in file:
1052 train_file.write(f"__label__POSITIVE {line}")
1054 with open(senteval_folder / "data" / "customerr" / "custrev.neg", encoding="latin1") as file:
1055 for line in file:
1056 train_file.write(f"__label__NEGATIVE {line}")
1058 super(SENTEVAL_CR, self).__init__(
1059 data_folder, label_type='sentiment', tokenizer=tokenizer, memory_mode=memory_mode, **corpusargs,
1060 )
1063class SENTEVAL_MR(ClassificationCorpus):
1064 """
1065 The movie reviews dataset of SentEval, see https://github.com/facebookresearch/SentEval, classified into
1066 NEGATIVE or POSITIVE sentiment.
1067 """
1069 def __init__(
1070 self,
1071 tokenizer: Union[bool, Callable[[str], List[Token]], Tokenizer] = SpaceTokenizer(),
1072 memory_mode: str = 'full',
1073 **corpusargs
1074 ):
1075 """
1076 Instantiates SentEval movie reviews dataset.
1077 :param corpusargs: Other args for ClassificationCorpus.
1078 :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer)
1079 :param memory_mode: Set to 'full' by default since this is a small corpus. Can also be 'partial' or 'none'.
1080 """
1082 # this dataset name
1083 dataset_name = self.__class__.__name__.lower()
1085 # default dataset folder is the cache root
1086 data_folder = flair.cache_root / "datasets" / dataset_name
1088 # download data if necessary
1089 if not (data_folder / "train.txt").is_file():
1091 # download senteval datasets if necessary und unzip
1092 senteval_path = "https://dl.fbaipublicfiles.com/senteval/senteval_data/datasmall_NB_ACL12.zip"
1093 cached_path(senteval_path, Path("datasets") / "senteval")
1094 senteval_folder = flair.cache_root / "datasets" / "senteval"
1095 unzip_file(senteval_folder / "datasmall_NB_ACL12.zip", senteval_folder)
1097 # create dataset directory if necessary
1098 if not os.path.exists(data_folder):
1099 os.makedirs(data_folder)
1101 # create train.txt file by iterating over pos and neg file
1102 with open(data_folder / "train.txt", "a") as train_file:
1104 with open(senteval_folder / "data" / "rt10662" / "rt-polarity.pos", encoding="latin1") as file:
1105 for line in file:
1106 train_file.write(f"__label__POSITIVE {line}")
1108 with open(senteval_folder / "data" / "rt10662" / "rt-polarity.neg", encoding="latin1") as file:
1109 for line in file:
1110 train_file.write(f"__label__NEGATIVE {line}")
1112 super(SENTEVAL_MR, self).__init__(
1113 data_folder, label_type='sentiment', tokenizer=tokenizer, memory_mode=memory_mode, **corpusargs
1114 )
1117class SENTEVAL_SUBJ(ClassificationCorpus):
1118 """
1119 The subjectivity dataset of SentEval, see https://github.com/facebookresearch/SentEval, classified into
1120 SUBJECTIVE or OBJECTIVE sentiment.
1121 """
1123 def __init__(
1124 self,
1125 tokenizer: Union[bool, Callable[[str], List[Token]], Tokenizer] = SpaceTokenizer(),
1126 memory_mode: str = 'full',
1127 **corpusargs,
1128 ):
1129 """
1130 Instantiates SentEval subjectivity dataset.
1131 :param corpusargs: Other args for ClassificationCorpus.
1132 :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer)
1133 :param memory_mode: Set to 'full' by default since this is a small corpus. Can also be 'partial' or 'none'.
1134 """
1136 # this dataset name
1137 dataset_name = self.__class__.__name__.lower()
1139 # default dataset folder is the cache root
1140 data_folder = flair.cache_root / "datasets" / dataset_name
1142 # download data if necessary
1143 if not (data_folder / "train.txt").is_file():
1145 # download senteval datasets if necessary und unzip
1146 senteval_path = "https://dl.fbaipublicfiles.com/senteval/senteval_data/datasmall_NB_ACL12.zip"
1147 cached_path(senteval_path, Path("datasets") / "senteval")
1148 senteval_folder = flair.cache_root / "datasets" / "senteval"
1149 unzip_file(senteval_folder / "datasmall_NB_ACL12.zip", senteval_folder)
1151 # create dataset directory if necessary
1152 if not os.path.exists(data_folder):
1153 os.makedirs(data_folder)
1155 # create train.txt file by iterating over pos and neg file
1156 with open(data_folder / "train.txt", "a") as train_file:
1158 with open(senteval_folder / "data" / "subj" / "subj.subjective", encoding="latin1") as file:
1159 for line in file:
1160 train_file.write(f"__label__SUBJECTIVE {line}")
1162 with open(senteval_folder / "data" / "subj" / "subj.objective", encoding="latin1") as file:
1163 for line in file:
1164 train_file.write(f"__label__OBJECTIVE {line}")
1166 super(SENTEVAL_SUBJ, self).__init__(
1167 data_folder, label_type='objectivity', tokenizer=tokenizer, memory_mode=memory_mode, **corpusargs,
1168 )
1171class SENTEVAL_MPQA(ClassificationCorpus):
1172 """
1173 The opinion-polarity dataset of SentEval, see https://github.com/facebookresearch/SentEval, classified into
1174 NEGATIVE or POSITIVE polarity.
1175 """
1177 def __init__(
1178 self,
1179 tokenizer: Union[bool, Callable[[str], List[Token]], Tokenizer] = SpaceTokenizer(),
1180 memory_mode: str = 'full',
1181 **corpusargs,
1182 ):
1183 """
1184 Instantiates SentEval opinion polarity dataset.
1185 :param corpusargs: Other args for ClassificationCorpus.
1186 :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer)
1187 :param memory_mode: Set to 'full' by default since this is a small corpus. Can also be 'partial' or 'none'.
1188 """
1190 # this dataset name
1191 dataset_name = self.__class__.__name__.lower()
1193 # default dataset folder is the cache root
1194 data_folder = flair.cache_root / "datasets" / dataset_name
1196 # download data if necessary
1197 if not (data_folder / "train.txt").is_file():
1199 # download senteval datasets if necessary und unzip
1200 senteval_path = "https://dl.fbaipublicfiles.com/senteval/senteval_data/datasmall_NB_ACL12.zip"
1201 cached_path(senteval_path, Path("datasets") / "senteval")
1202 senteval_folder = flair.cache_root / "datasets" / "senteval"
1203 unzip_file(senteval_folder / "datasmall_NB_ACL12.zip", senteval_folder)
1205 # create dataset directory if necessary
1206 if not os.path.exists(data_folder):
1207 os.makedirs(data_folder)
1209 # create train.txt file by iterating over pos and neg file
1210 with open(data_folder / "train.txt", "a") as train_file:
1212 with open(senteval_folder / "data" / "mpqa" / "mpqa.pos", encoding="latin1") as file:
1213 for line in file:
1214 train_file.write(f"__label__POSITIVE {line}")
1216 with open(senteval_folder / "data" / "mpqa" / "mpqa.neg", encoding="latin1") as file:
1217 for line in file:
1218 train_file.write(f"__label__NEGATIVE {line}")
1220 super(SENTEVAL_MPQA, self).__init__(
1221 data_folder, label_type='sentiment', tokenizer=tokenizer, memory_mode=memory_mode, **corpusargs,
1222 )
1225class SENTEVAL_SST_BINARY(ClassificationCorpus):
1226 """
1227 The Stanford sentiment treebank dataset of SentEval, see https://github.com/facebookresearch/SentEval, classified
1228 into NEGATIVE or POSITIVE sentiment.
1229 """
1231 def __init__(
1232 self,
1233 tokenizer: Union[bool, Callable[[str], List[Token]], Tokenizer] = SpaceTokenizer(),
1234 memory_mode: str = 'full',
1235 **corpusargs,
1236 ):
1237 """
1238 Instantiates SentEval Stanford sentiment treebank dataset.
1239 :param memory_mode: Set to 'full' by default since this is a small corpus. Can also be 'partial' or 'none'.
1240 :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer)
1241 :param corpusargs: Other args for ClassificationCorpus.
1242 """
1244 # this dataset name
1245 dataset_name = self.__class__.__name__.lower() + '_v2'
1247 # default dataset folder is the cache root
1248 data_folder = flair.cache_root / "datasets" / dataset_name
1250 # download data if necessary
1251 if not (data_folder / "train.txt").is_file():
1253 # download senteval datasets if necessary und unzip
1254 cached_path('https://raw.githubusercontent.com/PrincetonML/SIF/master/data/sentiment-train',
1255 Path("datasets") / dataset_name / 'raw')
1256 cached_path('https://raw.githubusercontent.com/PrincetonML/SIF/master/data/sentiment-test',
1257 Path("datasets") / dataset_name / 'raw')
1258 cached_path('https://raw.githubusercontent.com/PrincetonML/SIF/master/data/sentiment-dev',
1259 Path("datasets") / dataset_name / 'raw')
1261 original_filenames = ["sentiment-train", "sentiment-dev", "sentiment-test"]
1262 new_filenames = ["train.txt", "dev.txt", "test.txt"]
1264 # create train dev and test files in fasttext format
1265 for new_filename, original_filename in zip(new_filenames, original_filenames):
1266 with open(data_folder / new_filename, "a") as out_file, open(
1267 data_folder / 'raw' / original_filename) as in_file:
1268 for line in in_file:
1269 fields = line.split('\t')
1270 label = 'POSITIVE' if fields[1].rstrip() == '1' else 'NEGATIVE'
1271 out_file.write(f"__label__{label} {fields[0]}\n")
1273 super(SENTEVAL_SST_BINARY, self).__init__(
1274 data_folder,
1275 tokenizer=tokenizer,
1276 memory_mode=memory_mode,
1277 **corpusargs,
1278 )
1281class SENTEVAL_SST_GRANULAR(ClassificationCorpus):
1282 """
1283 The Stanford sentiment treebank dataset of SentEval, see https://github.com/facebookresearch/SentEval, classified
1284 into 5 sentiment classes.
1285 """
1287 def __init__(
1288 self,
1289 tokenizer: Union[bool, Callable[[str], List[Token]], Tokenizer] = SpaceTokenizer(),
1290 memory_mode: str = 'full',
1291 **corpusargs,
1292 ):
1293 """
1294 Instantiates SentEval Stanford sentiment treebank dataset.
1295 :param memory_mode: Set to 'full' by default since this is a small corpus. Can also be 'partial' or 'none'.
1296 :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer)
1297 :param corpusargs: Other args for ClassificationCorpus.
1298 """
1300 # this dataset name
1301 dataset_name = self.__class__.__name__.lower()
1303 # default dataset folder is the cache root
1304 data_folder = flair.cache_root / "datasets" / dataset_name
1306 # download data if necessary
1307 if not (data_folder / "train.txt").is_file():
1309 # download senteval datasets if necessary und unzip
1310 cached_path(
1311 'https://raw.githubusercontent.com/AcademiaSinicaNLPLab/sentiment_dataset/master/data/stsa.fine.train',
1312 Path("datasets") / dataset_name / 'raw')
1313 cached_path(
1314 'https://raw.githubusercontent.com/AcademiaSinicaNLPLab/sentiment_dataset/master/data/stsa.fine.test',
1315 Path("datasets") / dataset_name / 'raw')
1316 cached_path(
1317 'https://raw.githubusercontent.com/AcademiaSinicaNLPLab/sentiment_dataset/master/data/stsa.fine.dev',
1318 Path("datasets") / dataset_name / 'raw')
1320 # convert to FastText format
1321 for split in ['train', 'dev', 'test']:
1322 with open(data_folder / f"{split}.txt", "w") as train_file:
1324 with open(data_folder / 'raw' / f'stsa.fine.{split}', encoding="latin1") as file:
1325 for line in file:
1326 train_file.write(f"__label__{line[0]} {line[2:]}")
1328 super(SENTEVAL_SST_GRANULAR, self).__init__(
1329 data_folder,
1330 tokenizer=tokenizer,
1331 memory_mode=memory_mode,
1332 **corpusargs,
1333 )
1336class GLUE_COLA(ClassificationCorpus):
1337 """
1338 Corpus of Linguistic Acceptability from GLUE benchmark (https://gluebenchmark.com/tasks).
1339 The task is to predict whether an English sentence is grammatically correct.
1340 Additionaly to the Corpus we have eval_dataset containing the unlabeled test data for Glue evaluation.
1341 """
1343 def __init__(self,
1344 label_type="acceptability",
1345 base_path: Union[str, Path] = None,
1346 tokenizer: Tokenizer = SegtokTokenizer(),
1347 **corpusargs):
1348 """
1349 Instantiates CoLA dataset
1350 :param base_path: Provide this only if you store the COLA corpus in a specific folder.
1351 :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer)
1352 :param corpusargs: Other args for ClassificationCorpus.
1353 """
1355 if type(base_path) == str:
1356 base_path: Path = Path(base_path)
1358 dataset_name = "glue"
1360 # if no base_path provided take cache root
1361 if not base_path:
1362 base_path = flair.cache_root / "datasets"
1363 data_folder = base_path / dataset_name
1365 # download data if necessary
1366 cola_path = "https://dl.fbaipublicfiles.com/glue/data/CoLA.zip"
1368 data_file = data_folder / "CoLA/train.txt"
1370 # if data is not downloaded yet, download it
1371 if not data_file.is_file():
1372 # get the zip file
1373 zipped_data_path = cached_path(cola_path, Path("datasets") / dataset_name)
1375 unpack_file(zipped_data_path, data_folder, mode="zip", keep=False)
1377 # move original .tsv files to another folder
1378 Path(data_folder / "CoLA/train.tsv").rename(data_folder / "CoLA/original/train.tsv")
1379 Path(data_folder / "CoLA/dev.tsv").rename(data_folder / "CoLA/original/dev.tsv")
1380 Path(data_folder / "CoLA/test.tsv").rename(data_folder / "CoLA/original/test.tsv")
1382 label_map = {0: 'not_grammatical', 1: 'grammatical'}
1384 # create train and dev splits in fasttext format
1385 for split in ["train", "dev"]:
1386 with open(data_folder / "CoLA" / (split + ".txt"), "a") as out_file, open(
1387 data_folder / "CoLA" / "original" / (split + ".tsv")) as in_file:
1388 for line in in_file:
1389 fields = line.rstrip().split('\t')
1390 label = int(fields[1])
1391 sentence = fields[3]
1392 out_file.write(f"__label__{label_map[label]} {sentence}\n")
1394 # create eval_dataset file with no labels
1395 with open(data_folder / "CoLA" / "eval_dataset.txt", "a") as out_file, open(
1396 data_folder / "CoLA" / "original" / "test.tsv",) as in_file:
1397 for line in in_file:
1398 fields = line.rstrip().split('\t')
1399 sentence = fields[1]
1400 out_file.write(f"{sentence}\n")
1402 super(GLUE_COLA, self).__init__(
1403 data_folder / "CoLA",
1404 label_type=label_type,
1405 tokenizer=tokenizer,
1406 **corpusargs,
1407 )
1409 self.eval_dataset = ClassificationDataset(
1410 data_folder / "CoLA/eval_dataset.txt",
1411 label_type=label_type,
1412 allow_examples_without_labels=True,
1413 tokenizer=tokenizer,
1414 memory_mode="full",
1415 )
1417 """
1418 This function creates a tsv file with predictions of the eval_dataset (after calling
1419 classifier.predict(corpus.eval_dataset, label_name='acceptability')). The resulting file
1420 is called CoLA.tsv and is in the format required for submission to the Glue Benchmark.
1421 """
1423 def tsv_from_eval_dataset(self, folder_path: Union[str, Path]):
1425 if type(folder_path) == str:
1426 folder_path = Path(folder_path)
1427 folder_path = folder_path / 'CoLA.tsv'
1429 with open(folder_path, mode='w') as tsv_file:
1430 tsv_file.write("index\tprediction\n")
1431 for index, datapoint in enumerate(self.eval_dataset):
1432 reverse_label_map = {'grammatical': 1, 'not_grammatical': 0}
1433 predicted_label = reverse_label_map[datapoint.get_labels('acceptability')[0].value]
1434 tsv_file.write(str(index) + '\t' + predicted_label + '\n')
1437class GO_EMOTIONS(ClassificationCorpus):
1438 """
1439 GoEmotions dataset containing 58k Reddit comments labeled with 27 emotion categories, see. https://github.com/google-research/google-research/tree/master/goemotions
1440 """
1441 def __init__(
1442 self,
1443 base_path: Union[str, Path] = None,
1444 tokenizer: Union[bool, Callable[[str], List[Token]], Tokenizer] = SegtokTokenizer(),
1445 memory_mode: str = 'partial',
1446 **corpusargs,
1447 ):
1448 """
1449 Parameters
1450 ----------
1451 base_path : Provide this only if you want to store the corpus in a specific folder, otherwise use default.
1452 tokenizer : Default is SegtokTokenizer().
1453 memory_mode : Set to what degree to keep corpus in memory ('full', 'partial' or 'disk'). Use 'full'
1454 if full corpus and all embeddings fits into memory for speedups during training. Otherwise use 'partial' and if
1455 even this is too much for your memory, use 'disk'.
1456 **corpusargs : Other args for ClassificationCorpus.
1458 """
1460 label_name_map = {'0': 'ADMIRATION',
1461 '1': 'AMUSEMENT',
1462 '2': 'ANGER',
1463 '3': 'ANNOYANCE',
1464 '4': 'APPROVAL',
1465 '5': 'CARING',
1466 '6': 'CONFUSION',
1467 '7': 'CURIOSITY',
1468 '8': 'DESIRE',
1469 '9': 'DISAPPOINTMENT',
1470 '10': 'DISAPPROVAL',
1471 '11': 'DISGUST',
1472 '12': 'EMBARRASSMENT',
1473 '13': 'EXCITEMENT',
1474 '14': 'FEAR',
1475 '15': 'GRATITUDE',
1476 '16': 'GRIEF',
1477 '17': 'JOY',
1478 '18': 'LOVE',
1479 '19': 'NERVOUSNESS',
1480 '20': 'OPTIMISM',
1481 '21': 'PRIDE',
1482 '22': 'REALIZATION',
1483 '23': 'RELIEF',
1484 '24': 'REMORSE',
1485 '25': 'SADNESS',
1486 '26': 'SURPRISE',
1487 '27': 'NEUTRAL'}
1489 if type(base_path) == str:
1490 base_path: Path = Path(base_path)
1492 # default dataset folder is the cache root
1493 if not base_path:
1494 base_path = flair.cache_root / "datasets"
1496 # this dataset name
1497 dataset_name = self.__class__.__name__.lower()
1499 # default dataset folder is the cache root
1500 data_folder = base_path / dataset_name
1502 # download data if necessary
1503 if not (data_folder / "train.txt").is_file():
1505 # download datasets if necessary
1506 goemotions_url = "https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/"
1507 for name in ["train.tsv", "test.tsv", "dev.tsv"]:
1508 cached_path(goemotions_url + name, Path("datasets") / dataset_name / 'raw')
1510 # create dataset directory if necessary
1511 if not os.path.exists(data_folder):
1512 os.makedirs(data_folder)
1514 data_path = flair.cache_root / "datasets" / dataset_name / 'raw'
1515 # create correctly formated txt files
1516 for name in ["train", "test", "dev"]:
1517 with open(data_folder / (name + '.txt'), "w", encoding='utf-8') as txt_file:
1518 with open(data_path / (name + ".tsv"), "r", encoding='utf-8') as tsv_file:
1520 lines = tsv_file.readlines()
1521 for line in lines:
1522 row = line.split('\t')
1523 text = row[0]
1524 # multiple labels are possible
1525 labels = row[1].split(',')
1526 label_string = ""
1527 for label in labels:
1528 label_string += '__label__'
1529 label_string += label
1530 label_string += ' '
1531 txt_file.write(f"{label_string}{text}\n")
1533 super(GO_EMOTIONS, self).__init__(
1534 data_folder, label_type='emotion', tokenizer=tokenizer,
1535 memory_mode=memory_mode, label_name_map=label_name_map, **corpusargs,
1536 )
1539class TREC_50(ClassificationCorpus):
1540 """
1541 The TREC Question Classification Corpus, classifying questions into 50 fine-grained answer types.
1542 """
1544 def __init__(self,
1545 base_path: Union[str, Path] = None,
1546 tokenizer: Union[bool, Callable[[str], List[Token]], Tokenizer] = SpaceTokenizer(),
1547 memory_mode='full',
1548 **corpusargs
1549 ):
1550 """
1551 Instantiates TREC Question Classification Corpus with 6 classes.
1552 :param base_path: Provide this only if you store the TREC corpus in a specific folder, otherwise use default.
1553 :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer)
1554 :param memory_mode: Set to 'full' by default since this is a small corpus. Can also be 'partial' or 'none'.
1555 :param corpusargs: Other args for ClassificationCorpus.
1556 """
1558 if type(base_path) == str:
1559 base_path: Path = Path(base_path)
1561 # this dataset name
1562 dataset_name = self.__class__.__name__.lower()
1564 # default dataset folder is the cache root
1565 if not base_path:
1566 base_path = flair.cache_root / "datasets"
1567 data_folder = base_path / dataset_name
1569 # download data if necessary
1570 trec_path = "https://cogcomp.seas.upenn.edu/Data/QA/QC/"
1572 original_filenames = ["train_5500.label", "TREC_10.label"]
1573 new_filenames = ["train.txt", "test.txt"]
1574 for original_filename in original_filenames:
1575 cached_path(
1576 f"{trec_path}{original_filename}",
1577 Path("datasets") / dataset_name / "original",
1578 )
1580 data_file = data_folder / new_filenames[0]
1582 if not data_file.is_file():
1583 for original_filename, new_filename in zip(
1584 original_filenames, new_filenames
1585 ):
1586 with open(
1587 data_folder / "original" / original_filename,
1588 "rt",
1589 encoding="latin1",
1590 ) as open_fp:
1591 with open(
1592 data_folder / new_filename, "wt", encoding="utf-8"
1593 ) as write_fp:
1594 for line in open_fp:
1595 line = line.rstrip()
1596 fields = line.split()
1597 old_label = fields[0]
1598 question = " ".join(fields[1:])
1600 # Create flair compatible labels
1601 # TREC-6 : NUM:dist -> __label__NUM
1602 # TREC-50: NUM:dist -> __label__NUM:dist
1603 new_label = "__label__"
1604 new_label += old_label
1606 write_fp.write(f"{new_label} {question}\n")
1608 super(TREC_50, self).__init__(
1609 data_folder, tokenizer=tokenizer, memory_mode=memory_mode, **corpusargs,
1610 )
1613class TREC_6(ClassificationCorpus):
1614 """
1615 The TREC Question Classification Corpus, classifying questions into 6 coarse-grained answer types
1616 (DESC, HUM, LOC, ENTY, NUM, ABBR).
1617 """
1619 def __init__(self,
1620 base_path: Union[str, Path] = None,
1621 tokenizer: Union[bool, Callable[[str], List[Token]], Tokenizer] = SpaceTokenizer(),
1622 memory_mode='full',
1623 **corpusargs
1624 ):
1625 """
1626 Instantiates TREC Question Classification Corpus with 6 classes.
1627 :param base_path: Provide this only if you store the TREC corpus in a specific folder, otherwise use default.
1628 :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer)
1629 :param memory_mode: Set to 'full' by default since this is a small corpus. Can also be 'partial' or 'none'.
1630 :param corpusargs: Other args for ClassificationCorpus.
1631 """
1633 if type(base_path) == str:
1634 base_path: Path = Path(base_path)
1636 # this dataset name
1637 dataset_name = self.__class__.__name__.lower()
1639 # default dataset folder is the cache root
1640 if not base_path:
1641 base_path = flair.cache_root / "datasets"
1642 data_folder = base_path / dataset_name
1644 # download data if necessary
1645 trec_path = "https://cogcomp.seas.upenn.edu/Data/QA/QC/"
1647 original_filenames = ["train_5500.label", "TREC_10.label"]
1648 new_filenames = ["train.txt", "test.txt"]
1649 for original_filename in original_filenames:
1650 cached_path(
1651 f"{trec_path}{original_filename}",
1652 Path("datasets") / dataset_name / "original",
1653 )
1655 data_file = data_folder / new_filenames[0]
1657 if not data_file.is_file():
1658 for original_filename, new_filename in zip(
1659 original_filenames, new_filenames
1660 ):
1661 with open(
1662 data_folder / "original" / original_filename,
1663 "rt",
1664 encoding="latin1",
1665 ) as open_fp:
1666 with open(
1667 data_folder / new_filename, "wt", encoding="utf-8"
1668 ) as write_fp:
1669 for line in open_fp:
1670 line = line.rstrip()
1671 fields = line.split()
1672 old_label = fields[0]
1673 question = " ".join(fields[1:])
1675 # Create flair compatible labels
1676 # TREC-6 : NUM:dist -> __label__NUM
1677 # TREC-50: NUM:dist -> __label__NUM:dist
1678 new_label = "__label__"
1679 new_label += old_label.split(":")[0]
1681 write_fp.write(f"{new_label} {question}\n")
1683 super(TREC_6, self).__init__(
1684 data_folder, label_type='question_class', tokenizer=tokenizer, memory_mode=memory_mode, **corpusargs,
1685 )
1688class YAHOO_ANSWERS(ClassificationCorpus):
1689 """
1690 The YAHOO Question Classification Corpus, classifying questions into 10 coarse-grained answer types
1691 """
1693 def __init__(self,
1694 base_path: Union[str, Path] = None,
1695 tokenizer: Union[bool, Callable[[str], List[Token]], Tokenizer] = SpaceTokenizer(),
1696 memory_mode='partial',
1697 **corpusargs
1698 ):
1699 """
1700 Instantiates YAHOO Question Classification Corpus with 10 classes.
1701 :param base_path: Provide this only if you store the YAHOO corpus in a specific folder, otherwise use default.
1702 :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer)
1703 :param memory_mode: Set to 'partial' by default since this is a rather big corpus. Can also be 'full' or 'none'.
1704 :param corpusargs: Other args for ClassificationCorpus.
1705 """
1707 if type(base_path) == str:
1708 base_path: Path = Path(base_path)
1710 # this dataset name
1711 dataset_name = self.__class__.__name__.lower()
1713 # default dataset folder is the cache root
1714 if not base_path:
1715 base_path = flair.cache_root / "datasets"
1716 data_folder = base_path / dataset_name
1718 # download data if necessary
1719 url = "https://s3.amazonaws.com/fast-ai-nlp/yahoo_answers_csv.tgz"
1721 label_map = {'1': 'Society_&_Culture',
1722 '2': 'Science_&_Mathematics',
1723 '3': 'Health',
1724 '4': 'Education_&_Reference',
1725 '5': 'Computers_&_Internet',
1726 '6': 'Sports',
1727 '7': 'Business_&_Finance',
1728 '8': 'Entertainment_&_Music',
1729 '9': 'Family_&_Relationships',
1730 '10': 'Politics_&_Government'}
1732 original = flair.cache_root / "datasets" / dataset_name / "original"
1734 if not (data_folder / "train.txt").is_file():
1735 cached_path(url, original)
1738 import tarfile
1740 tar = tarfile.open(original / "yahoo_answers_csv.tgz", "r:gz")
1741 members = []
1743 for member in tar.getmembers():
1744 if("test.csv" in member.name or "train.csv" in member.name):
1745 members.append(member)
1747 tar.extractall(original, members=members)
1749 for name in ["train", "test"]:
1750 file = open(original / "yahoo_answers_csv" / (name+".csv"))
1751 reader = csv.reader(file)
1752 writer = open(data_folder / (name+".txt"), "wt", encoding="utf-8")
1753 for row in reader:
1754 writer.write("__label__"+label_map.get(row[0])+" "+row[1]+"\n")
1756 file.close()
1757 writer.close()
1759 super(YAHOO_ANSWERS, self).__init__(
1760 data_folder, label_type='question_type', tokenizer=tokenizer, memory_mode=memory_mode, **corpusargs,
1761 )
1764class GERMEVAL_2018_OFFENSIVE_LANGUAGE(ClassificationCorpus):
1765 """
1766 GermEval 2018 corpus for identification of offensive language.
1767 Classifying German tweets into 2 coarse-grained categories OFFENSIVE and OTHER
1768 or 4 fine-grained categories ABUSE, INSULT, PROFATINTY and OTHER.
1769 """
1771 def __init__(self,
1772 base_path: Union[str, Path] = None,
1773 tokenizer: Union[bool, Callable[[str], List[Token]], Tokenizer] = SegtokTokenizer(),
1774 memory_mode: str = 'full',
1775 fine_grained_classes: bool = False,
1776 **corpusargs):
1777 """
1778 Instantiates GermEval 2018 Offensive Language Classification Corpus.
1779 :param base_path: Provide this only if you store the Offensive Language corpus in a specific folder, otherwise use default.
1780 :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer)
1781 :param memory_mode: Set to 'full' by default since this is a small corpus. Can also be 'partial' or 'none'.
1782 :param fine_grained_classes: Set to True to load the dataset with 4 fine-grained classes
1783 :param corpusargs: Other args for ClassificationCorpus.
1784 """
1786 if type(base_path) == str:
1787 base_path: Path = Path(base_path)
1789 # this dataset name
1790 dataset_name = self.__class__.__name__.lower()
1792 # default dataset folder is the cache root
1793 if not base_path:
1794 base_path = flair.cache_root / "datasets"
1795 data_folder = base_path / dataset_name
1797 # download data if necessary
1798 offlang_path = "https://raw.githubusercontent.com/uds-lsv/GermEval-2018-Data/master/"
1800 original_filenames = ["germeval2018.training.txt", "germeval2018.test.txt"]
1801 new_filenames = ["train.txt", "test.txt"]
1802 for original_filename in original_filenames:
1803 cached_path(
1804 f"{offlang_path}{original_filename}",
1805 Path("datasets") / dataset_name / "original",
1806 )
1808 task_setting = "coarse_grained"
1809 if fine_grained_classes:
1810 task_setting = "fine_grained"
1812 task_folder = data_folder / task_setting
1813 data_file = task_folder / new_filenames[0]
1815 # create a separate directory for different tasks
1816 if not os.path.exists(task_folder):
1817 os.makedirs(task_folder)
1819 if not data_file.is_file():
1820 for original_filename, new_filename in zip(
1821 original_filenames, new_filenames
1822 ):
1823 with open(
1824 data_folder / "original" / original_filename,
1825 "rt",
1826 encoding="utf-8",
1827 ) as open_fp:
1828 with open(
1829 data_folder / task_setting / new_filename, "wt", encoding="utf-8"
1830 ) as write_fp:
1831 for line in open_fp:
1832 line = line.rstrip()
1833 fields = line.split('\t')
1834 tweet = fields[0]
1835 if task_setting == "fine_grained":
1836 old_label = fields[2]
1837 else:
1838 old_label = fields[1]
1839 new_label = '__label__' + old_label
1840 write_fp.write(f"{new_label} {tweet}\n")
1842 super(GERMEVAL_2018_OFFENSIVE_LANGUAGE, self).__init__(
1843 data_folder=task_folder, tokenizer=tokenizer, memory_mode=memory_mode, **corpusargs,
1844 )
1847class COMMUNICATIVE_FUNCTIONS(ClassificationCorpus):
1848 """
1849 The Communicative Functions Classification Corpus.
1850 Classifying sentences from scientific papers into 39 communicative functions.
1851 """
1853 def __init__(self,
1854 base_path: Union[str, Path] = None,
1855 memory_mode: str = 'full',
1856 tokenizer: Union[bool, Callable[[str], List[Token]], Tokenizer] = SpaceTokenizer(),
1857 **corpusargs):
1858 """
1859 Instantiates Communicative Functions Classification Corpus with 39 classes.
1860 :param base_path: Provide this only if you store the Communicative Functions date in a specific folder, otherwise use default.
1861 :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer)
1862 :param memory_mode: Set to 'full' by default since this is a small corpus. Can also be 'partial' or 'none'.
1863 :param corpusargs: Other args for ClassificationCorpus.
1864 """
1866 if type(base_path) == str:
1867 base_path: Path = Path(base_path)
1869 # this dataset name
1870 dataset_name = self.__class__.__name__.lower()
1872 # default dataset folder is the cache root
1873 if not base_path:
1874 base_path = flair.cache_root / "datasets"
1875 data_folder = base_path / dataset_name
1877 original_filenames = ["background.tsv", "discussion.tsv", "introduction.tsv", "method.tsv", "result.tsv"]
1879 # download data if necessary
1880 comm_path = "https://raw.githubusercontent.com/Alab-NII/FECFevalDataset/master/sentences/"
1882 for original_filename in original_filenames:
1883 cached_path(f"{comm_path}{original_filename}", Path("datasets") / dataset_name / "original")
1885 data_file = data_folder / "train.txt"
1887 if not data_file.is_file(): # check if new file already exists
1888 with open(data_folder / "train.txt", 'a+', encoding="utf-8") as write_fp:
1889 for original_filename in original_filenames[:4]:
1890 with open(data_folder / "original" / original_filename, 'rt', encoding="utf-8") as open_fp:
1891 for line in open_fp:
1892 liste = line.split('\t')
1893 write_fp.write('__label__' + liste[0].replace(' ', '_') + ' ' + liste[2] + '\n')
1894 with open(data_folder / "original" / "result.tsv", 'rt', encoding="utf-8") as open_fp:
1895 for line in open_fp:
1896 liste = line.split('\t')
1897 if liste[0].split(' ')[-1] == "(again)":
1898 write_fp.write('__label__' + liste[0][:-8].replace(' ', '_') + ' ' + liste[2] + '\n')
1899 else:
1900 write_fp.write('__label__' + liste[0].replace(' ', '_') + ' ' + liste[2] + '\n')
1902 super(COMMUNICATIVE_FUNCTIONS, self).__init__(
1903 data_folder, label_type='communicative_function', tokenizer=tokenizer, memory_mode=memory_mode,
1904 **corpusargs,
1905 )
1908def _download_wassa_if_not_there(emotion, data_folder, dataset_name):
1909 for split in ["train", "dev", "test"]:
1911 data_file = data_folder / f"{emotion}-{split}.txt"
1913 if not data_file.is_file():
1915 if split == "train":
1916 url = f"http://saifmohammad.com/WebDocs/EmoInt%20Train%20Data/{emotion}-ratings-0to1.train.txt"
1917 if split == "dev":
1918 url = f"http://saifmohammad.com/WebDocs/EmoInt%20Dev%20Data%20With%20Gold/{emotion}-ratings-0to1.dev.gold.txt"
1919 if split == "test":
1920 url = f"http://saifmohammad.com/WebDocs/EmoInt%20Test%20Gold%20Data/{emotion}-ratings-0to1.test.gold.txt"
1922 path = cached_path(url, Path("datasets") / dataset_name)
1924 with open(path, "r", encoding="UTF-8") as f:
1925 with open(data_file, "w", encoding="UTF-8") as out:
1926 next(f)
1927 for line in f:
1928 fields = line.split("\t")
1929 out.write(f"__label__{fields[3].rstrip()} {fields[1]}\n")
1931 os.remove(path)
1934class WASSA_ANGER(ClassificationCorpus):
1935 """
1936 WASSA-2017 anger emotion-intensity dataset downloaded from and documented at
1937 https://saifmohammad.com/WebPages/EmotionIntensity-SharedTask.html
1938 """
1940 def __init__(self,
1941 base_path: Union[str, Path] = None,
1942 tokenizer: Tokenizer = SegtokTokenizer(),
1943 **corpusargs):
1944 """
1945 Instantiates WASSA-2017 anger emotion-intensity dataset
1946 :param base_path: Provide this only if you store the WASSA corpus in a specific folder, otherwise use default.
1947 :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer)
1948 :param corpusargs: Other args for ClassificationCorpus.
1949 """
1951 if type(base_path) == str:
1952 base_path: Path = Path(base_path)
1954 # this dataset name
1955 dataset_name = self.__class__.__name__.lower()
1957 # default dataset folder is the cache root
1958 if not base_path:
1959 base_path = flair.cache_root / "datasets"
1960 data_folder = base_path / dataset_name
1962 # download data if necessary
1963 _download_wassa_if_not_there("anger", data_folder, dataset_name)
1965 super(WASSA_ANGER, self).__init__(
1966 data_folder, tokenizer=tokenizer, **corpusargs,
1967 )
1970class WASSA_FEAR(ClassificationCorpus):
1971 """
1972 WASSA-2017 fear emotion-intensity dataset downloaded from and documented at
1973 https://saifmohammad.com/WebPages/EmotionIntensity-SharedTask.html
1974 """
1976 def __init__(self,
1977 base_path: Union[str, Path] = None,
1978 tokenizer: Tokenizer = SegtokTokenizer(),
1979 **corpusargs):
1980 """
1981 Instantiates WASSA-2017 fear emotion-intensity dataset
1982 :param base_path: Provide this only if you store the WASSA corpus in a specific folder, otherwise use default.
1983 :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer)
1984 :param corpusargs: Other args for ClassificationCorpus.
1985 """
1987 if type(base_path) == str:
1988 base_path: Path = Path(base_path)
1990 # this dataset name
1991 dataset_name = self.__class__.__name__.lower()
1993 # default dataset folder is the cache root
1994 if not base_path:
1995 base_path = flair.cache_root / "datasets"
1996 data_folder = base_path / dataset_name
1998 # download data if necessary
1999 _download_wassa_if_not_there("fear", data_folder, dataset_name)
2001 super(WASSA_FEAR, self).__init__(
2002 data_folder, tokenizer=tokenizer, **corpusargs
2003 )
2006class WASSA_JOY(ClassificationCorpus):
2007 """
2008 WASSA-2017 joy emotion-intensity dataset downloaded from and documented at
2009 https://saifmohammad.com/WebPages/EmotionIntensity-SharedTask.html
2010 """
2012 def __init__(self,
2013 base_path: Union[str, Path] = None,
2014 tokenizer: Tokenizer = SegtokTokenizer(),
2015 **corpusargs):
2016 """
2017 Instantiates WASSA-2017 joy emotion-intensity dataset
2018 :param base_path: Provide this only if you store the WASSA corpus in a specific folder, otherwise use default.
2019 :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer)
2020 :param corpusargs: Other args for ClassificationCorpus.
2021 """
2023 if type(base_path) == str:
2024 base_path: Path = Path(base_path)
2026 # this dataset name
2027 dataset_name = self.__class__.__name__.lower()
2029 # default dataset folder is the cache root
2030 if not base_path:
2031 base_path = flair.cache_root / "datasets"
2032 data_folder = base_path / dataset_name
2034 # download data if necessary
2035 _download_wassa_if_not_there("joy", data_folder, dataset_name)
2037 super(WASSA_JOY, self).__init__(
2038 data_folder, tokenizer=tokenizer, **corpusargs,
2039 )
2042class WASSA_SADNESS(ClassificationCorpus):
2043 """
2044 WASSA-2017 sadness emotion-intensity dataset downloaded from and documented at
2045 https://saifmohammad.com/WebPages/EmotionIntensity-SharedTask.html
2046 """
2048 def __init__(self,
2049 base_path: Union[str, Path] = None,
2050 tokenizer: Tokenizer = SegtokTokenizer(),
2051 **corpusargs):
2052 """
2053 Instantiates WASSA-2017 sadness emotion-intensity dataset
2054 :param base_path: Provide this only if you store the WASSA corpus in a specific folder, otherwise use default.
2055 :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer)
2056 :param corpusargs: Other args for ClassificationCorpus.
2057 """
2059 if type(base_path) == str:
2060 base_path: Path = Path(base_path)
2062 # this dataset name
2063 dataset_name = self.__class__.__name__.lower()
2065 # default dataset folder is the cache root
2066 if not base_path:
2067 base_path = flair.cache_root / "datasets"
2068 data_folder = base_path / dataset_name
2070 # download data if necessary
2071 _download_wassa_if_not_there("sadness", data_folder, dataset_name)
2073 super(WASSA_SADNESS, self).__init__(
2074 data_folder, tokenizer=tokenizer, **corpusargs,
2075 )