Coverage for /home/ubuntu/Documents/Research/mut_p1/flair/flair/datasets/treebanks.py: 24%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

870 statements  

1import logging 

2import re 

3from pathlib import Path 

4from typing import List, Union 

5 

6import flair 

7from flair.data import ( 

8 Sentence, 

9 Corpus, 

10 Token, 

11 FlairDataset, 

12) 

13from flair.datasets.base import find_train_dev_test_files 

14from flair.file_utils import cached_path, unzip_file 

15 

16log = logging.getLogger("flair") 

17 

18 

19class UniversalDependenciesCorpus(Corpus): 

20 def __init__( 

21 self, 

22 data_folder: Union[str, Path], 

23 train_file=None, 

24 test_file=None, 

25 dev_file=None, 

26 in_memory: bool = True, 

27 split_multiwords: bool = True, 

28 ): 

29 """ 

30 Instantiates a Corpus from CoNLL-U column-formatted task data such as the UD corpora 

31 

32 :param data_folder: base folder with the task data 

33 :param train_file: the name of the train file 

34 :param test_file: the name of the test file 

35 :param dev_file: the name of the dev file, if None, dev data is sampled from train 

36 :param in_memory: If set to True, keeps full dataset in memory, otherwise does disk reads 

37 :param split_multiwords: If set to True, multiwords are split (default), otherwise kept as single tokens 

38 :return: a Corpus with annotated train, dev and test data 

39 """ 

40 

41 # find train, dev and test files if not specified 

42 dev_file, test_file, train_file = \ 

43 find_train_dev_test_files(data_folder, dev_file, test_file, train_file) 

44 

45 # get train data 

46 train = UniversalDependenciesDataset(train_file, in_memory=in_memory, split_multiwords=split_multiwords) 

47 

48 # get test data 

49 test = UniversalDependenciesDataset(test_file, in_memory=in_memory, split_multiwords=split_multiwords) \ 

50 if test_file is not None else None 

51 

52 # get dev data 

53 dev = UniversalDependenciesDataset(dev_file, in_memory=in_memory, split_multiwords=split_multiwords) \ 

54 if dev_file is not None else None 

55 

56 super(UniversalDependenciesCorpus, self).__init__( 

57 train, dev, test, name=str(data_folder) 

58 ) 

59 

60 

61class UniversalDependenciesDataset(FlairDataset): 

62 def __init__(self, path_to_conll_file: Union[str, Path], in_memory: bool = True, split_multiwords: bool = True): 

63 """ 

64 Instantiates a column dataset in CoNLL-U format. 

65 

66 :param path_to_conll_file: Path to the CoNLL-U formatted file 

67 :param in_memory: If set to True, keeps full dataset in memory, otherwise does disk reads 

68 """ 

69 if type(path_to_conll_file) is str: 

70 path_to_conll_file = Path(path_to_conll_file) 

71 assert path_to_conll_file.exists() 

72 

73 self.in_memory: bool = in_memory 

74 self.split_multiwords: bool = split_multiwords 

75 

76 self.path_to_conll_file = path_to_conll_file 

77 self.total_sentence_count: int = 0 

78 

79 with open(str(self.path_to_conll_file), encoding="utf-8") as file: 

80 

81 # option 1: read only sentence boundaries as offset positions 

82 if not self.in_memory: 

83 self.indices: List[int] = [] 

84 

85 line = file.readline() 

86 position = 0 

87 while line: 

88 line = line.strip() 

89 if line == "": 

90 self.indices.append(position) 

91 position = file.tell() 

92 line = file.readline() 

93 

94 self.total_sentence_count = len(self.indices) 

95 

96 # option 2: keep everything in memory 

97 if self.in_memory: 

98 self.sentences: List[Sentence] = [] 

99 

100 while True: 

101 sentence = self._read_next_sentence(file) 

102 if not sentence: break 

103 self.sentences.append(sentence) 

104 

105 self.total_sentence_count = len(self.sentences) 

106 

107 def is_in_memory(self) -> bool: 

108 return self.in_memory 

109 

110 def __len__(self): 

111 return self.total_sentence_count 

112 

113 def __getitem__(self, index: int = 0) -> Sentence: 

114 

115 # if in memory, retrieve parsed sentence 

116 if self.in_memory: 

117 sentence = self.sentences[index] 

118 

119 # else skip to position in file where sentence begins 

120 else: 

121 with open(str(self.path_to_conll_file), encoding="utf-8") as file: 

122 file.seek(self.indices[index]) 

123 sentence = self._read_next_sentence(file) 

124 

125 return sentence 

126 

127 def _read_next_sentence(self, file): 

128 line = file.readline() 

129 sentence: Sentence = Sentence() 

130 

131 # current token ID 

132 token_idx = 0 

133 

134 # handling for the awful UD multiword format 

135 current_multiword_text = '' 

136 current_multiword_sequence = '' 

137 current_multiword_first_token = 0 

138 current_multiword_last_token = 0 

139 

140 while line: 

141 line = line.strip() 

142 fields: List[str] = re.split("\t+", line) 

143 

144 # end of sentence 

145 if line == "": 

146 if len(sentence) > 0: 

147 break 

148 

149 # comments 

150 elif line.startswith("#"): 

151 line = file.readline() 

152 continue 

153 

154 # ellipsis 

155 elif "." in fields[0]: 

156 line = file.readline() 

157 continue 

158 

159 # if token is a multi-word 

160 elif "-" in fields[0]: 

161 line = file.readline() 

162 

163 current_multiword_first_token = int(fields[0].split('-')[0]) 

164 current_multiword_last_token = int(fields[0].split('-')[1]) 

165 current_multiword_text = fields[1] 

166 current_multiword_sequence = '' 

167 

168 if self.split_multiwords: 

169 continue 

170 else: 

171 token = Token(fields[1]) 

172 token.add_label("lemma", str(fields[2])) 

173 if len(fields) > 9 and 'SpaceAfter=No' in fields[9]: 

174 token.whitespace_after = False 

175 sentence.add_token(token) 

176 token_idx += 1 

177 

178 # normal single-word tokens 

179 else: 

180 

181 # if we don't split multiwords, skip over component words 

182 if not self.split_multiwords and token_idx < current_multiword_last_token: 

183 token_idx += 1 

184 line = file.readline() 

185 continue 

186 

187 # add token 

188 token = Token(fields[1], head_id=int(fields[6])) 

189 token.add_label("lemma", str(fields[2])) 

190 token.add_label("upos", str(fields[3])) 

191 token.add_label("pos", str(fields[4])) 

192 token.add_label("dependency", str(fields[7])) 

193 

194 if len(fields) > 9 and 'SpaceAfter=No' in fields[9]: 

195 token.whitespace_after = False 

196 

197 # add morphological tags 

198 for morph in str(fields[5]).split("|"): 

199 if "=" not in morph: 

200 continue 

201 token.add_label(morph.split("=")[0].lower(), morph.split("=")[1]) 

202 

203 if len(fields) > 10 and str(fields[10]) == "Y": 

204 token.add_label("frame", str(fields[11])) 

205 

206 token_idx += 1 

207 

208 # derive whitespace logic for multiwords 

209 if token_idx <= current_multiword_last_token: 

210 current_multiword_sequence += token.text 

211 

212 # print(token) 

213 # print(current_multiword_last_token) 

214 # print(current_multiword_first_token) 

215 # if multi-word equals component tokens, there should be no whitespace 

216 if token_idx == current_multiword_last_token and current_multiword_sequence == current_multiword_text: 

217 # go through all tokens in subword and set whitespace_after information 

218 for i in range(current_multiword_last_token - current_multiword_first_token): 

219 # print(i) 

220 sentence[-(i+1)].whitespace_after = False 

221 

222 sentence.add_token(token) 

223 

224 line = file.readline() 

225 return sentence 

226 

227 

228class UD_ENGLISH(UniversalDependenciesCorpus): 

229 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

230 

231 if type(base_path) == str: 

232 base_path: Path = Path(base_path) 

233 

234 # this dataset name 

235 dataset_name = self.__class__.__name__.lower() 

236 

237 # default dataset folder is the cache root 

238 if not base_path: 

239 base_path = flair.cache_root / "datasets" 

240 data_folder = base_path / dataset_name 

241 

242 # download data if necessary 

243 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master" 

244 cached_path(f"{web_path}/en_ewt-ud-dev.conllu", Path("datasets") / dataset_name) 

245 cached_path( 

246 f"{web_path}/en_ewt-ud-test.conllu", Path("datasets") / dataset_name 

247 ) 

248 cached_path( 

249 f"{web_path}/en_ewt-ud-train.conllu", Path("datasets") / dataset_name 

250 ) 

251 

252 super(UD_ENGLISH, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

253 

254 

255class UD_GALICIAN(UniversalDependenciesCorpus): 

256 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

257 

258 if type(base_path) == str: 

259 base_path: Path = Path(base_path) 

260 

261 # this dataset name 

262 dataset_name = self.__class__.__name__.lower() 

263 

264 # default dataset folder is the cache root 

265 if not base_path: 

266 base_path = Path(flair.cache_root) / "datasets" 

267 data_folder = base_path / dataset_name 

268 

269 # download data if necessary 

270 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Galician-TreeGal/master" 

271 cached_path( 

272 f"{web_path}/gl_treegal-ud-test.conllu", Path("datasets") / dataset_name 

273 ) 

274 cached_path( 

275 f"{web_path}/gl_treegal-ud-train.conllu", Path("datasets") / dataset_name 

276 ) 

277 

278 super(UD_GALICIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

279 

280 

281class UD_ANCIENT_GREEK(UniversalDependenciesCorpus): 

282 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

283 

284 if type(base_path) == str: 

285 base_path: Path = Path(base_path) 

286 

287 # this dataset name 

288 dataset_name = self.__class__.__name__.lower() 

289 

290 # default dataset folder is the cache root 

291 if not base_path: 

292 base_path = flair.cache_root / "datasets" 

293 data_folder = base_path / dataset_name 

294 

295 # download data if necessary 

296 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-PROIEL/master" 

297 cached_path(f"{web_path}/grc_proiel-ud-dev.conllu", Path("datasets") / dataset_name) 

298 cached_path( 

299 f"{web_path}/grc_proiel-ud-test.conllu", Path("datasets") / dataset_name 

300 ) 

301 cached_path( 

302 f"{web_path}/grc_proiel-ud-train.conllu", Path("datasets") / dataset_name 

303 ) 

304 

305 super(UD_ANCIENT_GREEK, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

306 

307 

308class UD_KAZAKH(UniversalDependenciesCorpus): 

309 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

310 

311 if type(base_path) == str: 

312 base_path: Path = Path(base_path) 

313 

314 # this dataset name 

315 dataset_name = self.__class__.__name__.lower() 

316 

317 # default dataset folder is the cache root 

318 if not base_path: 

319 base_path = flair.cache_root / "datasets" 

320 data_folder = base_path / dataset_name 

321 

322 # download data if necessary 

323 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Kazakh-KTB/master" 

324 cached_path( 

325 f"{web_path}/kk_ktb-ud-test.conllu", Path("datasets") / dataset_name 

326 ) 

327 cached_path( 

328 f"{web_path}/kk_ktb-ud-train.conllu", Path("datasets") / dataset_name 

329 ) 

330 

331 super(UD_KAZAKH, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

332 

333 

334class UD_OLD_CHURCH_SLAVONIC(UniversalDependenciesCorpus): 

335 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

336 

337 if type(base_path) == str: 

338 base_path: Path = Path(base_path) 

339 

340 # this dataset name 

341 dataset_name = self.__class__.__name__.lower() 

342 

343 # default dataset folder is the cache root 

344 if not base_path: 

345 base_path = flair.cache_root / "datasets" 

346 data_folder = base_path / dataset_name 

347 

348 # download data if necessary 

349 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Old_Church_Slavonic-PROIEL/master" 

350 cached_path(f"{web_path}/cu_proiel-ud-dev.conllu", Path("datasets") / dataset_name) 

351 cached_path( 

352 f"{web_path}/cu_proiel-ud-test.conllu", Path("datasets") / dataset_name 

353 ) 

354 cached_path( 

355 f"{web_path}/cu_proiel-ud-train.conllu", Path("datasets") / dataset_name 

356 ) 

357 

358 super(UD_OLD_CHURCH_SLAVONIC, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

359 

360 

361class UD_ARMENIAN(UniversalDependenciesCorpus): 

362 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

363 

364 if type(base_path) == str: 

365 base_path: Path = Path(base_path) 

366 

367 # this dataset name 

368 dataset_name = self.__class__.__name__.lower() 

369 

370 # default dataset folder is the cache root 

371 if not base_path: 

372 base_path = flair.cache_root / "datasets" 

373 data_folder = base_path / dataset_name 

374 

375 # download data if necessary 

376 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Armenian-ArmTDP/master/" 

377 cached_path(f"{web_path}/hy_armtdp-ud-dev.conllu", Path("datasets") / dataset_name) 

378 cached_path( 

379 f"{web_path}/hy_armtdp-ud-test.conllu", Path("datasets") / dataset_name 

380 ) 

381 cached_path( 

382 f"{web_path}/hy_armtdp-ud-train.conllu", Path("datasets") / dataset_name 

383 ) 

384 

385 super(UD_ARMENIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

386 

387class UD_ESTONIAN(UniversalDependenciesCorpus): 

388 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

389 

390 if type(base_path) == str: 

391 base_path: Path = Path(base_path) 

392 

393 # this dataset name 

394 dataset_name = self.__class__.__name__.lower() 

395 

396 # default dataset folder is the cache root 

397 if not base_path: 

398 base_path = flair.cache_root / "datasets" 

399 data_folder = base_path / dataset_name 

400 

401 # download data if necessary 

402 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Estonian-EDT/master" 

403 cached_path(f"{web_path}/et_edt-ud-dev.conllu", Path("datasets") / dataset_name) 

404 cached_path( 

405 f"{web_path}/et_edt-ud-test.conllu", Path("datasets") / dataset_name 

406 ) 

407 cached_path( 

408 f"{web_path}/et_edt-ud-train.conllu", Path("datasets") / dataset_name 

409 ) 

410 

411 super(UD_ESTONIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

412 

413 

414class UD_GERMAN(UniversalDependenciesCorpus): 

415 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

416 

417 if type(base_path) == str: 

418 base_path: Path = Path(base_path) 

419 

420 # this dataset name 

421 dataset_name = self.__class__.__name__.lower() 

422 

423 # default dataset folder is the cache root 

424 if not base_path: 

425 base_path = flair.cache_root / "datasets" 

426 data_folder = base_path / dataset_name 

427 

428 # download data if necessary 

429 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_German-GSD/master" 

430 cached_path(f"{ud_path}/de_gsd-ud-dev.conllu", Path("datasets") / dataset_name) 

431 cached_path(f"{ud_path}/de_gsd-ud-test.conllu", Path("datasets") / dataset_name) 

432 cached_path( 

433 f"{ud_path}/de_gsd-ud-train.conllu", Path("datasets") / dataset_name 

434 ) 

435 

436 super(UD_GERMAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

437 

438 

439class UD_GERMAN_HDT(UniversalDependenciesCorpus): 

440 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = False, split_multiwords: bool = True): 

441 

442 if type(base_path) == str: 

443 base_path: Path = Path(base_path) 

444 

445 # this dataset name 

446 dataset_name = self.__class__.__name__.lower() 

447 

448 # default dataset folder is the cache root 

449 if not base_path: 

450 base_path = flair.cache_root / "datasets" 

451 data_folder = base_path / dataset_name 

452 

453 # download data if necessary 

454 ud_path = ( 

455 "https://raw.githubusercontent.com/UniversalDependencies/UD_German-HDT/dev" 

456 ) 

457 cached_path(f"{ud_path}/de_hdt-ud-dev.conllu", Path("datasets") / dataset_name) 

458 cached_path(f"{ud_path}/de_hdt-ud-test.conllu", Path("datasets") / dataset_name) 

459 

460 train_filenames = [ 

461 "de_hdt-ud-train-a-1.conllu", 

462 "de_hdt-ud-train-a-2.conllu", 

463 "de_hdt-ud-train-b-1.conllu", 

464 "de_hdt-ud-train-b-2.conllu", 

465 ] 

466 

467 for train_file in train_filenames: 

468 cached_path( 

469 f"{ud_path}/{train_file}", Path("datasets") / dataset_name / "original" 

470 ) 

471 

472 data_path = flair.cache_root / "datasets" / dataset_name 

473 

474 new_train_file: Path = data_path / "de_hdt-ud-train-all.conllu" 

475 

476 if not new_train_file.is_file(): 

477 with open(new_train_file, "wt") as f_out: 

478 for train_filename in train_filenames: 

479 with open(data_path / "original" / train_filename, "rt") as f_in: 

480 f_out.write(f_in.read()) 

481 

482 super(UD_GERMAN_HDT, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

483 

484 

485class UD_DUTCH(UniversalDependenciesCorpus): 

486 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

487 

488 if type(base_path) == str: 

489 base_path: Path = Path(base_path) 

490 

491 # this dataset name 

492 dataset_name = self.__class__.__name__.lower() 

493 

494 # default dataset folder is the cache root 

495 if not base_path: 

496 base_path = flair.cache_root / "datasets" 

497 data_folder = base_path / dataset_name 

498 

499 # download data if necessary 

500 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Dutch-Alpino/master" 

501 cached_path( 

502 f"{ud_path}/nl_alpino-ud-dev.conllu", Path("datasets") / dataset_name 

503 ) 

504 cached_path( 

505 f"{ud_path}/nl_alpino-ud-test.conllu", Path("datasets") / dataset_name 

506 ) 

507 cached_path( 

508 f"{ud_path}/nl_alpino-ud-train.conllu", Path("datasets") / dataset_name 

509 ) 

510 

511 super(UD_DUTCH, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

512 

513class UD_FAROESE(UniversalDependenciesCorpus): 

514 """ This treebank includes the Faroese treebank dataset from the following link: 

515 https://github.com/UniversalDependencies/UD_Faroese-FarPaHC/tree/master 

516  

517 Faronese is a small Western Scandinavian language with 60.000-100.000, related to Icelandic and Old Norse""" 

518 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

519 

520 if type(base_path) == str: 

521 base_path: Path = Path(base_path) 

522 

523 # this dataset name 

524 dataset_name = self.__class__.__name__.lower() 

525 

526 # default dataset folder is the cache root 

527 if not base_path: 

528 base_path = flair.cache_root / "datasets" 

529 data_folder = base_path / dataset_name 

530 

531 # download data if necessary 

532 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Faroese-FarPaHC/master" 

533 cached_path( 

534 f"{web_path}/fo_farpahc-ud-dev.conllu", Path("datasets") / dataset_name 

535 ) 

536 cached_path( 

537 f"{web_path}/fo_farpahc-ud-test.conllu", Path("datasets") / dataset_name 

538 ) 

539 cached_path( 

540 f"{web_path}/fo_farpahc-ud-train.conllu", Path("datasets") / dataset_name 

541 ) 

542 

543 super(UD_FAROESE, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

544 

545 

546class UD_FRENCH(UniversalDependenciesCorpus): 

547 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

548 

549 if type(base_path) == str: 

550 base_path: Path = Path(base_path) 

551 

552 # this dataset name 

553 dataset_name = self.__class__.__name__.lower() 

554 

555 # default dataset folder is the cache root 

556 if not base_path: 

557 base_path = flair.cache_root / "datasets" 

558 data_folder = base_path / dataset_name 

559 

560 # download data if necessary 

561 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_French-GSD/master" 

562 cached_path(f"{ud_path}/fr_gsd-ud-dev.conllu", Path("datasets") / dataset_name) 

563 cached_path(f"{ud_path}/fr_gsd-ud-test.conllu", Path("datasets") / dataset_name) 

564 cached_path( 

565 f"{ud_path}/fr_gsd-ud-train.conllu", Path("datasets") / dataset_name 

566 ) 

567 super(UD_FRENCH, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

568 

569 

570class UD_ITALIAN(UniversalDependenciesCorpus): 

571 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

572 

573 if type(base_path) == str: 

574 base_path: Path = Path(base_path) 

575 

576 # this dataset name 

577 dataset_name = self.__class__.__name__.lower() 

578 

579 # default dataset folder is the cache root 

580 if not base_path: 

581 base_path = flair.cache_root / "datasets" 

582 data_folder = base_path / dataset_name 

583 

584 # download data if necessary 

585 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Italian-ISDT/master" 

586 cached_path(f"{ud_path}/it_isdt-ud-dev.conllu", Path("datasets") / dataset_name) 

587 cached_path( 

588 f"{ud_path}/it_isdt-ud-test.conllu", Path("datasets") / dataset_name 

589 ) 

590 cached_path( 

591 f"{ud_path}/it_isdt-ud-train.conllu", Path("datasets") / dataset_name 

592 ) 

593 super(UD_ITALIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

594 

595 

596class UD_LATIN(UniversalDependenciesCorpus): 

597 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

598 

599 if type(base_path) == str: 

600 base_path: Path = Path(base_path) 

601 

602 # this dataset name 

603 dataset_name = self.__class__.__name__.lower() 

604 

605 # default dataset folder is the cache root 

606 if not base_path: 

607 base_path = Path(flair.cache_root) / "datasets" 

608 data_folder = base_path / dataset_name 

609 

610 # download data if necessary 

611 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Latin-LLCT/master/" 

612 cached_path(f"{web_path}/la_llct-ud-dev.conllu", Path("datasets") / dataset_name) 

613 cached_path( 

614 f"{web_path}/la_llct-ud-test.conllu", Path("datasets") / dataset_name 

615 ) 

616 cached_path( 

617 f"{web_path}/la_llct-ud-train.conllu", Path("datasets") / dataset_name 

618 ) 

619 

620 super(UD_LATIN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

621 

622 

623 

624class UD_SPANISH(UniversalDependenciesCorpus): 

625 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

626 

627 if type(base_path) == str: 

628 base_path: Path = Path(base_path) 

629 

630 # this dataset name 

631 dataset_name = self.__class__.__name__.lower() 

632 

633 # default dataset folder is the cache root 

634 if not base_path: 

635 base_path = flair.cache_root / "datasets" 

636 data_folder = base_path / dataset_name 

637 

638 # download data if necessary 

639 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Spanish-GSD/master" 

640 cached_path(f"{ud_path}/es_gsd-ud-dev.conllu", Path("datasets") / dataset_name) 

641 cached_path(f"{ud_path}/es_gsd-ud-test.conllu", Path("datasets") / dataset_name) 

642 cached_path( 

643 f"{ud_path}/es_gsd-ud-train.conllu", Path("datasets") / dataset_name 

644 ) 

645 super(UD_SPANISH, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

646 

647 

648class UD_PORTUGUESE(UniversalDependenciesCorpus): 

649 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

650 

651 if type(base_path) == str: 

652 base_path: Path = Path(base_path) 

653 

654 # this dataset name 

655 dataset_name = self.__class__.__name__.lower() 

656 

657 # default dataset folder is the cache root 

658 if not base_path: 

659 base_path = flair.cache_root / "datasets" 

660 data_folder = base_path / dataset_name 

661 

662 # download data if necessary 

663 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Portuguese-Bosque/master" 

664 cached_path( 

665 f"{ud_path}/pt_bosque-ud-dev.conllu", Path("datasets") / dataset_name 

666 ) 

667 cached_path( 

668 f"{ud_path}/pt_bosque-ud-test.conllu", Path("datasets") / dataset_name 

669 ) 

670 cached_path( 

671 f"{ud_path}/pt_bosque-ud-train.conllu", Path("datasets") / dataset_name 

672 ) 

673 super(UD_PORTUGUESE, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

674 

675 

676class UD_ROMANIAN(UniversalDependenciesCorpus): 

677 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

678 

679 if type(base_path) == str: 

680 base_path: Path = Path(base_path) 

681 

682 # this dataset name 

683 dataset_name = self.__class__.__name__.lower() 

684 

685 # default dataset folder is the cache root 

686 if not base_path: 

687 base_path = flair.cache_root / "datasets" 

688 data_folder = base_path / dataset_name 

689 

690 # download data if necessary 

691 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Romanian-RRT/master" 

692 cached_path(f"{ud_path}/ro_rrt-ud-dev.conllu", Path("datasets") / dataset_name) 

693 cached_path(f"{ud_path}/ro_rrt-ud-test.conllu", Path("datasets") / dataset_name) 

694 cached_path( 

695 f"{ud_path}/ro_rrt-ud-train.conllu", Path("datasets") / dataset_name 

696 ) 

697 super(UD_ROMANIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

698 

699 

700class UD_CATALAN(UniversalDependenciesCorpus): 

701 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

702 

703 if type(base_path) == str: 

704 base_path: Path = Path(base_path) 

705 

706 # this dataset name 

707 dataset_name = self.__class__.__name__.lower() 

708 

709 # default dataset folder is the cache root 

710 if not base_path: 

711 base_path = flair.cache_root / "datasets" 

712 data_folder = base_path / dataset_name 

713 

714 # download data if necessary 

715 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Catalan-AnCora/master" 

716 cached_path( 

717 f"{ud_path}/ca_ancora-ud-dev.conllu", Path("datasets") / dataset_name 

718 ) 

719 cached_path( 

720 f"{ud_path}/ca_ancora-ud-test.conllu", Path("datasets") / dataset_name 

721 ) 

722 cached_path( 

723 f"{ud_path}/ca_ancora-ud-train.conllu", Path("datasets") / dataset_name 

724 ) 

725 super(UD_CATALAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

726 

727 

728class UD_POLISH(UniversalDependenciesCorpus): 

729 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

730 

731 if type(base_path) == str: 

732 base_path: Path = Path(base_path) 

733 

734 # this dataset name 

735 dataset_name = self.__class__.__name__.lower() 

736 

737 # default dataset folder is the cache root 

738 if not base_path: 

739 base_path = flair.cache_root / "datasets" 

740 data_folder = base_path / dataset_name 

741 

742 # download data if necessary 

743 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Polish-LFG/master" 

744 cached_path(f"{ud_path}/pl_lfg-ud-dev.conllu", Path("datasets") / dataset_name) 

745 cached_path(f"{ud_path}/pl_lfg-ud-test.conllu", Path("datasets") / dataset_name) 

746 cached_path( 

747 f"{ud_path}/pl_lfg-ud-train.conllu", Path("datasets") / dataset_name 

748 ) 

749 

750 super(UD_POLISH, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

751 

752 

753class UD_CZECH(UniversalDependenciesCorpus): 

754 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = False, split_multiwords: bool = True): 

755 

756 if type(base_path) == str: 

757 base_path: Path = Path(base_path) 

758 

759 # this dataset name 

760 dataset_name = self.__class__.__name__.lower() 

761 

762 # default dataset folder is the cache root 

763 if not base_path: 

764 base_path = flair.cache_root / "datasets" 

765 data_folder = base_path / dataset_name 

766 

767 # download data if necessary 

768 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Czech-PDT/master" 

769 cached_path(f"{ud_path}/cs_pdt-ud-dev.conllu", Path("datasets") / dataset_name) 

770 cached_path(f"{ud_path}/cs_pdt-ud-test.conllu", Path("datasets") / dataset_name) 

771 cached_path( 

772 f"{ud_path}/cs_pdt-ud-train-c.conllu", 

773 Path("datasets") / dataset_name / "original", 

774 ) 

775 cached_path( 

776 f"{ud_path}/cs_pdt-ud-train-l.conllu", 

777 Path("datasets") / dataset_name / "original", 

778 ) 

779 cached_path( 

780 f"{ud_path}/cs_pdt-ud-train-m.conllu", 

781 Path("datasets") / dataset_name / "original", 

782 ) 

783 cached_path( 

784 f"{ud_path}/cs_pdt-ud-train-v.conllu", 

785 Path("datasets") / dataset_name / "original", 

786 ) 

787 data_path = flair.cache_root / "datasets" / dataset_name 

788 

789 train_filenames = [ 

790 "cs_pdt-ud-train-c.conllu", 

791 "cs_pdt-ud-train-l.conllu", 

792 "cs_pdt-ud-train-m.conllu", 

793 "cs_pdt-ud-train-v.conllu", 

794 ] 

795 

796 new_train_file: Path = data_path / "cs_pdt-ud-train-all.conllu" 

797 

798 if not new_train_file.is_file(): 

799 with open(new_train_file, "wt") as f_out: 

800 for train_filename in train_filenames: 

801 with open(data_path / "original" / train_filename, "rt") as f_in: 

802 f_out.write(f_in.read()) 

803 super(UD_CZECH, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

804 

805 

806class UD_SLOVAK(UniversalDependenciesCorpus): 

807 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

808 

809 if type(base_path) == str: 

810 base_path: Path = Path(base_path) 

811 

812 # this dataset name 

813 dataset_name = self.__class__.__name__.lower() 

814 

815 # default dataset folder is the cache root 

816 if not base_path: 

817 base_path = flair.cache_root / "datasets" 

818 data_folder = base_path / dataset_name 

819 

820 # download data if necessary 

821 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Slovak-SNK/master" 

822 cached_path(f"{ud_path}/sk_snk-ud-dev.conllu", Path("datasets") / dataset_name) 

823 cached_path(f"{ud_path}/sk_snk-ud-test.conllu", Path("datasets") / dataset_name) 

824 cached_path( 

825 f"{ud_path}/sk_snk-ud-train.conllu", Path("datasets") / dataset_name 

826 ) 

827 

828 super(UD_SLOVAK, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

829 

830 

831class UD_SWEDISH(UniversalDependenciesCorpus): 

832 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

833 

834 if type(base_path) == str: 

835 base_path: Path = Path(base_path) 

836 

837 # this dataset name 

838 dataset_name = self.__class__.__name__.lower() 

839 

840 # default dataset folder is the cache root 

841 if not base_path: 

842 base_path = flair.cache_root / "datasets" 

843 data_folder = base_path / dataset_name 

844 

845 # download data if necessary 

846 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Swedish-Talbanken/master" 

847 cached_path( 

848 f"{ud_path}/sv_talbanken-ud-dev.conllu", Path("datasets") / dataset_name 

849 ) 

850 cached_path( 

851 f"{ud_path}/sv_talbanken-ud-test.conllu", Path("datasets") / dataset_name 

852 ) 

853 cached_path( 

854 f"{ud_path}/sv_talbanken-ud-train.conllu", Path("datasets") / dataset_name 

855 ) 

856 

857 super(UD_SWEDISH, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

858 

859 

860class UD_DANISH(UniversalDependenciesCorpus): 

861 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

862 

863 if type(base_path) == str: 

864 base_path: Path = Path(base_path) 

865 

866 # this dataset name 

867 dataset_name = self.__class__.__name__.lower() 

868 

869 # default dataset folder is the cache root 

870 if not base_path: 

871 base_path = flair.cache_root / "datasets" 

872 data_folder = base_path / dataset_name 

873 

874 # download data if necessary 

875 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Danish-DDT/master" 

876 cached_path(f"{ud_path}/da_ddt-ud-dev.conllu", Path("datasets") / dataset_name) 

877 cached_path(f"{ud_path}/da_ddt-ud-test.conllu", Path("datasets") / dataset_name) 

878 cached_path( 

879 f"{ud_path}/da_ddt-ud-train.conllu", Path("datasets") / dataset_name 

880 ) 

881 

882 super(UD_DANISH, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

883 

884 

885class UD_NORWEGIAN(UniversalDependenciesCorpus): 

886 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

887 

888 if type(base_path) == str: 

889 base_path: Path = Path(base_path) 

890 

891 # this dataset name 

892 dataset_name = self.__class__.__name__.lower() 

893 

894 # default dataset folder is the cache root 

895 if not base_path: 

896 base_path = flair.cache_root / "datasets" 

897 data_folder = base_path / dataset_name 

898 

899 # download data if necessary 

900 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Norwegian-Bokmaal/master" 

901 cached_path( 

902 f"{ud_path}/no_bokmaal-ud-dev.conllu", Path("datasets") / dataset_name 

903 ) 

904 cached_path( 

905 f"{ud_path}/no_bokmaal-ud-test.conllu", Path("datasets") / dataset_name 

906 ) 

907 cached_path( 

908 f"{ud_path}/no_bokmaal-ud-train.conllu", Path("datasets") / dataset_name 

909 ) 

910 

911 super(UD_NORWEGIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

912 

913 

914class UD_FINNISH(UniversalDependenciesCorpus): 

915 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

916 

917 if type(base_path) == str: 

918 base_path: Path = Path(base_path) 

919 

920 # this dataset name 

921 dataset_name = self.__class__.__name__.lower() 

922 

923 # default dataset folder is the cache root 

924 if not base_path: 

925 base_path = flair.cache_root / "datasets" 

926 data_folder = base_path / dataset_name 

927 

928 # download data if necessary 

929 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Finnish-TDT/master" 

930 cached_path(f"{ud_path}/fi_tdt-ud-dev.conllu", Path("datasets") / dataset_name) 

931 cached_path(f"{ud_path}/fi_tdt-ud-test.conllu", Path("datasets") / dataset_name) 

932 cached_path( 

933 f"{ud_path}/fi_tdt-ud-train.conllu", Path("datasets") / dataset_name 

934 ) 

935 

936 super(UD_FINNISH, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

937 

938 

939class UD_SLOVENIAN(UniversalDependenciesCorpus): 

940 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

941 

942 if type(base_path) == str: 

943 base_path: Path = Path(base_path) 

944 

945 # this dataset name 

946 dataset_name = self.__class__.__name__.lower() 

947 

948 # default dataset folder is the cache root 

949 if not base_path: 

950 base_path = flair.cache_root / "datasets" 

951 data_folder = base_path / dataset_name 

952 

953 # download data if necessary 

954 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Slovenian-SSJ/master" 

955 cached_path(f"{ud_path}/sl_ssj-ud-dev.conllu", Path("datasets") / dataset_name) 

956 cached_path(f"{ud_path}/sl_ssj-ud-test.conllu", Path("datasets") / dataset_name) 

957 cached_path( 

958 f"{ud_path}/sl_ssj-ud-train.conllu", Path("datasets") / dataset_name 

959 ) 

960 

961 super(UD_SLOVENIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

962 

963 

964class UD_CROATIAN(UniversalDependenciesCorpus): 

965 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

966 

967 if type(base_path) == str: 

968 base_path: Path = Path(base_path) 

969 

970 # this dataset name 

971 dataset_name = self.__class__.__name__.lower() 

972 

973 # default dataset folder is the cache root 

974 if not base_path: 

975 base_path = flair.cache_root / "datasets" 

976 data_folder = base_path / dataset_name 

977 

978 # download data if necessary 

979 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Croatian-SET/master" 

980 cached_path(f"{ud_path}/hr_set-ud-dev.conllu", Path("datasets") / dataset_name) 

981 cached_path(f"{ud_path}/hr_set-ud-test.conllu", Path("datasets") / dataset_name) 

982 cached_path( 

983 f"{ud_path}/hr_set-ud-train.conllu", Path("datasets") / dataset_name 

984 ) 

985 

986 super(UD_CROATIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

987 

988 

989class UD_SERBIAN(UniversalDependenciesCorpus): 

990 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

991 

992 if type(base_path) == str: 

993 base_path: Path = Path(base_path) 

994 

995 # this dataset name 

996 dataset_name = self.__class__.__name__.lower() 

997 

998 # default dataset folder is the cache root 

999 if not base_path: 

1000 base_path = flair.cache_root / "datasets" 

1001 data_folder = base_path / dataset_name 

1002 

1003 # download data if necessary 

1004 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Serbian-SET/master" 

1005 cached_path(f"{ud_path}/sr_set-ud-dev.conllu", Path("datasets") / dataset_name) 

1006 cached_path(f"{ud_path}/sr_set-ud-test.conllu", Path("datasets") / dataset_name) 

1007 cached_path( 

1008 f"{ud_path}/sr_set-ud-train.conllu", Path("datasets") / dataset_name 

1009 ) 

1010 

1011 super(UD_SERBIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

1012 

1013 

1014class UD_BULGARIAN(UniversalDependenciesCorpus): 

1015 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

1016 

1017 if type(base_path) == str: 

1018 base_path: Path = Path(base_path) 

1019 

1020 # this dataset name 

1021 dataset_name = self.__class__.__name__.lower() 

1022 

1023 # default dataset folder is the cache root 

1024 if not base_path: 

1025 base_path = flair.cache_root / "datasets" 

1026 data_folder = base_path / dataset_name 

1027 

1028 # download data if necessary 

1029 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Bulgarian-BTB/master" 

1030 cached_path(f"{ud_path}/bg_btb-ud-dev.conllu", Path("datasets") / dataset_name) 

1031 cached_path(f"{ud_path}/bg_btb-ud-test.conllu", Path("datasets") / dataset_name) 

1032 cached_path( 

1033 f"{ud_path}/bg_btb-ud-train.conllu", Path("datasets") / dataset_name 

1034 ) 

1035 

1036 super(UD_BULGARIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

1037 

1038 

1039class UD_ARABIC(UniversalDependenciesCorpus): 

1040 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

1041 

1042 if type(base_path) == str: 

1043 base_path: Path = Path(base_path) 

1044 

1045 # this dataset name 

1046 dataset_name = self.__class__.__name__.lower() 

1047 

1048 # default dataset folder is the cache root 

1049 if not base_path: 

1050 base_path = flair.cache_root / "datasets" 

1051 data_folder = base_path / dataset_name 

1052 

1053 # download data if necessary 

1054 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Arabic-PADT/master" 

1055 cached_path(f"{ud_path}/ar_padt-ud-dev.conllu", Path("datasets") / dataset_name) 

1056 cached_path( 

1057 f"{ud_path}/ar_padt-ud-test.conllu", Path("datasets") / dataset_name 

1058 ) 

1059 cached_path( 

1060 f"{ud_path}/ar_padt-ud-train.conllu", Path("datasets") / dataset_name 

1061 ) 

1062 super(UD_ARABIC, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

1063 

1064 

1065class UD_HEBREW(UniversalDependenciesCorpus): 

1066 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

1067 

1068 if type(base_path) == str: 

1069 base_path: Path = Path(base_path) 

1070 

1071 # this dataset name 

1072 dataset_name = self.__class__.__name__.lower() 

1073 

1074 # default dataset folder is the cache root 

1075 if not base_path: 

1076 base_path = flair.cache_root / "datasets" 

1077 data_folder = base_path / dataset_name 

1078 

1079 # download data if necessary 

1080 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Hebrew-HTB/master" 

1081 cached_path(f"{ud_path}/he_htb-ud-dev.conllu", Path("datasets") / dataset_name) 

1082 cached_path(f"{ud_path}/he_htb-ud-test.conllu", Path("datasets") / dataset_name) 

1083 cached_path( 

1084 f"{ud_path}/he_htb-ud-train.conllu", Path("datasets") / dataset_name 

1085 ) 

1086 super(UD_HEBREW, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

1087 

1088 

1089class UD_TURKISH(UniversalDependenciesCorpus): 

1090 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

1091 

1092 if type(base_path) == str: 

1093 base_path: Path = Path(base_path) 

1094 

1095 # this dataset name 

1096 dataset_name = self.__class__.__name__.lower() 

1097 

1098 # default dataset folder is the cache root 

1099 if not base_path: 

1100 base_path = flair.cache_root / "datasets" 

1101 data_folder = base_path / dataset_name 

1102 

1103 # download data if necessary 

1104 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Turkish-IMST/master" 

1105 cached_path(f"{ud_path}/tr_imst-ud-dev.conllu", Path("datasets") / dataset_name) 

1106 cached_path( 

1107 f"{ud_path}/tr_imst-ud-test.conllu", Path("datasets") / dataset_name 

1108 ) 

1109 cached_path( 

1110 f"{ud_path}/tr_imst-ud-train.conllu", Path("datasets") / dataset_name 

1111 ) 

1112 

1113 super(UD_TURKISH, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

1114 

1115 

1116class UD_PERSIAN(UniversalDependenciesCorpus): 

1117 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

1118 

1119 if type(base_path) == str: 

1120 base_path: Path = Path(base_path) 

1121 

1122 # this dataset name 

1123 dataset_name = self.__class__.__name__.lower() 

1124 

1125 # default dataset folder is the cache root 

1126 if not base_path: 

1127 base_path = flair.cache_root / "datasets" 

1128 data_folder = base_path / dataset_name 

1129 

1130 # download data if necessary 

1131 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Persian-Seraji/master" 

1132 cached_path( 

1133 f"{ud_path}/fa_seraji-ud-dev.conllu", Path("datasets") / dataset_name 

1134 ) 

1135 cached_path( 

1136 f"{ud_path}/fa_seraji-ud-test.conllu", Path("datasets") / dataset_name 

1137 ) 

1138 cached_path( 

1139 f"{ud_path}/fa_seraji-ud-train.conllu", Path("datasets") / dataset_name 

1140 ) 

1141 

1142 super(UD_PERSIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

1143 

1144 

1145class UD_RUSSIAN(UniversalDependenciesCorpus): 

1146 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

1147 

1148 if type(base_path) == str: 

1149 base_path: Path = Path(base_path) 

1150 

1151 # this dataset name 

1152 dataset_name = self.__class__.__name__.lower() 

1153 

1154 # default dataset folder is the cache root 

1155 if not base_path: 

1156 base_path = flair.cache_root / "datasets" 

1157 data_folder = base_path / dataset_name 

1158 

1159 # download data if necessary 

1160 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/master" 

1161 cached_path( 

1162 f"{ud_path}/ru_syntagrus-ud-dev.conllu", Path("datasets") / dataset_name 

1163 ) 

1164 cached_path( 

1165 f"{ud_path}/ru_syntagrus-ud-test.conllu", Path("datasets") / dataset_name 

1166 ) 

1167 cached_path( 

1168 f"{ud_path}/ru_syntagrus-ud-train.conllu", Path("datasets") / dataset_name 

1169 ) 

1170 

1171 super(UD_RUSSIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

1172 

1173 

1174class UD_HINDI(UniversalDependenciesCorpus): 

1175 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

1176 

1177 if type(base_path) == str: 

1178 base_path: Path = Path(base_path) 

1179 

1180 # this dataset name 

1181 dataset_name = self.__class__.__name__.lower() 

1182 

1183 # default dataset folder is the cache root 

1184 if not base_path: 

1185 base_path = flair.cache_root / "datasets" 

1186 data_folder = base_path / dataset_name 

1187 

1188 # download data if necessary 

1189 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Hindi-HDTB/master" 

1190 cached_path(f"{ud_path}/hi_hdtb-ud-dev.conllu", Path("datasets") / dataset_name) 

1191 cached_path( 

1192 f"{ud_path}/hi_hdtb-ud-test.conllu", Path("datasets") / dataset_name 

1193 ) 

1194 cached_path( 

1195 f"{ud_path}/hi_hdtb-ud-train.conllu", Path("datasets") / dataset_name 

1196 ) 

1197 

1198 super(UD_HINDI, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

1199 

1200 

1201class UD_INDONESIAN(UniversalDependenciesCorpus): 

1202 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

1203 

1204 if type(base_path) == str: 

1205 base_path: Path = Path(base_path) 

1206 

1207 # this dataset name 

1208 dataset_name = self.__class__.__name__.lower() 

1209 

1210 # default dataset folder is the cache root 

1211 if not base_path: 

1212 base_path = flair.cache_root / "datasets" 

1213 data_folder = base_path / dataset_name 

1214 

1215 # download data if necessary 

1216 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-GSD/master" 

1217 cached_path(f"{ud_path}/id_gsd-ud-dev.conllu", Path("datasets") / dataset_name) 

1218 cached_path(f"{ud_path}/id_gsd-ud-test.conllu", Path("datasets") / dataset_name) 

1219 cached_path( 

1220 f"{ud_path}/id_gsd-ud-train.conllu", Path("datasets") / dataset_name 

1221 ) 

1222 

1223 super(UD_INDONESIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

1224 

1225 

1226class UD_JAPANESE(UniversalDependenciesCorpus): 

1227 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

1228 

1229 if type(base_path) == str: 

1230 base_path: Path = Path(base_path) 

1231 

1232 # this dataset name 

1233 dataset_name = self.__class__.__name__.lower() 

1234 

1235 # default dataset folder is the cache root 

1236 if not base_path: 

1237 base_path = flair.cache_root / "datasets" 

1238 data_folder = base_path / dataset_name 

1239 

1240 # download data if necessary 

1241 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Japanese-GSD/master" 

1242 cached_path(f"{ud_path}/ja_gsd-ud-dev.conllu", Path("datasets") / dataset_name) 

1243 cached_path(f"{ud_path}/ja_gsd-ud-test.conllu", Path("datasets") / dataset_name) 

1244 cached_path( 

1245 f"{ud_path}/ja_gsd-ud-train.conllu", Path("datasets") / dataset_name 

1246 ) 

1247 

1248 super(UD_JAPANESE, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

1249 

1250 

1251class UD_CHINESE(UniversalDependenciesCorpus): 

1252 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

1253 

1254 if type(base_path) == str: 

1255 base_path: Path = Path(base_path) 

1256 

1257 # this dataset name 

1258 dataset_name = self.__class__.__name__.lower() 

1259 

1260 # default dataset folder is the cache root 

1261 if not base_path: 

1262 base_path = flair.cache_root / "datasets" 

1263 data_folder = base_path / dataset_name 

1264 

1265 # download data if necessary 

1266 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Chinese-GSD/master" 

1267 cached_path(f"{ud_path}/zh_gsd-ud-dev.conllu", Path("datasets") / dataset_name) 

1268 cached_path(f"{ud_path}/zh_gsd-ud-test.conllu", Path("datasets") / dataset_name) 

1269 cached_path( 

1270 f"{ud_path}/zh_gsd-ud-train.conllu", Path("datasets") / dataset_name 

1271 ) 

1272 

1273 super(UD_CHINESE, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

1274 

1275 

1276class UD_KOREAN(UniversalDependenciesCorpus): 

1277 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

1278 

1279 if type(base_path) == str: 

1280 base_path: Path = Path(base_path) 

1281 

1282 # this dataset name 

1283 dataset_name = self.__class__.__name__.lower() 

1284 

1285 # default dataset folder is the cache root 

1286 if not base_path: 

1287 base_path = flair.cache_root / "datasets" 

1288 data_folder = base_path / dataset_name 

1289 

1290 # download data if necessary 

1291 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Korean-Kaist/master" 

1292 cached_path( 

1293 f"{ud_path}/ko_kaist-ud-dev.conllu", Path("datasets") / dataset_name 

1294 ) 

1295 cached_path( 

1296 f"{ud_path}/ko_kaist-ud-test.conllu", Path("datasets") / dataset_name 

1297 ) 

1298 cached_path( 

1299 f"{ud_path}/ko_kaist-ud-train.conllu", Path("datasets") / dataset_name 

1300 ) 

1301 

1302 super(UD_KOREAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

1303 

1304 

1305class UD_BASQUE(UniversalDependenciesCorpus): 

1306 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

1307 

1308 if type(base_path) == str: 

1309 base_path: Path = Path(base_path) 

1310 

1311 # this dataset name 

1312 dataset_name = self.__class__.__name__.lower() 

1313 

1314 # default dataset folder is the cache root 

1315 if not base_path: 

1316 base_path = flair.cache_root / "datasets" 

1317 data_folder = base_path / dataset_name 

1318 

1319 # download data if necessary 

1320 ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Basque-BDT/master" 

1321 cached_path(f"{ud_path}/eu_bdt-ud-dev.conllu", Path("datasets") / dataset_name) 

1322 cached_path(f"{ud_path}/eu_bdt-ud-test.conllu", Path("datasets") / dataset_name) 

1323 cached_path( 

1324 f"{ud_path}/eu_bdt-ud-train.conllu", Path("datasets") / dataset_name 

1325 ) 

1326 

1327 super(UD_BASQUE, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

1328 

1329 

1330class UD_CHINESE_KYOTO(UniversalDependenciesCorpus): 

1331 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

1332 

1333 if type(base_path) == str: 

1334 base_path: Path = Path(base_path) 

1335 

1336 # this dataset name 

1337 dataset_name = self.__class__.__name__.lower() 

1338 

1339 # default dataset folder is the cache root 

1340 if not base_path: 

1341 base_path = flair.cache_root / "datasets" 

1342 data_folder = base_path / dataset_name 

1343 

1344 # download data if necessary 

1345 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Classical_Chinese-Kyoto/master" 

1346 cached_path(f"{web_path}/lzh_kyoto-ud-dev.conllu", Path("datasets") / dataset_name) 

1347 cached_path( 

1348 f"{web_path}/lzh_kyoto-ud-test.conllu", Path("datasets") / dataset_name 

1349 ) 

1350 cached_path( 

1351 f"{web_path}/lzh_kyoto-ud-train.conllu", Path("datasets") / dataset_name 

1352 ) 

1353 

1354 super(UD_CHINESE_KYOTO, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

1355 

1356 

1357class UD_GREEK(UniversalDependenciesCorpus): 

1358 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

1359 

1360 if type(base_path) == str: 

1361 base_path: Path = Path(base_path) 

1362 

1363 # this dataset name 

1364 dataset_name = self.__class__.__name__.lower() 

1365 

1366 # default dataset folder is the cache root 

1367 if not base_path: 

1368 base_path = flair.cache_root / "datasets" 

1369 data_folder = base_path / dataset_name 

1370 

1371 # download data if necessary 

1372 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Greek-GDT/master" 

1373 cached_path(f"{web_path}/el_gdt-ud-dev.conllu", Path("datasets") / dataset_name) 

1374 cached_path( 

1375 f"{web_path}/el_gdt-ud-test.conllu", Path("datasets") / dataset_name 

1376 ) 

1377 cached_path( 

1378 f"{web_path}/el_gdt-ud-train.conllu", Path("datasets") / dataset_name 

1379 ) 

1380 

1381 super(UD_GREEK, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

1382 

1383 

1384class UD_NAIJA(UniversalDependenciesCorpus): 

1385 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

1386 

1387 if type(base_path) == str: 

1388 base_path: Path = Path(base_path) 

1389 

1390 # this dataset name 

1391 dataset_name = self.__class__.__name__.lower() 

1392 

1393 # default dataset folder is the cache root 

1394 if not base_path: 

1395 base_path = flair.cache_root / "datasets" 

1396 data_folder = base_path / dataset_name 

1397 

1398 # download data if necessary 

1399 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Naija-NSC/master" 

1400 cached_path(f"{web_path}//pcm_nsc-ud-dev.conllu", Path("datasets") / dataset_name) 

1401 cached_path( 

1402 f"{web_path}//pcm_nsc-ud-test.conllu", Path("datasets") / dataset_name 

1403 ) 

1404 cached_path( 

1405 f"{web_path}//pcm_nsc-ud-train.conllu", Path("datasets") / dataset_name 

1406 ) 

1407 

1408 super(UD_NAIJA, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

1409 

1410 

1411class UD_LIVVI(UniversalDependenciesCorpus): 

1412 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

1413 

1414 if type(base_path) == str: 

1415 base_path: Path = Path(base_path) 

1416 

1417 # this dataset name 

1418 dataset_name = self.__class__.__name__.lower() 

1419 

1420 # default dataset folder is the cache root 

1421 if not base_path: 

1422 base_path = flair.cache_root / "datasets" 

1423 data_folder = base_path / dataset_name 

1424 

1425 # download data if necessary 

1426 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Livvi-KKPP/master" 

1427 cached_path(f"{web_path}/olo_kkpp-ud-test.conllu", Path("datasets") / dataset_name) 

1428 cached_path(f"{web_path}/olo_kkpp-ud-train.conllu", Path("datasets") / dataset_name) 

1429 

1430 super(UD_LIVVI, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

1431 

1432 

1433class UD_BURYAT(UniversalDependenciesCorpus): 

1434 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

1435 

1436 if type(base_path) == str: 

1437 base_path: Path = Path(base_path) 

1438 

1439 # this dataset name 

1440 dataset_name = self.__class__.__name__.lower() 

1441 

1442 # default dataset folder is the cache root 

1443 if not base_path: 

1444 base_path = flair.cache_root / "datasets" 

1445 data_folder = base_path / dataset_name 

1446 

1447 # download data if necessary 

1448 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Buryat-BDT/master" 

1449 cached_path( 

1450 f"{web_path}/bxr_bdt-ud-test.conllu", Path("datasets") / dataset_name 

1451 ) 

1452 cached_path( 

1453 f"{web_path}/bxr_bdt-ud-train.conllu", Path("datasets") / dataset_name 

1454 ) 

1455 

1456 super(UD_BURYAT, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

1457 

1458 

1459class UD_NORTH_SAMI(UniversalDependenciesCorpus): 

1460 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

1461 

1462 if type(base_path) == str: 

1463 base_path: Path = Path(base_path) 

1464 

1465 # this dataset name 

1466 dataset_name = self.__class__.__name__.lower() 

1467 

1468 # default dataset folder is the cache root 

1469 if not base_path: 

1470 base_path = flair.cache_root / "datasets" 

1471 data_folder = base_path / dataset_name 

1472 

1473 # download data if necessary 

1474 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_North_Sami-Giella/master" 

1475 cached_path( 

1476 f"{web_path}/sme_giella-ud-test.conllu", Path("datasets") / dataset_name 

1477 ) 

1478 cached_path( 

1479 f"{web_path}/sme_giella-ud-train.conllu", Path("datasets") / dataset_name 

1480 ) 

1481 

1482 super(UD_NORTH_SAMI, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

1483 

1484 

1485class UD_MARATHI(UniversalDependenciesCorpus): 

1486 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

1487 

1488 if type(base_path) == str: 

1489 base_path: Path = Path(base_path) 

1490 

1491 # this dataset name 

1492 dataset_name = self.__class__.__name__.lower() 

1493 

1494 # default dataset folder is the cache root 

1495 if not base_path: 

1496 base_path = flair.cache_root / "datasets" 

1497 data_folder = base_path / dataset_name 

1498 

1499 # download data if necessary 

1500 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Marathi-UFAL/master" 

1501 cached_path(f"{web_path}/mr_ufal-ud-dev.conllu", Path("datasets") / dataset_name) 

1502 cached_path( 

1503 f"{web_path}/mr_ufal-ud-test.conllu", Path("datasets") / dataset_name 

1504 ) 

1505 cached_path( 

1506 f"{web_path}/mr_ufal-ud-train.conllu", Path("datasets") / dataset_name 

1507 ) 

1508 

1509 super(UD_MARATHI, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

1510 

1511 

1512class UD_MALTESE(UniversalDependenciesCorpus): 

1513 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

1514 

1515 if type(base_path) == str: 

1516 base_path: Path = Path(base_path) 

1517 

1518 # this dataset name 

1519 dataset_name = self.__class__.__name__.lower() 

1520 

1521 # default dataset folder is the cache root 

1522 if not base_path: 

1523 base_path = flair.cache_root / "datasets" 

1524 data_folder = base_path / dataset_name 

1525 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Maltese-MUDT/master" 

1526 cached_path( 

1527 f"{web_path}/mt_mudt-ud-dev.conllu", Path("datasets") / dataset_name 

1528 ) 

1529 cached_path( 

1530 f"{web_path}/mt_mudt-ud-test.conllu", Path("datasets") / dataset_name 

1531 ) 

1532 cached_path( 

1533 f"{web_path}/mt_mudt-ud-train.conllu", Path("datasets") / dataset_name 

1534 ) 

1535 

1536 super(UD_MALTESE, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

1537 

1538 

1539class UD_AFRIKAANS(UniversalDependenciesCorpus): 

1540 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

1541 

1542 if type(base_path) == str: 

1543 base_path: Path = Path(base_path) 

1544 

1545 # this dataset name 

1546 dataset_name = self.__class__.__name__.lower() 

1547 

1548 # default dataset folder is the cache root 

1549 if not base_path: 

1550 base_path = flair.cache_root / "datasets" 

1551 data_folder = base_path / dataset_name 

1552 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Afrikaans-AfriBooms/master" 

1553 cached_path( 

1554 f"{web_path}/af_afribooms-ud-dev.conllu", Path("datasets") / dataset_name 

1555 ) 

1556 cached_path( 

1557 f"{web_path}/af_afribooms-ud-test.conllu", Path("datasets") / dataset_name 

1558 ) 

1559 cached_path( 

1560 f"{web_path}/af_afribooms-ud-train.conllu", Path("datasets") / dataset_name 

1561 ) 

1562 

1563 super(UD_AFRIKAANS, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

1564 

1565 

1566class UD_GOTHIC(UniversalDependenciesCorpus): 

1567 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

1568 

1569 if type(base_path) == str: 

1570 base_path: Path = Path(base_path) 

1571 

1572 # this dataset name 

1573 dataset_name = self.__class__.__name__.lower() 

1574 

1575 # default dataset folder is the cache root 

1576 if not base_path: 

1577 base_path = flair.cache_root / "datasets" 

1578 data_folder = base_path / dataset_name 

1579 

1580 # download data if necessary 

1581 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Gothic-PROIEL/master" 

1582 cached_path(f"{web_path}/got_proiel-ud-dev.conllu", Path("datasets") / dataset_name) 

1583 cached_path( 

1584 f"{web_path}/got_proiel-ud-test.conllu", Path("datasets") / dataset_name 

1585 ) 

1586 cached_path( 

1587 f"{web_path}/got_proiel-ud-train.conllu", Path("datasets") / dataset_name 

1588 ) 

1589 

1590 super(UD_GOTHIC, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

1591 

1592 

1593class UD_OLD_FRENCH(UniversalDependenciesCorpus): 

1594 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

1595 

1596 if type(base_path) == str: 

1597 base_path: Path = Path(base_path) 

1598 

1599 # this dataset name 

1600 dataset_name = self.__class__.__name__.lower() 

1601 

1602 # default dataset folder is the cache root 

1603 if not base_path: 

1604 base_path = flair.cache_root / "datasets" 

1605 data_folder = base_path / dataset_name 

1606 

1607 # download data if necessary 

1608 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Old_French-SRCMF/master" 

1609 cached_path(f"{web_path}/fro_srcmf-ud-dev.conllu", Path("datasets") / dataset_name) 

1610 cached_path( 

1611 f"{web_path}/fro_srcmf-ud-test.conllu", Path("datasets") / dataset_name 

1612 ) 

1613 cached_path( 

1614 f"{web_path}/fro_srcmf-ud-train.conllu", Path("datasets") / dataset_name 

1615 ) 

1616 

1617 super(UD_OLD_FRENCH, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

1618 

1619 

1620class UD_WOLOF(UniversalDependenciesCorpus): 

1621 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

1622 

1623 if type(base_path) == str: 

1624 base_path: Path = Path(base_path) 

1625 

1626 # this dataset name 

1627 dataset_name = self.__class__.__name__.lower() 

1628 

1629 # default dataset folder is the cache root 

1630 if not base_path: 

1631 base_path = flair.cache_root / "datasets" 

1632 data_folder = base_path / dataset_name 

1633 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Wolof-WTB/master" 

1634 cached_path( 

1635 f"{web_path}/wo_wtb-ud-dev.conllu", Path("datasets") / dataset_name 

1636 ) 

1637 cached_path( 

1638 f"{web_path}/wo_wtb-ud-test.conllu", Path("datasets") / dataset_name 

1639 ) 

1640 cached_path( 

1641 f"{web_path}/wo_wtb-ud-train.conllu", Path("datasets") / dataset_name 

1642 ) 

1643 

1644 super(UD_WOLOF, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

1645 

1646 

1647class UD_BELARUSIAN(UniversalDependenciesCorpus): 

1648 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

1649 

1650 if type(base_path) == str: 

1651 base_path: Path = Path(base_path) 

1652 

1653 # this dataset name 

1654 dataset_name = self.__class__.__name__.lower() 

1655 

1656 # default dataset folder is the cache root 

1657 if not base_path: 

1658 base_path = flair.cache_root / "datasets" 

1659 data_folder = base_path / dataset_name 

1660 

1661 # download data if necessary 

1662 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Belarusian-HSE/master" 

1663 cached_path(f"{web_path}/be_hse-ud-dev.conllu", Path("datasets") / dataset_name) 

1664 cached_path( 

1665 f"{web_path}/be_hse-ud-test.conllu", Path("datasets") / dataset_name 

1666 ) 

1667 cached_path( 

1668 f"{web_path}/be_hse-ud-train.conllu", Path("datasets") / dataset_name 

1669 ) 

1670 

1671 super(UD_BELARUSIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

1672 

1673 

1674class UD_COPTIC(UniversalDependenciesCorpus): 

1675 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

1676 

1677 if type(base_path) == str: 

1678 base_path: Path = Path(base_path) 

1679 

1680 # this dataset name 

1681 dataset_name = self.__class__.__name__.lower() 

1682 

1683 # default dataset folder is the cache root 

1684 if not base_path: 

1685 base_path = flair.cache_root / "datasets" 

1686 data_folder = base_path / dataset_name 

1687 

1688 # download data if necessary 

1689 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Coptic-Scriptorium/master" 

1690 cached_path(f"{web_path}/cop_scriptorium-ud-dev.conllu", Path("datasets") / dataset_name) 

1691 cached_path( 

1692 f"{web_path}/cop_scriptorium-ud-test.conllu", Path("datasets") / dataset_name 

1693 ) 

1694 cached_path( 

1695 f"{web_path}/cop_scriptorium-ud-train.conllu", Path("datasets") / dataset_name 

1696 ) 

1697 

1698 super(UD_COPTIC, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

1699 

1700class UD_IRISH(UniversalDependenciesCorpus): 

1701 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

1702 

1703 if type(base_path) == str: 

1704 base_path: Path = Path(base_path) 

1705 

1706 # this dataset name 

1707 dataset_name = self.__class__.__name__.lower() 

1708 

1709 # default dataset folder is the cache root 

1710 if not base_path: 

1711 base_path = flair.cache_root / "datasets" 

1712 data_folder = base_path / dataset_name 

1713 

1714 # download data if necessary 

1715 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Irish-IDT/master" 

1716 cached_path(f"{web_path}/ga_idt-ud-dev.conllu", Path("datasets") / dataset_name) 

1717 cached_path( 

1718 f"{web_path}/ga_idt-ud-test.conllu", Path("datasets") / dataset_name 

1719 ) 

1720 cached_path( 

1721 f"{web_path}/ga_idt-ud-train.conllu", Path("datasets") / dataset_name 

1722 ) 

1723 

1724 super(UD_IRISH, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

1725 

1726class UD_LATVIAN(UniversalDependenciesCorpus): 

1727 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

1728 

1729 if type(base_path) == str: 

1730 base_path: Path = Path(base_path) 

1731 

1732 # this dataset name 

1733 dataset_name = self.__class__.__name__.lower() 

1734 

1735 # default dataset folder is the cache root 

1736 if not base_path: 

1737 base_path = Path(flair.cache_root) / "datasets" 

1738 data_folder = base_path / dataset_name 

1739 

1740 # download data if necessary 

1741 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Latvian-LVTB/master" 

1742 cached_path(f"{web_path}/lv_lvtb-ud-dev.conllu", Path("datasets") / dataset_name) 

1743 cached_path( 

1744 f"{web_path}/lv_lvtb-ud-test.conllu", Path("datasets") / dataset_name 

1745 ) 

1746 cached_path( 

1747 f"{web_path}/lv_lvtb-ud-train.conllu", Path("datasets") / dataset_name 

1748 ) 

1749 

1750 super(UD_LATVIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

1751 

1752class UD_LITHUANIAN(UniversalDependenciesCorpus): 

1753 def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): 

1754 

1755 if type(base_path) == str: 

1756 base_path: Path = Path(base_path) 

1757 

1758 # this dataset name 

1759 dataset_name = self.__class__.__name__.lower() 

1760 

1761 # default dataset folder is the cache root 

1762 if not base_path: 

1763 base_path = Path(flair.cache_root) / "datasets" 

1764 data_folder = base_path / dataset_name 

1765 

1766 # download data if necessary 

1767 web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Lithuanian-ALKSNIS/master" 

1768 cached_path(f"{web_path}/lt_alksnis-ud-dev.conllu", Path("datasets") / dataset_name) 

1769 cached_path( 

1770 f"{web_path}/lt_alksnis-ud-test.conllu", Path("datasets") / dataset_name 

1771 ) 

1772 cached_path( 

1773 f"{web_path}/lt_alksnis-ud-train.conllu", Path("datasets") / dataset_name 

1774 ) 

1775 

1776 super(UD_LITHUANIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) 

1777 

1778