Skip to content
This repository was archived by the owner on Sep 10, 2025. It is now read-only.

Commit 1f17c0a

Browse files
authored
migrate IWSLT2017 to datapipes. (#1547)
* migrate IWSLT2017 to datapipes. * refactor IWSLT2017 to use feedback from IWSLT2016. * remove unused import. * fix flake. * fix typo in comment. * add TODOs to IWSLT datasets. * refactor common code out of IWSLTs and convert single quotes to double. * fix typo.
1 parent 2372682 commit 1f17c0a

File tree

3 files changed

+336
-332
lines changed

3 files changed

+336
-332
lines changed

torchtext/data/datasets_utils.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,70 @@ def __str__(self):
392392
return self.description
393393

394394

395+
def _generate_iwslt_files_for_lang_and_split(year, src_language, tgt_language, valid_set, test_set):
396+
train_filenames = (
397+
"train.{}-{}.{}".format(src_language, tgt_language, src_language),
398+
"train.{}-{}.{}".format(src_language, tgt_language, tgt_language)
399+
)
400+
valid_filenames = (
401+
"IWSLT{}.TED.{}.{}-{}.{}".format(year, valid_set, src_language, tgt_language, src_language),
402+
"IWSLT{}.TED.{}.{}-{}.{}".format(year, valid_set, src_language, tgt_language, tgt_language)
403+
)
404+
test_filenames = (
405+
"IWSLT{}.TED.{}.{}-{}.{}".format(year, test_set, src_language, tgt_language, src_language),
406+
"IWSLT{}.TED.{}.{}-{}.{}".format(year, test_set, src_language, tgt_language, tgt_language)
407+
)
408+
409+
src_train, tgt_train = train_filenames
410+
src_eval, tgt_eval = valid_filenames
411+
src_test, tgt_test = test_filenames
412+
413+
uncleaned_train_filenames = (
414+
"train.tags.{}-{}.{}".format(src_language, tgt_language, src_language),
415+
"train.tags.{}-{}.{}".format(src_language, tgt_language, tgt_language)
416+
)
417+
uncleaned_valid_filenames = (
418+
"IWSLT{}.TED.{}.{}-{}.{}.xml".format(year, valid_set, src_language, tgt_language, src_language),
419+
"IWSLT{}.TED.{}.{}-{}.{}.xml".format(year, valid_set, src_language, tgt_language, tgt_language)
420+
)
421+
uncleaned_test_filenames = (
422+
"IWSLT{}.TED.{}.{}-{}.{}.xml".format(year, test_set, src_language, tgt_language, src_language),
423+
"IWSLT{}.TED.{}.{}-{}.{}.xml".format(year, test_set, src_language, tgt_language, tgt_language)
424+
)
425+
426+
uncleaned_src_train, uncleaned_tgt_train = uncleaned_train_filenames
427+
uncleaned_src_eval, uncleaned_tgt_eval = uncleaned_valid_filenames
428+
uncleaned_src_test, uncleaned_tgt_test = uncleaned_test_filenames
429+
430+
file_path_by_lang_and_split = {
431+
src_language: {
432+
"train": src_train,
433+
"valid": src_eval,
434+
"test": src_test,
435+
},
436+
tgt_language: {
437+
"train": tgt_train,
438+
"valid": tgt_eval,
439+
"test": tgt_test,
440+
}
441+
}
442+
443+
uncleaned_filenames_by_lang_and_split = {
444+
src_language: {
445+
"train": uncleaned_src_train,
446+
"valid": uncleaned_src_eval,
447+
"test": uncleaned_src_test,
448+
},
449+
tgt_language: {
450+
"train": uncleaned_tgt_train,
451+
"valid": uncleaned_tgt_eval,
452+
"test": uncleaned_tgt_test,
453+
}
454+
}
455+
456+
return file_path_by_lang_and_split, uncleaned_filenames_by_lang_and_split
457+
458+
395459
@functional_datapipe("read_squad")
396460
class _ParseSQuADQAData(IterDataPipe):
397461
r"""Iterable DataPipe to parse the contents of a stream of JSON objects

0 commit comments

Comments
 (0)