From c7c5dbe7646a110c675bff871d0c9eb65346b297 Mon Sep 17 00:00:00 2001 From: Elijah Rippeth Date: Sat, 5 Feb 2022 09:05:56 -0500 Subject: [PATCH 01/19] start from outermost tar for consistency and better testing. --- test/datasets/test_iwslt2016.py | 99 +++++++++++++++++++++++++++------ 1 file changed, 81 insertions(+), 18 deletions(-) diff --git a/test/datasets/test_iwslt2016.py b/test/datasets/test_iwslt2016.py index e352f9ef1b..aceacb4d42 100644 --- a/test/datasets/test_iwslt2016.py +++ b/test/datasets/test_iwslt2016.py @@ -1,6 +1,7 @@ import os import random import string +import tarfile from collections import defaultdict from unittest.mock import patch @@ -12,35 +13,97 @@ from ..common.torchtext_test_case import TorchtextTestCase +def _generate_uncleaned_train(): + """Generate tags files""" + file_contents = [] + examples = [] + xml_tags = [ + '" + close_tag = "" + file_contents.append(open_tag + rand_string + close_tag) + else: + examples.append(rand_string + "\n") + file_contents.append(rand_string) + return examples, "\n".join(file_contents) + + +def _generate_uncleaned_valid(): + file_contents = [""] + examples = [] + + for doc_id in range(5): + file_contents.append(f'') + for seg_id in range(100): + # Write one of the XML tags randomly to make sure we clean appropriately + rand_string = " ".join( + random.choice(string.ascii_letters) for i in range(10) + ) + examples.append(rand_string) + file_contents.append(f"{rand_string} " + "\n") + file_contents.append("") + file_contents.append("") + return examples, " ".join(file_contents) + + +def _generate_uncleaned_test(): + return _generate_uncleaned_valid() + + +def _generate_uncleaned_contents(split): + return { + "train": _generate_uncleaned_train(), + "valid": _generate_uncleaned_valid(), + "test": _generate_uncleaned_test(), + }[split] + + def _get_mock_dataset(root_dir, split, src, tgt): """ root_dir: directory to the mocked dataset """ - temp_dataset_dir = os.path.join(root_dir, f"IWSLT2016/2016-01/texts/{src}/{tgt}/{src}-{tgt}/") - os.makedirs(temp_dataset_dir, exist_ok=True) + inner_temp_dataset_dir = f"{src}-{tgt}" + outer_temp_dataset_dir = os.path.join(root_dir, f"IWSLT2016/2016-01/texts/{src}/{tgt}/") + os.makedirs(inner_temp_dataset_dir, exist_ok=True) + os.makedirs(outer_temp_dataset_dir, exist_ok=True) - seed = 1 mocked_data = defaultdict(lambda: defaultdict(list)) valid_set = "tst2013" test_set = "tst2014" - files_for_split, _ = _generate_iwslt_files_for_lang_and_split(16, src, tgt, valid_set, test_set) - src_file = files_for_split[src][split] - tgt_file = files_for_split[tgt][split] + _, uncleaned_file_names = _generate_iwslt_files_for_lang_and_split(16, src, tgt, valid_set, test_set) + src_file = uncleaned_file_names[src][split] + tgt_file = uncleaned_file_names[tgt][split] for file_name in (src_file, tgt_file): - txt_file = os.path.join(temp_dataset_dir, file_name) - with open(txt_file, "w") as f: + out_file = os.path.join(inner_temp_dataset_dir, file_name) + with open(out_file, "w") as f: # Get file extension (i.e., the language) without the . prefix (.en -> en) lang = os.path.splitext(file_name)[1][1:] - for i in range(5): - rand_string = " ".join( - random.choice(string.ascii_letters) for i in range(seed) - ) - dataset_line = f"{rand_string} {rand_string}\n" - # append line to correct dataset split - mocked_data[split][lang].append(dataset_line) - f.write(f'{rand_string} {rand_string}\n') - seed += 1 + mocked_data_for_split, file_contents = _generate_uncleaned_contents(split) + mocked_data[split][lang] = mocked_data_for_split + f.write(file_contents) + + inner_compressed_dataset_path = os.path.join( + outer_temp_dataset_dir, f"{src}-{tgt}.tgz" + ) + + # create tar file from dataset folder + with tarfile.open(inner_compressed_dataset_path, "w:gz") as tar: + tar.add(inner_temp_dataset_dir, arcname=f"{src}-{tgt}") + + outer_temp_dataset_path = os.path.join( + root_dir, "2016-01.tgz" + ) + with tarfile.open(outer_temp_dataset_path, "w:gz") as tar: + tar.add(outer_temp_dataset_dir, arcname="2016-01") return list(zip(mocked_data[split][src], mocked_data[split][tgt])) @@ -54,7 +117,7 @@ def setUpClass(cls): super().setUpClass() cls.root_dir = cls.get_base_temp_dir() cls.patcher = patch( - "torchdata.datapipes.iter.util.cacheholder.OnDiskCacheHolderIterDataPipe._cache_check_fn", return_value=True + "torchdata.datapipes.iter.util.cacheholder._hash_check", return_value=True ) cls.patcher.start() From 25b10dc79d1ad3dbdc76b17c49748e3bef4c18bf Mon Sep 17 00:00:00 2001 From: Elijah Rippeth Date: Sat, 5 Feb 2022 09:08:55 -0500 Subject: [PATCH 02/19] fix bug with inner path location and add test splits for better coverage. --- test/datasets/test_iwslt2016.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/test/datasets/test_iwslt2016.py b/test/datasets/test_iwslt2016.py index aceacb4d42..1e30936a59 100644 --- a/test/datasets/test_iwslt2016.py +++ b/test/datasets/test_iwslt2016.py @@ -70,10 +70,11 @@ def _get_mock_dataset(root_dir, split, src, tgt): """ root_dir: directory to the mocked dataset """ - inner_temp_dataset_dir = f"{src}-{tgt}" outer_temp_dataset_dir = os.path.join(root_dir, f"IWSLT2016/2016-01/texts/{src}/{tgt}/") - os.makedirs(inner_temp_dataset_dir, exist_ok=True) + inner_temp_dataset_dir = os.path.join(outer_temp_dataset_dir, f"{src}-{tgt}") + os.makedirs(outer_temp_dataset_dir, exist_ok=True) + os.makedirs(inner_temp_dataset_dir, exist_ok=True) mocked_data = defaultdict(lambda: defaultdict(list)) valid_set = "tst2013" @@ -126,7 +127,11 @@ def tearDownClass(cls): cls.patcher.stop() super().tearDownClass() - @parameterized.expand([("train", "de", "en"), ("valid", "de", "en")]) + @parameterized.expand([ + ("train", "de", "en"), + ("valid", "de", "en"), + ("test", "de", "en"), + ]) def test_iwslt2016(self, split, src, tgt): expected_samples = _get_mock_dataset(self.root_dir, split, src, tgt) @@ -137,7 +142,7 @@ def test_iwslt2016(self, split, src, tgt): for sample, expected_sample in zip_equal(samples, expected_samples): self.assertEqual(sample, expected_sample) - @parameterized.expand(["train", "valid"]) + @parameterized.expand(["train", "valid", "test"]) def test_iwslt2016_split_argument(self, split): dataset1 = IWSLT2016(root=self.root_dir, split=split) (dataset2,) = IWSLT2016(root=self.root_dir, split=(split,)) From 50f9f7e5c6b68731b401e4c5dbfe93f432b6212d Mon Sep 17 00:00:00 2001 From: Elijah Rippeth Date: Sun, 6 Feb 2022 10:25:05 -0500 Subject: [PATCH 03/19] fix comment. --- test/datasets/test_iwslt2016.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/datasets/test_iwslt2016.py b/test/datasets/test_iwslt2016.py index 1e30936a59..e950fb1c7d 100644 --- a/test/datasets/test_iwslt2016.py +++ b/test/datasets/test_iwslt2016.py @@ -22,10 +22,11 @@ def _generate_uncleaned_train(): '" close_tag = "" @@ -43,7 +44,6 @@ def _generate_uncleaned_valid(): for doc_id in range(5): file_contents.append(f'') for seg_id in range(100): - # Write one of the XML tags randomly to make sure we clean appropriately rand_string = " ".join( random.choice(string.ascii_letters) for i in range(10) ) From dbd99b46918fc5b8bc5c94c5b03ffa9cbb90ee84 Mon Sep 17 00:00:00 2001 From: Elijah Rippeth Date: Mon, 7 Feb 2022 11:30:07 -0500 Subject: [PATCH 04/19] incorporate feedback from review. --- test/datasets/test_iwslt2016.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/datasets/test_iwslt2016.py b/test/datasets/test_iwslt2016.py index e950fb1c7d..455d6d41ce 100644 --- a/test/datasets/test_iwslt2016.py +++ b/test/datasets/test_iwslt2016.py @@ -70,7 +70,7 @@ def _get_mock_dataset(root_dir, split, src, tgt): """ root_dir: directory to the mocked dataset """ - outer_temp_dataset_dir = os.path.join(root_dir, f"IWSLT2016/2016-01/texts/{src}/{tgt}/") + outer_temp_dataset_dir = os.path.join(root_dir, f"IWSLT2016/temp_dataset_dir/2016-01/texts/{src}/{tgt}/") inner_temp_dataset_dir = os.path.join(outer_temp_dataset_dir, f"{src}-{tgt}") os.makedirs(outer_temp_dataset_dir, exist_ok=True) From d50270955a01f172cda05a6df64de5a051229e5f Mon Sep 17 00:00:00 2001 From: Elijah Rippeth Date: Mon, 7 Feb 2022 18:40:32 -0500 Subject: [PATCH 05/19] revert temp_dataset_dir temporarily but add test support for all support langpairs. --- test/datasets/test_iwslt2016.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/test/datasets/test_iwslt2016.py b/test/datasets/test_iwslt2016.py index 455d6d41ce..c39458c491 100644 --- a/test/datasets/test_iwslt2016.py +++ b/test/datasets/test_iwslt2016.py @@ -6,12 +6,13 @@ from unittest.mock import patch from parameterized import parameterized -from torchtext.datasets.iwslt2016 import IWSLT2016 +from torchtext.datasets.iwslt2016 import IWSLT2016, SUPPORTED_DATASETS from torchtext.data.datasets_utils import _generate_iwslt_files_for_lang_and_split from ..common.case_utils import TempDirMixin, zip_equal from ..common.torchtext_test_case import TorchtextTestCase +SUPPORTED_LANGPAIRS = [(k, e) for k, v in SUPPORTED_DATASETS["language_pair"].items() for e in v] def _generate_uncleaned_train(): """Generate tags files""" @@ -70,7 +71,7 @@ def _get_mock_dataset(root_dir, split, src, tgt): """ root_dir: directory to the mocked dataset """ - outer_temp_dataset_dir = os.path.join(root_dir, f"IWSLT2016/temp_dataset_dir/2016-01/texts/{src}/{tgt}/") + outer_temp_dataset_dir = os.path.join(root_dir, f"IWSLT2016/2016-01/texts/{src}/{tgt}/") inner_temp_dataset_dir = os.path.join(outer_temp_dataset_dir, f"{src}-{tgt}") os.makedirs(outer_temp_dataset_dir, exist_ok=True) @@ -128,10 +129,10 @@ def tearDownClass(cls): super().tearDownClass() @parameterized.expand([ - ("train", "de", "en"), - ("valid", "de", "en"), - ("test", "de", "en"), - ]) + ("train", src, tgt), + ("valid", src, tgt), + ("test", src, tgt), + ] for src, tgt in SUPPORTED_LANGPAIRS) def test_iwslt2016(self, split, src, tgt): expected_samples = _get_mock_dataset(self.root_dir, split, src, tgt) From 2fb3275a9c7aaa916daece9866ab7ed92d558b3e Mon Sep 17 00:00:00 2001 From: Elijah Rippeth Date: Mon, 7 Feb 2022 18:47:48 -0500 Subject: [PATCH 06/19] fix flake. --- test/datasets/test_iwslt2016.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/datasets/test_iwslt2016.py b/test/datasets/test_iwslt2016.py index c39458c491..b0a18c02f0 100644 --- a/test/datasets/test_iwslt2016.py +++ b/test/datasets/test_iwslt2016.py @@ -14,6 +14,7 @@ SUPPORTED_LANGPAIRS = [(k, e) for k, v in SUPPORTED_DATASETS["language_pair"].items() for e in v] + def _generate_uncleaned_train(): """Generate tags files""" file_contents = [] From c4df43ea9cf7996dd7878a43aa42bf19df82286f Mon Sep 17 00:00:00 2001 From: Elijah Rippeth Date: Mon, 7 Feb 2022 20:01:47 -0500 Subject: [PATCH 07/19] expand split, src, and tgt appropriately. --- test/datasets/test_iwslt2016.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/datasets/test_iwslt2016.py b/test/datasets/test_iwslt2016.py index b0a18c02f0..a6e3964ed1 100644 --- a/test/datasets/test_iwslt2016.py +++ b/test/datasets/test_iwslt2016.py @@ -130,10 +130,10 @@ def tearDownClass(cls): super().tearDownClass() @parameterized.expand([ - ("train", src, tgt), - ("valid", src, tgt), - ("test", src, tgt), - ] for src, tgt in SUPPORTED_LANGPAIRS) + (split, src, tgt) + for split in ("train", "valid", "test") + for src, tgt in SUPPORTED_LANGPAIRS + ]) def test_iwslt2016(self, split, src, tgt): expected_samples = _get_mock_dataset(self.root_dir, split, src, tgt) From 9ae1fd36b8598c6e078f7f60857b0836cce5f222 Mon Sep 17 00:00:00 2001 From: Elijah Rippeth Date: Tue, 8 Feb 2022 07:51:29 -0500 Subject: [PATCH 08/19] pass langpair to constructor so appropriate files are searched. --- test/datasets/test_iwslt2016.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/datasets/test_iwslt2016.py b/test/datasets/test_iwslt2016.py index a6e3964ed1..31aaa42189 100644 --- a/test/datasets/test_iwslt2016.py +++ b/test/datasets/test_iwslt2016.py @@ -137,7 +137,7 @@ def tearDownClass(cls): def test_iwslt2016(self, split, src, tgt): expected_samples = _get_mock_dataset(self.root_dir, split, src, tgt) - dataset = IWSLT2016(root=self.root_dir, split=split) + dataset = IWSLT2016(root=self.root_dir, split=split, language_pair=(src, tgt)) samples = list(dataset) From 6156ffe8f88673f31bfdfc7f6a43378f9574a85f Mon Sep 17 00:00:00 2001 From: Elijah Rippeth Date: Tue, 8 Feb 2022 08:14:58 -0500 Subject: [PATCH 09/19] parameterize dev and test sets. --- test/datasets/test_iwslt2016.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/test/datasets/test_iwslt2016.py b/test/datasets/test_iwslt2016.py index 31aaa42189..bd4fbc0e19 100644 --- a/test/datasets/test_iwslt2016.py +++ b/test/datasets/test_iwslt2016.py @@ -2,18 +2,20 @@ import random import string import tarfile +import itertools from collections import defaultdict from unittest.mock import patch from parameterized import parameterized -from torchtext.datasets.iwslt2016 import IWSLT2016, SUPPORTED_DATASETS +from torchtext.datasets.iwslt2016 import IWSLT2016, SUPPORTED_DATASETS, SET_NOT_EXISTS from torchtext.data.datasets_utils import _generate_iwslt_files_for_lang_and_split from ..common.case_utils import TempDirMixin, zip_equal from ..common.torchtext_test_case import TorchtextTestCase SUPPORTED_LANGPAIRS = [(k, e) for k, v in SUPPORTED_DATASETS["language_pair"].items() for e in v] - +SUPPORTED_DEVTEST_SPLITS = SUPPORTED_DATASETS["valid_test"] +DEV_TEST_SPLITS = [(dev, test) for dev, test in itertools.product(SUPPORTED_DEVTEST_SPLITS, repeat=2) if dev != test] def _generate_uncleaned_train(): """Generate tags files""" @@ -68,7 +70,7 @@ def _generate_uncleaned_contents(split): }[split] -def _get_mock_dataset(root_dir, split, src, tgt): +def _get_mock_dataset(root_dir, split, src, tgt, valid_set, test_set): """ root_dir: directory to the mocked dataset """ @@ -79,12 +81,11 @@ def _get_mock_dataset(root_dir, split, src, tgt): os.makedirs(inner_temp_dataset_dir, exist_ok=True) mocked_data = defaultdict(lambda: defaultdict(list)) - valid_set = "tst2013" - test_set = "tst2014" _, uncleaned_file_names = _generate_iwslt_files_for_lang_and_split(16, src, tgt, valid_set, test_set) src_file = uncleaned_file_names[src][split] tgt_file = uncleaned_file_names[tgt][split] + for file_name in (src_file, tgt_file): out_file = os.path.join(inner_temp_dataset_dir, file_name) with open(out_file, "w") as f: @@ -130,14 +131,18 @@ def tearDownClass(cls): super().tearDownClass() @parameterized.expand([ - (split, src, tgt) + (split, src, tgt, dev_set, test_set) for split in ("train", "valid", "test") + for dev_set, test_set in DEV_TEST_SPLITS for src, tgt in SUPPORTED_LANGPAIRS + if (dev_set not in SET_NOT_EXISTS[(src, tgt)] and test_set not in SET_NOT_EXISTS[(src, tgt)]) ]) - def test_iwslt2016(self, split, src, tgt): - expected_samples = _get_mock_dataset(self.root_dir, split, src, tgt) + def test_iwslt2016(self, split, src, tgt, dev_set, test_set): + expected_samples = _get_mock_dataset(self.root_dir, split, src, tgt, dev_set, test_set) - dataset = IWSLT2016(root=self.root_dir, split=split, language_pair=(src, tgt)) + dataset = IWSLT2016( + root=self.root_dir, split=split, language_pair=(src, tgt), valid_set=dev_set, test_set=test_set + ) samples = list(dataset) From d242b4fc870fa76c699dca17ef7881d56361cfa4 Mon Sep 17 00:00:00 2001 From: Elijah Rippeth Date: Tue, 8 Feb 2022 08:21:44 -0500 Subject: [PATCH 10/19] fix flake. --- test/datasets/test_iwslt2016.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/datasets/test_iwslt2016.py b/test/datasets/test_iwslt2016.py index bd4fbc0e19..68484e2011 100644 --- a/test/datasets/test_iwslt2016.py +++ b/test/datasets/test_iwslt2016.py @@ -17,6 +17,7 @@ SUPPORTED_DEVTEST_SPLITS = SUPPORTED_DATASETS["valid_test"] DEV_TEST_SPLITS = [(dev, test) for dev, test in itertools.product(SUPPORTED_DEVTEST_SPLITS, repeat=2) if dev != test] + def _generate_uncleaned_train(): """Generate tags files""" file_contents = [] @@ -138,6 +139,7 @@ def tearDownClass(cls): if (dev_set not in SET_NOT_EXISTS[(src, tgt)] and test_set not in SET_NOT_EXISTS[(src, tgt)]) ]) def test_iwslt2016(self, split, src, tgt, dev_set, test_set): + expected_samples = _get_mock_dataset(self.root_dir, split, src, tgt, dev_set, test_set) dataset = IWSLT2016( From 264a2980ff52d44c4716cad07b8e6f417a204143 Mon Sep 17 00:00:00 2001 From: Elijah Rippeth Date: Tue, 8 Feb 2022 09:38:32 -0500 Subject: [PATCH 11/19] refactor logic so we read previously-cleaned files if they exist so expectations match. --- test/datasets/test_iwslt2016.py | 42 ++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/test/datasets/test_iwslt2016.py b/test/datasets/test_iwslt2016.py index 68484e2011..6dabaeb508 100644 --- a/test/datasets/test_iwslt2016.py +++ b/test/datasets/test_iwslt2016.py @@ -83,18 +83,32 @@ def _get_mock_dataset(root_dir, split, src, tgt, valid_set, test_set): mocked_data = defaultdict(lambda: defaultdict(list)) - _, uncleaned_file_names = _generate_iwslt_files_for_lang_and_split(16, src, tgt, valid_set, test_set) - src_file = uncleaned_file_names[src][split] - tgt_file = uncleaned_file_names[tgt][split] - - for file_name in (src_file, tgt_file): - out_file = os.path.join(inner_temp_dataset_dir, file_name) - with open(out_file, "w") as f: - # Get file extension (i.e., the language) without the . prefix (.en -> en) - lang = os.path.splitext(file_name)[1][1:] - mocked_data_for_split, file_contents = _generate_uncleaned_contents(split) - mocked_data[split][lang] = mocked_data_for_split - f.write(file_contents) + cleaned_file_names, uncleaned_file_names = _generate_iwslt_files_for_lang_and_split(16, src, tgt, valid_set, test_set) + uncleaned_src_file = uncleaned_file_names[src][split] + uncleaned_tgt_file = uncleaned_file_names[tgt][split] + + cleaned_src_file = cleaned_file_names[src][split] + cleaned_tgt_file = cleaned_file_names[tgt][split] + + for (unclean_file_name, clean_file_name) in [ + (uncleaned_src_file, cleaned_src_file), + (uncleaned_tgt_file, cleaned_tgt_file) + ]: + # Get file extension (i.e., the language) without the . prefix (.en -> en) + lang = os.path.splitext(unclean_file_name)[1][1:] + expected_clean_filename = os.path.join(inner_temp_dataset_dir, clean_file_name) + + # If we've already written a clean file, read it, so we don't generate + # new random strings. Otherwise generate new files and clean when read. + if os.path.exists(expected_clean_filename): + with open(expected_clean_filename, encoding="utf-8") as f: + mocked_data[(split, valid_set, test_set)][lang] = f.readlines() + else: + out_file = os.path.join(inner_temp_dataset_dir, unclean_file_name) + with open(out_file, "w") as f: + mocked_data_for_split, file_contents = _generate_uncleaned_contents(split) + mocked_data[(split, valid_set, test_set)][lang] = mocked_data_for_split + f.write(file_contents) inner_compressed_dataset_path = os.path.join( outer_temp_dataset_dir, f"{src}-{tgt}.tgz" @@ -105,12 +119,12 @@ def _get_mock_dataset(root_dir, split, src, tgt, valid_set, test_set): tar.add(inner_temp_dataset_dir, arcname=f"{src}-{tgt}") outer_temp_dataset_path = os.path.join( - root_dir, "2016-01.tgz" + root_dir, "IWSLT2016", "2016-01.tgz" ) with tarfile.open(outer_temp_dataset_path, "w:gz") as tar: tar.add(outer_temp_dataset_dir, arcname="2016-01") - return list(zip(mocked_data[split][src], mocked_data[split][tgt])) + return list(zip(mocked_data[(split, valid_set, test_set)][src], mocked_data[(split, valid_set, test_set)][tgt])) class TestIWSLT2016(TempDirMixin, TorchtextTestCase): From 91e2cf2c72e067c14362098013f68f3ffa76f20d Mon Sep 17 00:00:00 2001 From: Elijah Rippeth Date: Tue, 8 Feb 2022 09:41:19 -0500 Subject: [PATCH 12/19] revert experiment which uses triple as a key since it is unnecessary. --- test/datasets/test_iwslt2016.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/datasets/test_iwslt2016.py b/test/datasets/test_iwslt2016.py index 6dabaeb508..c73172ef46 100644 --- a/test/datasets/test_iwslt2016.py +++ b/test/datasets/test_iwslt2016.py @@ -102,12 +102,12 @@ def _get_mock_dataset(root_dir, split, src, tgt, valid_set, test_set): # new random strings. Otherwise generate new files and clean when read. if os.path.exists(expected_clean_filename): with open(expected_clean_filename, encoding="utf-8") as f: - mocked_data[(split, valid_set, test_set)][lang] = f.readlines() + mocked_data[split][lang] = f.readlines() else: out_file = os.path.join(inner_temp_dataset_dir, unclean_file_name) with open(out_file, "w") as f: mocked_data_for_split, file_contents = _generate_uncleaned_contents(split) - mocked_data[(split, valid_set, test_set)][lang] = mocked_data_for_split + mocked_data[split][lang] = mocked_data_for_split f.write(file_contents) inner_compressed_dataset_path = os.path.join( @@ -124,7 +124,7 @@ def _get_mock_dataset(root_dir, split, src, tgt, valid_set, test_set): with tarfile.open(outer_temp_dataset_path, "w:gz") as tar: tar.add(outer_temp_dataset_dir, arcname="2016-01") - return list(zip(mocked_data[(split, valid_set, test_set)][src], mocked_data[(split, valid_set, test_set)][tgt])) + return list(zip(mocked_data[split][src], mocked_data[split][tgt])) class TestIWSLT2016(TempDirMixin, TorchtextTestCase): From 4446af4c1190a331af1c5b944add021d1ae759d5 Mon Sep 17 00:00:00 2001 From: Parmeet Singh Bhatia Date: Wed, 9 Feb 2022 00:49:57 -0500 Subject: [PATCH 13/19] fix testing issues --- test/datasets/test_iwslt2016.py | 42 ++++++++++++++++----------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/test/datasets/test_iwslt2016.py b/test/datasets/test_iwslt2016.py index c73172ef46..4d947027c9 100644 --- a/test/datasets/test_iwslt2016.py +++ b/test/datasets/test_iwslt2016.py @@ -1,5 +1,6 @@ import os import random +import shutil import string import tarfile import itertools @@ -7,7 +8,7 @@ from unittest.mock import patch from parameterized import parameterized -from torchtext.datasets.iwslt2016 import IWSLT2016, SUPPORTED_DATASETS, SET_NOT_EXISTS +from torchtext.datasets.iwslt2016 import DATASET_NAME, IWSLT2016, SUPPORTED_DATASETS, SET_NOT_EXISTS from torchtext.data.datasets_utils import _generate_iwslt_files_for_lang_and_split from ..common.case_utils import TempDirMixin, zip_equal @@ -75,7 +76,9 @@ def _get_mock_dataset(root_dir, split, src, tgt, valid_set, test_set): """ root_dir: directory to the mocked dataset """ - outer_temp_dataset_dir = os.path.join(root_dir, f"IWSLT2016/2016-01/texts/{src}/{tgt}/") + + base_dir = os.path.join(root_dir, DATASET_NAME) + outer_temp_dataset_dir = os.path.join(base_dir, f"2016-01/texts/{src}/{tgt}/") inner_temp_dataset_dir = os.path.join(outer_temp_dataset_dir, f"{src}-{tgt}") os.makedirs(outer_temp_dataset_dir, exist_ok=True) @@ -96,19 +99,12 @@ def _get_mock_dataset(root_dir, split, src, tgt, valid_set, test_set): ]: # Get file extension (i.e., the language) without the . prefix (.en -> en) lang = os.path.splitext(unclean_file_name)[1][1:] - expected_clean_filename = os.path.join(inner_temp_dataset_dir, clean_file_name) - # If we've already written a clean file, read it, so we don't generate - # new random strings. Otherwise generate new files and clean when read. - if os.path.exists(expected_clean_filename): - with open(expected_clean_filename, encoding="utf-8") as f: - mocked_data[split][lang] = f.readlines() - else: - out_file = os.path.join(inner_temp_dataset_dir, unclean_file_name) - with open(out_file, "w") as f: - mocked_data_for_split, file_contents = _generate_uncleaned_contents(split) - mocked_data[split][lang] = mocked_data_for_split - f.write(file_contents) + out_file = os.path.join(inner_temp_dataset_dir, unclean_file_name) + with open(out_file, "w") as f: + mocked_data_for_split, file_contents = _generate_uncleaned_contents(split) + mocked_data[split][lang] = mocked_data_for_split + f.write(file_contents) inner_compressed_dataset_path = os.path.join( outer_temp_dataset_dir, f"{src}-{tgt}.tgz" @@ -118,12 +114,13 @@ def _get_mock_dataset(root_dir, split, src, tgt, valid_set, test_set): with tarfile.open(inner_compressed_dataset_path, "w:gz") as tar: tar.add(inner_temp_dataset_dir, arcname=f"{src}-{tgt}") - outer_temp_dataset_path = os.path.join( - root_dir, "IWSLT2016", "2016-01.tgz" - ) + shutil.rmtree(inner_temp_dataset_dir) + outer_temp_dataset_path = os.path.join(base_dir, "2016-01.tgz") + with tarfile.open(outer_temp_dataset_path, "w:gz") as tar: tar.add(outer_temp_dataset_dir, arcname="2016-01") + shutil.rmtree(outer_temp_dataset_dir) return list(zip(mocked_data[split][src], mocked_data[split][tgt])) @@ -134,7 +131,6 @@ class TestIWSLT2016(TempDirMixin, TorchtextTestCase): @classmethod def setUpClass(cls): super().setUpClass() - cls.root_dir = cls.get_base_temp_dir() cls.patcher = patch( "torchdata.datapipes.iter.util.cacheholder._hash_check", return_value=True ) @@ -154,10 +150,11 @@ def tearDownClass(cls): ]) def test_iwslt2016(self, split, src, tgt, dev_set, test_set): - expected_samples = _get_mock_dataset(self.root_dir, split, src, tgt, dev_set, test_set) + root_dir = self.get_base_temp_dir() + expected_samples = _get_mock_dataset(root_dir, split, src, tgt, dev_set, test_set) dataset = IWSLT2016( - root=self.root_dir, split=split, language_pair=(src, tgt), valid_set=dev_set, test_set=test_set + root=root_dir, split=split, language_pair=(src, tgt), valid_set=dev_set, test_set=test_set ) samples = list(dataset) @@ -167,8 +164,9 @@ def test_iwslt2016(self, split, src, tgt, dev_set, test_set): @parameterized.expand(["train", "valid", "test"]) def test_iwslt2016_split_argument(self, split): - dataset1 = IWSLT2016(root=self.root_dir, split=split) - (dataset2,) = IWSLT2016(root=self.root_dir, split=(split,)) + root_dir = self.get_base_temp_dir() + dataset1 = IWSLT2016(root=root_dir, split=split) + (dataset2,) = IWSLT2016(root=root_dir, split=(split,)) for d1, d2 in zip_equal(dataset1, dataset2): self.assertEqual(d1, d2) From 386b40bc43b58110e3a350742ee8904987b2e05e Mon Sep 17 00:00:00 2001 From: Parmeet Singh Bhatia Date: Wed, 9 Feb 2022 13:25:01 -0500 Subject: [PATCH 14/19] fix issues with temporary directory --- test/datasets/test_iwslt2016.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/test/datasets/test_iwslt2016.py b/test/datasets/test_iwslt2016.py index 4d947027c9..1f399112e0 100644 --- a/test/datasets/test_iwslt2016.py +++ b/test/datasets/test_iwslt2016.py @@ -4,6 +4,7 @@ import string import tarfile import itertools +import tempfile from collections import defaultdict from unittest.mock import patch @@ -78,7 +79,8 @@ def _get_mock_dataset(root_dir, split, src, tgt, valid_set, test_set): """ base_dir = os.path.join(root_dir, DATASET_NAME) - outer_temp_dataset_dir = os.path.join(base_dir, f"2016-01/texts/{src}/{tgt}/") + temp_dataset_dir = os.path.join(base_dir, 'temp_dataset_dir') + outer_temp_dataset_dir = os.path.join(temp_dataset_dir, f"texts/{src}/{tgt}/") inner_temp_dataset_dir = os.path.join(outer_temp_dataset_dir, f"{src}-{tgt}") os.makedirs(outer_temp_dataset_dir, exist_ok=True) @@ -118,9 +120,9 @@ def _get_mock_dataset(root_dir, split, src, tgt, valid_set, test_set): outer_temp_dataset_path = os.path.join(base_dir, "2016-01.tgz") with tarfile.open(outer_temp_dataset_path, "w:gz") as tar: - tar.add(outer_temp_dataset_dir, arcname="2016-01") + tar.add(temp_dataset_dir, arcname="2016-01") - shutil.rmtree(outer_temp_dataset_dir) + shutil.rmtree(temp_dataset_dir) return list(zip(mocked_data[split][src], mocked_data[split][tgt])) @@ -150,7 +152,7 @@ def tearDownClass(cls): ]) def test_iwslt2016(self, split, src, tgt, dev_set, test_set): - root_dir = self.get_base_temp_dir() + root_dir = tempfile.TemporaryDirectory().name expected_samples = _get_mock_dataset(root_dir, split, src, tgt, dev_set, test_set) dataset = IWSLT2016( @@ -165,8 +167,12 @@ def test_iwslt2016(self, split, src, tgt, dev_set, test_set): @parameterized.expand(["train", "valid", "test"]) def test_iwslt2016_split_argument(self, split): root_dir = self.get_base_temp_dir() - dataset1 = IWSLT2016(root=root_dir, split=split) - (dataset2,) = IWSLT2016(root=root_dir, split=(split,)) + language_pair = ("de", "en") + valid_set = "tst2013" + test_set = "tst2014" + _ = _get_mock_dataset(root_dir, split, language_pair[0], language_pair[1], valid_set, test_set) + dataset1 = IWSLT2016(root=root_dir, split=split, language_pair=language_pair, valid_set=valid_set, test_set=test_set) + (dataset2,) = IWSLT2016(root=root_dir, split=(split,), language_pair=language_pair, valid_set=valid_set, test_set=test_set) for d1, d2 in zip_equal(dataset1, dataset2): self.assertEqual(d1, d2) From 1ea385e8a50176e4134eb2fccf3acbedb23f7187 Mon Sep 17 00:00:00 2001 From: Parmeet Singh Bhatia Date: Wed, 9 Feb 2022 14:24:07 -0500 Subject: [PATCH 15/19] create unique base directories for test_iwslt2016_split_argument --- test/datasets/test_iwslt2016.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/datasets/test_iwslt2016.py b/test/datasets/test_iwslt2016.py index 1f399112e0..c80c91b9b7 100644 --- a/test/datasets/test_iwslt2016.py +++ b/test/datasets/test_iwslt2016.py @@ -166,7 +166,7 @@ def test_iwslt2016(self, split, src, tgt, dev_set, test_set): @parameterized.expand(["train", "valid", "test"]) def test_iwslt2016_split_argument(self, split): - root_dir = self.get_base_temp_dir() + root_dir = tempfile.TemporaryDirectory().name language_pair = ("de", "en") valid_set = "tst2013" test_set = "tst2014" From 05a042ad9b2ecac6c386963453beb2abd014dfb6 Mon Sep 17 00:00:00 2001 From: Parmeet Singh Bhatia Date: Wed, 9 Feb 2022 17:00:54 -0500 Subject: [PATCH 16/19] fix comments --- test/datasets/test_iwslt2016.py | 41 +++++++++++++++++---------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/test/datasets/test_iwslt2016.py b/test/datasets/test_iwslt2016.py index c80c91b9b7..802d043708 100644 --- a/test/datasets/test_iwslt2016.py +++ b/test/datasets/test_iwslt2016.py @@ -116,17 +116,18 @@ def _get_mock_dataset(root_dir, split, src, tgt, valid_set, test_set): with tarfile.open(inner_compressed_dataset_path, "w:gz") as tar: tar.add(inner_temp_dataset_dir, arcname=f"{src}-{tgt}") + # this is necessary so that the outer tarball only includes the inner tarball shutil.rmtree(inner_temp_dataset_dir) + outer_temp_dataset_path = os.path.join(base_dir, "2016-01.tgz") with tarfile.open(outer_temp_dataset_path, "w:gz") as tar: tar.add(temp_dataset_dir, arcname="2016-01") - shutil.rmtree(temp_dataset_dir) return list(zip(mocked_data[split][src], mocked_data[split][tgt])) -class TestIWSLT2016(TempDirMixin, TorchtextTestCase): +class TestIWSLT2016(TorchtextTestCase): root_dir = None patcher = None @@ -152,27 +153,27 @@ def tearDownClass(cls): ]) def test_iwslt2016(self, split, src, tgt, dev_set, test_set): - root_dir = tempfile.TemporaryDirectory().name - expected_samples = _get_mock_dataset(root_dir, split, src, tgt, dev_set, test_set) + with tempfile.TemporaryDirectory() as root_dir: + expected_samples = _get_mock_dataset(root_dir, split, src, tgt, dev_set, test_set) - dataset = IWSLT2016( - root=root_dir, split=split, language_pair=(src, tgt), valid_set=dev_set, test_set=test_set - ) + dataset = IWSLT2016( + root=root_dir, split=split, language_pair=(src, tgt), valid_set=dev_set, test_set=test_set + ) - samples = list(dataset) + samples = list(dataset) - for sample, expected_sample in zip_equal(samples, expected_samples): - self.assertEqual(sample, expected_sample) + for sample, expected_sample in zip_equal(samples, expected_samples): + self.assertEqual(sample, expected_sample) @parameterized.expand(["train", "valid", "test"]) def test_iwslt2016_split_argument(self, split): - root_dir = tempfile.TemporaryDirectory().name - language_pair = ("de", "en") - valid_set = "tst2013" - test_set = "tst2014" - _ = _get_mock_dataset(root_dir, split, language_pair[0], language_pair[1], valid_set, test_set) - dataset1 = IWSLT2016(root=root_dir, split=split, language_pair=language_pair, valid_set=valid_set, test_set=test_set) - (dataset2,) = IWSLT2016(root=root_dir, split=(split,), language_pair=language_pair, valid_set=valid_set, test_set=test_set) - - for d1, d2 in zip_equal(dataset1, dataset2): - self.assertEqual(d1, d2) + with tempfile.TemporaryDirectory() as root_dir: + language_pair = ("de", "en") + valid_set = "tst2013" + test_set = "tst2014" + _ = _get_mock_dataset(root_dir, split, language_pair[0], language_pair[1], valid_set, test_set) + dataset1 = IWSLT2016(root=root_dir, split=split, language_pair=language_pair, valid_set=valid_set, test_set=test_set) + (dataset2,) = IWSLT2016(root=root_dir, split=(split,), language_pair=language_pair, valid_set=valid_set, test_set=test_set) + + for d1, d2 in zip_equal(dataset1, dataset2): + self.assertEqual(d1, d2) From caac50889c5221386bf87551db2afe0261d9901b Mon Sep 17 00:00:00 2001 From: Parmeet Singh Bhatia Date: Wed, 9 Feb 2022 17:31:02 -0500 Subject: [PATCH 17/19] fix flake --- test/datasets/test_iwslt2016.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/datasets/test_iwslt2016.py b/test/datasets/test_iwslt2016.py index 802d043708..f2e71e8595 100644 --- a/test/datasets/test_iwslt2016.py +++ b/test/datasets/test_iwslt2016.py @@ -12,7 +12,7 @@ from torchtext.datasets.iwslt2016 import DATASET_NAME, IWSLT2016, SUPPORTED_DATASETS, SET_NOT_EXISTS from torchtext.data.datasets_utils import _generate_iwslt_files_for_lang_and_split -from ..common.case_utils import TempDirMixin, zip_equal +from ..common.case_utils import zip_equal from ..common.torchtext_test_case import TorchtextTestCase SUPPORTED_LANGPAIRS = [(k, e) for k, v in SUPPORTED_DATASETS["language_pair"].items() for e in v] From 7e93369a542bf341cdb7f606de28b129495ee8f2 Mon Sep 17 00:00:00 2001 From: Parmeet Singh Bhatia Date: Wed, 9 Feb 2022 18:31:35 -0500 Subject: [PATCH 18/19] fix flake --- test/datasets/test_iwslt2017.py | 172 ++++++++++++++++++++++++++++++++ 1 file changed, 172 insertions(+) create mode 100644 test/datasets/test_iwslt2017.py diff --git a/test/datasets/test_iwslt2017.py b/test/datasets/test_iwslt2017.py new file mode 100644 index 0000000000..e5595821b0 --- /dev/null +++ b/test/datasets/test_iwslt2017.py @@ -0,0 +1,172 @@ +import os +import random +import shutil +import string +import tarfile +import tempfile +from collections import defaultdict +from unittest.mock import patch + +from parameterized import parameterized +from torchtext.datasets.iwslt2017 import DATASET_NAME, IWSLT2017, SUPPORTED_DATASETS, _PATH +from torchtext.data.datasets_utils import _generate_iwslt_files_for_lang_and_split + +from ..common.case_utils import zip_equal +from ..common.torchtext_test_case import TorchtextTestCase + +SUPPORTED_LANGPAIRS = [(k, e) for k, v in SUPPORTED_DATASETS["language_pair"].items() for e in v] + + +def _generate_uncleaned_train(): + """Generate tags files""" + file_contents = [] + examples = [] + xml_tags = [ + '" + close_tag = "" + file_contents.append(open_tag + rand_string + close_tag) + else: + examples.append(rand_string + "\n") + file_contents.append(rand_string) + return examples, "\n".join(file_contents) + + +def _generate_uncleaned_valid(): + file_contents = [""] + examples = [] + + for doc_id in range(5): + file_contents.append(f'') + for seg_id in range(100): + rand_string = " ".join( + random.choice(string.ascii_letters) for i in range(10) + ) + examples.append(rand_string) + file_contents.append(f"{rand_string} " + "\n") + file_contents.append("") + file_contents.append("") + return examples, " ".join(file_contents) + + +def _generate_uncleaned_test(): + return _generate_uncleaned_valid() + + +def _generate_uncleaned_contents(split): + return { + "train": _generate_uncleaned_train(), + "valid": _generate_uncleaned_valid(), + "test": _generate_uncleaned_test(), + }[split] + + +def _get_mock_dataset(root_dir, split, src, tgt, valid_set, test_set): + """ + root_dir: directory to the mocked dataset + """ + + base_dir = os.path.join(root_dir, DATASET_NAME) + temp_dataset_dir = os.path.join(base_dir, 'temp_dataset_dir') + outer_temp_dataset_dir = os.path.join(temp_dataset_dir, "texts/DeEnItNlRo/DeEnItNlRo") + inner_temp_dataset_dir = os.path.join(outer_temp_dataset_dir, "DeEnItNlRo-DeEnItNlRo") + + os.makedirs(outer_temp_dataset_dir, exist_ok=True) + os.makedirs(inner_temp_dataset_dir, exist_ok=True) + + mocked_data = defaultdict(lambda: defaultdict(list)) + + cleaned_file_names, uncleaned_file_names = _generate_iwslt_files_for_lang_and_split(17, src, tgt, valid_set, test_set) + uncleaned_src_file = uncleaned_file_names[src][split] + uncleaned_tgt_file = uncleaned_file_names[tgt][split] + + cleaned_src_file = cleaned_file_names[src][split] + cleaned_tgt_file = cleaned_file_names[tgt][split] + + for (unclean_file_name, clean_file_name) in [ + (uncleaned_src_file, cleaned_src_file), + (uncleaned_tgt_file, cleaned_tgt_file) + ]: + # Get file extension (i.e., the language) without the . prefix (.en -> en) + lang = os.path.splitext(unclean_file_name)[1][1:] + + out_file = os.path.join(inner_temp_dataset_dir, unclean_file_name) + with open(out_file, "w") as f: + mocked_data_for_split, file_contents = _generate_uncleaned_contents(split) + mocked_data[split][lang] = mocked_data_for_split + f.write(file_contents) + + inner_compressed_dataset_path = os.path.join( + outer_temp_dataset_dir, "DeEnItNlRo-DeEnItNlRo.tgz" + ) + + # create tar file from dataset folder + with tarfile.open(inner_compressed_dataset_path, "w:gz") as tar: + tar.add(inner_temp_dataset_dir, arcname="DeEnItNlRo-DeEnItNlRo") + + # this is necessary so that the outer tarball only includes the inner tarball + shutil.rmtree(inner_temp_dataset_dir) + + outer_temp_dataset_path = os.path.join(base_dir, _PATH) + + with tarfile.open(outer_temp_dataset_path, "w:gz") as tar: + tar.add(temp_dataset_dir, arcname=os.path.splitext(_PATH)[0]) + + return list(zip(mocked_data[split][src], mocked_data[split][tgt])) + + +class TestIWSLT2017(TorchtextTestCase): + root_dir = None + patcher = None + + @classmethod + def setUpClass(cls): + super().setUpClass() + cls.patcher = patch( + "torchdata.datapipes.iter.util.cacheholder._hash_check", return_value=True + ) + cls.patcher.start() + + @classmethod + def tearDownClass(cls): + cls.patcher.stop() + super().tearDownClass() + + @parameterized.expand([ + (split, src, tgt) + for split in ("train", "valid", "test") + for src, tgt in SUPPORTED_LANGPAIRS + ]) + def test_iwslt2017(self, split, src, tgt): + + with tempfile.TemporaryDirectory() as root_dir: + expected_samples = _get_mock_dataset(root_dir, split, src, tgt, "dev2010", "tst2010") + + dataset = IWSLT2017(root=root_dir, split=split, language_pair=(src, tgt)) + + samples = list(dataset) + + for sample, expected_sample in zip_equal(samples, expected_samples): + self.assertEqual(sample, expected_sample) + + @parameterized.expand(["train", "valid", "test"]) + def test_iwslt2017_split_argument(self, split): + with tempfile.TemporaryDirectory() as root_dir: + language_pair = ("de", "en") + valid_set = "dev2010" + test_set = "tst2010" + _ = _get_mock_dataset(root_dir, split, language_pair[0], language_pair[1], valid_set, test_set) + dataset1 = IWSLT2017(root=root_dir, split=split, language_pair=language_pair) + (dataset2,) = IWSLT2017(root=root_dir, split=(split,), language_pair=language_pair) + + for d1, d2 in zip_equal(dataset1, dataset2): + self.assertEqual(d1, d2) From f66ae99cf4998ef2108a35c0415b910a84fa6efa Mon Sep 17 00:00:00 2001 From: Parmeet Singh Bhatia Date: Wed, 9 Feb 2022 18:38:39 -0500 Subject: [PATCH 19/19] Revert "fix flake" This reverts commit 7e93369a542bf341cdb7f606de28b129495ee8f2. --- test/datasets/test_iwslt2017.py | 172 -------------------------------- 1 file changed, 172 deletions(-) delete mode 100644 test/datasets/test_iwslt2017.py diff --git a/test/datasets/test_iwslt2017.py b/test/datasets/test_iwslt2017.py deleted file mode 100644 index e5595821b0..0000000000 --- a/test/datasets/test_iwslt2017.py +++ /dev/null @@ -1,172 +0,0 @@ -import os -import random -import shutil -import string -import tarfile -import tempfile -from collections import defaultdict -from unittest.mock import patch - -from parameterized import parameterized -from torchtext.datasets.iwslt2017 import DATASET_NAME, IWSLT2017, SUPPORTED_DATASETS, _PATH -from torchtext.data.datasets_utils import _generate_iwslt_files_for_lang_and_split - -from ..common.case_utils import zip_equal -from ..common.torchtext_test_case import TorchtextTestCase - -SUPPORTED_LANGPAIRS = [(k, e) for k, v in SUPPORTED_DATASETS["language_pair"].items() for e in v] - - -def _generate_uncleaned_train(): - """Generate tags files""" - file_contents = [] - examples = [] - xml_tags = [ - '" - close_tag = "" - file_contents.append(open_tag + rand_string + close_tag) - else: - examples.append(rand_string + "\n") - file_contents.append(rand_string) - return examples, "\n".join(file_contents) - - -def _generate_uncleaned_valid(): - file_contents = [""] - examples = [] - - for doc_id in range(5): - file_contents.append(f'') - for seg_id in range(100): - rand_string = " ".join( - random.choice(string.ascii_letters) for i in range(10) - ) - examples.append(rand_string) - file_contents.append(f"{rand_string} " + "\n") - file_contents.append("") - file_contents.append("") - return examples, " ".join(file_contents) - - -def _generate_uncleaned_test(): - return _generate_uncleaned_valid() - - -def _generate_uncleaned_contents(split): - return { - "train": _generate_uncleaned_train(), - "valid": _generate_uncleaned_valid(), - "test": _generate_uncleaned_test(), - }[split] - - -def _get_mock_dataset(root_dir, split, src, tgt, valid_set, test_set): - """ - root_dir: directory to the mocked dataset - """ - - base_dir = os.path.join(root_dir, DATASET_NAME) - temp_dataset_dir = os.path.join(base_dir, 'temp_dataset_dir') - outer_temp_dataset_dir = os.path.join(temp_dataset_dir, "texts/DeEnItNlRo/DeEnItNlRo") - inner_temp_dataset_dir = os.path.join(outer_temp_dataset_dir, "DeEnItNlRo-DeEnItNlRo") - - os.makedirs(outer_temp_dataset_dir, exist_ok=True) - os.makedirs(inner_temp_dataset_dir, exist_ok=True) - - mocked_data = defaultdict(lambda: defaultdict(list)) - - cleaned_file_names, uncleaned_file_names = _generate_iwslt_files_for_lang_and_split(17, src, tgt, valid_set, test_set) - uncleaned_src_file = uncleaned_file_names[src][split] - uncleaned_tgt_file = uncleaned_file_names[tgt][split] - - cleaned_src_file = cleaned_file_names[src][split] - cleaned_tgt_file = cleaned_file_names[tgt][split] - - for (unclean_file_name, clean_file_name) in [ - (uncleaned_src_file, cleaned_src_file), - (uncleaned_tgt_file, cleaned_tgt_file) - ]: - # Get file extension (i.e., the language) without the . prefix (.en -> en) - lang = os.path.splitext(unclean_file_name)[1][1:] - - out_file = os.path.join(inner_temp_dataset_dir, unclean_file_name) - with open(out_file, "w") as f: - mocked_data_for_split, file_contents = _generate_uncleaned_contents(split) - mocked_data[split][lang] = mocked_data_for_split - f.write(file_contents) - - inner_compressed_dataset_path = os.path.join( - outer_temp_dataset_dir, "DeEnItNlRo-DeEnItNlRo.tgz" - ) - - # create tar file from dataset folder - with tarfile.open(inner_compressed_dataset_path, "w:gz") as tar: - tar.add(inner_temp_dataset_dir, arcname="DeEnItNlRo-DeEnItNlRo") - - # this is necessary so that the outer tarball only includes the inner tarball - shutil.rmtree(inner_temp_dataset_dir) - - outer_temp_dataset_path = os.path.join(base_dir, _PATH) - - with tarfile.open(outer_temp_dataset_path, "w:gz") as tar: - tar.add(temp_dataset_dir, arcname=os.path.splitext(_PATH)[0]) - - return list(zip(mocked_data[split][src], mocked_data[split][tgt])) - - -class TestIWSLT2017(TorchtextTestCase): - root_dir = None - patcher = None - - @classmethod - def setUpClass(cls): - super().setUpClass() - cls.patcher = patch( - "torchdata.datapipes.iter.util.cacheholder._hash_check", return_value=True - ) - cls.patcher.start() - - @classmethod - def tearDownClass(cls): - cls.patcher.stop() - super().tearDownClass() - - @parameterized.expand([ - (split, src, tgt) - for split in ("train", "valid", "test") - for src, tgt in SUPPORTED_LANGPAIRS - ]) - def test_iwslt2017(self, split, src, tgt): - - with tempfile.TemporaryDirectory() as root_dir: - expected_samples = _get_mock_dataset(root_dir, split, src, tgt, "dev2010", "tst2010") - - dataset = IWSLT2017(root=root_dir, split=split, language_pair=(src, tgt)) - - samples = list(dataset) - - for sample, expected_sample in zip_equal(samples, expected_samples): - self.assertEqual(sample, expected_sample) - - @parameterized.expand(["train", "valid", "test"]) - def test_iwslt2017_split_argument(self, split): - with tempfile.TemporaryDirectory() as root_dir: - language_pair = ("de", "en") - valid_set = "dev2010" - test_set = "tst2010" - _ = _get_mock_dataset(root_dir, split, language_pair[0], language_pair[1], valid_set, test_set) - dataset1 = IWSLT2017(root=root_dir, split=split, language_pair=language_pair) - (dataset2,) = IWSLT2017(root=root_dir, split=(split,), language_pair=language_pair) - - for d1, d2 in zip_equal(dataset1, dataset2): - self.assertEqual(d1, d2)