-
Notifications
You must be signed in to change notification settings - Fork 814
mock up IWSLT2016 test for faster testing. #1563
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,83 @@ | ||
| import os | ||
| import random | ||
| import string | ||
| from collections import defaultdict | ||
| from unittest.mock import patch | ||
|
|
||
| from parameterized import parameterized | ||
| from torchtext.datasets.iwslt2016 import IWSLT2016 | ||
| from torchtext.data.datasets_utils import _generate_iwslt_files_for_lang_and_split | ||
|
|
||
| from ..common.case_utils import TempDirMixin, zip_equal | ||
| from ..common.torchtext_test_case import TorchtextTestCase | ||
|
|
||
|
|
||
| def _get_mock_dataset(root_dir, split, src, tgt): | ||
| """ | ||
| root_dir: directory to the mocked dataset | ||
| """ | ||
| temp_dataset_dir = os.path.join(root_dir, f"IWSLT2016/2016-01/texts/{src}/{tgt}/{src}-{tgt}/") | ||
| os.makedirs(temp_dataset_dir, exist_ok=True) | ||
|
|
||
| seed = 1 | ||
| mocked_data = defaultdict(lambda: defaultdict(list)) | ||
| valid_set = "tst2013" | ||
| test_set = "tst2014" | ||
|
|
||
| files_for_split, _ = _generate_iwslt_files_for_lang_and_split(16, src, tgt, valid_set, test_set) | ||
| src_file = files_for_split[src][split] | ||
| tgt_file = files_for_split[tgt][split] | ||
| for file_name in (src_file, tgt_file): | ||
| txt_file = os.path.join(temp_dataset_dir, file_name) | ||
| with open(txt_file, "w") as f: | ||
| # Get file extension (i.e., the language) without the . prefix (.en -> en) | ||
| lang = os.path.splitext(file_name)[1][1:] | ||
| for i in range(5): | ||
| rand_string = " ".join( | ||
| random.choice(string.ascii_letters) for i in range(seed) | ||
| ) | ||
| dataset_line = f"{rand_string} {rand_string}\n" | ||
| # append line to correct dataset split | ||
| mocked_data[split][lang].append(dataset_line) | ||
| f.write(f'{rand_string} {rand_string}\n') | ||
| seed += 1 | ||
|
|
||
| return list(zip(mocked_data[split][src], mocked_data[split][tgt])) | ||
|
|
||
|
|
||
| class TestIWSLT2016(TempDirMixin, TorchtextTestCase): | ||
| root_dir = None | ||
| patcher = None | ||
|
|
||
| @classmethod | ||
| def setUpClass(cls): | ||
| super().setUpClass() | ||
| cls.root_dir = cls.get_base_temp_dir() | ||
| cls.patcher = patch( | ||
| "torchdata.datapipes.iter.util.cacheholder.OnDiskCacheHolderIterDataPipe._cache_check_fn", return_value=True | ||
| ) | ||
| cls.patcher.start() | ||
|
|
||
| @classmethod | ||
| def tearDownClass(cls): | ||
| cls.patcher.stop() | ||
| super().tearDownClass() | ||
|
|
||
| @parameterized.expand([("train", "de", "en"), ("valid", "de", "en")]) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IWSLT2016 also consist of
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oops, yes. I had made a change and forgot to re-incorporate the test split. I can cut a PR to fix in the morning. |
||
| def test_iwslt2016(self, split, src, tgt): | ||
| expected_samples = _get_mock_dataset(self.root_dir, split, src, tgt) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there any specific reason why we don't want to generate all the mocked data within the |
||
|
|
||
| dataset = IWSLT2016(root=self.root_dir, split=split) | ||
|
|
||
| samples = list(dataset) | ||
|
|
||
| for sample, expected_sample in zip_equal(samples, expected_samples): | ||
|
Comment on lines
+68
to
+74
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. NIT: can we organize
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately it's not quite as straightforward unless we want to hardcode the language pairs in
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One thing we could do is make
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Gotcha, I think it's okay to keep your current implementation. I didn't catch the fact that the ordering mattered here. |
||
| self.assertEqual(sample, expected_sample) | ||
|
|
||
| @parameterized.expand(["train", "valid"]) | ||
| def test_iwslt2016_split_argument(self, split): | ||
| dataset1 = IWSLT2016(root=self.root_dir, split=split) | ||
| (dataset2,) = IWSLT2016(root=self.root_dir, split=(split,)) | ||
|
|
||
| for d1, d2 in zip_equal(dataset1, dataset2): | ||
| self.assertEqual(d1, d2) | ||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a reason we are not creating a download archive
2016-01.tgzlike we are doing for other datasets?Edit: I think it quite important to start from the download archive, otherwise we can get into hard to find bugs specially when the compression pattern is complex like we have in here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@erip just wanted to check if you do plan to follow-up on this as well?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I can follow up on this. It will take a lot more thought since, as you mention, the clean up is quite involved. That said, I think it should be doable.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure, thanks @erip!