-
Notifications
You must be signed in to change notification settings - Fork 814
Fix IWSLT2016 testing #1585
Fix IWSLT2016 testing #1585
Changes from all commits
c7c5dbe
25b10dc
50f9f7e
dbd99b4
d502709
2fb3275
c4df43e
9ae1fd3
6156ffe
d242b4f
264a298
91e2cf2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,46 +1,128 @@ | ||
| import os | ||
| import random | ||
| import string | ||
| import tarfile | ||
| import itertools | ||
| from collections import defaultdict | ||
| from unittest.mock import patch | ||
|
|
||
| from parameterized import parameterized | ||
| from torchtext.datasets.iwslt2016 import IWSLT2016 | ||
| from torchtext.datasets.iwslt2016 import IWSLT2016, SUPPORTED_DATASETS, SET_NOT_EXISTS | ||
| from torchtext.data.datasets_utils import _generate_iwslt_files_for_lang_and_split | ||
|
|
||
| from ..common.case_utils import TempDirMixin, zip_equal | ||
| from ..common.torchtext_test_case import TorchtextTestCase | ||
|
|
||
|
|
||
| def _get_mock_dataset(root_dir, split, src, tgt): | ||
| SUPPORTED_LANGPAIRS = [(k, e) for k, v in SUPPORTED_DATASETS["language_pair"].items() for e in v] | ||
| SUPPORTED_DEVTEST_SPLITS = SUPPORTED_DATASETS["valid_test"] | ||
| DEV_TEST_SPLITS = [(dev, test) for dev, test in itertools.product(SUPPORTED_DEVTEST_SPLITS, repeat=2) if dev != test] | ||
|
|
||
|
|
||
| def _generate_uncleaned_train(): | ||
| """Generate tags files""" | ||
| file_contents = [] | ||
| examples = [] | ||
| xml_tags = [ | ||
| '<url', '<keywords', '<talkid', '<description', '<reviewer', | ||
| '<translator', '<title', '<speaker', '<doc', '</doc' | ||
| ] | ||
| for i in range(100): | ||
| rand_string = " ".join( | ||
| random.choice(string.ascii_letters) for i in range(10) | ||
| ) | ||
| # With a 10% change, add one of the XML tags which is cleaned | ||
| # to ensure cleaning happens appropriately | ||
| if random.random() < 0.1: | ||
| open_tag = random.choice(xml_tags) + ">" | ||
| close_tag = "</" + open_tag[1:] + ">" | ||
| file_contents.append(open_tag + rand_string + close_tag) | ||
| else: | ||
| examples.append(rand_string + "\n") | ||
| file_contents.append(rand_string) | ||
| return examples, "\n".join(file_contents) | ||
|
|
||
|
|
||
| def _generate_uncleaned_valid(): | ||
| file_contents = ["<root>"] | ||
| examples = [] | ||
|
|
||
| for doc_id in range(5): | ||
| file_contents.append(f'<doc docid="{doc_id}" genre="lectures">') | ||
| for seg_id in range(100): | ||
| rand_string = " ".join( | ||
| random.choice(string.ascii_letters) for i in range(10) | ||
| ) | ||
| examples.append(rand_string) | ||
| file_contents.append(f"<seg>{rand_string} </seg>" + "\n") | ||
| file_contents.append("</doc>") | ||
| file_contents.append("</root>") | ||
| return examples, " ".join(file_contents) | ||
|
|
||
|
|
||
| def _generate_uncleaned_test(): | ||
| return _generate_uncleaned_valid() | ||
|
|
||
|
|
||
| def _generate_uncleaned_contents(split): | ||
| return { | ||
| "train": _generate_uncleaned_train(), | ||
| "valid": _generate_uncleaned_valid(), | ||
| "test": _generate_uncleaned_test(), | ||
| }[split] | ||
|
|
||
|
|
||
| def _get_mock_dataset(root_dir, split, src, tgt, valid_set, test_set): | ||
| """ | ||
| root_dir: directory to the mocked dataset | ||
| """ | ||
| temp_dataset_dir = os.path.join(root_dir, f"IWSLT2016/2016-01/texts/{src}/{tgt}/{src}-{tgt}/") | ||
| os.makedirs(temp_dataset_dir, exist_ok=True) | ||
| outer_temp_dataset_dir = os.path.join(root_dir, f"IWSLT2016/2016-01/texts/{src}/{tgt}/") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you change this path to be "...IWSLT2016/temp_dataset_dir/2016-01/..." you will notice that there are a lot of test failures. The purpose of naming it this is to ensure that the folder we are creating the mocked data is different from where the files/folders would be place when the archive is extracted (i.e. when calling the dataset). Without doing this, the archive will never be extracted since our dataset implementation checks to see whether a file/folder exists and if it does, the archive is never extracted. When patching this PR locally and making the above change, I noticed we're getting
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, I understand the idea now. 👍 Let me give this a shot. |
||
| inner_temp_dataset_dir = os.path.join(outer_temp_dataset_dir, f"{src}-{tgt}") | ||
|
|
||
| os.makedirs(outer_temp_dataset_dir, exist_ok=True) | ||
| os.makedirs(inner_temp_dataset_dir, exist_ok=True) | ||
|
|
||
| seed = 1 | ||
| mocked_data = defaultdict(lambda: defaultdict(list)) | ||
| valid_set = "tst2013" | ||
| test_set = "tst2014" | ||
|
|
||
| files_for_split, _ = _generate_iwslt_files_for_lang_and_split(16, src, tgt, valid_set, test_set) | ||
| src_file = files_for_split[src][split] | ||
| tgt_file = files_for_split[tgt][split] | ||
| for file_name in (src_file, tgt_file): | ||
| txt_file = os.path.join(temp_dataset_dir, file_name) | ||
| with open(txt_file, "w") as f: | ||
| # Get file extension (i.e., the language) without the . prefix (.en -> en) | ||
| lang = os.path.splitext(file_name)[1][1:] | ||
| for i in range(5): | ||
| rand_string = " ".join( | ||
| random.choice(string.ascii_letters) for i in range(seed) | ||
| ) | ||
| dataset_line = f"{rand_string} {rand_string}\n" | ||
| # append line to correct dataset split | ||
| mocked_data[split][lang].append(dataset_line) | ||
| f.write(f'{rand_string} {rand_string}\n') | ||
| seed += 1 | ||
|
|
||
| cleaned_file_names, uncleaned_file_names = _generate_iwslt_files_for_lang_and_split(16, src, tgt, valid_set, test_set) | ||
| uncleaned_src_file = uncleaned_file_names[src][split] | ||
| uncleaned_tgt_file = uncleaned_file_names[tgt][split] | ||
|
|
||
| cleaned_src_file = cleaned_file_names[src][split] | ||
| cleaned_tgt_file = cleaned_file_names[tgt][split] | ||
|
|
||
| for (unclean_file_name, clean_file_name) in [ | ||
| (uncleaned_src_file, cleaned_src_file), | ||
| (uncleaned_tgt_file, cleaned_tgt_file) | ||
| ]: | ||
| # Get file extension (i.e., the language) without the . prefix (.en -> en) | ||
| lang = os.path.splitext(unclean_file_name)[1][1:] | ||
| expected_clean_filename = os.path.join(inner_temp_dataset_dir, clean_file_name) | ||
|
|
||
| # If we've already written a clean file, read it, so we don't generate | ||
| # new random strings. Otherwise generate new files and clean when read. | ||
| if os.path.exists(expected_clean_filename): | ||
| with open(expected_clean_filename, encoding="utf-8") as f: | ||
| mocked_data[split][lang] = f.readlines() | ||
| else: | ||
| out_file = os.path.join(inner_temp_dataset_dir, unclean_file_name) | ||
| with open(out_file, "w") as f: | ||
| mocked_data_for_split, file_contents = _generate_uncleaned_contents(split) | ||
| mocked_data[split][lang] = mocked_data_for_split | ||
| f.write(file_contents) | ||
|
|
||
| inner_compressed_dataset_path = os.path.join( | ||
| outer_temp_dataset_dir, f"{src}-{tgt}.tgz" | ||
| ) | ||
|
|
||
| # create tar file from dataset folder | ||
| with tarfile.open(inner_compressed_dataset_path, "w:gz") as tar: | ||
| tar.add(inner_temp_dataset_dir, arcname=f"{src}-{tgt}") | ||
|
|
||
| outer_temp_dataset_path = os.path.join( | ||
| root_dir, "IWSLT2016", "2016-01.tgz" | ||
| ) | ||
| with tarfile.open(outer_temp_dataset_path, "w:gz") as tar: | ||
| tar.add(outer_temp_dataset_dir, arcname="2016-01") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Before adding
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could approach it that way -- my approach is to just re-read the cleaned files to generate the expected values. I don't know if there's a strong reason to prefer one vs. the other.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ya, we probably want to match the extraction process as if we are working with the original dataset archive. The code-base would be extracting and caching files from inner tarball say |
||
|
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another alternative to what I mentioned above is to add the following lines here to delete the outer temp dataset dir and in doing so ensure the archive will always be extracted: Doing this gives me the following results: |
||
| return list(zip(mocked_data[split][src], mocked_data[split][tgt])) | ||
|
|
||
|
|
@@ -54,7 +136,7 @@ def setUpClass(cls): | |
| super().setUpClass() | ||
| cls.root_dir = cls.get_base_temp_dir() | ||
| cls.patcher = patch( | ||
| "torchdata.datapipes.iter.util.cacheholder.OnDiskCacheHolderIterDataPipe._cache_check_fn", return_value=True | ||
| "torchdata.datapipes.iter.util.cacheholder._hash_check", return_value=True | ||
| ) | ||
| cls.patcher.start() | ||
|
|
||
|
|
@@ -63,18 +145,27 @@ def tearDownClass(cls): | |
| cls.patcher.stop() | ||
| super().tearDownClass() | ||
|
|
||
| @parameterized.expand([("train", "de", "en"), ("valid", "de", "en")]) | ||
| def test_iwslt2016(self, split, src, tgt): | ||
| expected_samples = _get_mock_dataset(self.root_dir, split, src, tgt) | ||
| @parameterized.expand([ | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. TODO: can we (should we?) parameterize by each supported lang pair?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For complete code coverage I think it would make sense to parameterize by each supported lang pair. I believe we could use the
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @parmeet also wanted to get your thoughts on this
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I would agree. Simulating the exact structure as the original dataset with all the possible use-cases would be the key for coverage. |
||
| (split, src, tgt, dev_set, test_set) | ||
| for split in ("train", "valid", "test") | ||
| for dev_set, test_set in DEV_TEST_SPLITS | ||
| for src, tgt in SUPPORTED_LANGPAIRS | ||
| if (dev_set not in SET_NOT_EXISTS[(src, tgt)] and test_set not in SET_NOT_EXISTS[(src, tgt)]) | ||
| ]) | ||
| def test_iwslt2016(self, split, src, tgt, dev_set, test_set): | ||
|
|
||
| expected_samples = _get_mock_dataset(self.root_dir, split, src, tgt, dev_set, test_set) | ||
|
|
||
| dataset = IWSLT2016(root=self.root_dir, split=split) | ||
| dataset = IWSLT2016( | ||
| root=self.root_dir, split=split, language_pair=(src, tgt), valid_set=dev_set, test_set=test_set | ||
| ) | ||
|
|
||
| samples = list(dataset) | ||
|
|
||
| for sample, expected_sample in zip_equal(samples, expected_samples): | ||
| self.assertEqual(sample, expected_sample) | ||
|
|
||
| @parameterized.expand(["train", "valid"]) | ||
| @parameterized.expand(["train", "valid", "test"]) | ||
| def test_iwslt2016_split_argument(self, split): | ||
| dataset1 = IWSLT2016(root=self.root_dir, split=split) | ||
| (dataset2,) = IWSLT2016(root=self.root_dir, split=(split,)) | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.