Skip to content
This repository was archived by the owner on Sep 10, 2025. It is now read-only.
155 changes: 123 additions & 32 deletions test/datasets/test_iwslt2016.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,128 @@
import os
import random
import string
import tarfile
import itertools
from collections import defaultdict
from unittest.mock import patch

from parameterized import parameterized
from torchtext.datasets.iwslt2016 import IWSLT2016
from torchtext.datasets.iwslt2016 import IWSLT2016, SUPPORTED_DATASETS, SET_NOT_EXISTS
from torchtext.data.datasets_utils import _generate_iwslt_files_for_lang_and_split

from ..common.case_utils import TempDirMixin, zip_equal
from ..common.torchtext_test_case import TorchtextTestCase


def _get_mock_dataset(root_dir, split, src, tgt):
SUPPORTED_LANGPAIRS = [(k, e) for k, v in SUPPORTED_DATASETS["language_pair"].items() for e in v]
SUPPORTED_DEVTEST_SPLITS = SUPPORTED_DATASETS["valid_test"]
DEV_TEST_SPLITS = [(dev, test) for dev, test in itertools.product(SUPPORTED_DEVTEST_SPLITS, repeat=2) if dev != test]


def _generate_uncleaned_train():
"""Generate tags files"""
file_contents = []
examples = []
xml_tags = [
'<url', '<keywords', '<talkid', '<description', '<reviewer',
'<translator', '<title', '<speaker', '<doc', '</doc'
]
for i in range(100):
rand_string = " ".join(
random.choice(string.ascii_letters) for i in range(10)
)
# With a 10% change, add one of the XML tags which is cleaned
# to ensure cleaning happens appropriately
if random.random() < 0.1:
open_tag = random.choice(xml_tags) + ">"
close_tag = "</" + open_tag[1:] + ">"
file_contents.append(open_tag + rand_string + close_tag)
else:
examples.append(rand_string + "\n")
file_contents.append(rand_string)
return examples, "\n".join(file_contents)


def _generate_uncleaned_valid():
file_contents = ["<root>"]
examples = []

for doc_id in range(5):
file_contents.append(f'<doc docid="{doc_id}" genre="lectures">')
for seg_id in range(100):
rand_string = " ".join(
random.choice(string.ascii_letters) for i in range(10)
)
examples.append(rand_string)
file_contents.append(f"<seg>{rand_string} </seg>" + "\n")
file_contents.append("</doc>")
file_contents.append("</root>")
return examples, " ".join(file_contents)


def _generate_uncleaned_test():
return _generate_uncleaned_valid()


def _generate_uncleaned_contents(split):
return {
"train": _generate_uncleaned_train(),
"valid": _generate_uncleaned_valid(),
"test": _generate_uncleaned_test(),
}[split]


def _get_mock_dataset(root_dir, split, src, tgt, valid_set, test_set):
"""
root_dir: directory to the mocked dataset
"""
temp_dataset_dir = os.path.join(root_dir, f"IWSLT2016/2016-01/texts/{src}/{tgt}/{src}-{tgt}/")
os.makedirs(temp_dataset_dir, exist_ok=True)
outer_temp_dataset_dir = os.path.join(root_dir, f"IWSLT2016/2016-01/texts/{src}/{tgt}/")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you change this path to be "...IWSLT2016/temp_dataset_dir/2016-01/..." you will notice that there are a lot of test failures. The purpose of naming it this is to ensure that the folder we are creating the mocked data is different from where the files/folders would be place when the archive is extracted (i.e. when calling the dataset). Without doing this, the archive will never be extracted since our dataset implementation checks to see whether a file/folder exists and if it does, the archive is never extracted.

When patching this PR locally and making the above change, I noticed we're getting 212 failed, 451 passed. This is a sample error I'm getting from pytest:

FAILED test_iwslt2016.py::TestIWSLT2016::test_iwslt2016_219_train - AssertionError: 'H T G W w U w K b d\n' != 'b H K H c L t k I s\n'

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I understand the idea now. 👍 Let me give this a shot.

inner_temp_dataset_dir = os.path.join(outer_temp_dataset_dir, f"{src}-{tgt}")

os.makedirs(outer_temp_dataset_dir, exist_ok=True)
os.makedirs(inner_temp_dataset_dir, exist_ok=True)

seed = 1
mocked_data = defaultdict(lambda: defaultdict(list))
valid_set = "tst2013"
test_set = "tst2014"

files_for_split, _ = _generate_iwslt_files_for_lang_and_split(16, src, tgt, valid_set, test_set)
src_file = files_for_split[src][split]
tgt_file = files_for_split[tgt][split]
for file_name in (src_file, tgt_file):
txt_file = os.path.join(temp_dataset_dir, file_name)
with open(txt_file, "w") as f:
# Get file extension (i.e., the language) without the . prefix (.en -> en)
lang = os.path.splitext(file_name)[1][1:]
for i in range(5):
rand_string = " ".join(
random.choice(string.ascii_letters) for i in range(seed)
)
dataset_line = f"{rand_string} {rand_string}\n"
# append line to correct dataset split
mocked_data[split][lang].append(dataset_line)
f.write(f'{rand_string} {rand_string}\n')
seed += 1

cleaned_file_names, uncleaned_file_names = _generate_iwslt_files_for_lang_and_split(16, src, tgt, valid_set, test_set)
uncleaned_src_file = uncleaned_file_names[src][split]
uncleaned_tgt_file = uncleaned_file_names[tgt][split]

cleaned_src_file = cleaned_file_names[src][split]
cleaned_tgt_file = cleaned_file_names[tgt][split]

for (unclean_file_name, clean_file_name) in [
(uncleaned_src_file, cleaned_src_file),
(uncleaned_tgt_file, cleaned_tgt_file)
]:
# Get file extension (i.e., the language) without the . prefix (.en -> en)
lang = os.path.splitext(unclean_file_name)[1][1:]
expected_clean_filename = os.path.join(inner_temp_dataset_dir, clean_file_name)

# If we've already written a clean file, read it, so we don't generate
# new random strings. Otherwise generate new files and clean when read.
if os.path.exists(expected_clean_filename):
with open(expected_clean_filename, encoding="utf-8") as f:
mocked_data[split][lang] = f.readlines()
else:
out_file = os.path.join(inner_temp_dataset_dir, unclean_file_name)
with open(out_file, "w") as f:
mocked_data_for_split, file_contents = _generate_uncleaned_contents(split)
mocked_data[split][lang] = mocked_data_for_split
f.write(file_contents)

inner_compressed_dataset_path = os.path.join(
outer_temp_dataset_dir, f"{src}-{tgt}.tgz"
)

# create tar file from dataset folder
with tarfile.open(inner_compressed_dataset_path, "w:gz") as tar:
tar.add(inner_temp_dataset_dir, arcname=f"{src}-{tgt}")

outer_temp_dataset_path = os.path.join(
root_dir, "IWSLT2016", "2016-01.tgz"
)
with tarfile.open(outer_temp_dataset_path, "w:gz") as tar:
tar.add(outer_temp_dataset_dir, arcname="2016-01")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Before adding outer_temp_dataset_dir to the archive, we probably want to remove inner_temp_dataset_dir since the contents of this folder are already available in the inner_compressed_dataset_path, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could approach it that way -- my approach is to just re-read the cleaned files to generate the expected values. I don't know if there's a strong reason to prefer one vs. the other.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ya, we probably want to match the extraction process as if we are working with the original dataset archive. The code-base would be extracting and caching files from inner tarball sayde-en.tgz and if the files are already present in top-most archive 2016-01.tgz then we won't be executing the inner extraction (due to secondary caching logic introduced in this issue #1589). This is certainly a complex test/dataset, let me know if I am missing anything :)


Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another alternative to what I mentioned above is to add the following lines here to delete the outer temp dataset dir and in doing so ensure the archive will always be extracted:

import shutil
shutil.rmtree(outer_temp_dataset_dir)

Doing this gives me the following results: 110 failed, 553 passed. This is a sample error I'm getting from pytest:

FAILED test_iwslt2016.py::TestIWSLT2016::test_iwslt2016_008_train - ValueError: Iterables have different lengths

return list(zip(mocked_data[split][src], mocked_data[split][tgt]))

Expand All @@ -54,7 +136,7 @@ def setUpClass(cls):
super().setUpClass()
cls.root_dir = cls.get_base_temp_dir()
cls.patcher = patch(
"torchdata.datapipes.iter.util.cacheholder.OnDiskCacheHolderIterDataPipe._cache_check_fn", return_value=True
"torchdata.datapipes.iter.util.cacheholder._hash_check", return_value=True
)
cls.patcher.start()

Expand All @@ -63,18 +145,27 @@ def tearDownClass(cls):
cls.patcher.stop()
super().tearDownClass()

@parameterized.expand([("train", "de", "en"), ("valid", "de", "en")])
def test_iwslt2016(self, split, src, tgt):
expected_samples = _get_mock_dataset(self.root_dir, split, src, tgt)
@parameterized.expand([
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: can we (should we?) parameterize by each supported lang pair?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For complete code coverage I think it would make sense to parameterize by each supported lang pair. I believe we could use the nested_params fn to generate the cartesian product of language pairs. The main issue I forsee with using that function is it would generate lang pairs that map to the same language (i.e en ->en) which I'm not sure would make sense for this dataset.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@parmeet also wanted to get your thoughts on this

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I would agree. Simulating the exact structure as the original dataset with all the possible use-cases would be the key for coverage.

(split, src, tgt, dev_set, test_set)
for split in ("train", "valid", "test")
for dev_set, test_set in DEV_TEST_SPLITS
for src, tgt in SUPPORTED_LANGPAIRS
if (dev_set not in SET_NOT_EXISTS[(src, tgt)] and test_set not in SET_NOT_EXISTS[(src, tgt)])
])
def test_iwslt2016(self, split, src, tgt, dev_set, test_set):

expected_samples = _get_mock_dataset(self.root_dir, split, src, tgt, dev_set, test_set)

dataset = IWSLT2016(root=self.root_dir, split=split)
dataset = IWSLT2016(
root=self.root_dir, split=split, language_pair=(src, tgt), valid_set=dev_set, test_set=test_set
)

samples = list(dataset)

for sample, expected_sample in zip_equal(samples, expected_samples):
self.assertEqual(sample, expected_sample)

@parameterized.expand(["train", "valid"])
@parameterized.expand(["train", "valid", "test"])
def test_iwslt2016_split_argument(self, split):
dataset1 = IWSLT2016(root=self.root_dir, split=split)
(dataset2,) = IWSLT2016(root=self.root_dir, split=(split,))
Expand Down