Skip to content
This repository was archived by the owner on Sep 10, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions test/datasets/test_cc100.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
import lzma
import os
import random
import string
import lzma
from parameterized import parameterized
from collections import defaultdict
from unittest.mock import patch

from parameterized import parameterized
from torchtext.datasets import CC100
from torchtext.datasets.cc100 import VALID_CODES

from ..common.case_utils import TempDirMixin, zip_equal
from ..common.torchtext_test_case import TorchtextTestCase

from torchtext.datasets.cc100 import VALID_CODES


def _get_mock_dataset(root_dir):
"""
Expand Down
22 changes: 16 additions & 6 deletions test/datasets/test_conll2000chunking.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import gzip
import os
import random
import string
import gzip
from collections import defaultdict
from unittest.mock import patch

Expand All @@ -27,11 +27,19 @@ def _get_mock_dataset(root_dir):
mocked_lines = mocked_data[os.path.splitext(file_name)[0]]
with open(txt_file, "w") as f:
for i in range(5):
rand_strings = [random.choice(string.ascii_letters) for i in range(seed)]
rand_label_1 = [random.choice(string.ascii_letters) for i in range(seed)]
rand_label_2 = [random.choice(string.ascii_letters) for i in range(seed)]
rand_strings = [
random.choice(string.ascii_letters) for i in range(seed)
]
rand_label_1 = [
random.choice(string.ascii_letters) for i in range(seed)
]
rand_label_2 = [
random.choice(string.ascii_letters) for i in range(seed)
]
# one token per line (each sample ends with an extra \n)
for rand_string, label_1, label_2 in zip(rand_strings, rand_label_1, rand_label_2):
for rand_string, label_1, label_2 in zip(
rand_strings, rand_label_1, rand_label_2
):
f.write(f"{rand_string} {label_1} {label_2}\n")
f.write("\n")
dataset_line = (rand_strings, rand_label_1, rand_label_2)
Expand All @@ -41,7 +49,9 @@ def _get_mock_dataset(root_dir):

# create gz file from dataset folder
compressed_dataset_path = os.path.join(base_dir, f"{file_name}.gz")
with gzip.open(compressed_dataset_path, "wb") as gz_file, open(txt_file, "rb") as file_in:
with gzip.open(compressed_dataset_path, "wb") as gz_file, open(
txt_file, "rb"
) as file_in:
gz_file.writelines(file_in)

return mocked_data
Expand Down
10 changes: 6 additions & 4 deletions test/datasets/test_enwik9.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,12 @@ def _get_mock_dataset(root_dir):
mocked_data = []
with open(txt_file, "w") as f:
for i in range(5):
rand_string = "<" + " ".join(
random.choice(string.ascii_letters) for i in range(seed)
) + ">"
dataset_line = (f"'{rand_string}'")
rand_string = (
"<"
+ " ".join(random.choice(string.ascii_letters) for i in range(seed))
+ ">"
)
dataset_line = f"'{rand_string}'"
f.write(f"'{rand_string}'\n")

# append line to correct dataset split
Expand Down
92 changes: 68 additions & 24 deletions test/datasets/test_iwslt2016.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,54 @@
import itertools
import os
import random
import shutil
import string
import tarfile
import itertools
import tempfile
from collections import defaultdict
from unittest.mock import patch

from parameterized import parameterized
from torchtext.datasets.iwslt2016 import DATASET_NAME, IWSLT2016, SUPPORTED_DATASETS, SET_NOT_EXISTS
from torchtext.data.datasets_utils import _generate_iwslt_files_for_lang_and_split
from torchtext.datasets.iwslt2016 import (
DATASET_NAME,
IWSLT2016,
SUPPORTED_DATASETS,
SET_NOT_EXISTS,
)

from ..common.case_utils import zip_equal
from ..common.torchtext_test_case import TorchtextTestCase

SUPPORTED_LANGPAIRS = [(k, e) for k, v in SUPPORTED_DATASETS["language_pair"].items() for e in v]
SUPPORTED_LANGPAIRS = [
(k, e) for k, v in SUPPORTED_DATASETS["language_pair"].items() for e in v
]
SUPPORTED_DEVTEST_SPLITS = SUPPORTED_DATASETS["valid_test"]
DEV_TEST_SPLITS = [(dev, test) for dev, test in itertools.product(SUPPORTED_DEVTEST_SPLITS, repeat=2) if dev != test]
DEV_TEST_SPLITS = [
(dev, test)
for dev, test in itertools.product(SUPPORTED_DEVTEST_SPLITS, repeat=2)
if dev != test
]


def _generate_uncleaned_train():
"""Generate tags files"""
file_contents = []
examples = []
xml_tags = [
'<url', '<keywords', '<talkid', '<description', '<reviewer',
'<translator', '<title', '<speaker', '<doc', '</doc'
"<url",
"<keywords",
"<talkid",
"<description",
"<reviewer",
"<translator",
"<title",
"<speaker",
"<doc",
"</doc",
]
for i in range(100):
rand_string = " ".join(
random.choice(string.ascii_letters) for i in range(10)
)
rand_string = " ".join(random.choice(string.ascii_letters) for i in range(10))
# With a 10% change, add one of the XML tags which is cleaned
# to ensure cleaning happens appropriately
if random.random() < 0.1:
Expand Down Expand Up @@ -79,7 +96,7 @@ def _get_mock_dataset(root_dir, split, src, tgt, valid_set, test_set):
"""

base_dir = os.path.join(root_dir, DATASET_NAME)
temp_dataset_dir = os.path.join(base_dir, 'temp_dataset_dir')
temp_dataset_dir = os.path.join(base_dir, "temp_dataset_dir")
outer_temp_dataset_dir = os.path.join(temp_dataset_dir, f"texts/{src}/{tgt}/")
inner_temp_dataset_dir = os.path.join(outer_temp_dataset_dir, f"{src}-{tgt}")

Expand All @@ -88,7 +105,9 @@ def _get_mock_dataset(root_dir, split, src, tgt, valid_set, test_set):

mocked_data = defaultdict(lambda: defaultdict(list))

cleaned_file_names, uncleaned_file_names = _generate_iwslt_files_for_lang_and_split(16, src, tgt, valid_set, test_set)
cleaned_file_names, uncleaned_file_names = _generate_iwslt_files_for_lang_and_split(
16, src, tgt, valid_set, test_set
)
uncleaned_src_file = uncleaned_file_names[src][split]
uncleaned_tgt_file = uncleaned_file_names[tgt][split]

Expand All @@ -97,7 +116,7 @@ def _get_mock_dataset(root_dir, split, src, tgt, valid_set, test_set):

for (unclean_file_name, clean_file_name) in [
(uncleaned_src_file, cleaned_src_file),
(uncleaned_tgt_file, cleaned_tgt_file)
(uncleaned_tgt_file, cleaned_tgt_file),
]:
# Get file extension (i.e., the language) without the . prefix (.en -> en)
lang = os.path.splitext(unclean_file_name)[1][1:]
Expand Down Expand Up @@ -144,20 +163,31 @@ def tearDownClass(cls):
cls.patcher.stop()
super().tearDownClass()

@parameterized.expand([
(split, src, tgt, dev_set, test_set)
for split in ("train", "valid", "test")
for dev_set, test_set in DEV_TEST_SPLITS
for src, tgt in SUPPORTED_LANGPAIRS
if (dev_set not in SET_NOT_EXISTS[(src, tgt)] and test_set not in SET_NOT_EXISTS[(src, tgt)])
])
@parameterized.expand(
[
(split, src, tgt, dev_set, test_set)
for split in ("train", "valid", "test")
for dev_set, test_set in DEV_TEST_SPLITS
for src, tgt in SUPPORTED_LANGPAIRS
if (
dev_set not in SET_NOT_EXISTS[(src, tgt)]
and test_set not in SET_NOT_EXISTS[(src, tgt)]
)
]
)
def test_iwslt2016(self, split, src, tgt, dev_set, test_set):

with tempfile.TemporaryDirectory() as root_dir:
expected_samples = _get_mock_dataset(root_dir, split, src, tgt, dev_set, test_set)
expected_samples = _get_mock_dataset(
root_dir, split, src, tgt, dev_set, test_set
)

dataset = IWSLT2016(
root=root_dir, split=split, language_pair=(src, tgt), valid_set=dev_set, test_set=test_set
root=root_dir,
split=split,
language_pair=(src, tgt),
valid_set=dev_set,
test_set=test_set,
)

samples = list(dataset)
Expand All @@ -171,9 +201,23 @@ def test_iwslt2016_split_argument(self, split):
language_pair = ("de", "en")
valid_set = "tst2013"
test_set = "tst2014"
_ = _get_mock_dataset(root_dir, split, language_pair[0], language_pair[1], valid_set, test_set)
dataset1 = IWSLT2016(root=root_dir, split=split, language_pair=language_pair, valid_set=valid_set, test_set=test_set)
(dataset2,) = IWSLT2016(root=root_dir, split=(split,), language_pair=language_pair, valid_set=valid_set, test_set=test_set)
_ = _get_mock_dataset(
root_dir, split, language_pair[0], language_pair[1], valid_set, test_set
)
dataset1 = IWSLT2016(
root=root_dir,
split=split,
language_pair=language_pair,
valid_set=valid_set,
test_set=test_set,
)
(dataset2,) = IWSLT2016(
root=root_dir,
split=(split,),
language_pair=language_pair,
valid_set=valid_set,
test_set=test_set,
)

for d1, d2 in zip_equal(dataset1, dataset2):
self.assertEqual(d1, d2)
71 changes: 50 additions & 21 deletions test/datasets/test_iwslt2017.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,27 +8,40 @@
from unittest.mock import patch

from parameterized import parameterized
from torchtext.datasets.iwslt2017 import DATASET_NAME, IWSLT2017, SUPPORTED_DATASETS, _PATH
from torchtext.data.datasets_utils import _generate_iwslt_files_for_lang_and_split
from torchtext.datasets.iwslt2017 import (
DATASET_NAME,
IWSLT2017,
SUPPORTED_DATASETS,
_PATH,
)

from ..common.case_utils import zip_equal
from ..common.torchtext_test_case import TorchtextTestCase

SUPPORTED_LANGPAIRS = [(k, e) for k, v in SUPPORTED_DATASETS["language_pair"].items() for e in v]
SUPPORTED_LANGPAIRS = [
(k, e) for k, v in SUPPORTED_DATASETS["language_pair"].items() for e in v
]


def _generate_uncleaned_train():
"""Generate tags files"""
file_contents = []
examples = []
xml_tags = [
'<url', '<keywords', '<talkid', '<description', '<reviewer',
'<translator', '<title', '<speaker', '<doc', '</doc'
"<url",
"<keywords",
"<talkid",
"<description",
"<reviewer",
"<translator",
"<title",
"<speaker",
"<doc",
"</doc",
]
for i in range(100):
rand_string = " ".join(
random.choice(string.ascii_letters) for i in range(10)
)
rand_string = " ".join(random.choice(string.ascii_letters) for i in range(10))
# With a 10% change, add one of the XML tags which is cleaned
# to ensure cleaning happens appropriately
if random.random() < 0.1:
Expand Down Expand Up @@ -76,16 +89,22 @@ def _get_mock_dataset(root_dir, split, src, tgt, valid_set, test_set):
"""

base_dir = os.path.join(root_dir, DATASET_NAME)
temp_dataset_dir = os.path.join(base_dir, 'temp_dataset_dir')
outer_temp_dataset_dir = os.path.join(temp_dataset_dir, "texts/DeEnItNlRo/DeEnItNlRo")
inner_temp_dataset_dir = os.path.join(outer_temp_dataset_dir, "DeEnItNlRo-DeEnItNlRo")
temp_dataset_dir = os.path.join(base_dir, "temp_dataset_dir")
outer_temp_dataset_dir = os.path.join(
temp_dataset_dir, "texts/DeEnItNlRo/DeEnItNlRo"
)
inner_temp_dataset_dir = os.path.join(
outer_temp_dataset_dir, "DeEnItNlRo-DeEnItNlRo"
)

os.makedirs(outer_temp_dataset_dir, exist_ok=True)
os.makedirs(inner_temp_dataset_dir, exist_ok=True)

mocked_data = defaultdict(lambda: defaultdict(list))

cleaned_file_names, uncleaned_file_names = _generate_iwslt_files_for_lang_and_split(17, src, tgt, valid_set, test_set)
cleaned_file_names, uncleaned_file_names = _generate_iwslt_files_for_lang_and_split(
17, src, tgt, valid_set, test_set
)
uncleaned_src_file = uncleaned_file_names[src][split]
uncleaned_tgt_file = uncleaned_file_names[tgt][split]

Expand All @@ -94,7 +113,7 @@ def _get_mock_dataset(root_dir, split, src, tgt, valid_set, test_set):

for (unclean_file_name, clean_file_name) in [
(uncleaned_src_file, cleaned_src_file),
(uncleaned_tgt_file, cleaned_tgt_file)
(uncleaned_tgt_file, cleaned_tgt_file),
]:
# Get file extension (i.e., the language) without the . prefix (.en -> en)
lang = os.path.splitext(unclean_file_name)[1][1:]
Expand Down Expand Up @@ -141,15 +160,19 @@ def tearDownClass(cls):
cls.patcher.stop()
super().tearDownClass()

@parameterized.expand([
(split, src, tgt)
for split in ("train", "valid", "test")
for src, tgt in SUPPORTED_LANGPAIRS
])
@parameterized.expand(
[
(split, src, tgt)
for split in ("train", "valid", "test")
for src, tgt in SUPPORTED_LANGPAIRS
]
)
def test_iwslt2017(self, split, src, tgt):

with tempfile.TemporaryDirectory() as root_dir:
expected_samples = _get_mock_dataset(root_dir, split, src, tgt, "dev2010", "tst2010")
expected_samples = _get_mock_dataset(
root_dir, split, src, tgt, "dev2010", "tst2010"
)

dataset = IWSLT2017(root=root_dir, split=split, language_pair=(src, tgt))

Expand All @@ -164,9 +187,15 @@ def test_iwslt2017_split_argument(self, split):
language_pair = ("de", "en")
valid_set = "dev2010"
test_set = "tst2010"
_ = _get_mock_dataset(root_dir, split, language_pair[0], language_pair[1], valid_set, test_set)
dataset1 = IWSLT2017(root=root_dir, split=split, language_pair=language_pair)
(dataset2,) = IWSLT2017(root=root_dir, split=(split,), language_pair=language_pair)
_ = _get_mock_dataset(
root_dir, split, language_pair[0], language_pair[1], valid_set, test_set
)
dataset1 = IWSLT2017(
root=root_dir, split=split, language_pair=language_pair
)
(dataset2,) = IWSLT2017(
root=root_dir, split=(split,), language_pair=language_pair
)

for d1, d2 in zip_equal(dataset1, dataset2):
self.assertEqual(d1, d2)
Loading