Skip to content
This repository was archived by the owner on Sep 10, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
File renamed without changes.
67 changes: 27 additions & 40 deletions test/experimental/test_datasets.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,42 @@
import hashlib
import json
import os
import shutil
import tempfile

from torchtext.experimental.datasets import sst2

from ..common.assets import get_asset_path
from ..common.assets import _ASSET_DIR
from ..common.case_utils import skipIfNoModule
from ..common.torchtext_test_case import TorchtextTestCase


class TestDataset(TorchtextTestCase):
@skipIfNoModule("torchdata")
def test_sst2__dataset(self):
# copy the asset file into the expected download location
# note that this is just a zip file with the first 10 lines of the SST2 dataset
# test if providing a custom hash works with the dummy dataset
with tempfile.TemporaryDirectory() as dir_name:
asset_path = get_asset_path(sst2._PATH)
data_path = os.path.join(dir_name, sst2.DATASET_NAME, sst2._PATH)
os.makedirs(os.path.join(dir_name, sst2.DATASET_NAME))
shutil.copy(asset_path, data_path)

split = ("train", "dev", "test")
train_dataset, dev_dataset, test_dataset = sst2.SST2(
split=split, root=dir_name, validate_hash=False
)
split = ("train", "dev", "test")
train_dataset, dev_dataset, test_dataset = sst2.SST2(
split=split, root=_ASSET_DIR, validate_hash=False
)

# verify datasets objects are instances of SST2Dataset
for dataset in (train_dataset, dev_dataset, test_dataset):
self.assertTrue(isinstance(dataset, sst2.SST2Dataset))
# verify datasets objects are instances of SST2Dataset
for dataset in (train_dataset, dev_dataset, test_dataset):
self.assertTrue(isinstance(dataset, sst2.SST2Dataset))

# verify hashes of first line in dataset
self.assertEqual(
hashlib.md5(
json.dumps(next(iter(train_dataset)), sort_keys=True).encode(
"utf-8"
)
).hexdigest(),
sst2._FIRST_LINE_MD5["train"],
)
self.assertEqual(
hashlib.md5(
json.dumps(next(iter(dev_dataset)), sort_keys=True).encode("utf-8")
).hexdigest(),
sst2._FIRST_LINE_MD5["dev"],
)
self.assertEqual(
hashlib.md5(
json.dumps(next(iter(test_dataset)), sort_keys=True).encode("utf-8")
).hexdigest(),
sst2._FIRST_LINE_MD5["test"],
)
# verify hashes of first line in dataset
self.assertEqual(
hashlib.md5(
json.dumps(next(iter(train_dataset)), sort_keys=True).encode("utf-8")
).hexdigest(),
sst2._FIRST_LINE_MD5["train"],
)
self.assertEqual(
hashlib.md5(
json.dumps(next(iter(dev_dataset)), sort_keys=True).encode("utf-8")
).hexdigest(),
sst2._FIRST_LINE_MD5["dev"],
)
self.assertEqual(
hashlib.md5(
json.dumps(next(iter(test_dataset)), sort_keys=True).encode("utf-8")
).hexdigest(),
sst2._FIRST_LINE_MD5["test"],
)
4 changes: 3 additions & 1 deletion torchtext/experimental/datasets/sst2.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,9 @@ def _get_datapipe(self, root, split, validate_hash):
)

# extract data from zip
extracted_files = check_cache_dp.read_from_zip().filter(lambda x: split in x[0])
extracted_files = check_cache_dp.read_from_zip().filter(
lambda x: f"{split}.tsv" in x[0]
)

# Parse CSV file and yield data samples
return extracted_files.parse_csv(skip_lines=1, delimiter="\t").map(
Expand Down