Skip to content
This repository was archived by the owner on Sep 10, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions test/common/case_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,40 @@
import os.path
import tempfile
import unittest

from torchtext._internal.module_utils import is_module_available


class TempDirMixin:
"""Mixin to provide easy access to temp dir"""

temp_dir_ = None

@classmethod
def get_base_temp_dir(cls):
# If TORCHTEXT_TEST_TEMP_DIR is set, use it instead of temporary directory.
# this is handy for debugging.
key = "TORCHTEXT_TEST_TEMP_DIR"
if key in os.environ:
return os.environ[key]
if cls.temp_dir_ is None:
cls.temp_dir_ = tempfile.TemporaryDirectory()
return cls.temp_dir_.name

@classmethod
def tearDownClass(cls):
super().tearDownClass()
if cls.temp_dir_ is not None:
cls.temp_dir_.cleanup()
cls.temp_dir_ = None

def get_temp_path(self, *paths):
temp_dir = os.path.join(self.get_base_temp_dir(), self.id())
path = os.path.join(temp_dir, *paths)
os.makedirs(os.path.dirname(path), exist_ok=True)
return path


def skipIfNoModule(module, display_name=None):
display_name = display_name or module
return unittest.skipIf(not is_module_available(module), f'"{display_name}" is not available')
Empty file added test/datasets/__init__.py
Empty file.
82 changes: 82 additions & 0 deletions test/datasets/amazonreviewpolarity_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import os
import random
import string
import tarfile
from collections import defaultdict
from unittest.mock import patch

from parameterized import parameterized
from torchtext.datasets.amazonreviewpolarity import AmazonReviewPolarity

from ..common.case_utils import TempDirMixin
from ..common.torchtext_test_case import TorchtextTestCase


def get_mock_dataset(root_dir):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if it is technically feasible to Mock the HTTP link as well?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can look into this. I think it would be a matter of mocking the GDriveReader to return the tar-file

Copy link
Contributor

@mthrok mthrok Jan 21, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be doable but you want to be careful how much you want to monkey patch things. When test fixture gets complicated, it becomes error-prone, especially because mnkey patching is not what we do in library code thus we are not as familiar with the technique. And test fixtures are not tested, it's much harder to find bugs in fixtures.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to be clear, my query was to patch the URL (like the proposal to patch the MD5) and not the code (GDriveReader). Meaning that the patched URL would work like a normal URL but instead would fetch the mocked dataset. Is this the same understanding you had @Nayef211 , @mthrok ? I am not sure though if this is doable or if there are still concerns doing this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let me make sure I understand your suggestion correctly @parmeet. The reason we are patching the MD5 hash is because the tests would actually fail without it. Patching the URL in this case doesn't make sense to me since the on_disk_cache datapipe just expects to verify whether the files that were downloaded by the GDriveReader datapipe exists locally. What would the benefit of patching the URL to return the tar.gz archive accomplish that wouldn't be done by the AmazonReviewPolarity dataset implementation?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm also following the same testing strategy used by torchaudio which assumes that any functions being used to implement the dataset has already been tested (in this case this would be datapipes).

I agree, we probably do not want to write tests that explicitly tests external library logic/workflow.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OnDiskCache would check the hash, if it doesn't match, it will trigger the following ops (Downloader or Extract) in this case.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So as we are making the compressed file available in the folder, which is essentially the second step in the pipeline (the first being the download part). This means that any code/datapipe responsible for download part won't be executed/tested making it vulnerable for modifications. The idea was simply to cover 100% of the code through testing :).

BTW, as a very stupid example, if by mistake I end up changing GDriveReader to HttpReader, or change the url_dp to point to something else (instead of URL), I could completely bypass the current unit-testing. That said, I don't want to block this PR for this. I don't mind merging it as such as it is already a huge improvement over what we had. Feel free to merge it without further due :)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

1a. If the MD5 doesn't match, assert that we make an HTTP call
1a. If the MD5 does match, assert we read from cache
2a. If we've cached the contents of the tar, assert that we don't read the tar again
2b. If we haven't, assert we extract.
3. Test the parsing logic simply

Regarding the @erip's points I think they are valid. As I do not know which component implements which logic, if 1 and 2 are what torchdata is responsible for and tested there, I do not think we need to test it here.

I'm also following the same testing strategy used by torchaudio which assumes that any functions being used to implement the dataset has already been tested (in this case this would be datapipes). Instead we just test the API of the dataset to validate whether the expected input args result in the expected outputs. @mthrok could you weigh in with your thoughts as well?

In the context of mocking and patching, I prefer to avoid patching the code as much as possible, because in my experience, patching requires more knowledge about the implementation detail, and as the library code changes, these implementation detail changes, thus it increases the maintenance cost. Which is why I said "be careful how much you want to monkey patch stuff". (Of course, the notable exception to this is network call and such, which otherwise would not be possible.)
The reason why I chose to mock dataset is because they won't change. The datasets by nature are not supposed to change. So even when the library code changes dramatically, the implementation should be centered around the data. Thus we would not need to change the test fixture that much. (and as a bonus, we found a unexpected bug when the mock data was introduced. The order of files returned by glob was not consistent across OS. This would not be caught by patching)
Note that in torchaudio repo, we do not test the download part and the dataset implementation does not check cache if not downloading a new archive. so the code coverage is not 100%. I personally consider this a reasonable trade-off.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the detailed feedback and for describing how torchaudio approaches testing datasets @mthrok. Moving forward, I am going to stick to testing the core functionality of the dataset functions in this test suite.

I still think @parmeet makes a valid point about getting better test coverage by patching GDriveReader datapipe to return the tar archive even if it adds a bit of maintenance overhead (like you said, its a tradeoff between maintenance and test coverage). I am going to merge this PR as is but let me add this patch in a followup PR so we can discuss further if it makes sense from a maintenance vs test coverage perspective.

"""
root_dir: directory to the mocked dataset
"""
base_dir = os.path.join(root_dir, "AmazonReviewPolarity")
temp_dataset_dir = os.path.join(base_dir, "temp_dataset_dir")
os.makedirs(temp_dataset_dir, exist_ok=True)

seed = 1
mocked_data = defaultdict(list)
for file_name in ("train.csv", "test.csv"):
txt_file = os.path.join(temp_dataset_dir, file_name)
with open(txt_file, "w") as f:
for i in range(5):
label = seed % 2 + 1
rand_string = " ".join(
random.choice(string.ascii_letters) for i in range(seed)
)
dataset_line = (label, f"{rand_string} {rand_string}")
# append line to correct dataset split
mocked_data[os.path.splitext(file_name)[0]].append(dataset_line)
f.write(f'"{label}","{rand_string}","{rand_string}"\n')
seed += 1

compressed_dataset_path = os.path.join(
base_dir, "amazon_review_polarity_csv.tar.gz"
)
# create tar file from dataset folder
with tarfile.open(compressed_dataset_path, "w:gz") as tar:
tar.add(temp_dataset_dir, arcname="amazon_review_polarity_csv")

return mocked_data


class TestAmazonReviewPolarity(TempDirMixin, TorchtextTestCase):
root_dir = None
samples = []

@classmethod
def setUpClass(cls):
super().setUpClass()
cls.root_dir = cls.get_base_temp_dir()
cls.samples = get_mock_dataset(cls.root_dir)

@parameterized.expand(["train", "test"])
def test_amazon_review_polarity(self, split):
with patch(
"torchdata.datapipes.iter.util.cacheholder._hash_check", return_value=True
):
dataset = AmazonReviewPolarity(root=self.root_dir, split=split)
n_iter = 0
for i, (label, text) in enumerate(dataset):
expected_sample = self.samples[split][i]
assert label == expected_sample[0]
assert text == expected_sample[1]
n_iter += 1
assert n_iter == len(self.samples[split])

@parameterized.expand([("train", ("train",)), ("test", ("test",))])
def test_amazon_review_polarity_split_argument(self, split1, split2):
with patch(
"torchdata.datapipes.iter.util.cacheholder._hash_check", return_value=True
):
dataset1 = AmazonReviewPolarity(root=self.root_dir, split=split1)
(dataset2,) = AmazonReviewPolarity(root=self.root_dir, split=split2)

for d1, d2 in zip(dataset1, dataset2):
self.assertEqual(d1, d2)