This repository was archived by the owner on Sep 10, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 814
Add AmazonReviewPolarity Mocked Unit Test #1532
Merged
Nayef211
merged 7 commits into
pytorch:main
from
Nayef211:test/mock_amazon_review_polarity
Jan 24, 2022
Merged
Changes from all commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
8abc6fc
First attempt at adding test for amazon review polarity
df53fd3
Merge branch 'main' into test/mock_amazon_review_polarity
92e90f8
Updated dataset to take validate_hash param. Finalized tests
249de07
Created non empty tar file
09ae6db
Remove formatting. Patch _hash_check method from torchdata during tes…
263a257
Added super().setUpClass()
e4eca13
Remove commented import
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,7 +1,40 @@ | ||
| import os.path | ||
| import tempfile | ||
| import unittest | ||
|
|
||
| from torchtext._internal.module_utils import is_module_available | ||
|
|
||
|
|
||
| class TempDirMixin: | ||
| """Mixin to provide easy access to temp dir""" | ||
|
|
||
| temp_dir_ = None | ||
|
|
||
| @classmethod | ||
| def get_base_temp_dir(cls): | ||
| # If TORCHTEXT_TEST_TEMP_DIR is set, use it instead of temporary directory. | ||
| # this is handy for debugging. | ||
| key = "TORCHTEXT_TEST_TEMP_DIR" | ||
| if key in os.environ: | ||
| return os.environ[key] | ||
| if cls.temp_dir_ is None: | ||
| cls.temp_dir_ = tempfile.TemporaryDirectory() | ||
| return cls.temp_dir_.name | ||
|
|
||
| @classmethod | ||
| def tearDownClass(cls): | ||
| super().tearDownClass() | ||
| if cls.temp_dir_ is not None: | ||
| cls.temp_dir_.cleanup() | ||
| cls.temp_dir_ = None | ||
|
|
||
| def get_temp_path(self, *paths): | ||
| temp_dir = os.path.join(self.get_base_temp_dir(), self.id()) | ||
| path = os.path.join(temp_dir, *paths) | ||
| os.makedirs(os.path.dirname(path), exist_ok=True) | ||
| return path | ||
|
|
||
|
|
||
| def skipIfNoModule(module, display_name=None): | ||
| display_name = display_name or module | ||
| return unittest.skipIf(not is_module_available(module), f'"{display_name}" is not available') |
Empty file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,82 @@ | ||
| import os | ||
| import random | ||
| import string | ||
| import tarfile | ||
| from collections import defaultdict | ||
| from unittest.mock import patch | ||
|
|
||
| from parameterized import parameterized | ||
| from torchtext.datasets.amazonreviewpolarity import AmazonReviewPolarity | ||
|
|
||
| from ..common.case_utils import TempDirMixin | ||
| from ..common.torchtext_test_case import TorchtextTestCase | ||
|
|
||
|
|
||
| def get_mock_dataset(root_dir): | ||
| """ | ||
| root_dir: directory to the mocked dataset | ||
| """ | ||
| base_dir = os.path.join(root_dir, "AmazonReviewPolarity") | ||
| temp_dataset_dir = os.path.join(base_dir, "temp_dataset_dir") | ||
| os.makedirs(temp_dataset_dir, exist_ok=True) | ||
|
|
||
| seed = 1 | ||
| mocked_data = defaultdict(list) | ||
| for file_name in ("train.csv", "test.csv"): | ||
| txt_file = os.path.join(temp_dataset_dir, file_name) | ||
| with open(txt_file, "w") as f: | ||
| for i in range(5): | ||
| label = seed % 2 + 1 | ||
| rand_string = " ".join( | ||
| random.choice(string.ascii_letters) for i in range(seed) | ||
| ) | ||
| dataset_line = (label, f"{rand_string} {rand_string}") | ||
| # append line to correct dataset split | ||
| mocked_data[os.path.splitext(file_name)[0]].append(dataset_line) | ||
| f.write(f'"{label}","{rand_string}","{rand_string}"\n') | ||
| seed += 1 | ||
|
|
||
| compressed_dataset_path = os.path.join( | ||
| base_dir, "amazon_review_polarity_csv.tar.gz" | ||
| ) | ||
| # create tar file from dataset folder | ||
| with tarfile.open(compressed_dataset_path, "w:gz") as tar: | ||
| tar.add(temp_dataset_dir, arcname="amazon_review_polarity_csv") | ||
|
|
||
| return mocked_data | ||
|
|
||
|
|
||
| class TestAmazonReviewPolarity(TempDirMixin, TorchtextTestCase): | ||
| root_dir = None | ||
| samples = [] | ||
|
|
||
| @classmethod | ||
| def setUpClass(cls): | ||
Nayef211 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| super().setUpClass() | ||
| cls.root_dir = cls.get_base_temp_dir() | ||
| cls.samples = get_mock_dataset(cls.root_dir) | ||
|
|
||
| @parameterized.expand(["train", "test"]) | ||
| def test_amazon_review_polarity(self, split): | ||
| with patch( | ||
| "torchdata.datapipes.iter.util.cacheholder._hash_check", return_value=True | ||
| ): | ||
| dataset = AmazonReviewPolarity(root=self.root_dir, split=split) | ||
| n_iter = 0 | ||
| for i, (label, text) in enumerate(dataset): | ||
| expected_sample = self.samples[split][i] | ||
| assert label == expected_sample[0] | ||
| assert text == expected_sample[1] | ||
| n_iter += 1 | ||
| assert n_iter == len(self.samples[split]) | ||
|
|
||
| @parameterized.expand([("train", ("train",)), ("test", ("test",))]) | ||
| def test_amazon_review_polarity_split_argument(self, split1, split2): | ||
| with patch( | ||
| "torchdata.datapipes.iter.util.cacheholder._hash_check", return_value=True | ||
| ): | ||
| dataset1 = AmazonReviewPolarity(root=self.root_dir, split=split1) | ||
| (dataset2,) = AmazonReviewPolarity(root=self.root_dir, split=split2) | ||
|
|
||
| for d1, d2 in zip(dataset1, dataset2): | ||
| self.assertEqual(d1, d2) | ||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if it is technically feasible to Mock the HTTP link as well?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I can look into this. I think it would be a matter of mocking the
GDriveReaderto return the tar-fileUh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should be doable but you want to be careful how much you want to monkey patch things. When test fixture gets complicated, it becomes error-prone, especially because mnkey patching is not what we do in library code thus we are not as familiar with the technique. And test fixtures are not tested, it's much harder to find bugs in fixtures.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just to be clear, my query was to patch the URL (like the proposal to patch the MD5) and not the code (GDriveReader). Meaning that the patched URL would work like a normal URL but instead would fetch the mocked dataset. Is this the same understanding you had @Nayef211 , @mthrok ? I am not sure though if this is doable or if there are still concerns doing this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let me make sure I understand your suggestion correctly @parmeet. The reason we are patching the
MD5hash is because the tests would actually fail without it. Patching the URL in this case doesn't make sense to me since theon_disk_cachedatapipe just expects to verify whether the files that were downloaded by theGDriveReaderdatapipe exists locally. What would the benefit of patching the URL to return thetar.gzarchive accomplish that wouldn't be done by the AmazonReviewPolarity dataset implementation?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree, we probably do not want to write tests that explicitly tests external library logic/workflow.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OnDiskCachewould check the hash, if it doesn't match, it will trigger the following ops (Downloader or Extract) in this case.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
BTW, as a very stupid example, if by mistake I end up changing GDriveReader to HttpReader, or change the url_dp to point to something else (instead of URL), I could completely bypass the current unit-testing. That said, I don't want to block this PR for this. I don't mind merging it as such as it is already a huge improvement over what we had. Feel free to merge it without further due :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Regarding the @erip's points I think they are valid. As I do not know which component implements which logic, if 1 and 2 are what torchdata is responsible for and tested there, I do not think we need to test it here.
In the context of mocking and patching, I prefer to avoid patching the code as much as possible, because in my experience, patching requires more knowledge about the implementation detail, and as the library code changes, these implementation detail changes, thus it increases the maintenance cost. Which is why I said "be careful how much you want to monkey patch stuff". (Of course, the notable exception to this is network call and such, which otherwise would not be possible.)
The reason why I chose to mock dataset is because they won't change. The datasets by nature are not supposed to change. So even when the library code changes dramatically, the implementation should be centered around the data. Thus we would not need to change the test fixture that much. (and as a bonus, we found a unexpected bug when the mock data was introduced. The order of files returned by glob was not consistent across OS. This would not be caught by patching)
Note that in torchaudio repo, we do not test the download part and the dataset implementation does not check cache if not downloading a new archive. so the code coverage is not 100%. I personally consider this a reasonable trade-off.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the detailed feedback and for describing how torchaudio approaches testing datasets @mthrok. Moving forward, I am going to stick to testing the core functionality of the dataset functions in this test suite.
I still think @parmeet makes a valid point about getting better test coverage by patching
GDriveReaderdatapipe to return the tar archive even if it adds a bit of maintenance overhead (like you said, its a tradeoff between maintenance and test coverage). I am going to merge this PR as is but let me add this patch in a followup PR so we can discuss further if it makes sense from a maintenance vs test coverage perspective.