From ecb2cf83f56a282fec85a31418163a0714b8e192 Mon Sep 17 00:00:00 2001 From: nayef211 Date: Thu, 3 Feb 2022 18:10:23 -0800 Subject: [PATCH 1/2] Added squad2 tests --- test/datasets/test_squad2.py | 104 +++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 test/datasets/test_squad2.py diff --git a/test/datasets/test_squad2.py b/test/datasets/test_squad2.py new file mode 100644 index 0000000000..e34b31be60 --- /dev/null +++ b/test/datasets/test_squad2.py @@ -0,0 +1,104 @@ +import json +import os +import random +import string +import uuid +from collections import defaultdict +from random import randint +from unittest.mock import patch + +from parameterized import parameterized +from torchtext.data.datasets_utils import _ParseSQuADQAData +from torchtext.datasets.squad2 import SQuAD2 + +from ..common.case_utils import TempDirMixin, zip_equal +from ..common.torchtext_test_case import TorchtextTestCase + + +def _get_mock_json_data(): + rand_string = " ".join(random.choice(string.ascii_letters) for i in range(10)) + mock_json_data = { + "data": [ + { + "title": rand_string, + "paragraphs": [ + { + "context": rand_string, + "qas": [ + { + "answers": [ + { + "answer_start": randint(1, 1000), + "text": rand_string, + } + ], + "question": rand_string, + "id": uuid.uuid1().hex, + }, + ], + } + ], + } + ] + } + return mock_json_data + + +def _get_mock_dataset(root_dir): + """ + root_dir: directory to the mocked dataset + """ + base_dir = os.path.join(root_dir, "SQuAD2") + os.makedirs(base_dir, exist_ok=True) + + mocked_data = defaultdict(list) + for file_name in ("train-v2.0.json", "dev-v2.0.json"): + txt_file = os.path.join(base_dir, file_name) + with open(txt_file, "w") as f: + mock_json_data = _get_mock_json_data() + f.write(json.dumps(mock_json_data)) + + split = "train" if "train" in file_name else "dev" + dataset_line = next( + iter(_ParseSQuADQAData([("file_handle", mock_json_data)])) + ) + mocked_data[split].append(dataset_line) + + return mocked_data + + +class TestSQuAD1(TempDirMixin, TorchtextTestCase): + root_dir = None + samples = [] + + @classmethod + def setUpClass(cls): + super().setUpClass() + cls.root_dir = cls.get_base_temp_dir() + cls.samples = _get_mock_dataset(cls.root_dir) + cls.patcher = patch( + "torchdata.datapipes.iter.util.cacheholder._hash_check", return_value=True + ) + cls.patcher.start() + + @classmethod + def tearDownClass(cls): + cls.patcher.stop() + super().tearDownClass() + + @parameterized.expand(["train", "dev"]) + def test_squad2(self, split): + dataset = SQuAD2(root=self.root_dir, split=split) + + samples = list(dataset) + expected_samples = self.samples[split] + for sample, expected_sample in zip_equal(samples, expected_samples): + self.assertEqual(sample, expected_sample) + + @parameterized.expand(["train", "dev"]) + def test_squad2_split_argument(self, split): + dataset1 = SQuAD2(root=self.root_dir, split=split) + (dataset2,) = SQuAD2(root=self.root_dir, split=(split,)) + + for d1, d2 in zip_equal(dataset1, dataset2): + self.assertEqual(d1, d2) From 5a675782dfaecd4a15d3ea7efdc21409fb8ec6ba Mon Sep 17 00:00:00 2001 From: nayef211 Date: Mon, 7 Feb 2022 08:27:11 -0800 Subject: [PATCH 2/2] Parameterized squad1 and squad2 dataset tests --- .../{test_squad1.py => test_squad.py} | 40 ++++--- test/datasets/test_squad2.py | 104 ------------------ 2 files changed, 25 insertions(+), 119 deletions(-) rename test/datasets/{test_squad1.py => test_squad.py} (69%) delete mode 100644 test/datasets/test_squad2.py diff --git a/test/datasets/test_squad1.py b/test/datasets/test_squad.py similarity index 69% rename from test/datasets/test_squad1.py rename to test/datasets/test_squad.py index 75f1f61639..d44b6637f1 100644 --- a/test/datasets/test_squad1.py +++ b/test/datasets/test_squad.py @@ -7,11 +7,12 @@ from random import randint from unittest.mock import patch -from parameterized import parameterized from torchtext.data.datasets_utils import _ParseSQuADQAData from torchtext.datasets.squad1 import SQuAD1 +from torchtext.datasets.squad2 import SQuAD2 from ..common.case_utils import TempDirMixin, zip_equal +from ..common.parameterized_utils import nested_params from ..common.torchtext_test_case import TorchtextTestCase @@ -44,15 +45,20 @@ def _get_mock_json_data(): return mock_json_data -def _get_mock_dataset(root_dir): +def _get_mock_dataset(root_dir, base_dir_name): """ root_dir: directory to the mocked dataset """ - base_dir = os.path.join(root_dir, "SQuAD1") + base_dir = os.path.join(root_dir, base_dir_name) os.makedirs(base_dir, exist_ok=True) + if base_dir_name == SQuAD1.__name__: + file_names = ("train-v1.1.json", "dev-v1.1.json") + else: + file_names = ("train-v2.0.json", "dev-v2.0.json") + mocked_data = defaultdict(list) - for file_name in ("train-v1.1.json", "dev-v1.1.json"): + for file_name in file_names: txt_file = os.path.join(base_dir, file_name) with open(txt_file, "w") as f: mock_json_data = _get_mock_json_data() @@ -67,7 +73,7 @@ def _get_mock_dataset(root_dir): return mocked_data -class TestSQuAD1(TempDirMixin, TorchtextTestCase): +class TestSQuAD(TempDirMixin, TorchtextTestCase): root_dir = None samples = [] @@ -75,7 +81,6 @@ class TestSQuAD1(TempDirMixin, TorchtextTestCase): def setUpClass(cls): super().setUpClass() cls.root_dir = cls.get_base_temp_dir() - cls.samples = _get_mock_dataset(cls.root_dir) cls.patcher = patch( "torchdata.datapipes.iter.util.cacheholder._hash_check", return_value=True ) @@ -86,19 +91,24 @@ def tearDownClass(cls): cls.patcher.stop() super().tearDownClass() - @parameterized.expand(["train", "dev"]) - def test_squad1(self, split): - dataset = SQuAD1(root=self.root_dir, split=split) - + @nested_params([SQuAD1, SQuAD2], ["train", "dev"]) + def test_squad(self, squad_dataset, split): + expected_samples = _get_mock_dataset(self.root_dir, squad_dataset.__name__)[ + split + ] + dataset = squad_dataset(root=self.root_dir, split=split) samples = list(dataset) - expected_samples = self.samples[split] + for sample, expected_sample in zip_equal(samples, expected_samples): self.assertEqual(sample, expected_sample) - @parameterized.expand(["train", "dev"]) - def test_squad1_split_argument(self, split): - dataset1 = SQuAD1(root=self.root_dir, split=split) - (dataset2,) = SQuAD1(root=self.root_dir, split=(split,)) + @nested_params([SQuAD1, SQuAD2], ["train", "dev"]) + def test_squad_split_argument(self, squad_dataset, split): + # call `_get_mock_dataset` to create mock dataset files + _ = _get_mock_dataset(self.root_dir, squad_dataset.__name__) + + dataset1 = squad_dataset(root=self.root_dir, split=split) + (dataset2,) = squad_dataset(root=self.root_dir, split=(split,)) for d1, d2 in zip_equal(dataset1, dataset2): self.assertEqual(d1, d2) diff --git a/test/datasets/test_squad2.py b/test/datasets/test_squad2.py deleted file mode 100644 index e34b31be60..0000000000 --- a/test/datasets/test_squad2.py +++ /dev/null @@ -1,104 +0,0 @@ -import json -import os -import random -import string -import uuid -from collections import defaultdict -from random import randint -from unittest.mock import patch - -from parameterized import parameterized -from torchtext.data.datasets_utils import _ParseSQuADQAData -from torchtext.datasets.squad2 import SQuAD2 - -from ..common.case_utils import TempDirMixin, zip_equal -from ..common.torchtext_test_case import TorchtextTestCase - - -def _get_mock_json_data(): - rand_string = " ".join(random.choice(string.ascii_letters) for i in range(10)) - mock_json_data = { - "data": [ - { - "title": rand_string, - "paragraphs": [ - { - "context": rand_string, - "qas": [ - { - "answers": [ - { - "answer_start": randint(1, 1000), - "text": rand_string, - } - ], - "question": rand_string, - "id": uuid.uuid1().hex, - }, - ], - } - ], - } - ] - } - return mock_json_data - - -def _get_mock_dataset(root_dir): - """ - root_dir: directory to the mocked dataset - """ - base_dir = os.path.join(root_dir, "SQuAD2") - os.makedirs(base_dir, exist_ok=True) - - mocked_data = defaultdict(list) - for file_name in ("train-v2.0.json", "dev-v2.0.json"): - txt_file = os.path.join(base_dir, file_name) - with open(txt_file, "w") as f: - mock_json_data = _get_mock_json_data() - f.write(json.dumps(mock_json_data)) - - split = "train" if "train" in file_name else "dev" - dataset_line = next( - iter(_ParseSQuADQAData([("file_handle", mock_json_data)])) - ) - mocked_data[split].append(dataset_line) - - return mocked_data - - -class TestSQuAD1(TempDirMixin, TorchtextTestCase): - root_dir = None - samples = [] - - @classmethod - def setUpClass(cls): - super().setUpClass() - cls.root_dir = cls.get_base_temp_dir() - cls.samples = _get_mock_dataset(cls.root_dir) - cls.patcher = patch( - "torchdata.datapipes.iter.util.cacheholder._hash_check", return_value=True - ) - cls.patcher.start() - - @classmethod - def tearDownClass(cls): - cls.patcher.stop() - super().tearDownClass() - - @parameterized.expand(["train", "dev"]) - def test_squad2(self, split): - dataset = SQuAD2(root=self.root_dir, split=split) - - samples = list(dataset) - expected_samples = self.samples[split] - for sample, expected_sample in zip_equal(samples, expected_samples): - self.assertEqual(sample, expected_sample) - - @parameterized.expand(["train", "dev"]) - def test_squad2_split_argument(self, split): - dataset1 = SQuAD2(root=self.root_dir, split=split) - (dataset2,) = SQuAD2(root=self.root_dir, split=(split,)) - - for d1, d2 in zip_equal(dataset1, dataset2): - self.assertEqual(d1, d2)