From 94a334945e7f0c6a13d6531abe3e097ac425eafd Mon Sep 17 00:00:00 2001 From: nayef211 Date: Fri, 7 Jan 2022 14:24:13 -0800 Subject: [PATCH 1/6] Updated max seq length for truncate in xlmr base. Updated xlmr docs. Moved xlmr tests to integration tests --- test/integration_tests/__init__.py | 0 test/integration_tests/test_models.py | 67 +++++++++++++++++++++++++++ test/models/test_models.py | 64 ------------------------- torchtext/models/roberta/bundler.py | 20 +++++++- 4 files changed, 86 insertions(+), 65 deletions(-) create mode 100644 test/integration_tests/__init__.py create mode 100644 test/integration_tests/test_models.py diff --git a/test/integration_tests/__init__.py b/test/integration_tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/integration_tests/test_models.py b/test/integration_tests/test_models.py new file mode 100644 index 0000000000..3240d43cc0 --- /dev/null +++ b/test/integration_tests/test_models.py @@ -0,0 +1,67 @@ +import torch +import torchtext + +from ..common.assets import get_asset_path +from ..common.torchtext_test_case import TorchtextTestCase + + +class TestModels(TorchtextTestCase): + def test_xlmr_base(self): + asset_path = get_asset_path("xlmr.base.output.pt") + test_text = "XLMR base Model Comparison" + + xlmr_base = torchtext.models.XLMR_BASE_ENCODER + transform = xlmr_base.transform() + model = xlmr_base.get_model() + model = model.eval() + + model_input = torch.tensor(transform([test_text])) + actual = model(model_input) + expected = torch.load(asset_path) + torch.testing.assert_close(actual, expected) + + def test_xlmr_base_jit(self): + asset_path = get_asset_path("xlmr.base.output.pt") + test_text = "XLMR base Model Comparison" + + xlmr_base = torchtext.models.XLMR_BASE_ENCODER + transform = xlmr_base.transform() + transform_jit = torch.jit.script(transform) + model = xlmr_base.get_model() + model = model.eval() + model_jit = torch.jit.script(model) + + model_input = torch.tensor(transform_jit([test_text])) + actual = model_jit(model_input) + expected = torch.load(asset_path) + torch.testing.assert_close(actual, expected) + + def test_xlmr_large(self): + asset_path = get_asset_path("xlmr.large.output.pt") + test_text = "XLMR base Model Comparison" + + xlmr_large = torchtext.models.XLMR_LARGE_ENCODER + transform = xlmr_large.transform() + model = xlmr_large.get_model() + model = model.eval() + + model_input = torch.tensor(transform([test_text])) + actual = model(model_input) + expected = torch.load(asset_path) + torch.testing.assert_close(actual, expected) + + def test_xlmr_large_jit(self): + asset_path = get_asset_path("xlmr.large.output.pt") + test_text = "XLMR base Model Comparison" + + xlmr_large = torchtext.models.XLMR_LARGE_ENCODER + transform = xlmr_large.transform() + transform_jit = torch.jit.script(transform) + model = xlmr_large.get_model() + model = model.eval() + model_jit = torch.jit.script(model) + + model_input = torch.tensor(transform_jit([test_text])) + actual = model_jit(model_input) + expected = torch.load(asset_path) + torch.testing.assert_close(actual, expected) diff --git a/test/models/test_models.py b/test/models/test_models.py index 58942acb62..cf14984917 100644 --- a/test/models/test_models.py +++ b/test/models/test_models.py @@ -3,7 +3,6 @@ from torch.nn import functional as torch_F import copy from ..common.torchtext_test_case import TorchtextTestCase -from ..common.assets import get_asset_path class TestModules(TorchtextTestCase): @@ -37,69 +36,6 @@ def test_self_attn_mask(self): class TestModels(TorchtextTestCase): - def test_xlmr_base_output(self): - asset_name = "xlmr.base.output.pt" - asset_path = get_asset_path(asset_name) - xlmr_base = torchtext.models.XLMR_BASE_ENCODER - model = xlmr_base.get_model() - model = model.eval() - model_input = torch.tensor([[0, 43523, 52005, 3647, 13293, 113307, 40514, 2]]) - actual = model(model_input) - expected = torch.load(asset_path) - torch.testing.assert_close(actual, expected) - - def test_xlmr_base_jit_output(self): - asset_name = "xlmr.base.output.pt" - asset_path = get_asset_path(asset_name) - xlmr_base = torchtext.models.XLMR_BASE_ENCODER - model = xlmr_base.get_model() - model = model.eval() - model_jit = torch.jit.script(model) - model_input = torch.tensor([[0, 43523, 52005, 3647, 13293, 113307, 40514, 2]]) - actual = model_jit(model_input) - expected = torch.load(asset_path) - torch.testing.assert_close(actual, expected) - - def test_xlmr_large_output(self): - asset_name = "xlmr.large.output.pt" - asset_path = get_asset_path(asset_name) - xlmr_base = torchtext.models.XLMR_LARGE_ENCODER - model = xlmr_base.get_model() - model = model.eval() - model_input = torch.tensor([[0, 43523, 52005, 3647, 13293, 113307, 40514, 2]]) - actual = model(model_input) - expected = torch.load(asset_path) - torch.testing.assert_close(actual, expected) - - def test_xlmr_large_jit_output(self): - asset_name = "xlmr.large.output.pt" - asset_path = get_asset_path(asset_name) - xlmr_base = torchtext.models.XLMR_LARGE_ENCODER - model = xlmr_base.get_model() - model = model.eval() - model_jit = torch.jit.script(model) - model_input = torch.tensor([[0, 43523, 52005, 3647, 13293, 113307, 40514, 2]]) - actual = model_jit(model_input) - expected = torch.load(asset_path) - torch.testing.assert_close(actual, expected) - - def test_xlmr_transform(self): - xlmr_base = torchtext.models.XLMR_BASE_ENCODER - transform = xlmr_base.transform() - test_text = "XLMR base Model Comparison" - actual = transform([test_text]) - expected = [[0, 43523, 52005, 3647, 13293, 113307, 40514, 2]] - torch.testing.assert_close(actual, expected) - - def test_xlmr_transform_jit(self): - xlmr_base = torchtext.models.XLMR_BASE_ENCODER - transform = xlmr_base.transform() - transform_jit = torch.jit.script(transform) - test_text = "XLMR base Model Comparison" - actual = transform_jit([test_text]) - expected = [[0, 43523, 52005, 3647, 13293, 113307, 40514, 2]] - torch.testing.assert_close(actual, expected) - def test_roberta_bundler_build_model(self): from torchtext.models import RobertaEncoderConf, RobertaClassificationHead, RobertaModel, RobertaModelBundle dummy_encoder_conf = RobertaEncoderConf(vocab_size=10, embedding_dim=16, ffn_dimension=64, num_attention_heads=2, num_encoder_layers=2) diff --git a/torchtext/models/roberta/bundler.py b/torchtext/models/roberta/bundler.py index 7cd9e2c833..16d128fb4d 100644 --- a/torchtext/models/roberta/bundler.py +++ b/torchtext/models/roberta/bundler.py @@ -158,7 +158,7 @@ def encoderConf(self) -> RobertaEncoderConf: transform=lambda: T.Sequential( T.SentencePieceTokenizer(urljoin(_TEXT_BUCKET, "xlmr.sentencepiece.bpe.model")), T.VocabTransform(load_state_dict_from_url(urljoin(_TEXT_BUCKET, "xlmr.vocab.pt"))), - T.Truncate(510), + T.Truncate(254), T.AddToken(token=0, begin=True), T.AddToken(token=2, begin=False), ) @@ -168,6 +168,15 @@ def encoderConf(self) -> RobertaEncoderConf: ''' XLM-R Encoder with Base configuration + The XLM-RoBERTa model was proposed in `Unsupervised Cross-lingual Representation Learning + at Scale `. It is a large multi-lingual language model, + trained on 2.5TB of filtered CommonCrawl data and based on the RoBERTa model architecture. + + Originally published by the authors of XLM-RoBERTa under MIT License + and redistributed with the same license. + [`License `__, + `Source `__] + Please refer to :func:`torchtext.models.RobertaModelBundle` for the usage. ''' ) @@ -189,6 +198,15 @@ def encoderConf(self) -> RobertaEncoderConf: ''' XLM-R Encoder with Large configuration + The XLM-RoBERTa model was proposed in `Unsupervised Cross-lingual Representation Learning + at Scale `. It is a large multi-lingual language model, + trained on 2.5TB of filtered CommonCrawl data and based on the RoBERTa model architecture. + + Originally published by the authors of XLM-RoBERTa under MIT License + and redistributed with the same license. + [`License `__, + `Source `__] + Please refer to :func:`torchtext.models.RobertaModelBundle` for the usage. ''' ) From 60bb626923e4c4144ddb9ac4d035eddeb13b4a5f Mon Sep 17 00:00:00 2001 From: nayef211 Date: Fri, 7 Jan 2022 14:36:56 -0800 Subject: [PATCH 2/6] Removing changes to truncate transform --- torchtext/models/roberta/bundler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchtext/models/roberta/bundler.py b/torchtext/models/roberta/bundler.py index 16d128fb4d..6c3fc2ba09 100644 --- a/torchtext/models/roberta/bundler.py +++ b/torchtext/models/roberta/bundler.py @@ -158,7 +158,7 @@ def encoderConf(self) -> RobertaEncoderConf: transform=lambda: T.Sequential( T.SentencePieceTokenizer(urljoin(_TEXT_BUCKET, "xlmr.sentencepiece.bpe.model")), T.VocabTransform(load_state_dict_from_url(urljoin(_TEXT_BUCKET, "xlmr.vocab.pt"))), - T.Truncate(254), + T.Truncate(510), T.AddToken(token=0, begin=True), T.AddToken(token=2, begin=False), ) From b6f08edaed36348c0d1d60571ec65354697414d1 Mon Sep 17 00:00:00 2001 From: nayef211 Date: Fri, 7 Jan 2022 18:35:57 -0800 Subject: [PATCH 3/6] Remove documentation changes from PR --- torchtext/models/roberta/bundler.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/torchtext/models/roberta/bundler.py b/torchtext/models/roberta/bundler.py index 034a7f8646..fcfb82dbd6 100644 --- a/torchtext/models/roberta/bundler.py +++ b/torchtext/models/roberta/bundler.py @@ -168,15 +168,6 @@ def encoderConf(self) -> RobertaEncoderConf: ''' XLM-R Encoder with Base configuration - The XLM-RoBERTa model was proposed in `Unsupervised Cross-lingual Representation Learning - at Scale `. It is a large multi-lingual language model, - trained on 2.5TB of filtered CommonCrawl data and based on the RoBERTa model architecture. - - Originally published by the authors of XLM-RoBERTa under MIT License - and redistributed with the same license. - [`License `__, - `Source `__] - Please refer to :func:`torchtext.models.RobertaModelBundle` for the usage. ''' ) @@ -198,15 +189,6 @@ def encoderConf(self) -> RobertaEncoderConf: ''' XLM-R Encoder with Large configuration - The XLM-RoBERTa model was proposed in `Unsupervised Cross-lingual Representation Learning - at Scale `. It is a large multi-lingual language model, - trained on 2.5TB of filtered CommonCrawl data and based on the RoBERTa model architecture. - - Originally published by the authors of XLM-RoBERTa under MIT License - and redistributed with the same license. - [`License `__, - `Source `__] - Please refer to :func:`torchtext.models.RobertaModelBundle` for the usage. ''' ) From e2c79e7100e0e25f35050fe4d3546af43d862ec5 Mon Sep 17 00:00:00 2001 From: nayef211 Date: Fri, 7 Jan 2022 19:01:28 -0800 Subject: [PATCH 4/6] Parameterized model tests --- test/integration_tests/test_models.py | 139 +++++++------------------- 1 file changed, 34 insertions(+), 105 deletions(-) diff --git a/test/integration_tests/test_models.py b/test/integration_tests/test_models.py index ef8dd876c9..f4c9eba687 100644 --- a/test/integration_tests/test_models.py +++ b/test/integration_tests/test_models.py @@ -1,127 +1,56 @@ import torch -import torchtext +from parameterized import parameterized +from torchtext.models import ( + XLMR_BASE_ENCODER, + XLMR_LARGE_ENCODER, + ROBERTA_BASE_ENCODER, + ROBERTA_LARGE_ENCODER, +) from ..common.assets import get_asset_path from ..common.torchtext_test_case import TorchtextTestCase +TEST_MODELS_PARAMETERIZED_ARGS = [ + ("xlmr.base.output.pt", "XLMR base Model Comparison", XLMR_BASE_ENCODER), + ("xlmr.large.output.pt", "XLMR base Model Comparison", XLMR_LARGE_ENCODER), + ( + "roberta.base.output.pt", + "Roberta base Model Comparison", + ROBERTA_BASE_ENCODER, + ), + ( + "roberta.large.output.pt", + "Roberta base Model Comparison", + ROBERTA_LARGE_ENCODER, + ), +] -class TestModels(TorchtextTestCase): - def test_xlmr_base(self): - asset_path = get_asset_path("xlmr.base.output.pt") - test_text = "XLMR base Model Comparison" - - xlmr_base = torchtext.models.XLMR_BASE_ENCODER - transform = xlmr_base.transform() - model = xlmr_base.get_model() - model = model.eval() - - model_input = torch.tensor(transform([test_text])) - actual = model(model_input) - expected = torch.load(asset_path) - torch.testing.assert_close(actual, expected) - def test_xlmr_base_jit(self): - asset_path = get_asset_path("xlmr.base.output.pt") - test_text = "XLMR base Model Comparison" - - xlmr_base = torchtext.models.XLMR_BASE_ENCODER - transform = xlmr_base.transform() - transform_jit = torch.jit.script(transform) - model = xlmr_base.get_model() - model = model.eval() - model_jit = torch.jit.script(model) - - model_input = torch.tensor(transform_jit([test_text])) - actual = model_jit(model_input) - expected = torch.load(asset_path) - torch.testing.assert_close(actual, expected) - - def test_xlmr_large(self): - asset_path = get_asset_path("xlmr.large.output.pt") - test_text = "XLMR base Model Comparison" - - xlmr_large = torchtext.models.XLMR_LARGE_ENCODER - transform = xlmr_large.transform() - model = xlmr_large.get_model() - model = model.eval() - - model_input = torch.tensor(transform([test_text])) - actual = model(model_input) - expected = torch.load(asset_path) - torch.testing.assert_close(actual, expected) - - def test_xlmr_large_jit(self): - asset_path = get_asset_path("xlmr.large.output.pt") - test_text = "XLMR base Model Comparison" - - xlmr_large = torchtext.models.XLMR_LARGE_ENCODER - transform = xlmr_large.transform() - transform_jit = torch.jit.script(transform) - model = xlmr_large.get_model() - model = model.eval() - model_jit = torch.jit.script(model) - - model_input = torch.tensor(transform_jit([test_text])) - actual = model_jit(model_input) - expected = torch.load(asset_path) - torch.testing.assert_close(actual, expected) - - def test_roberta_base(self): - asset_path = get_asset_path("roberta.base.output.pt") - test_text = "Roberta base Model Comparison" - - roberta_base = torchtext.models.ROBERTA_BASE_ENCODER - transform = roberta_base.transform() - model = roberta_base.get_model() - model = model.eval() - - model_input = torch.tensor(transform([test_text])) - actual = model(model_input) - expected = torch.load(asset_path) - torch.testing.assert_close(actual, expected) - - def test_roberta_base_jit(self): - asset_path = get_asset_path("roberta.base.output.pt") - test_text = "Roberta base Model Comparison" - - roberta_base = torchtext.models.ROBERTA_BASE_ENCODER - transform = roberta_base.transform() - transform_jit = torch.jit.script(transform) - model = roberta_base.get_model() - model = model.eval() - model_jit = torch.jit.script(model) - - model_input = torch.tensor(transform_jit([test_text])) - actual = model_jit(model_input) - expected = torch.load(asset_path) - torch.testing.assert_close(actual, expected) - - def test_roberta_large(self): - asset_path = get_asset_path("roberta.large.output.pt") - test_text = "Roberta base Model Comparison" +class TestModels(TorchtextTestCase): + @parameterized.expand(TEST_MODELS_PARAMETERIZED_ARGS) + def test_model(self, expected_asset_name, test_text, model_bundler): + expected_asset_path = get_asset_path(expected_asset_name) - roberta_large = torchtext.models.ROBERTA_LARGE_ENCODER - transform = roberta_large.transform() - model = roberta_large.get_model() + transform = model_bundler.transform() + model = model_bundler.get_model() model = model.eval() model_input = torch.tensor(transform([test_text])) actual = model(model_input) - expected = torch.load(asset_path) + expected = torch.load(expected_asset_path) torch.testing.assert_close(actual, expected) - def test_roberta_large_jit(self): - asset_path = get_asset_path("roberta.large.output.pt") - test_text = "Roberta base Model Comparison" + @parameterized.expand(TEST_MODELS_PARAMETERIZED_ARGS) + def test_model_jit(self, expected_asset_name, test_text, model_bundler): + expected_asset_path = get_asset_path(expected_asset_name) - roberta_large = torchtext.models.ROBERTA_LARGE_ENCODER - transform = roberta_large.transform() + transform = model_bundler.transform() transform_jit = torch.jit.script(transform) - model = roberta_large.get_model() + model = model_bundler.get_model() model = model.eval() model_jit = torch.jit.script(model) model_input = torch.tensor(transform_jit([test_text])) actual = model_jit(model_input) - expected = torch.load(asset_path) + expected = torch.load(expected_asset_path) torch.testing.assert_close(actual, expected) From 226d5ca64cdd8e2c511ada9ccae2c9d02a70ecf1 Mon Sep 17 00:00:00 2001 From: nayef211 Date: Mon, 10 Jan 2022 13:52:15 -0800 Subject: [PATCH 5/6] Added nested_params helper method. Updated model integration test to parameterize a single method covering jit and non-jit tests --- test/common/parameterized_utils.py | 53 +++++++++++++++++++++--- test/integration_tests/test_models.py | 58 ++++++++++++--------------- 2 files changed, 73 insertions(+), 38 deletions(-) diff --git a/test/common/parameterized_utils.py b/test/common/parameterized_utils.py index 85d5bcb0f5..9c8aad346f 100644 --- a/test/common/parameterized_utils.py +++ b/test/common/parameterized_utils.py @@ -1,17 +1,60 @@ import json -from parameterized import param import os.path +from itertools import product +from parameterized import param, parameterized -_TEST_DIR_PATH = os.path.realpath( - os.path.join(os.path.dirname(__file__), '..')) + +_TEST_DIR_PATH = os.path.realpath(os.path.join(os.path.dirname(__file__), "..")) def get_asset_path(*paths): """Return full path of a test asset""" - return os.path.join(_TEST_DIR_PATH, 'asset', *paths) + return os.path.join(_TEST_DIR_PATH, "asset", *paths) def load_params(*paths): - with open(get_asset_path(*paths), 'r') as file: + with open(get_asset_path(*paths), "r") as file: return [param(json.loads(line)) for line in file] + + +def _name_func(func, _, params): + strs = [] + for arg in params.args: + if isinstance(arg, tuple): + strs.append("_".join(str(a) for a in arg)) + else: + strs.append(str(arg)) + # sanitize the test name + name = "_".join(strs).replace(".", "_") + return f"{func.__name__}_{name}" + + +def nested_params(*params_set): + """Generate the cartesian product of the given list of parameters. + Args: + params_set (list of parameters): Parameters. When using ``parameterized.param`` class, + all the parameters have to be specified with the class, only using kwargs. + """ + flatten = [p for params in params_set for p in params] + + # Parameters to be nested are given as list of plain objects + if all(not isinstance(p, param) for p in flatten): + args = list(product(*params_set)) + return parameterized.expand(args, name_func=_name_func) + + # Parameters to be nested are given as list of `parameterized.param` + if not all(isinstance(p, param) for p in flatten): + raise TypeError( + "When using ``parameterized.param``, " + "all the parameters have to be of the ``param`` type." + ) + if any(p.args for p in flatten): + raise ValueError( + "When using ``parameterized.param``, " + "all the parameters have to be provided as keyword argument." + ) + args = [param()] + for params in params_set: + args = [param(**x.kwargs, **y.kwargs) for x in args for y in params] + return parameterized.expand(args) diff --git a/test/integration_tests/test_models.py b/test/integration_tests/test_models.py index f4c9eba687..4483680774 100644 --- a/test/integration_tests/test_models.py +++ b/test/integration_tests/test_models.py @@ -1,5 +1,4 @@ import torch -from parameterized import parameterized from torchtext.models import ( XLMR_BASE_ENCODER, XLMR_LARGE_ENCODER, @@ -8,49 +7,42 @@ ) from ..common.assets import get_asset_path +from ..common.parameterized_utils import nested_params from ..common.torchtext_test_case import TorchtextTestCase -TEST_MODELS_PARAMETERIZED_ARGS = [ - ("xlmr.base.output.pt", "XLMR base Model Comparison", XLMR_BASE_ENCODER), - ("xlmr.large.output.pt", "XLMR base Model Comparison", XLMR_LARGE_ENCODER), - ( - "roberta.base.output.pt", - "Roberta base Model Comparison", - ROBERTA_BASE_ENCODER, - ), - ( - "roberta.large.output.pt", - "Roberta base Model Comparison", - ROBERTA_LARGE_ENCODER, - ), -] - class TestModels(TorchtextTestCase): - @parameterized.expand(TEST_MODELS_PARAMETERIZED_ARGS) - def test_model(self, expected_asset_name, test_text, model_bundler): + @nested_params( + [ + ("xlmr.base.output.pt", "XLMR base Model Comparison", XLMR_BASE_ENCODER), + ("xlmr.large.output.pt", "XLMR base Model Comparison", XLMR_LARGE_ENCODER), + ( + "roberta.base.output.pt", + "Roberta base Model Comparison", + ROBERTA_BASE_ENCODER, + ), + ( + "roberta.large.output.pt", + "Roberta base Model Comparison", + ROBERTA_LARGE_ENCODER, + ), + ], + [True, False], + ) + def test_model(self, model_args, is_jit): + expected_asset_name, test_text, model_bundler = model_args + expected_asset_path = get_asset_path(expected_asset_name) transform = model_bundler.transform() model = model_bundler.get_model() model = model.eval() + if is_jit: + transform = torch.jit.script(transform) + model = torch.jit.script(model) + model_input = torch.tensor(transform([test_text])) actual = model(model_input) expected = torch.load(expected_asset_path) torch.testing.assert_close(actual, expected) - - @parameterized.expand(TEST_MODELS_PARAMETERIZED_ARGS) - def test_model_jit(self, expected_asset_name, test_text, model_bundler): - expected_asset_path = get_asset_path(expected_asset_name) - - transform = model_bundler.transform() - transform_jit = torch.jit.script(transform) - model = model_bundler.get_model() - model = model.eval() - model_jit = torch.jit.script(model) - - model_input = torch.tensor(transform_jit([test_text])) - actual = model_jit(model_input) - expected = torch.load(expected_asset_path) - torch.testing.assert_close(actual, expected) From 77278386685abddbf2c754df777576a7d56bbd97 Mon Sep 17 00:00:00 2001 From: nayef211 Date: Mon, 10 Jan 2022 15:05:55 -0800 Subject: [PATCH 6/6] Added docstring for unit tests --- test/integration_tests/test_models.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/integration_tests/test_models.py b/test/integration_tests/test_models.py index 64e95c956d..67e10b0ee8 100644 --- a/test/integration_tests/test_models.py +++ b/test/integration_tests/test_models.py @@ -45,6 +45,9 @@ class TestModels(TorchtextTestCase): [True, False], ) def test_model(self, model_args, is_jit): + """Verify pre-trained XLM-R and Roberta models in torchtext produce + the same output as the reference implementation within fairseq + """ expected_asset_name, test_text, model_bundler = model_args expected_asset_path = get_asset_path(expected_asset_name)