From 6daa7bdebe28a002ae45a28dca31935200b2e2aa Mon Sep 17 00:00:00 2001 From: Parmeet Singh Bhatia Date: Mon, 3 Jan 2022 19:34:40 -0500 Subject: [PATCH 1/7] add scriptable sequential transform --- docs/source/transforms.rst | 9 ++++++++- test/test_transforms.py | 25 +++++++++++++++++++++++++ torchtext/experimental/transforms.py | 18 ------------------ torchtext/transforms.py | 14 ++++++++++++++ 4 files changed, 47 insertions(+), 19 deletions(-) diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst index 6393dd11d2..41f7d0e92c 100644 --- a/docs/source/transforms.rst +++ b/docs/source/transforms.rst @@ -7,7 +7,7 @@ torchtext.transforms .. automodule:: torchtext.transforms .. currentmodule:: torchtext.transforms -Transforms are common text transforms. They can be chained together using :class:`torch.nn.Sequential` +Transforms are common text transforms. They can be chained together using :class:`torch.nn.Sequential` or using :class:`torchtext.transforms.Sequential` to support torch-scriptability. SentencePieceTokenizer ---------------------- @@ -51,3 +51,10 @@ AddToken .. autoclass:: AddToken .. automethod:: forward + +Sequential +---------- + +.. autoclass:: Sequential + + .. automethod:: forward diff --git a/test/test_transforms.py b/test/test_transforms.py index 3aebb90b47..aad46f1317 100644 --- a/test/test_transforms.py +++ b/test/test_transforms.py @@ -203,6 +203,31 @@ def test_add_token(self): def test_add_token_jit(self): self._add_token(test_scripting=True) + def _sequential(self, test_scripting): + max_seq_len = 3 + padding_val = 0 + transform = transforms.Sequential( + transforms.Truncate(max_seq_len=max_seq_len), + transforms.ToTensor(padding_value=padding_val, dtype=torch.long) + ) + + if test_scripting: + transform = torch.jit.script(transform) + + input = [[1, 2, 3], [1, 2, 3]] + + actual = transform(input) + expected = torch.tensor(input) + torch.testing.assert_close(actual, expected) + + def test_sequential(self): + """test pipelining transforms using Sequential transform""" + self._sequential(test_scripting=False) + + def test_sequential_jit(self): + """test pipelining transforms using Sequential transform, ensuring the composite transform is scriptable""" + self._sequential(test_scripting=True) + class TestGPT2BPETokenizer(TorchtextTestCase): def _gpt2_bpe_tokenizer(self, test_scripting): diff --git a/torchtext/experimental/transforms.py b/torchtext/experimental/transforms.py index aa37044d22..e29c50bbdd 100644 --- a/torchtext/experimental/transforms.py +++ b/torchtext/experimental/transforms.py @@ -164,24 +164,6 @@ def __prepare_scriptable__(self): return RegexTokenizer(regex_tokenizer) -class TextSequentialTransforms(nn.Sequential): - r"""A container to host a sequential text transforms. - - Example: - >>> import torch - >>> from torchtext.experimental.transforms import basic_english_normalize, TextSequentialTransforms - >>> tokenizer = basic_english_normalize() - >>> txt_pipeline = TextSequentialTransforms(tokenizer) - >>> txt_pipeline('here is an example') - ['here', 'is', 'an', 'example'] - >>> jit_txt_pipeline = torch.jit.script(txt_pipeline) - """ - - def forward(self, input: str): - for module in self: - input = module(input) - return input - PRETRAINED_SP_MODEL = { 'text_unigram_15000': 'https://pytorch.s3.amazonaws.com/models/text/pretrained_spm/text_unigram_15000.model', diff --git a/torchtext/transforms.py b/torchtext/transforms.py index 3d8fe5bc22..81939d428d 100644 --- a/torchtext/transforms.py +++ b/torchtext/transforms.py @@ -485,3 +485,17 @@ def bytes_to_unicode(): n += 1 cs = [chr(n) for n in cs] return dict(zip(bs, cs)) + + +class Sequential(torch.nn.Sequential): + r"""A container to host a sequence of text transforms. + """ + + def forward(self, input: Any) -> Any: + """ + :param input: Input sequence or batch. The input type must be supported by the first transform in the sequence. + :type input: `Any` + """ + for module in self: + input = module(input) + return input From dd0fcca52deec9500bf48d95b0f896a8d961430b Mon Sep 17 00:00:00 2001 From: Parmeet Singh Bhatia Date: Mon, 3 Jan 2022 19:40:41 -0500 Subject: [PATCH 2/7] fix flake --- torchtext/experimental/transforms.py | 2 -- torchtext/transforms.py | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/torchtext/experimental/transforms.py b/torchtext/experimental/transforms.py index e29c50bbdd..5b0b2e8aed 100644 --- a/torchtext/experimental/transforms.py +++ b/torchtext/experimental/transforms.py @@ -12,7 +12,6 @@ 'regex_tokenizer', 'BasicEnglishNormalize', 'RegexTokenizer', - 'TextSequentialTransforms', 'PRETRAINED_SP_MODEL', 'load_sp_model', 'sentencepiece_tokenizer', @@ -164,7 +163,6 @@ def __prepare_scriptable__(self): return RegexTokenizer(regex_tokenizer) - PRETRAINED_SP_MODEL = { 'text_unigram_15000': 'https://pytorch.s3.amazonaws.com/models/text/pretrained_spm/text_unigram_15000.model', 'text_unigram_25000': 'https://pytorch.s3.amazonaws.com/models/text/pretrained_spm/text_unigram_25000.model', diff --git a/torchtext/transforms.py b/torchtext/transforms.py index 81939d428d..eaeaf4c372 100644 --- a/torchtext/transforms.py +++ b/torchtext/transforms.py @@ -18,6 +18,7 @@ 'Truncate', 'AddToken', 'GPT2BPETokenizer', + 'Sequential', ] From 3afc3631de55c56f88af6c4c34b8120c561294bb Mon Sep 17 00:00:00 2001 From: Parmeet Singh Bhatia Date: Mon, 3 Jan 2022 21:02:05 -0500 Subject: [PATCH 3/7] remove xlmr transform and use sequential transform for composition --- test/models/test_models.py | 8 +-- torchtext/models/roberta/bundler.py | 26 +++++--- torchtext/models/roberta/transforms.py | 82 -------------------------- 3 files changed, 21 insertions(+), 95 deletions(-) delete mode 100644 torchtext/models/roberta/transforms.py diff --git a/test/models/test_models.py b/test/models/test_models.py index 58942acb62..6a03a7cb22 100644 --- a/test/models/test_models.py +++ b/test/models/test_models.py @@ -87,8 +87,8 @@ def test_xlmr_transform(self): xlmr_base = torchtext.models.XLMR_BASE_ENCODER transform = xlmr_base.transform() test_text = "XLMR base Model Comparison" - actual = transform([test_text]) - expected = [[0, 43523, 52005, 3647, 13293, 113307, 40514, 2]] + actual = transform(test_text) + expected = torch.tensor([0, 43523, 52005, 3647, 13293, 113307, 40514, 2]) torch.testing.assert_close(actual, expected) def test_xlmr_transform_jit(self): @@ -96,8 +96,8 @@ def test_xlmr_transform_jit(self): transform = xlmr_base.transform() transform_jit = torch.jit.script(transform) test_text = "XLMR base Model Comparison" - actual = transform_jit([test_text]) - expected = [[0, 43523, 52005, 3647, 13293, 113307, 40514, 2]] + actual = transform_jit(test_text) + expected = torch.tensor([0, 43523, 52005, 3647, 13293, 113307, 40514, 2]) torch.testing.assert_close(actual, expected) def test_roberta_bundler_build_model(self): diff --git a/torchtext/models/roberta/bundler.py b/torchtext/models/roberta/bundler.py index 58774aacd5..07a4af9e65 100644 --- a/torchtext/models/roberta/bundler.py +++ b/torchtext/models/roberta/bundler.py @@ -15,7 +15,7 @@ RobertaModel, ) -from .transforms import get_xlmr_transform +import torchtext.transforms as T from torchtext import _TEXT_BUCKET @@ -156,10 +156,14 @@ def encoderConf(self) -> RobertaEncoderConf: XLMR_BASE_ENCODER = RobertaModelBundle( _path=urljoin(_TEXT_BUCKET, "xlmr.base.encoder.pt"), _encoder_conf=RobertaEncoderConf(vocab_size=250002), - transform=partial(get_xlmr_transform, - vocab_path=urljoin(_TEXT_BUCKET, "xlmr.vocab.pt"), - spm_model_path=urljoin(_TEXT_BUCKET, "xlmr.sentencepiece.bpe.model"), - ) + transform=lambda: T.Sequential( + T.SentencePieceTokenizer(urljoin(_TEXT_BUCKET, "xlmr.sentencepiece.bpe.model")), + T.VocabTransform(load_state_dict_from_url(urljoin(_TEXT_BUCKET, "xlmr.vocab.pt"))), + T.Truncate(510), + T.AddToken(token=0, begin=True), + T.AddToken(token=2, begin=False), + T.ToTensor(padding_value=1) + ) ) XLMR_BASE_ENCODER.__doc__ = ( @@ -174,10 +178,14 @@ def encoderConf(self) -> RobertaEncoderConf: XLMR_LARGE_ENCODER = RobertaModelBundle( _path=urljoin(_TEXT_BUCKET, "xlmr.large.encoder.pt"), _encoder_conf=RobertaEncoderConf(vocab_size=250002, embedding_dim=1024, ffn_dimension=4096, num_attention_heads=16, num_encoder_layers=24), - transform=partial(get_xlmr_transform, - vocab_path=urljoin(_TEXT_BUCKET, "xlmr.vocab.pt"), - spm_model_path=urljoin(_TEXT_BUCKET, "xlmr.sentencepiece.bpe.model"), - ) + transform=lambda: T.Sequential( + T.SentencePieceTokenizer(urljoin(_TEXT_BUCKET, "xlmr.sentencepiece.bpe.model")), + T.VocabTransform(load_state_dict_from_url(urljoin(_TEXT_BUCKET, "xlmr.vocab.pt"))), + T.Truncate(510), + T.AddToken(token=0, begin=True), + T.AddToken(token=2, begin=False), + T.ToTensor(padding_value=1) + ) ) XLMR_LARGE_ENCODER.__doc__ = ( diff --git a/torchtext/models/roberta/transforms.py b/torchtext/models/roberta/transforms.py deleted file mode 100644 index 683b6406be..0000000000 --- a/torchtext/models/roberta/transforms.py +++ /dev/null @@ -1,82 +0,0 @@ -import os -import torch -from torch.nn import Module -from torchtext._download_hooks import load_state_dict_from_url -from torchtext import transforms -from torchtext import functional - -from typing import List, Any - - -class XLMRobertaModelTransform(Module): - def __init__( - self, - vocab_path: str, - spm_model_path: str, - bos_token: str = "", - cls_token: str = "", - pad_token: str = "", - eos_token: str = "", - sep_token: str = "", - unk_token: str = "", - mask_token: str = "", - max_seq_len: int = 514, - ): - super().__init__() - self.bos_token = bos_token - self.eos_token = eos_token - self.pad_token = pad_token - self.unk_token = unk_token - self.mask_token = mask_token - self.cls_token = cls_token - self.sep_token = sep_token - self.max_seq_len = max_seq_len - - self.token_transform = transforms.SentencePieceTokenizer(spm_model_path) - - if os.path.exists(vocab_path): - self.vocab = torch.load(vocab_path) - else: - self.vocab = load_state_dict_from_url(vocab_path) - - self.vocab_transform = transforms.VocabTransform(self.vocab) - self.pad_idx = self.vocab[self.pad_token] - self.bos_idx = self.vocab[self.bos_token] - self.eos_idx = self.vocab[self.eos_token] - - def forward(self, input: Any, - add_bos: bool = True, - add_eos: bool = True, - truncate: bool = True) -> Any: - if torch.jit.isinstance(input, str): - tokens = self.vocab_transform(self.token_transform(input)) - - if truncate: - tokens = functional.truncate(tokens, self.max_seq_len - 2) - - if add_bos: - tokens = functional.add_token(tokens, self.bos_idx) - - if add_eos: - tokens = functional.add_token(tokens, self.eos_idx, begin=False) - - return tokens - elif torch.jit.isinstance(input, List[str]): - tokens = self.vocab_transform(self.token_transform(input)) - - if truncate: - tokens = functional.truncate(tokens, self.max_seq_len - 2) - - if add_bos: - tokens = functional.add_token(tokens, self.bos_idx) - - if add_eos: - tokens = functional.add_token(tokens, self.eos_idx, begin=False) - - return tokens - else: - raise TypeError("Input type not supported") - - -def get_xlmr_transform(vocab_path, spm_model_path, **kwargs) -> XLMRobertaModelTransform: - return XLMRobertaModelTransform(vocab_path, spm_model_path, **kwargs) From ab1d8984b70daf1296ec98858a85351b0b4d183f Mon Sep 17 00:00:00 2001 From: Parmeet Singh Bhatia Date: Wed, 5 Jan 2022 18:59:57 -0500 Subject: [PATCH 4/7] remove totensor from transform --- torchtext/models/roberta/bundler.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/torchtext/models/roberta/bundler.py b/torchtext/models/roberta/bundler.py index 07a4af9e65..0f79450c86 100644 --- a/torchtext/models/roberta/bundler.py +++ b/torchtext/models/roberta/bundler.py @@ -162,7 +162,6 @@ def encoderConf(self) -> RobertaEncoderConf: T.Truncate(510), T.AddToken(token=0, begin=True), T.AddToken(token=2, begin=False), - T.ToTensor(padding_value=1) ) ) @@ -184,7 +183,6 @@ def encoderConf(self) -> RobertaEncoderConf: T.Truncate(510), T.AddToken(token=0, begin=True), T.AddToken(token=2, begin=False), - T.ToTensor(padding_value=1) ) ) From 1713f8eea4dad5127341e648c6f14a4bae46b983 Mon Sep 17 00:00:00 2001 From: Parmeet Singh Bhatia Date: Wed, 5 Jan 2022 19:04:08 -0500 Subject: [PATCH 5/7] fix tests --- test/models/test_models.py | 8 ++++---- test/test_transforms.py | 25 ------------------------- 2 files changed, 4 insertions(+), 29 deletions(-) diff --git a/test/models/test_models.py b/test/models/test_models.py index 6a03a7cb22..db79a77743 100644 --- a/test/models/test_models.py +++ b/test/models/test_models.py @@ -88,8 +88,8 @@ def test_xlmr_transform(self): transform = xlmr_base.transform() test_text = "XLMR base Model Comparison" actual = transform(test_text) - expected = torch.tensor([0, 43523, 52005, 3647, 13293, 113307, 40514, 2]) - torch.testing.assert_close(actual, expected) + expected = [0, 43523, 52005, 3647, 13293, 113307, 40514, 2] + self.assertEqual(actual, expected) def test_xlmr_transform_jit(self): xlmr_base = torchtext.models.XLMR_BASE_ENCODER @@ -97,8 +97,8 @@ def test_xlmr_transform_jit(self): transform_jit = torch.jit.script(transform) test_text = "XLMR base Model Comparison" actual = transform_jit(test_text) - expected = torch.tensor([0, 43523, 52005, 3647, 13293, 113307, 40514, 2]) - torch.testing.assert_close(actual, expected) + expected = [0, 43523, 52005, 3647, 13293, 113307, 40514, 2] + self.assertEqual(actual, expected) def test_roberta_bundler_build_model(self): from torchtext.models import RobertaEncoderConf, RobertaClassificationHead, RobertaModel, RobertaModelBundle diff --git a/test/test_transforms.py b/test/test_transforms.py index d20e9b2888..56dc4ed2e1 100644 --- a/test/test_transforms.py +++ b/test/test_transforms.py @@ -204,31 +204,6 @@ def test_add_token(self): def test_add_token_jit(self): self._add_token(test_scripting=True) - def _sequential(self, test_scripting): - max_seq_len = 3 - padding_val = 0 - transform = transforms.Sequential( - transforms.Truncate(max_seq_len=max_seq_len), - transforms.ToTensor(padding_value=padding_val, dtype=torch.long) - ) - - if test_scripting: - transform = torch.jit.script(transform) - - input = [[1, 2, 3], [1, 2, 3]] - - actual = transform(input) - expected = torch.tensor(input) - torch.testing.assert_close(actual, expected) - - def test_sequential(self): - """test pipelining transforms using Sequential transform""" - self._sequential(test_scripting=False) - - def test_sequential_jit(self): - """test pipelining transforms using Sequential transform, ensuring the composite transform is scriptable""" - self._sequential(test_scripting=True) - class TestSequential(TorchtextTestCase): def _sequential(self, test_scripting): From 2f936c7810df6879a4a17e0089b2b5df2fbb1e01 Mon Sep 17 00:00:00 2001 From: Parmeet Singh Bhatia Date: Wed, 5 Jan 2022 19:07:43 -0500 Subject: [PATCH 6/7] fix flake --- torchtext/models/roberta/bundler.py | 1 - 1 file changed, 1 deletion(-) diff --git a/torchtext/models/roberta/bundler.py b/torchtext/models/roberta/bundler.py index 0f79450c86..7cd9e2c833 100644 --- a/torchtext/models/roberta/bundler.py +++ b/torchtext/models/roberta/bundler.py @@ -1,5 +1,4 @@ from dataclasses import dataclass -from functools import partial from urllib.parse import urljoin from typing import Optional, Callable, Dict, Union, Any From f3ae69be805f2090c132d05f4c6f86d47eacd16b Mon Sep 17 00:00:00 2001 From: Parmeet Singh Bhatia Date: Wed, 5 Jan 2022 19:11:27 -0500 Subject: [PATCH 7/7] revert test_models.py --- test/models/test_models.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/models/test_models.py b/test/models/test_models.py index db79a77743..58942acb62 100644 --- a/test/models/test_models.py +++ b/test/models/test_models.py @@ -87,18 +87,18 @@ def test_xlmr_transform(self): xlmr_base = torchtext.models.XLMR_BASE_ENCODER transform = xlmr_base.transform() test_text = "XLMR base Model Comparison" - actual = transform(test_text) - expected = [0, 43523, 52005, 3647, 13293, 113307, 40514, 2] - self.assertEqual(actual, expected) + actual = transform([test_text]) + expected = [[0, 43523, 52005, 3647, 13293, 113307, 40514, 2]] + torch.testing.assert_close(actual, expected) def test_xlmr_transform_jit(self): xlmr_base = torchtext.models.XLMR_BASE_ENCODER transform = xlmr_base.transform() transform_jit = torch.jit.script(transform) test_text = "XLMR base Model Comparison" - actual = transform_jit(test_text) - expected = [0, 43523, 52005, 3647, 13293, 113307, 40514, 2] - self.assertEqual(actual, expected) + actual = transform_jit([test_text]) + expected = [[0, 43523, 52005, 3647, 13293, 113307, 40514, 2]] + torch.testing.assert_close(actual, expected) def test_roberta_bundler_build_model(self): from torchtext.models import RobertaEncoderConf, RobertaClassificationHead, RobertaModel, RobertaModelBundle