Skip to content
This repository was archived by the owner on Sep 10, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions docs/source/models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,19 @@ XLMR_LARGE_ENCODER
.. autodata:: XLMR_LARGE_ENCODER
:no-value:

ROBERTA_BASE_ENCODER
--------------------

.. container:: py attribute

.. autodata:: ROBERTA_BASE_ENCODER
:no-value:


ROBERTA_LARGE_ENCODER
---------------------

.. container:: py attribute

.. autodata:: ROBERTA_LARGE_ENCODER
:no-value:
Binary file added test/asset/roberta.base.output.pt
Binary file not shown.
Binary file added test/asset/roberta.large.output.pt
Binary file not shown.
Empty file.
67 changes: 67 additions & 0 deletions test/integration_tests/test_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import torch
import torchtext

from ..common.assets import get_asset_path
from ..common.torchtext_test_case import TorchtextTestCase


class TestModels(TorchtextTestCase):
def test_roberta_base(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this can be written in much compact manner with parameterization.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a followup item. Let me do this in a separate PR

asset_path = get_asset_path("roberta.base.output.pt")
test_text = "Roberta base Model Comparison"

roberta_base = torchtext.models.ROBERTA_BASE_ENCODER
transform = roberta_base.transform()
model = roberta_base.get_model()
model = model.eval()

model_input = torch.tensor(transform([test_text]))
actual = model(model_input)
expected = torch.load(asset_path)
torch.testing.assert_close(actual, expected)

def test_roberta_base_jit(self):
asset_path = get_asset_path("roberta.base.output.pt")
test_text = "Roberta base Model Comparison"

roberta_base = torchtext.models.ROBERTA_BASE_ENCODER
transform = roberta_base.transform()
transform_jit = torch.jit.script(transform)
model = roberta_base.get_model()
model = model.eval()
model_jit = torch.jit.script(model)

model_input = torch.tensor(transform_jit([test_text]))
actual = model_jit(model_input)
expected = torch.load(asset_path)
torch.testing.assert_close(actual, expected)

def test_roberta_large(self):
asset_path = get_asset_path("roberta.large.output.pt")
test_text = "Roberta base Model Comparison"

roberta_large = torchtext.models.ROBERTA_LARGE_ENCODER
transform = roberta_large.transform()
model = roberta_large.get_model()
model = model.eval()

model_input = torch.tensor(transform([test_text]))
actual = model(model_input)
expected = torch.load(asset_path)
torch.testing.assert_close(actual, expected)

def test_roberta_large_jit(self):
asset_path = get_asset_path("roberta.large.output.pt")
test_text = "Roberta base Model Comparison"

roberta_large = torchtext.models.ROBERTA_LARGE_ENCODER
transform = roberta_large.transform()
transform_jit = torch.jit.script(transform)
model = roberta_large.get_model()
model = model.eval()
model_jit = torch.jit.script(model)

model_input = torch.tensor(transform_jit([test_text]))
actual = model_jit(model_input)
expected = torch.load(asset_path)
torch.testing.assert_close(actual, expected)
4 changes: 4 additions & 0 deletions torchtext/models/roberta/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
RobertaModelBundle,
XLMR_BASE_ENCODER,
XLMR_LARGE_ENCODER,
ROBERTA_BASE_ENCODER,
ROBERTA_LARGE_ENCODER,
)

__all__ = [
Expand All @@ -17,4 +19,6 @@
"RobertaModelBundle",
"XLMR_BASE_ENCODER",
"XLMR_LARGE_ENCODER",
"ROBERTA_BASE_ENCODER",
"ROBERTA_LARGE_ENCODER",
]
86 changes: 86 additions & 0 deletions torchtext/models/roberta/bundler.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,3 +192,89 @@ def encoderConf(self) -> RobertaEncoderConf:
Please refer to :func:`torchtext.models.RobertaModelBundle` for the usage.
'''
)


ROBERTA_BASE_ENCODER = RobertaModelBundle(
_path=urljoin(_TEXT_BUCKET, "roberta.base.encoder.pt"),
_encoder_conf=RobertaEncoderConf(vocab_size=50265),
transform=lambda: T.Sequential(
T.GPT2BPETokenizer(
encoder_json_path=urljoin(_TEXT_BUCKET, "gpt2_bpe_encoder.json"),
vocab_bpe_path=urljoin(_TEXT_BUCKET, "gpt2_bpe_vocab.bpe"),
),
T.VocabTransform(
load_state_dict_from_url(urljoin(_TEXT_BUCKET, "roberta.vocab.pt"))
),
T.Truncate(254),
T.AddToken(token=0, begin=True),
T.AddToken(token=2, begin=False),
),
)

ROBERTA_BASE_ENCODER.__doc__ = (
'''
Roberta Encoder with Base configuration

RoBERTa iterates on BERT's pretraining procedure, including training the model longer,
with bigger batches over more data; removing the next sentence prediction objective;
training on longer sequences; and dynamically changing the masking pattern applied
to the training data.

The RoBERTa model was pretrained on the reunion of five datasets: BookCorpus,
English Wikipedia, CC-News, OpenWebText, and STORIES. Together theses datasets
contain over a 160GB of text.

Originally published by the authors of RoBERTa under MIT License
and redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/main/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/tree/main/examples/roberta#pre-trained-models>`__]

Please refer to :func:`torchtext.models.RobertaModelBundle` for the usage.
'''
)


ROBERTA_LARGE_ENCODER = RobertaModelBundle(
_path=urljoin(_TEXT_BUCKET, "roberta.large.encoder.pt"),
_encoder_conf=RobertaEncoderConf(
vocab_size=50265,
embedding_dim=1024,
ffn_dimension=4096,
num_attention_heads=16,
num_encoder_layers=24,
),
transform=lambda: T.Sequential(
T.GPT2BPETokenizer(
encoder_json_path=urljoin(_TEXT_BUCKET, "gpt2_bpe_encoder.json"),
vocab_bpe_path=urljoin(_TEXT_BUCKET, "gpt2_bpe_vocab.bpe"),
),
T.VocabTransform(
load_state_dict_from_url(urljoin(_TEXT_BUCKET, "roberta.vocab.pt"))
),
T.Truncate(510),
T.AddToken(token=0, begin=True),
T.AddToken(token=2, begin=False),
),
)

ROBERTA_LARGE_ENCODER.__doc__ = (
'''
Roberta Encoder with Large configuration

RoBERTa iterates on BERT's pretraining procedure, including training the model longer,
with bigger batches over more data; removing the next sentence prediction objective;
training on longer sequences; and dynamically changing the masking pattern applied
to the training data.

The RoBERTa model was pretrained on the reunion of five datasets: BookCorpus,
English Wikipedia, CC-News, OpenWebText, and STORIES. Together theses datasets
contain over a 160GB of text.

Originally published by the authors of RoBERTa under MIT License
and redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/main/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/tree/main/examples/roberta#pre-trained-models>`__]

Please refer to :func:`torchtext.models.RobertaModelBundle` for the usage.
'''
)