Skip to content
This repository was archived by the owner on Sep 10, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified test/asset/t5.base.encoder.output.pt
Binary file not shown.
Binary file modified test/asset/t5.base.output.pt
Binary file not shown.
12 changes: 7 additions & 5 deletions test/prototype/integration_tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,16 @@


class TestT5(TorchtextTestCase):
def _t5_model(self, t5_model, expected_asset_name, model_input):
def _t5_model(self, t5_model, expected_asset_name, test_text):
"""Verify that pre-trained T5 models in torchtext produce
the same output as the HuggingFace reference implementation.
"""
expected_asset_path = get_asset_path(expected_asset_name)
transform = t5_model.transform()
model = t5_model.get_model()
model = model.eval()

model_input = transform(test_text)
if model.encoder_only:
actual = model(model_input)["encoder_output"]
else:
Expand All @@ -26,10 +28,10 @@ def _t5_model(self, t5_model, expected_asset_name, model_input):

def test_t5_base_encoder_model(self):
expected_asset_name = "t5.base.encoder.output.pt"
model_input = torch.tensor([[1, 2, 3, 4, 5, 6], [7, 8, 9, 0, 0, 0]])
self._t5_model(t5_model=T5_BASE_ENCODER, expected_asset_name=expected_asset_name, model_input=model_input)
test_text = ["Hello world", "Attention rocks!"]
self._t5_model(t5_model=T5_BASE_ENCODER, expected_asset_name=expected_asset_name, test_text=test_text)

def test_t5_base_model(self):
expected_asset_name = "t5.base.output.pt"
model_input = torch.tensor([[1, 2, 3, 4, 5, 6], [7, 8, 9, 0, 0, 0]])
self._t5_model(t5_model=T5_BASE, expected_asset_name=expected_asset_name, model_input=model_input)
test_text = ["Hello world", "Attention rocks!"]
self._t5_model(t5_model=T5_BASE, expected_asset_name=expected_asset_name, test_text=test_text)
28 changes: 23 additions & 5 deletions torchtext/prototype/models/t5/bundler.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,40 @@
import logging
from dataclasses import dataclass
from typing import Any, Dict, Optional, Union
from typing import Any, Callable, Dict, Optional, Union
from urllib.parse import urljoin

import torch
from torchtext import _TEXT_BUCKET
from torchtext._download_hooks import load_state_dict_from_url

from .model import T5Conf, T5Model
from .t5_transform import T5Transform

logger = logging.getLogger(__name__)


@dataclass
class T5Bundle:
"""T5Bundle(_config: torchtext.prototype.models.T5Conf, _path: Optional[str] = None)
"""T5Bundle(_config: torchtext.prototype.models.T5Conf, _path: Optional[str] = None, transform: Optional[Callable] = None)

Example - Pretrained base t5 encoder
>>> import torch, torchtext
>>> t5_encoder_base = torchtext.prototype.models.T5_BASE_ENCODER
>>> transform = t5_encoder_base.transform()
>>> input_seq = ["Hello world", "Attention rocks!"]
>>> model = t5_encoder_base.get_model()
>>> model_input = torch.tensor([[1,2,3,4,5,6],[7,8,9,0,0,0]])
>>> model_input = transform(input_seq)
>>> output = model(model_input)['encoder_output']
>>> output.shape
torch.Size([2, 6, 768])
torch.Size([2, 4, 768])

Example - Pretrained base t5 model
>>> import torch, torchtext
>>> t5_base = torchtext.prototype.models.T5_BASE
>>> transform = t5_base.transform()
>>> input_seq = ["Hello world", "Attention rocks!"]
>>> model = t5_base.get_model()
>>> model_input = torch.tensor([[1,2,3,4,5,6],[7,8,9,0,0,0]])
>>> model_input = transform(input_seq)
>>> output = model(model_input)['decoder_output']
>>> output.shape
torch.Size([2, 1, 768])
Expand All @@ -43,6 +48,7 @@ class T5Bundle:

_config: T5Conf
_path: Optional[str] = None
transform: Optional[Callable] = None

def get_model(
self,
Expand Down Expand Up @@ -122,6 +128,12 @@ def config(self) -> T5Conf:
T5_BASE_ENCODER = T5Bundle(
_path=urljoin(_TEXT_BUCKET, "t5.base.encoder.pt"),
_config=T5Conf(encoder_only=True),
transform=lambda: T5Transform(
urljoin(_TEXT_BUCKET, "t5_tokenizer_base.model"),
max_seq_len=512,
eos_idx=1,
padding_idx=0,
),
)

T5_BASE_ENCODER.__doc__ = """
Expand All @@ -146,6 +158,12 @@ def config(self) -> T5Conf:
T5_BASE = T5Bundle(
_path=urljoin(_TEXT_BUCKET, "t5.base.pt"),
_config=T5Conf(encoder_only=False),
transform=lambda: T5Transform(
urljoin(_TEXT_BUCKET, "t5_tokenizer_base.model"),
max_seq_len=512,
eos_idx=1,
padding_idx=0,
),
)

T5_BASE.__doc__ = """
Expand Down