Skip to content
This repository was archived by the owner on Sep 10, 2025. It is now read-only.

Commit ec364a2

Browse files
authored
adding data pipelines for Roberta pre-processing (#1637)
1 parent 69f67f3 commit ec364a2

File tree

2 files changed

+157
-0
lines changed

2 files changed

+157
-0
lines changed
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
from argparse import ArgumentParser
2+
3+
import torcharrow as ta
4+
import torcharrow.dtypes as dt
5+
import torcharrow.pytorch as tap
6+
import torchtext.transforms as T
7+
from torch.hub import load_state_dict_from_url
8+
from torch.nn import Module
9+
from torch.utils.data import DataLoader
10+
from torchtext.datasets import SST2
11+
12+
13+
class RobertaTransform(Module):
14+
def __init__(self) -> None:
15+
super().__init__()
16+
# Instantiate various transforms
17+
18+
# Tokenizer to split input text into tokens
19+
encoder_json_path = "https://download.pytorch.org/models/text/gpt2_bpe_encoder.json"
20+
vocab_bpe_path = "https://download.pytorch.org/models/text/gpt2_bpe_vocab.bpe"
21+
self.tokenizer = T.GPT2BPETokenizer(encoder_json_path, vocab_bpe_path)
22+
23+
# vocabulary converting tokens to IDs
24+
vocab_path = "https://download.pytorch.org/models/text/roberta.vocab.pt"
25+
self.vocab = T.VocabTransform(load_state_dict_from_url(vocab_path))
26+
27+
# Add BOS token to the beginning of sentence
28+
self.add_bos = T.AddToken(token=0, begin=True)
29+
30+
# Add EOS token to the end of sentence
31+
self.add_eos = T.AddToken(token=2, begin=False)
32+
33+
def forward(self, input: ta.DataFrame) -> ta.DataFrame:
34+
input["tokens"] = input["text"].map(self.tokenizer.forward, dtype=dt.List(dt.string))
35+
input["tokens"] = input["tokens"].list.slice(stop=254)
36+
input["tokens"] = input["tokens"].map(self.vocab, dtype=dt.List(dt.int32))
37+
input["tokens"] = input["tokens"].map(self.add_bos)
38+
input["tokens"] = input["tokens"].map(self.add_eos)
39+
return input
40+
41+
42+
def main(args):
43+
44+
# Instantiate transform
45+
transform = RobertaTransform()
46+
47+
# Create SST2 datapipe and apply pre-processing
48+
train_dp = SST2(split="train")
49+
50+
# convert to DataFrame of size batches
51+
# TODO: Figure out how to create DataFrame of larger size and create batches consequently
52+
train_dp = train_dp.dataframe(columns=["text", "labels"], dataframe_size=args.batch_size)
53+
54+
# Apply transformation on DataFrame
55+
train_dp = train_dp.map(transform)
56+
57+
# Remove not required columns
58+
train_dp = train_dp.map(lambda x: x.drop(["text"]))
59+
60+
# convert DataFrame to tensor (This will yeild named tuple)
61+
train_dp = train_dp.map(lambda x: x.to_tensor({"tokens": tap.PadSequence(padding_value=1)}))
62+
63+
# create DataLoader
64+
dl = DataLoader(train_dp, batch_size=None)
65+
66+
train_steps = args.train_steps
67+
for i, batch in enumerate(dl):
68+
if i == train_steps:
69+
break
70+
71+
# model_input = batch.tokens
72+
# target = batch.labels
73+
...
74+
75+
76+
if __name__ == "__main__":
77+
parser = ArgumentParser()
78+
parser.add_argument("--batch-size", default=4, type=int)
79+
parser.add_argument("--train-steps", default=-1, type=int)
80+
parser.add_argument("--dataframe-size", default=100, type=int)
81+
main(parser.parse_args())
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
from argparse import ArgumentParser
2+
from functools import partial
3+
from typing import Dict, Any
4+
5+
import torchtext.functional as F
6+
import torchtext.transforms as T
7+
from torch.hub import load_state_dict_from_url
8+
from torch.nn import Module
9+
from torch.utils.data import DataLoader
10+
from torchtext.datasets import SST2
11+
12+
13+
class RobertaTransform(Module):
14+
def __init__(self) -> None:
15+
super().__init__()
16+
# Instantiate various transforms
17+
18+
# Tokenizer to split input text into tokens
19+
encoder_json_path = "https://download.pytorch.org/models/text/gpt2_bpe_encoder.json"
20+
vocab_bpe_path = "https://download.pytorch.org/models/text/gpt2_bpe_vocab.bpe"
21+
self.tokenizer = T.GPT2BPETokenizer(encoder_json_path, vocab_bpe_path)
22+
23+
# vocabulary converting tokens to IDs
24+
vocab_path = "https://download.pytorch.org/models/text/roberta.vocab.pt"
25+
self.vocab = T.VocabTransform(load_state_dict_from_url(vocab_path))
26+
27+
# Add BOS token to the beginning of sentence
28+
self.add_bos = T.AddToken(token=0, begin=True)
29+
30+
# Add EOS token to the end of sentence
31+
self.add_eos = T.AddToken(token=2, begin=False)
32+
33+
def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
34+
tokens = self.tokenizer(input["text"])
35+
tokens = F.truncate(tokens, max_seq_len=254)
36+
tokens = self.vocab(tokens)
37+
tokens = self.add_bos(tokens)
38+
tokens = self.add_eos(tokens)
39+
input["tokens"] = tokens
40+
return input
41+
42+
43+
def main(args):
44+
# Instantiate transform
45+
transform = RobertaTransform()
46+
47+
# Create SST2 datapipe and apply pre-processing
48+
batch_size = args.batch_size
49+
train_dp = SST2(split="train")
50+
train_dp = train_dp.batch(batch_size).rows2columnar(["text", "label"])
51+
52+
# Apply text pre-processing
53+
train_dp = train_dp.map(transform)
54+
55+
# convert to Tensor
56+
train_dp = train_dp.map(partial(F.to_tensor, padding_value=1), input_col="tokens")
57+
train_dp = train_dp.map(F.to_tensor, input_col="label")
58+
59+
# create DataLoader
60+
dl = DataLoader(train_dp, batch_size=None)
61+
62+
train_steps = args.train_steps
63+
for i, batch in enumerate(dl):
64+
if i == train_steps:
65+
break
66+
67+
# model_input = batch["tokens"]
68+
# target = batch["label"]
69+
...
70+
71+
72+
if __name__ == "__main__":
73+
parser = ArgumentParser()
74+
parser.add_argument("--batch-size", default=4, type=int)
75+
parser.add_argument("--train-steps", default=-1, type=int)
76+
main(parser.parse_args())

0 commit comments

Comments
 (0)