Skip to content
This repository was archived by the owner on Sep 10, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
184 changes: 184 additions & 0 deletions test/torchtext_unittest/test_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,19 +336,197 @@ def _gpt2_bpe_tokenizer(self, tokenizer):
"Hélló WoŕlḊ¿",
"Respublica superiorem",
"Avdija Vršajević în",
"multi space",
]

expected_tokens = [
["Hello", "ĠWorld", "!,", "Ġhow", "Ġare", "Ġyou", "?"],
["H", "é", "ll", "ó", "Ġ", "ĠWo", "Å", "ķ", "l", "á¸", "Ĭ", "Â", "¿"],
["Res", "public", "a", "Ġsuper", "i", "orem"],
["Av", "d", "ija", "ĠV", "r", "Å¡", "aj", "ev", "i", "Äĩ", "ĠÃ", "®", "n"],
["multi", "Ġ", "Ġ", "Ġ", "Ġ", "Ġ", "Ġspace"],
]
expected_token_ids = [
["15496", "2159", "28265", "703", "389", "345", "30"],
["39", "2634", "297", "10205", "220", "22173", "129", "243", "75", "41585", "232", "126", "123"],
["4965", "11377", "64", "2208", "72", "29625"],
["7355", "67", "34655", "569", "81", "32790", "1228", "1990", "72", "38325", "6184", "106", "77"],
["41684", "220", "220", "220", "220", "220", "2272"],
]

# test batch of sentences
if tokenizer._return_tokens:
self.assertEqual(tokenizer(sample_texts), expected_tokens)
else:
self.assertEqual(tokenizer(sample_texts), expected_token_ids)

# test individual sentences
for idx, txt in enumerate(sample_texts):
if tokenizer._return_tokens:
self.assertEqual(tokenizer(txt), expected_tokens[idx])
else:
self.assertEqual(tokenizer(txt), expected_token_ids[idx])

def _gpt2_bpe_tokenizer_with_added_vocab(self, tokenizer):
sample_texts = [
"<|endoftext|> and <|endoftext|> are special <|endofline|> is not!",
"test ACCEPT <avail_actions> with DECLINE <|endoftext|> and NO_ACTION",
"none in vocab: <|endofline|> WALK_60M WALK_10M <state>",
"Respublica Vršajević în",
"some in vocab: <|endofline|> WALK_60M WALK_10M <state>",
"<|endoftext|> WALK_60M WALK_10M <reward> <state>",
]

newly_added = tokenizer.add_special_tokens(
special_tokens_dict={
"unk_token": "<|endoftext|>",
"additional_special_tokens": [
"ACCEPT",
"DECLINE",
"NO_ACTION",
"WALK_10M",
"WALK_60M",
"<reward>",
],
}
)
self.assertEqual(newly_added, 6)

newly_added = tokenizer.add_special_tokens(
special_tokens_dict={
"unk_token": "<|endoftext|>",
"sep_token": "<avail_actions>",
"additional_special_tokens": [
"ACCEPT",
"DECLINE",
"NO_ACTION",
"WALK_10M",
"WALK_60M",
"<reward>",
],
}
)
self.assertEqual(newly_added, 1)

expected_tokens = [
[
"<|endoftext|>",
"and",
"<|endoftext|>",
"are",
"Ġspecial",
"Ġ<",
"|",
"end",
"of",
"line",
"|",
">",
"Ġis",
"Ġnot",
"!",
],
["test", "ACCEPT", "<avail_actions>", "with", "DECLINE", "<|endoftext|>", "and", "NO_ACTION"],
[
"none",
"Ġin",
"Ġvoc",
"ab",
":",
"Ġ<",
"|",
"end",
"of",
"line",
"|",
">",
"WALK_60M",
"WALK_10M",
"<",
"state",
">",
],
["Res", "public", "a", "ĠV", "r", "Å¡", "aj", "ev", "i", "Äĩ", "ĠÃ", "®", "n"],
[
"some",
"Ġin",
"Ġvoc",
"ab",
":",
"Ġ<",
"|",
"end",
"of",
"line",
"|",
">",
"WALK_60M",
"WALK_10M",
"<",
"state",
">",
],
["<|endoftext|>", "WALK_60M", "WALK_10M", "<reward>", "<", "state", ">"],
]
expected_token_ids = [
[
"50256",
"392",
"50256",
"533",
"2041",
"1279",
"91",
"437",
"1659",
"1370",
"91",
"29",
"318",
"407",
"0",
],
["9288", "50257", "50263", "4480", "50258", "50256", "392", "50259"],
[
"23108",
"287",
"12776",
"397",
"25",
"1279",
"91",
"437",
"1659",
"1370",
"91",
"29",
"50261",
"50260",
"27",
"5219",
"29",
],
["4965", "11377", "64", "569", "81", "32790", "1228", "1990", "72", "38325", "6184", "106", "77"],
[
"11246",
"287",
"12776",
"397",
"25",
"1279",
"91",
"437",
"1659",
"1370",
"91",
"29",
"50261",
"50260",
"27",
"5219",
"29",
],
["50256", "50261", "50260", "50262", "27", "5219", "29"],
]

# test batch of sentences
Expand Down Expand Up @@ -391,6 +569,12 @@ def test_gpt2_bpe_decoder(self):
"""test string output returned by decoder given the token ids"""
self._gpt2_bpe_decoder(self._load_tokenizer(test_scripting=False, return_tokens=False))

@nested_params([True, False])
def test_gpt2_bpe_tokenizer_with_added_vocab(self, return_tokens):
self._gpt2_bpe_tokenizer_with_added_vocab(
self._load_tokenizer(test_scripting=False, return_tokens=return_tokens)
)

def test_gpt2_bpe_tokenizer_save_load_pybind(self) -> None:
tokenizer = self._load_tokenizer(test_scripting=False, return_tokens=False)
tokenizer_path = os.path.join(self.test_dir, "gpt2_tokenizer_pybind.pt")
Expand Down
Loading