Skip to content
This repository was archived by the owner on Sep 10, 2025. It is now read-only.

Commit 3f9c349

Browse files
authored
[Feature] Added capability to add special tokens in GPT2BPEEncoder and avoid splitting on them (#1916)
* add_special_tokens and never split features added * removed a comment and updated a type hint * added explanation and example for how this change works * move SPECIAL_TOKENS_ATTRIBUTES to utils * rebase and address latest nit comments
1 parent 258a356 commit 3f9c349

File tree

6 files changed

+407
-26
lines changed

6 files changed

+407
-26
lines changed

test/torchtext_unittest/test_transforms.py

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -336,19 +336,197 @@ def _gpt2_bpe_tokenizer(self, tokenizer):
336336
"Hélló WoŕlḊ¿",
337337
"Respublica superiorem",
338338
"Avdija Vršajević în",
339+
"multi space",
339340
]
340341

341342
expected_tokens = [
342343
["Hello", "ĠWorld", "!,", "Ġhow", "Ġare", "Ġyou", "?"],
343344
["H", "é", "ll", "ó", "Ġ", "ĠWo", "Å", "ķ", "l", "á¸", "Ĭ", "Â", "¿"],
344345
["Res", "public", "a", "Ġsuper", "i", "orem"],
345346
["Av", "d", "ija", "ĠV", "r", "Å¡", "aj", "ev", "i", "Äĩ", "ĠÃ", "®", "n"],
347+
["multi", "Ġ", "Ġ", "Ġ", "Ġ", "Ġ", "Ġspace"],
346348
]
347349
expected_token_ids = [
348350
["15496", "2159", "28265", "703", "389", "345", "30"],
349351
["39", "2634", "297", "10205", "220", "22173", "129", "243", "75", "41585", "232", "126", "123"],
350352
["4965", "11377", "64", "2208", "72", "29625"],
351353
["7355", "67", "34655", "569", "81", "32790", "1228", "1990", "72", "38325", "6184", "106", "77"],
354+
["41684", "220", "220", "220", "220", "220", "2272"],
355+
]
356+
357+
# test batch of sentences
358+
if tokenizer._return_tokens:
359+
self.assertEqual(tokenizer(sample_texts), expected_tokens)
360+
else:
361+
self.assertEqual(tokenizer(sample_texts), expected_token_ids)
362+
363+
# test individual sentences
364+
for idx, txt in enumerate(sample_texts):
365+
if tokenizer._return_tokens:
366+
self.assertEqual(tokenizer(txt), expected_tokens[idx])
367+
else:
368+
self.assertEqual(tokenizer(txt), expected_token_ids[idx])
369+
370+
def _gpt2_bpe_tokenizer_with_added_vocab(self, tokenizer):
371+
sample_texts = [
372+
"<|endoftext|> and <|endoftext|> are special <|endofline|> is not!",
373+
"test ACCEPT <avail_actions> with DECLINE <|endoftext|> and NO_ACTION",
374+
"none in vocab: <|endofline|> WALK_60M WALK_10M <state>",
375+
"Respublica Vršajević în",
376+
"some in vocab: <|endofline|> WALK_60M WALK_10M <state>",
377+
"<|endoftext|> WALK_60M WALK_10M <reward> <state>",
378+
]
379+
380+
newly_added = tokenizer.add_special_tokens(
381+
special_tokens_dict={
382+
"unk_token": "<|endoftext|>",
383+
"additional_special_tokens": [
384+
"ACCEPT",
385+
"DECLINE",
386+
"NO_ACTION",
387+
"WALK_10M",
388+
"WALK_60M",
389+
"<reward>",
390+
],
391+
}
392+
)
393+
self.assertEqual(newly_added, 6)
394+
395+
newly_added = tokenizer.add_special_tokens(
396+
special_tokens_dict={
397+
"unk_token": "<|endoftext|>",
398+
"sep_token": "<avail_actions>",
399+
"additional_special_tokens": [
400+
"ACCEPT",
401+
"DECLINE",
402+
"NO_ACTION",
403+
"WALK_10M",
404+
"WALK_60M",
405+
"<reward>",
406+
],
407+
}
408+
)
409+
self.assertEqual(newly_added, 1)
410+
411+
expected_tokens = [
412+
[
413+
"<|endoftext|>",
414+
"and",
415+
"<|endoftext|>",
416+
"are",
417+
"Ġspecial",
418+
"Ġ<",
419+
"|",
420+
"end",
421+
"of",
422+
"line",
423+
"|",
424+
">",
425+
"Ġis",
426+
"Ġnot",
427+
"!",
428+
],
429+
["test", "ACCEPT", "<avail_actions>", "with", "DECLINE", "<|endoftext|>", "and", "NO_ACTION"],
430+
[
431+
"none",
432+
"Ġin",
433+
"Ġvoc",
434+
"ab",
435+
":",
436+
"Ġ<",
437+
"|",
438+
"end",
439+
"of",
440+
"line",
441+
"|",
442+
">",
443+
"WALK_60M",
444+
"WALK_10M",
445+
"<",
446+
"state",
447+
">",
448+
],
449+
["Res", "public", "a", "ĠV", "r", "Å¡", "aj", "ev", "i", "Äĩ", "ĠÃ", "®", "n"],
450+
[
451+
"some",
452+
"Ġin",
453+
"Ġvoc",
454+
"ab",
455+
":",
456+
"Ġ<",
457+
"|",
458+
"end",
459+
"of",
460+
"line",
461+
"|",
462+
">",
463+
"WALK_60M",
464+
"WALK_10M",
465+
"<",
466+
"state",
467+
">",
468+
],
469+
["<|endoftext|>", "WALK_60M", "WALK_10M", "<reward>", "<", "state", ">"],
470+
]
471+
expected_token_ids = [
472+
[
473+
"50256",
474+
"392",
475+
"50256",
476+
"533",
477+
"2041",
478+
"1279",
479+
"91",
480+
"437",
481+
"1659",
482+
"1370",
483+
"91",
484+
"29",
485+
"318",
486+
"407",
487+
"0",
488+
],
489+
["9288", "50257", "50263", "4480", "50258", "50256", "392", "50259"],
490+
[
491+
"23108",
492+
"287",
493+
"12776",
494+
"397",
495+
"25",
496+
"1279",
497+
"91",
498+
"437",
499+
"1659",
500+
"1370",
501+
"91",
502+
"29",
503+
"50261",
504+
"50260",
505+
"27",
506+
"5219",
507+
"29",
508+
],
509+
["4965", "11377", "64", "569", "81", "32790", "1228", "1990", "72", "38325", "6184", "106", "77"],
510+
[
511+
"11246",
512+
"287",
513+
"12776",
514+
"397",
515+
"25",
516+
"1279",
517+
"91",
518+
"437",
519+
"1659",
520+
"1370",
521+
"91",
522+
"29",
523+
"50261",
524+
"50260",
525+
"27",
526+
"5219",
527+
"29",
528+
],
529+
["50256", "50261", "50260", "50262", "27", "5219", "29"],
352530
]
353531

354532
# test batch of sentences
@@ -391,6 +569,12 @@ def test_gpt2_bpe_decoder(self):
391569
"""test string output returned by decoder given the token ids"""
392570
self._gpt2_bpe_decoder(self._load_tokenizer(test_scripting=False, return_tokens=False))
393571

572+
@nested_params([True, False])
573+
def test_gpt2_bpe_tokenizer_with_added_vocab(self, return_tokens):
574+
self._gpt2_bpe_tokenizer_with_added_vocab(
575+
self._load_tokenizer(test_scripting=False, return_tokens=return_tokens)
576+
)
577+
394578
def test_gpt2_bpe_tokenizer_save_load_pybind(self) -> None:
395579
tokenizer = self._load_tokenizer(test_scripting=False, return_tokens=False)
396580
tokenizer_path = os.path.join(self.test_dir, "gpt2_tokenizer_pybind.pt")

0 commit comments

Comments
 (0)