Skip to content
This repository was archived by the owner on Sep 10, 2025. It is now read-only.

Commit ad249fc

Browse files
committed
rebase and address latest nit comments
1 parent 9e66291 commit ad249fc

File tree

3 files changed

+18
-19
lines changed

3 files changed

+18
-19
lines changed

torchtext/csrc/gpt2_bpe_tokenizer.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ std::vector<std::string> gpt2_bpe_pre_tokenizer(std::string input) {
7070

7171
/* Notes on handling Special Tokens:
7272
We use regex pattern to first identify the special tokens in the input text.
73-
Other non-special tokens go through pre-tokenization as usual, but special
73+
Other 'non-special' tokens go through pre-tokenization as usual, but special
7474
tokens skip those steps.
7575
7676
Steps:
@@ -79,7 +79,7 @@ std::vector<std::string> gpt2_bpe_pre_tokenizer(std::string input) {
7979
`add_special_tokens` API.
8080
- form a regex pattern that helps in extracting special tokens from the
8181
input text.
82-
* Crate a vector that contains chunks of input text, such that each chunk is
82+
* Create a vector that contains chunks of input text, such that each chunk is
8383
either a sequence of non-special token or a single special token. For example,
8484
assuming <|special_tok|> and [SEP] are special tokens, the following text
8585
"This is an example with <|special_tok|> and [SEP] and [SPAM]."
@@ -94,9 +94,9 @@ std::vector<std::string> gpt2_bpe_pre_tokenizer(std::string input) {
9494

9595
if (bpe_never_split_set_.size() > 0) {
9696
std::string pattern = "";
97-
// escape regex characters for matching special tokens
98-
// this is done to ensure character like '|' in special like
99-
// <|endoftext|> don't get special regex meaning
97+
// Escape regex characters for matching special tokens. This is done to
98+
// ensure that characters like '|' in certain special tokens such as
99+
// <|endoftext|> don't get special regex treatment.
100100
for (std::string token : bpe_never_split_set_) {
101101
std::string::size_type pos = 0;
102102
while ((pos = token.find_first_of("|[]", pos)) != std::string::npos) {

torchtext/transforms.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
)
1515
from torchtext._torchtext import RegexTokenizer as RegexTokenizerPybind
1616
from torchtext.data.functional import load_sp_model
17-
from torchtext.utils import get_asset_local_path, SPECIAL_TOKENS_ATTRIBUTES
17+
from torchtext.utils import get_asset_local_path
1818
from torchtext.vocab import Vocab
1919

2020
from . import functional as F
@@ -288,6 +288,16 @@ class GPT2BPETokenizer(Module):
288288
:type return_input: bool
289289
"""
290290

291+
SPECIAL_TOKENS_ATTRIBUTES = [
292+
"bos_token",
293+
"eos_token",
294+
"unk_token",
295+
"sep_token",
296+
"pad_token",
297+
"cls_token",
298+
"mask_token",
299+
"additional_special_tokens",
300+
]
291301
__jit_unused_properties__ = ["is_jitable"]
292302
_seperator: torch.jit.Final[str]
293303

@@ -361,8 +371,8 @@ def add_special_tokens(self, special_tokens_dict: Mapping[str, Union[str, Sequen
361371
"""
362372
for key in special_tokens_dict.keys():
363373
assert (
364-
key in SPECIAL_TOKENS_ATTRIBUTES
365-
), f"Key '{key}' is not in the special token list: {SPECIAL_TOKENS_ATTRIBUTES}"
374+
key in self.SPECIAL_TOKENS_ATTRIBUTES
375+
), f"Key '{key}' is not in the special token list: {self.SPECIAL_TOKENS_ATTRIBUTES}"
366376

367377
return self.bpe.add_special_tokens(
368378
{k: v for k, v in special_tokens_dict.items() if k != "additional_special_tokens"},

torchtext/utils.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,6 @@
1313

1414
logger = logging.getLogger(__name__)
1515

16-
SPECIAL_TOKENS_ATTRIBUTES = [
17-
"bos_token",
18-
"eos_token",
19-
"unk_token",
20-
"sep_token",
21-
"pad_token",
22-
"cls_token",
23-
"mask_token",
24-
"additional_special_tokens",
25-
]
26-
2716

2817
def reporthook(t):
2918
"""

0 commit comments

Comments
 (0)