rebase and address latest nit comments

reachsumit · reachsumit · commit ad249fc8be9a · 2022-10-03T23:11:28.000-07:00
diff --git a/torchtext/csrc/gpt2_bpe_tokenizer.cpp b/torchtext/csrc/gpt2_bpe_tokenizer.cpp
@@ -70,7 +70,7 @@ std::vector<std::string> gpt2_bpe_pre_tokenizer(std::string input) {
 
   /* Notes on handling Special Tokens:
   We use regex pattern to first identify the special tokens in the input text.
-  Other non-special tokens go through pre-tokenization as usual, but special
+  Other 'non-special' tokens go through pre-tokenization as usual, but special
   tokens skip those steps.
 
   Steps:
@@ -79,7 +79,7 @@ std::vector<std::string> gpt2_bpe_pre_tokenizer(std::string input) {
   `add_special_tokens` API.
     - form a regex pattern that helps in extracting special tokens from the
   input text.
-  * Crate a vector that contains chunks of input text, such that each chunk is
+  * Create a vector that contains chunks of input text, such that each chunk is
   either a sequence of non-special token or a single special token. For example,
   assuming <|special_tok|> and [SEP] are special tokens, the following text
       "This is an example with <|special_tok|> and [SEP] and [SPAM]."
@@ -94,9 +94,9 @@ std::vector<std::string> gpt2_bpe_pre_tokenizer(std::string input) {
 
   if (bpe_never_split_set_.size() > 0) {
     std::string pattern = "";
-    // escape regex characters for matching special tokens
-    // this is done to ensure character like '|' in special like
-    // <|endoftext|> don't get special regex meaning
+    // Escape regex characters for matching special tokens. This is done to
+    // ensure that characters like '|' in certain special tokens such as
+    // <|endoftext|> don't get special regex treatment.
     for (std::string token : bpe_never_split_set_) {
       std::string::size_type pos = 0;
       while ((pos = token.find_first_of("|[]", pos)) != std::string::npos) {
diff --git a/torchtext/transforms.py b/torchtext/transforms.py
@@ -14,7 +14,7 @@
 )
 from torchtext._torchtext import RegexTokenizer as RegexTokenizerPybind
 from torchtext.data.functional import load_sp_model
-from torchtext.utils import get_asset_local_path, SPECIAL_TOKENS_ATTRIBUTES
+from torchtext.utils import get_asset_local_path
 from torchtext.vocab import Vocab
 
 from . import functional as F
@@ -288,6 +288,16 @@ class GPT2BPETokenizer(Module):
     :type return_input: bool
     """
 
+    SPECIAL_TOKENS_ATTRIBUTES = [
+        "bos_token",
+        "eos_token",
+        "unk_token",
+        "sep_token",
+        "pad_token",
+        "cls_token",
+        "mask_token",
+        "additional_special_tokens",
+    ]
     __jit_unused_properties__ = ["is_jitable"]
     _seperator: torch.jit.Final[str]
 
@@ -361,8 +371,8 @@ def add_special_tokens(self, special_tokens_dict: Mapping[str, Union[str, Sequen
         """
         for key in special_tokens_dict.keys():
             assert (
-                key in SPECIAL_TOKENS_ATTRIBUTES
-            ), f"Key '{key}' is not in the special token list: {SPECIAL_TOKENS_ATTRIBUTES}"
+                key in self.SPECIAL_TOKENS_ATTRIBUTES
+            ), f"Key '{key}' is not in the special token list: {self.SPECIAL_TOKENS_ATTRIBUTES}"
 
         return self.bpe.add_special_tokens(
             {k: v for k, v in special_tokens_dict.items() if k != "additional_special_tokens"},
diff --git a/torchtext/utils.py b/torchtext/utils.py
@@ -13,17 +13,6 @@
 
 logger = logging.getLogger(__name__)
 
-SPECIAL_TOKENS_ATTRIBUTES = [
-    "bos_token",
-    "eos_token",
-    "unk_token",
-    "sep_token",
-    "pad_token",
-    "cls_token",
-    "mask_token",
-    "additional_special_tokens",
-]
-
 
 def reporthook(t):
     """