|
14 | 14 | ) |
15 | 15 | from torchtext._torchtext import RegexTokenizer as RegexTokenizerPybind |
16 | 16 | from torchtext.data.functional import load_sp_model |
17 | | -from torchtext.utils import get_asset_local_path |
| 17 | +from torchtext.utils import get_asset_local_path, SPECIAL_TOKENS_ATTRIBUTES |
18 | 18 | from torchtext.vocab import Vocab |
19 | 19 |
|
20 | 20 | from . import functional as F |
@@ -294,16 +294,6 @@ class GPT2BPETokenizer(Module): |
294 | 294 | def __init__(self, encoder_json_path: str, vocab_bpe_path: str, return_tokens: bool = False) -> None: |
295 | 295 | super().__init__() |
296 | 296 | self._seperator = "\u0001" |
297 | | - self.SPECIAL_TOKENS_ATTRIBUTES = [ |
298 | | - "bos_token", |
299 | | - "eos_token", |
300 | | - "unk_token", |
301 | | - "sep_token", |
302 | | - "pad_token", |
303 | | - "cls_token", |
304 | | - "mask_token", |
305 | | - "additional_special_tokens", |
306 | | - ] |
307 | 297 | # load bpe encoder and bpe decoder |
308 | 298 | with open(get_asset_local_path(encoder_json_path), "r", encoding="utf-8") as f: |
309 | 299 | bpe_encoder = json.load(f) |
@@ -371,8 +361,8 @@ def add_special_tokens(self, special_tokens_dict: Mapping[str, Union[str, Sequen |
371 | 361 | """ |
372 | 362 | for key in special_tokens_dict.keys(): |
373 | 363 | assert ( |
374 | | - key in self.SPECIAL_TOKENS_ATTRIBUTES |
375 | | - ), f"Key '{key}' is not in the special token list: {self.SPECIAL_TOKENS_ATTRIBUTES}" |
| 364 | + key in SPECIAL_TOKENS_ATTRIBUTES |
| 365 | + ), f"Key '{key}' is not in the special token list: {SPECIAL_TOKENS_ATTRIBUTES}" |
376 | 366 |
|
377 | 367 | return self.bpe.add_special_tokens( |
378 | 368 | {k: v for k, v in special_tokens_dict.items() if k != "additional_special_tokens"}, |
|
0 commit comments