@@ -70,7 +70,7 @@ std::vector<std::string> gpt2_bpe_pre_tokenizer(std::string input) {
7070
7171 /* Notes on handling Special Tokens:
7272 We use regex pattern to first identify the special tokens in the input text.
73- Other non-special tokens go through pre-tokenization as usual, but special
73+ Other ' non-special' tokens go through pre-tokenization as usual, but special
7474 tokens skip those steps.
7575
7676 Steps:
@@ -79,7 +79,7 @@ std::vector<std::string> gpt2_bpe_pre_tokenizer(std::string input) {
7979 `add_special_tokens` API.
8080 - form a regex pattern that helps in extracting special tokens from the
8181 input text.
82- * Crate a vector that contains chunks of input text, such that each chunk is
82+ * Create a vector that contains chunks of input text, such that each chunk is
8383 either a sequence of non-special token or a single special token. For example,
8484 assuming <|special_tok|> and [SEP] are special tokens, the following text
8585 "This is an example with <|special_tok|> and [SEP] and [SPAM]."
@@ -94,9 +94,9 @@ std::vector<std::string> gpt2_bpe_pre_tokenizer(std::string input) {
9494
9595 if (bpe_never_split_set_.size () > 0 ) {
9696 std::string pattern = " " ;
97- // escape regex characters for matching special tokens
98- // this is done to ensure character like '|' in special like
99- // <|endoftext|> don't get special regex meaning
97+ // Escape regex characters for matching special tokens. This is done to
98+ // ensure that characters like '|' in certain special tokens such as
99+ // <|endoftext|> don't get special regex treatment.
100100 for (std::string token : bpe_never_split_set_) {
101101 std::string::size_type pos = 0 ;
102102 while ((pos = token.find_first_of (" |[]" , pos)) != std::string::npos) {
0 commit comments