@@ -336,19 +336,197 @@ def _gpt2_bpe_tokenizer(self, tokenizer):
336336 "Hélló WoŕlḊ¿" ,
337337 "Respublica superiorem" ,
338338 "Avdija Vršajević în" ,
339+ "multi space" ,
339340 ]
340341
341342 expected_tokens = [
342343 ["Hello" , "ĠWorld" , "!," , "Ġhow" , "Ġare" , "Ġyou" , "?" ],
343344 ["H" , "é" , "ll" , "ó" , "Ġ" , "ĠWo" , "Å" , "ķ" , "l" , "á¸" , "Ĭ" , "Â" , "¿" ],
344345 ["Res" , "public" , "a" , "Ġsuper" , "i" , "orem" ],
345346 ["Av" , "d" , "ija" , "ĠV" , "r" , "Å¡" , "aj" , "ev" , "i" , "Äĩ" , "ĠÃ" , "®" , "n" ],
347+ ["multi" , "Ġ" , "Ġ" , "Ġ" , "Ġ" , "Ġ" , "Ġspace" ],
346348 ]
347349 expected_token_ids = [
348350 ["15496" , "2159" , "28265" , "703" , "389" , "345" , "30" ],
349351 ["39" , "2634" , "297" , "10205" , "220" , "22173" , "129" , "243" , "75" , "41585" , "232" , "126" , "123" ],
350352 ["4965" , "11377" , "64" , "2208" , "72" , "29625" ],
351353 ["7355" , "67" , "34655" , "569" , "81" , "32790" , "1228" , "1990" , "72" , "38325" , "6184" , "106" , "77" ],
354+ ["41684" , "220" , "220" , "220" , "220" , "220" , "2272" ],
355+ ]
356+
357+ # test batch of sentences
358+ if tokenizer ._return_tokens :
359+ self .assertEqual (tokenizer (sample_texts ), expected_tokens )
360+ else :
361+ self .assertEqual (tokenizer (sample_texts ), expected_token_ids )
362+
363+ # test individual sentences
364+ for idx , txt in enumerate (sample_texts ):
365+ if tokenizer ._return_tokens :
366+ self .assertEqual (tokenizer (txt ), expected_tokens [idx ])
367+ else :
368+ self .assertEqual (tokenizer (txt ), expected_token_ids [idx ])
369+
370+ def _gpt2_bpe_tokenizer_with_added_vocab (self , tokenizer ):
371+ sample_texts = [
372+ "<|endoftext|> and <|endoftext|> are special <|endofline|> is not!" ,
373+ "test ACCEPT <avail_actions> with DECLINE <|endoftext|> and NO_ACTION" ,
374+ "none in vocab: <|endofline|> WALK_60M WALK_10M <state>" ,
375+ "Respublica Vršajević în" ,
376+ "some in vocab: <|endofline|> WALK_60M WALK_10M <state>" ,
377+ "<|endoftext|> WALK_60M WALK_10M <reward> <state>" ,
378+ ]
379+
380+ newly_added = tokenizer .add_special_tokens (
381+ special_tokens_dict = {
382+ "unk_token" : "<|endoftext|>" ,
383+ "additional_special_tokens" : [
384+ "ACCEPT" ,
385+ "DECLINE" ,
386+ "NO_ACTION" ,
387+ "WALK_10M" ,
388+ "WALK_60M" ,
389+ "<reward>" ,
390+ ],
391+ }
392+ )
393+ self .assertEqual (newly_added , 6 )
394+
395+ newly_added = tokenizer .add_special_tokens (
396+ special_tokens_dict = {
397+ "unk_token" : "<|endoftext|>" ,
398+ "sep_token" : "<avail_actions>" ,
399+ "additional_special_tokens" : [
400+ "ACCEPT" ,
401+ "DECLINE" ,
402+ "NO_ACTION" ,
403+ "WALK_10M" ,
404+ "WALK_60M" ,
405+ "<reward>" ,
406+ ],
407+ }
408+ )
409+ self .assertEqual (newly_added , 1 )
410+
411+ expected_tokens = [
412+ [
413+ "<|endoftext|>" ,
414+ "and" ,
415+ "<|endoftext|>" ,
416+ "are" ,
417+ "Ġspecial" ,
418+ "Ġ<" ,
419+ "|" ,
420+ "end" ,
421+ "of" ,
422+ "line" ,
423+ "|" ,
424+ ">" ,
425+ "Ġis" ,
426+ "Ġnot" ,
427+ "!" ,
428+ ],
429+ ["test" , "ACCEPT" , "<avail_actions>" , "with" , "DECLINE" , "<|endoftext|>" , "and" , "NO_ACTION" ],
430+ [
431+ "none" ,
432+ "Ġin" ,
433+ "Ġvoc" ,
434+ "ab" ,
435+ ":" ,
436+ "Ġ<" ,
437+ "|" ,
438+ "end" ,
439+ "of" ,
440+ "line" ,
441+ "|" ,
442+ ">" ,
443+ "WALK_60M" ,
444+ "WALK_10M" ,
445+ "<" ,
446+ "state" ,
447+ ">" ,
448+ ],
449+ ["Res" , "public" , "a" , "ĠV" , "r" , "Å¡" , "aj" , "ev" , "i" , "Äĩ" , "ĠÃ" , "®" , "n" ],
450+ [
451+ "some" ,
452+ "Ġin" ,
453+ "Ġvoc" ,
454+ "ab" ,
455+ ":" ,
456+ "Ġ<" ,
457+ "|" ,
458+ "end" ,
459+ "of" ,
460+ "line" ,
461+ "|" ,
462+ ">" ,
463+ "WALK_60M" ,
464+ "WALK_10M" ,
465+ "<" ,
466+ "state" ,
467+ ">" ,
468+ ],
469+ ["<|endoftext|>" , "WALK_60M" , "WALK_10M" , "<reward>" , "<" , "state" , ">" ],
470+ ]
471+ expected_token_ids = [
472+ [
473+ "50256" ,
474+ "392" ,
475+ "50256" ,
476+ "533" ,
477+ "2041" ,
478+ "1279" ,
479+ "91" ,
480+ "437" ,
481+ "1659" ,
482+ "1370" ,
483+ "91" ,
484+ "29" ,
485+ "318" ,
486+ "407" ,
487+ "0" ,
488+ ],
489+ ["9288" , "50257" , "50263" , "4480" , "50258" , "50256" , "392" , "50259" ],
490+ [
491+ "23108" ,
492+ "287" ,
493+ "12776" ,
494+ "397" ,
495+ "25" ,
496+ "1279" ,
497+ "91" ,
498+ "437" ,
499+ "1659" ,
500+ "1370" ,
501+ "91" ,
502+ "29" ,
503+ "50261" ,
504+ "50260" ,
505+ "27" ,
506+ "5219" ,
507+ "29" ,
508+ ],
509+ ["4965" , "11377" , "64" , "569" , "81" , "32790" , "1228" , "1990" , "72" , "38325" , "6184" , "106" , "77" ],
510+ [
511+ "11246" ,
512+ "287" ,
513+ "12776" ,
514+ "397" ,
515+ "25" ,
516+ "1279" ,
517+ "91" ,
518+ "437" ,
519+ "1659" ,
520+ "1370" ,
521+ "91" ,
522+ "29" ,
523+ "50261" ,
524+ "50260" ,
525+ "27" ,
526+ "5219" ,
527+ "29" ,
528+ ],
529+ ["50256" , "50261" , "50260" , "50262" , "27" , "5219" , "29" ],
352530 ]
353531
354532 # test batch of sentences
@@ -391,6 +569,12 @@ def test_gpt2_bpe_decoder(self):
391569 """test string output returned by decoder given the token ids"""
392570 self ._gpt2_bpe_decoder (self ._load_tokenizer (test_scripting = False , return_tokens = False ))
393571
572+ @nested_params ([True , False ])
573+ def test_gpt2_bpe_tokenizer_with_added_vocab (self , return_tokens ):
574+ self ._gpt2_bpe_tokenizer_with_added_vocab (
575+ self ._load_tokenizer (test_scripting = False , return_tokens = return_tokens )
576+ )
577+
394578 def test_gpt2_bpe_tokenizer_save_load_pybind (self ) -> None :
395579 tokenizer = self ._load_tokenizer (test_scripting = False , return_tokens = False )
396580 tokenizer_path = os .path .join (self .test_dir , "gpt2_tokenizer_pybind.pt" )
0 commit comments