@@ -294,26 +294,90 @@ def test_gpt2_bpe_tokenizer_save_load_torchscript(self):
294294
295295
296296class TestCLIPTokenizer (TorchtextTestCase ):
297- def _load_tokenizer (self , test_scripting ):
297+ def _load_tokenizer (self , init_using_merge_only : bool , test_scripting : bool ):
298298 encoder_json = "clip_encoder.json"
299299 bpe_vocab = "clip_vocab.bpe"
300- tokenizer = transforms .CLIPTokenizer (
301- encoder_json_path = get_asset_path (encoder_json ),
302- vocab_bpe_path = get_asset_path (bpe_vocab ),
303- )
300+ num_merges = (
301+ 49152 - 256 - 2
302+ ) # https://github.com/mlfoundations/open_clip/blob/57b3e8ea6ad6bfc2974203945f8fd577e0659468/src/clip/tokenizer.py#L67
303+ if init_using_merge_only :
304+ tokenizer = transforms .CLIPTokenizer (
305+ merges_path = get_asset_path (bpe_vocab ),
306+ num_merges = num_merges ,
307+ )
308+ else :
309+ tokenizer = transforms .CLIPTokenizer (
310+ encoder_json_path = get_asset_path (encoder_json ),
311+ merges_path = get_asset_path (bpe_vocab ),
312+ )
304313 if test_scripting :
305314 tokenizer = torch .jit .script (tokenizer )
306315 return tokenizer
307316
308317 def _clip_tokenizer (self , tokenizer ):
309318 sample_texts = [
310319 "Hello World!, how are you?" ,
311- "<|startoftext|> the quick brown fox jumped over the lazy dog <|endoftext|>"
320+ "<|startoftext|> the quick brown fox jumped over the lazy dog <|endoftext|>" ,
321+ "Awaiting their due award... Photo by Frederick (FN) Noronha. Copyleft. Creative Commons 3.0. Non-commercial. Attribution. May be copied for non-commercial purposes. For other purposes, contact fn at goa-india.org" ,
312322 ]
313323
314324 expected_token_ids = [
315- ['3306' , '1002' , '29325' , '829' , '631' , '592' , '286' ],
316- ['49406' , '518' , '3712' , '2866' , '3240' , '16901' , '962' , '518' , '10753' , '1929' , '49407' ],
325+ ["3306" , "1002" , "29325" , "829" , "631" , "592" , "286" ],
326+ ["49406" , "518" , "3712" , "2866" , "3240" , "16901" , "962" , "518" , "10753" , "1929" , "49407" ],
327+ [
328+ "14872" ,
329+ "911" ,
330+ "2887" ,
331+ "2047" ,
332+ "678" ,
333+ "1125" ,
334+ "638" ,
335+ "18570" ,
336+ "263" ,
337+ "21763" ,
338+ "264" ,
339+ "1062" ,
340+ "521" ,
341+ "1429" ,
342+ "269" ,
343+ "11376" ,
344+ "1823" ,
345+ "269" ,
346+ "4450" ,
347+ "16653" ,
348+ "274" ,
349+ "269" ,
350+ "271" ,
351+ "269" ,
352+ "3353" ,
353+ "268" ,
354+ "6287" ,
355+ "269" ,
356+ "24624" ,
357+ "740" ,
358+ "269" ,
359+ "1270" ,
360+ "655" ,
361+ "36770" ,
362+ "556" ,
363+ "3353" ,
364+ "268" ,
365+ "6287" ,
366+ "22020" ,
367+ "269" ,
368+ "556" ,
369+ "1010" ,
370+ "22020" ,
371+ "267" ,
372+ "3523" ,
373+ "21763" ,
374+ "536" ,
375+ "14399" ,
376+ "268" ,
377+ "1762" ,
378+ "269" ,
379+ "5593" ,
380+ ],
317381 ]
318382
319383 # test batch of sentences
@@ -325,22 +389,24 @@ def _clip_tokenizer(self, tokenizer):
325389
326390 def test_clip_tokenizer (self ):
327391 """test tokenization on single sentence input as well as batch on sentences"""
328- self ._clip_tokenizer (self ._load_tokenizer (test_scripting = False ))
392+ self ._clip_tokenizer (self ._load_tokenizer (init_using_merge_only = True , test_scripting = False ))
393+ self ._clip_tokenizer (self ._load_tokenizer (init_using_merge_only = False , test_scripting = False ))
329394
330395 def test_clip_tokenizer_jit (self ):
331396 """test tokenization with scripting on single sentence input as well as batch on sentences"""
332- self ._clip_tokenizer (self ._load_tokenizer (test_scripting = True ))
397+ self ._clip_tokenizer (self ._load_tokenizer (init_using_merge_only = True , test_scripting = True ))
398+ self ._clip_tokenizer (self ._load_tokenizer (init_using_merge_only = False , test_scripting = True ))
333399
334400 def test_clip_tokenizer_save_load_pybind (self ):
335- tokenizer = self ._load_tokenizer (test_scripting = False )
336- tokenizer_path = os .path .join (self .test_dir , ' gpt2_tokenizer_pybind.pt' )
401+ tokenizer = self ._load_tokenizer (init_using_merge_only = True , test_scripting = False )
402+ tokenizer_path = os .path .join (self .test_dir , " gpt2_tokenizer_pybind.pt" )
337403 torch .save (tokenizer , tokenizer_path )
338404 loaded_tokenizer = torch .load (tokenizer_path )
339405 self ._clip_tokenizer ((loaded_tokenizer ))
340406
341407 def test_clip_tokenizer_save_load_torchscript (self ):
342- tokenizer = self ._load_tokenizer (test_scripting = False )
343- tokenizer_path = os .path .join (self .test_dir , ' gpt2_tokenizer_torchscript.pt' )
408+ tokenizer = self ._load_tokenizer (init_using_merge_only = True , test_scripting = False )
409+ tokenizer_path = os .path .join (self .test_dir , " gpt2_tokenizer_torchscript.pt" )
344410 # Call the __prepare_scriptable__() func and convert the building block to the torbhind version
345411 # Not expect users to use the torchbind version on eager mode but still need a CI test here.
346412 torch .save (tokenizer .__prepare_scriptable__ (), tokenizer_path )
0 commit comments