11import argparse
22from collections import (Counter , OrderedDict )
33import time
4- import random
5- import string
6- from timeit import default_timer as timer
7- from matplotlib import pyplot as plt
84import torch
95from torchtext .datasets import DATASETS
106from torchtext .experimental .vocab_factory import (
139)
1410from torchtext .vocab import build_vocab_from_iterator
1511from torchtext .vocab import vocab as VocabNew
16- from torchtext .legacy .vocab import (
17- Vocab ,
18- build_vocab_from_iterator as build_vocab_from_iterator_legacy ,
19- )
20- from torchtext .experimental .transforms import (
12+ from torchtext .experimental .transforms import (
2113 basic_english_normalize ,
2214)
2315from torchtext .data .utils import get_tokenizer
2416
17+
2518def build_vocab (data , transforms ):
2619 def apply_transforms (data ):
2720 for _ , line in data :
@@ -31,96 +24,16 @@ def apply_transforms(data):
3124 return vocab
3225
3326
34- def compare_legacy_and_new_batch_lookup ():
35- num_tokens = 1000
36- num_letters = 6
37- num_lines = 100000
38- vocab = ['' .join (random .sample (string .ascii_letters * num_letters , num_letters )) for _ in range (num_tokens )]
39- counter = Counter ()
40- counter .update (vocab )
41- legacy_vocab = Vocab (counter )
42- new_vocab = VocabNew (counter )
43- speed_ups = []
44- token_lengths = [i for i in range (2 , 100 )]
45- for i in token_lengths :
46- lines = [random .sample (vocab , i ) for _ in range (num_lines )]
47- start_time = timer ()
48- for text in lines :
49- legacy_vocab .lookup_indices (text )
50- legacy_time = timer () - start_time
51-
52- start_time = timer ()
53- for text in lines :
54- new_vocab .lookup_indices (text )
55-
56- new_time = timer () - start_time
57-
58- speed_ups .append (legacy_time / new_time )
59- print ("speed-up={} for average length={}" .format (legacy_time / new_time , i ))
60- del lines
61-
62- plt .close ()
63- fig , ax = plt .subplots (1 , 1 )
64- ax .plot (token_lengths , speed_ups )
65- ax .set_xlabel ('Average Tokens per line' )
66- ax .set_ylabel ('Speed-up' )
67- plt .savefig ("speedup.jpg" )
68-
69-
70- def legacy_vocab_from_file_object (file_like_object , ** kwargs ):
71- r"""Create a `Vocab` object from a file like object.
72-
73- The `file_like_object` should contain tokens seperated by new lines. Note that the vocab
74- will be created in the order that the tokens first appear in the file (and not by the frequency of tokens).
75-
76- Format for txt file:
77- token1
78- token2
79- ...
80- token_n
81-
82- Args:
83- file_like_object (FileObject): a file like object to read data from.
84- Remaining keyword arguments: Passed to the constructor of Vocab class.
85-
86- Returns:
87- Vocab: a `Vocab` object.
88-
89- Examples:
90- >>> from torchtext.vocab import vocab_from_file_object
91- >>> f = open('vocab.txt', 'r')
92- >>> v = vocab_from_file_object(f, specials=('<unk>', '<pad>', '<eos>'), specials_first=False)
93- """
94- tokenizer = basic_english_normalize ()
95-
96- def tokenize (line ):
97- return tokenizer (line )
98-
99- def token_iterator (lines ):
100- for line in lines :
101- for token in tokenize (line ):
102- yield token
103-
104- return build_vocab_from_iterator_legacy (token_iterator (file_like_object ))
105-
106-
107- def benchmark_new_vocab_construction (vocab_file_path , is_raw_text = True , is_legacy = True , num_iters = 1 ):
27+ def benchmark_new_vocab_construction (vocab_file_path , is_raw_text = True , num_iters = 1 ):
10828 f = open (vocab_file_path , 'r' )
10929 t0 = time .monotonic ()
11030 if is_raw_text :
111- if is_legacy :
112- print ("Loading from raw text file with legacy python function" )
113- for _ in range (num_iters ):
114- legacy_vocab_from_file_object (f )
115-
116- print ("Construction time:" , time .monotonic () - t0 )
117- else :
118- print ("Loading from raw text file with basic_english_normalize tokenizer" )
119- for _ in range (num_iters ):
120- tokenizer = basic_english_normalize ()
121- jited_tokenizer = torch .jit .script (tokenizer )
122- build_vocab_from_text_file (vocab_file_path , jited_tokenizer , num_cpus = 1 )
123- print ("Construction time:" , time .monotonic () - t0 )
31+ print ("Loading from raw text file with basic_english_normalize tokenizer" )
32+ for _ in range (num_iters ):
33+ tokenizer = basic_english_normalize ()
34+ jited_tokenizer = torch .jit .script (tokenizer )
35+ build_vocab_from_text_file (vocab_file_path , jited_tokenizer , num_cpus = 1 )
36+ print ("Construction time:" , time .monotonic () - t0 )
12437 else :
12538 for _ in range (num_iters ):
12639 load_vocab_from_file (f )
@@ -146,9 +59,9 @@ def _run_benchmark_lookup(tokens, vocab):
14659 tokens_lists = []
14760 tokenizer = get_tokenizer ("basic_english" )
14861 for (_ , text ) in DATASETS [dataset ](split = 'train' ):
149- cur_tokens = tokenizer (text )
150- tokens_lists .append (cur_tokens )
151- tokens += cur_tokens
62+ cur_tokens = tokenizer (text )
63+ tokens_lists .append (cur_tokens )
64+ tokens += cur_tokens
15265
15366 if vocab_file_path :
15467 print ("Loading Vocab from file {}" .format (vocab_file_path ))
@@ -158,12 +71,6 @@ def token_iterator(file_path):
15871 for token in f :
15972 yield token
16073
161- # existing Vocab construction
162- print ("Vocab" )
163- t0 = time .monotonic ()
164- v_existing = build_vocab_from_iterator_legacy (token_iterator (vocab_file_path ))
165- print ("Construction time:" , time .monotonic () - t0 )
166-
16774 # new Vocab construction
16875 print ("Vocab New" )
16976 t0 = time .monotonic ()
@@ -176,25 +83,13 @@ def token_iterator(file_path):
17683 sorted_by_freq_tuples = sorted (counter .items (), key = lambda x : x [1 ], reverse = True )
17784 ordered_dict = OrderedDict (sorted_by_freq_tuples )
17885
179- # existing Vocab construction
180- print ("Vocab" )
181- t0 = time .monotonic ()
182- v_existing = Vocab (counter )
183- print ("Construction time:" , time .monotonic () - t0 )
184-
18586 # new Vocab construction
18687 print ("Vocab New" )
18788 t0 = time .monotonic ()
18889 v_new = VocabNew (ordered_dict )
18990 print ("Construction time:" , time .monotonic () - t0 )
19091 jit_v_new = torch .jit .script (v_new )
19192
192- # existing Vocab eager lookup
193- print ("Vocab - Eager Mode" )
194- _run_benchmark_lookup (tokens , v_existing )
195- _run_benchmark_lookup ([tokens ], v_existing )
196- _run_benchmark_lookup (tokens_lists , v_existing )
197-
19893 # new Vocab eager lookup
19994 print ("Vocab New - Eager Mode" )
20095 _run_benchmark_lookup (tokens , v_new )
@@ -215,8 +110,6 @@ def token_iterator(file_path):
215110 help = 'run benchmark for constructing a vocab (default=False)' )
216111 parser .add_argument ('--is-raw-text' , type = bool , default = True ,
217112 help = 'construct vocab from raw text file (default=True)' )
218- parser .add_argument ('--is-legacy' , type = bool , default = False ,
219- help = 'construct vocab using legacy implementation (default=False)' )
220113 parser .add_argument ('--vocab-filename-construction' , type = str , default = 'vocab.txt' ,
221114 help = 'The name of vocab file used for construction' )
222115 parser .add_argument ('--vocab-filename-lookup' , type = str , default = None ,
@@ -226,8 +119,7 @@ def token_iterator(file_path):
226119 args = parser .parse_args ()
227120
228121 if args .run_construction_benchmark :
229- print ("is_legacy" , args .is_legacy )
230122 benchmark_new_vocab_construction (args .vocab_filename_construction ,
231- is_raw_text = args .is_raw_text , is_legacy = args . is_legacy )
123+ is_raw_text = args .is_raw_text )
232124 else :
233125 benchmark_new_vocab_lookup (args .vocab_filename_lookup , args .dataset )
0 commit comments