77#include < vocab.h> // @manual
88namespace torchtext {
99
10- Vocab::Vocab (const StringList & tokens,
10+ Vocab::Vocab (StringList tokens,
1111 const c10::optional<int64_t > &default_index)
1212 : stoi_(MAX_VOCAB_SIZE, -1 ), default_index_{default_index} {
13- for (size_t i = 0 ; i < tokens. size (); i++ ) {
13+ for (auto &token : tokens) {
1414 // throw error if duplicate token is found
15- auto id = _find (c10::string_view{tokens[i]. data (), tokens[i]. size () });
15+ auto id = _find (c10::string_view{token });
1616 TORCH_CHECK (stoi_[id] == -1 ,
17- " Duplicate token found in tokens list: " + tokens[i] );
17+ " Duplicate token found in tokens list: " + token );
1818
19- _add (tokens[i] );
19+ _add (std::move (token) );
2020 }
2121}
2222
23- Vocab::Vocab (const StringList & tokens) : Vocab(tokens, {}) {}
23+ Vocab::Vocab (StringList tokens) : Vocab(std::move( tokens) , {}) {}
2424
2525int64_t Vocab::__len__ () const { return itos_.size (); }
2626
@@ -54,17 +54,17 @@ c10::optional<int64_t> Vocab::get_default_index() const {
5454 return default_index_;
5555}
5656
57- void Vocab::append_token (const std::string & token) {
57+ void Vocab::append_token (std::string token) {
5858 // throw error if token already exist in vocab
59- auto id = _find (c10::string_view{token. data (), token. size () });
59+ auto id = _find (c10::string_view{token});
6060 TORCH_CHECK (stoi_[id] == -1 , " Token " + token +
6161 " already exists in the Vocab with index: " +
6262 std::to_string (stoi_[id]));
6363
64- _add (token);
64+ _add (std::move ( token) );
6565}
6666
67- void Vocab::insert_token (const std::string & token, const int64_t &index) {
67+ void Vocab::insert_token (std::string token, const int64_t &index) {
6868 // throw error if index is not valid
6969 TORCH_CHECK (index >= 0 && index <= __len__ (),
7070 " Specified index " + std::to_string (index) +
@@ -76,11 +76,11 @@ void Vocab::insert_token(const std::string &token, const int64_t &index) {
7676
7777 // need to offset all tokens greater than or equal index by 1
7878 for (size_t i = index; i < __len__ (); i++) {
79- stoi_[_find (c10::string_view{itos_[i]. data (), itos_[i]. size () })] = i + 1 ;
79+ stoi_[_find (c10::string_view{itos_[i]})] = i + 1 ;
8080 }
8181
82- itos_. insert (itos_. begin () + index, token) ;
83- stoi_[ _find (c10::string_view{token. data (), token. size ()})] = index ;
82+ stoi_[ _find (c10::string_view{token})] = index;
83+ itos_. insert (itos_. begin () + index, std::move (token)) ;
8484}
8585
8686std::string Vocab::lookup_token (const int64_t &index) {
@@ -144,7 +144,7 @@ int64_t _infer_lines(const std::string &file_path) {
144144
145145void parse_vocab_file_chunk (const std::string &file_path, size_t offset,
146146 const int64_t start_line, const int64_t end_line,
147- std::shared_ptr<IndexDict> counter) {
147+ const std::shared_ptr<IndexDict> & counter) {
148148 std::ifstream fin (file_path, std::ios::in);
149149 TORCH_CHECK (fin.is_open (), " Cannot open input file " + file_path);
150150
@@ -165,7 +165,7 @@ void parse_vocab_file_chunk(const std::string &file_path, size_t offset,
165165
166166void parse_raw_text_file_chunk (const std::string &file_path, size_t offset,
167167 const int64_t start_line, const int64_t end_line,
168- std::shared_ptr<IndexDict> counter,
168+ const std::shared_ptr<IndexDict> & counter,
169169 torch::jit::script::Module &module ) {
170170 std::ifstream fin (file_path, std::ios::in);
171171 TORCH_CHECK (fin.is_open (), " Cannot open input file " + file_path);
@@ -225,9 +225,11 @@ _concat_tokens(std::vector<std::shared_ptr<IndexDict>> chunk_counters,
225225 // create token freq pairs
226226 std::vector<std::pair<std::string, int64_t >> token_freq_pairs;
227227
228- for (std::string token : unique_tokens) {
229- token_freq_pairs.push_back (std::make_pair (token, tokens_freq[token]));
228+ for (std::string &token : unique_tokens) {
229+ auto token_freq = tokens_freq[token];
230+ token_freq_pairs.emplace_back (std::move (token), token_freq);
230231 }
232+ unique_tokens.clear ();
231233
232234 // sort tokens by freq
233235 if (sort_tokens) {
@@ -236,9 +238,8 @@ _concat_tokens(std::vector<std::shared_ptr<IndexDict>> chunk_counters,
236238 }
237239
238240 // update unique tokens with correct order
239- unique_tokens.clear ();
240- for (const auto &token_freq_pair : token_freq_pairs) {
241- unique_tokens.push_back (token_freq_pair.first );
241+ for (auto &token_freq_pair : token_freq_pairs) {
242+ unique_tokens.emplace_back (std::move (token_freq_pair.first ));
242243 }
243244
244245 return unique_tokens;
0 commit comments