diff --git a/torchtext/csrc/sentencepiece.cpp b/torchtext/csrc/sentencepiece.cpp index 1fc5e370b2..0099d827d9 100644 --- a/torchtext/csrc/sentencepiece.cpp +++ b/torchtext/csrc/sentencepiece.cpp @@ -75,7 +75,7 @@ c10::intrusive_ptr load_sp_model(const std::string &path) { } c10::intrusive_ptr -load_sp_model_string(const std::string &content) { +load_sp_model_string(std::string content) { return c10::make_intrusive(std::move(content)); } diff --git a/torchtext/csrc/sentencepiece.h b/torchtext/csrc/sentencepiece.h index 3b26d90ffa..dfd997f52f 100644 --- a/torchtext/csrc/sentencepiece.h +++ b/torchtext/csrc/sentencepiece.h @@ -33,6 +33,6 @@ void generate_sp_model(const std::string &filename, const int64_t &vocab_size, const std::string &model_prefix); c10::intrusive_ptr load_sp_model(const std::string &path); c10::intrusive_ptr -load_sp_model_string(const std::string &content); +load_sp_model_string(std::string content); } // namespace torchtext diff --git a/torchtext/csrc/vectors.cpp b/torchtext/csrc/vectors.cpp index c5410b1496..dd187e6ac2 100644 --- a/torchtext/csrc/vectors.cpp +++ b/torchtext/csrc/vectors.cpp @@ -15,16 +15,16 @@ namespace torchtext { -Vectors::Vectors(const IndexMap &stoi, const torch::Tensor vectors, - const torch::Tensor &unk_tensor) - : stoi_(stoi), vectors_(vectors), unk_tensor_(unk_tensor) {} +Vectors::Vectors(const IndexMap &stoi, torch::Tensor vectors, + torch::Tensor unk_tensor) + : stoi_(stoi), vectors_(std::move(vectors)), unk_tensor_(std::move(unk_tensor)) {} Vectors::Vectors(const std::vector &tokens, const std::vector &indices, - const torch::Tensor &vectors, const torch::Tensor &unk_tensor) + torch::Tensor vectors, torch::Tensor unk_tensor) : vectors_(std::move(vectors)), unk_tensor_(std::move(unk_tensor)) { // guarding against size mismatch of tokens and indices - if (static_cast(tokens.size()) != indices.size()) { + if (tokens.size() != indices.size()) { #ifdef _MSC_VER std::cerr << "[RuntimeError] Mismatching sizes for tokens and indices. " "Size of tokens: " @@ -72,7 +72,7 @@ torch::Tensor Vectors::__getitem__(const std::string &token) { torch::Tensor Vectors::lookup_vectors(const std::vector &tokens) { std::vector vectors; for (const std::string &token : tokens) { - vectors.push_back(__getitem__(token)); + vectors.emplace_back(__getitem__(token)); } return torch::stack(vectors, 0); } @@ -206,7 +206,7 @@ _concat_vectors(std::vector> chunk_tokens, constexpr int64_t GRAIN_SIZE = 131072; std::tuple> _load_token_and_vectors_from_file( - const std::string &file_path, const std::string delimiter_str, + const std::string &file_path, const std::string &delimiter_str, int64_t num_cpus, c10::optional opt_unk_tensor) { TORCH_CHECK(delimiter_str.size() == 1, @@ -265,7 +265,7 @@ std::tuple> _load_token_and_vectors_from_file( torch::Tensor unk_tensor; if (opt_unk_tensor) { - unk_tensor = *opt_unk_tensor; + unk_tensor = std::move(*opt_unk_tensor); } else { unk_tensor = torch::zeros({vector_dim}, torch::kFloat32); } diff --git a/torchtext/csrc/vectors.h b/torchtext/csrc/vectors.h index 09716258ca..abe69f5fe6 100644 --- a/torchtext/csrc/vectors.h +++ b/torchtext/csrc/vectors.h @@ -19,12 +19,12 @@ struct Vectors : torch::CustomClassHolder { torch::Tensor vectors_; torch::Tensor unk_tensor_; - explicit Vectors(const IndexMap &stoi, const torch::Tensor vectors, - const torch::Tensor &unk_tensor); + explicit Vectors(const IndexMap &stoi, torch::Tensor vectors, + torch::Tensor unk_tensor); explicit Vectors(const std::vector &tokens, const std::vector &indices, - const torch::Tensor &vectors, - const torch::Tensor &unk_tensor); + torch::Tensor vectors, + torch::Tensor unk_tensor); std::unordered_map get_stoi(); torch::Tensor __getitem__(const std::string &token); torch::Tensor lookup_vectors(const std::vector &tokens); @@ -36,7 +36,7 @@ VectorsStates _serialize_vectors(const c10::intrusive_ptr &self); c10::intrusive_ptr _deserialize_vectors(VectorsStates states); std::tuple> _load_token_and_vectors_from_file( - const std::string &file_path, const std::string delimiter_str, + const std::string &file_path, const std::string &delimiter_str, const int64_t num_cpus, c10::optional opt_unk_tensor); } // namespace torchtext diff --git a/torchtext/csrc/vocab.cpp b/torchtext/csrc/vocab.cpp index c83652c014..6f7abc8db4 100644 --- a/torchtext/csrc/vocab.cpp +++ b/torchtext/csrc/vocab.cpp @@ -7,20 +7,20 @@ #include // @manual namespace torchtext { -Vocab::Vocab(const StringList &tokens, +Vocab::Vocab(StringList tokens, const c10::optional &default_index) : stoi_(MAX_VOCAB_SIZE, -1), default_index_{default_index} { - for (size_t i = 0; i < tokens.size(); i++) { + for (auto &token : tokens) { // throw error if duplicate token is found - auto id = _find(c10::string_view{tokens[i].data(), tokens[i].size()}); + auto id = _find(c10::string_view{token}); TORCH_CHECK(stoi_[id] == -1, - "Duplicate token found in tokens list: " + tokens[i]); + "Duplicate token found in tokens list: " + token); - _add(tokens[i]); + _add(std::move(token)); } } -Vocab::Vocab(const StringList &tokens) : Vocab(tokens, {}) {} +Vocab::Vocab(StringList tokens) : Vocab(std::move(tokens), {}) {} int64_t Vocab::__len__() const { return itos_.size(); } @@ -54,17 +54,17 @@ c10::optional Vocab::get_default_index() const { return default_index_; } -void Vocab::append_token(const std::string &token) { +void Vocab::append_token(std::string token) { // throw error if token already exist in vocab - auto id = _find(c10::string_view{token.data(), token.size()}); + auto id = _find(c10::string_view{token}); TORCH_CHECK(stoi_[id] == -1, "Token " + token + " already exists in the Vocab with index: " + std::to_string(stoi_[id])); - _add(token); + _add(std::move(token)); } -void Vocab::insert_token(const std::string &token, const int64_t &index) { +void Vocab::insert_token(std::string token, const int64_t &index) { // throw error if index is not valid TORCH_CHECK(index >= 0 && index <= __len__(), "Specified index " + std::to_string(index) + @@ -76,11 +76,11 @@ void Vocab::insert_token(const std::string &token, const int64_t &index) { // need to offset all tokens greater than or equal index by 1 for (size_t i = index; i < __len__(); i++) { - stoi_[_find(c10::string_view{itos_[i].data(), itos_[i].size()})] = i + 1; + stoi_[_find(c10::string_view{itos_[i]})] = i + 1; } - itos_.insert(itos_.begin() + index, token); - stoi_[_find(c10::string_view{token.data(), token.size()})] = index; + stoi_[_find(c10::string_view{token})] = index; + itos_.insert(itos_.begin() + index, std::move(token)); } std::string Vocab::lookup_token(const int64_t &index) { @@ -144,7 +144,7 @@ int64_t _infer_lines(const std::string &file_path) { void parse_vocab_file_chunk(const std::string &file_path, size_t offset, const int64_t start_line, const int64_t end_line, - std::shared_ptr counter) { + const std::shared_ptr &counter) { std::ifstream fin(file_path, std::ios::in); TORCH_CHECK(fin.is_open(), "Cannot open input file " + file_path); @@ -165,7 +165,7 @@ void parse_vocab_file_chunk(const std::string &file_path, size_t offset, void parse_raw_text_file_chunk(const std::string &file_path, size_t offset, const int64_t start_line, const int64_t end_line, - std::shared_ptr counter, + const std::shared_ptr &counter, torch::jit::script::Module &module) { std::ifstream fin(file_path, std::ios::in); TORCH_CHECK(fin.is_open(), "Cannot open input file " + file_path); @@ -225,9 +225,11 @@ _concat_tokens(std::vector> chunk_counters, // create token freq pairs std::vector> token_freq_pairs; - for (std::string token : unique_tokens) { - token_freq_pairs.push_back(std::make_pair(token, tokens_freq[token])); + for (std::string &token : unique_tokens) { + auto token_freq = tokens_freq[token]; + token_freq_pairs.emplace_back(std::move(token), token_freq); } + unique_tokens.clear(); // sort tokens by freq if (sort_tokens) { @@ -236,9 +238,8 @@ _concat_tokens(std::vector> chunk_counters, } // update unique tokens with correct order - unique_tokens.clear(); - for (const auto &token_freq_pair : token_freq_pairs) { - unique_tokens.push_back(token_freq_pair.first); + for (auto &token_freq_pair : token_freq_pairs) { + unique_tokens.emplace_back(std::move(token_freq_pair.first)); } return unique_tokens; diff --git a/torchtext/csrc/vocab.h b/torchtext/csrc/vocab.h index 66f04aa1a0..50ac492a63 100644 --- a/torchtext/csrc/vocab.h +++ b/torchtext/csrc/vocab.h @@ -27,7 +27,7 @@ int64_t _infer_lines(const std::string &file_path); struct Vocab : torch::CustomClassHolder { static const int32_t MAX_VOCAB_SIZE = 30000000; - int64_t unk_index_; + int64_t unk_index_{}; std::vector stoi_; const std::string version_str_ = "0.0.2"; StringList itos_; @@ -36,16 +36,16 @@ struct Vocab : torch::CustomClassHolder { // TODO: [can we remove this?] we need to keep this constructor, otherwise // torch binding gets compilation error: no matching constructor for // initialization of 'torchtext::Vocab' - explicit Vocab(const StringList &tokens); - explicit Vocab(const StringList &tokens, + explicit Vocab(StringList tokens); + explicit Vocab(StringList tokens, const c10::optional &default_index); int64_t __len__() const; int64_t __getitem__(const c10::string_view &token) const; bool __contains__(const c10::string_view &token) const; void set_default_index(c10::optional index); c10::optional get_default_index() const; - void insert_token(const std::string &token, const int64_t &index); - void append_token(const std::string &token); + void insert_token(std::string token, const int64_t &index); + void append_token(std::string token); std::string lookup_token(const int64_t &index); std::vector lookup_tokens(const std::vector &indices); std::vector @@ -72,10 +72,10 @@ struct Vocab : torch::CustomClassHolder { return id; } - void _add(const std::string &w) { + void _add(std::string w) { uint32_t h = _find(c10::string_view{w.data(), w.size()}); if (stoi_[h] == -1) { - itos_.push_back(w); + itos_.emplace_back(std::move(w)); stoi_[h] = itos_.size() - 1; } }