Some simple c++ refactoring (#1327)

cyyever · web-flow · commit f7c29854bc43 · 2021-07-16T10:25:07.000-04:00
diff --git a/torchtext/csrc/sentencepiece.cpp b/torchtext/csrc/sentencepiece.cpp
@@ -75,7 +75,7 @@ c10::intrusive_ptr<SentencePiece> load_sp_model(const std::string &path) {
 }
 
 c10::intrusive_ptr<SentencePiece>
-load_sp_model_string(const std::string &content) {
+load_sp_model_string(std::string content) {
   return c10::make_intrusive<SentencePiece>(std::move(content));
 }
 
diff --git a/torchtext/csrc/sentencepiece.h b/torchtext/csrc/sentencepiece.h
@@ -33,6 +33,6 @@ void generate_sp_model(const std::string &filename, const int64_t &vocab_size,
                        const std::string &model_prefix);
 c10::intrusive_ptr<SentencePiece> load_sp_model(const std::string &path);
 c10::intrusive_ptr<SentencePiece>
-load_sp_model_string(const std::string &content);
+load_sp_model_string(std::string content);
 
 } // namespace torchtext
diff --git a/torchtext/csrc/vectors.cpp b/torchtext/csrc/vectors.cpp
@@ -15,16 +15,16 @@
 
 namespace torchtext {
 
-Vectors::Vectors(const IndexMap &stoi, const torch::Tensor vectors,
-                 const torch::Tensor &unk_tensor)
-    : stoi_(stoi), vectors_(vectors), unk_tensor_(unk_tensor) {}
+Vectors::Vectors(const IndexMap &stoi, torch::Tensor vectors,
+                 torch::Tensor unk_tensor)
+    : stoi_(stoi), vectors_(std::move(vectors)), unk_tensor_(std::move(unk_tensor)) {}
 
 Vectors::Vectors(const std::vector<std::string> &tokens,
                  const std::vector<std::int64_t> &indices,
-                 const torch::Tensor &vectors, const torch::Tensor &unk_tensor)
+                 torch::Tensor vectors, torch::Tensor unk_tensor)
     : vectors_(std::move(vectors)), unk_tensor_(std::move(unk_tensor)) {
   // guarding against size mismatch of tokens and indices
-  if (static_cast<int>(tokens.size()) != indices.size()) {
+  if (tokens.size() != indices.size()) {
 #ifdef _MSC_VER
     std::cerr << "[RuntimeError] Mismatching sizes for tokens and indices. "
                  "Size of tokens: "
@@ -72,7 +72,7 @@ torch::Tensor Vectors::__getitem__(const std::string &token) {
 torch::Tensor Vectors::lookup_vectors(const std::vector<std::string> &tokens) {
   std::vector<torch::Tensor> vectors;
   for (const std::string &token : tokens) {
-    vectors.push_back(__getitem__(token));
+    vectors.emplace_back(__getitem__(token));
   }
   return torch::stack(vectors, 0);
 }
@@ -206,7 +206,7 @@ _concat_vectors(std::vector<std::shared_ptr<StringList>> chunk_tokens,
 
 constexpr int64_t GRAIN_SIZE = 131072;
 std::tuple<Vectors, std::vector<std::string>> _load_token_and_vectors_from_file(
-    const std::string &file_path, const std::string delimiter_str,
+    const std::string &file_path, const std::string &delimiter_str,
     int64_t num_cpus, c10::optional<torch::Tensor> opt_unk_tensor) {
 
   TORCH_CHECK(delimiter_str.size() == 1,
@@ -265,7 +265,7 @@ std::tuple<Vectors, std::vector<std::string>> _load_token_and_vectors_from_file(
 
   torch::Tensor unk_tensor;
   if (opt_unk_tensor) {
-    unk_tensor = *opt_unk_tensor;
+    unk_tensor = std::move(*opt_unk_tensor);
   } else {
     unk_tensor = torch::zeros({vector_dim}, torch::kFloat32);
   }
diff --git a/torchtext/csrc/vectors.h b/torchtext/csrc/vectors.h
@@ -19,12 +19,12 @@ struct Vectors : torch::CustomClassHolder {
   torch::Tensor vectors_;
   torch::Tensor unk_tensor_;
 
-  explicit Vectors(const IndexMap &stoi, const torch::Tensor vectors,
-                   const torch::Tensor &unk_tensor);
+  explicit Vectors(const IndexMap &stoi, torch::Tensor vectors,
+                   torch::Tensor unk_tensor);
   explicit Vectors(const std::vector<std::string> &tokens,
                    const std::vector<std::int64_t> &indices,
-                   const torch::Tensor &vectors,
-                   const torch::Tensor &unk_tensor);
+                   torch::Tensor vectors,
+                   torch::Tensor unk_tensor);
   std::unordered_map<std::string, int64_t> get_stoi();
   torch::Tensor __getitem__(const std::string &token);
   torch::Tensor lookup_vectors(const std::vector<std::string> &tokens);
@@ -36,7 +36,7 @@ VectorsStates _serialize_vectors(const c10::intrusive_ptr<Vectors> &self);
 c10::intrusive_ptr<Vectors> _deserialize_vectors(VectorsStates states);
 
 std::tuple<Vectors, std::vector<std::string>> _load_token_and_vectors_from_file(
-    const std::string &file_path, const std::string delimiter_str,
+    const std::string &file_path, const std::string &delimiter_str,
     const int64_t num_cpus, c10::optional<torch::Tensor> opt_unk_tensor);
 
 } // namespace torchtext
diff --git a/torchtext/csrc/vocab.cpp b/torchtext/csrc/vocab.cpp
@@ -7,20 +7,20 @@
 #include <vocab.h>       // @manual
 namespace torchtext {
 
-Vocab::Vocab(const StringList &tokens,
+Vocab::Vocab(StringList tokens,
              const c10::optional<int64_t> &default_index)
     : stoi_(MAX_VOCAB_SIZE, -1), default_index_{default_index} {
-  for (size_t i = 0; i < tokens.size(); i++) {
+  for (auto &token : tokens) {
     // throw error if duplicate token is found
-    auto id = _find(c10::string_view{tokens[i].data(), tokens[i].size()});
+    auto id = _find(c10::string_view{token});
     TORCH_CHECK(stoi_[id] == -1,
-                "Duplicate token found in tokens list: " + tokens[i]);
+                "Duplicate token found in tokens list: " + token);
 
-    _add(tokens[i]);
+    _add(std::move(token));
   }
 }
 
-Vocab::Vocab(const StringList &tokens) : Vocab(tokens, {}) {}
+Vocab::Vocab(StringList tokens) : Vocab(std::move(tokens), {}) {}
 
 int64_t Vocab::__len__() const { return itos_.size(); }
 
@@ -54,17 +54,17 @@ c10::optional<int64_t> Vocab::get_default_index() const {
   return default_index_;
 }
 
-void Vocab::append_token(const std::string &token) {
+void Vocab::append_token(std::string token) {
   // throw error if token already exist in vocab
-  auto id = _find(c10::string_view{token.data(), token.size()});
+  auto id = _find(c10::string_view{token});
   TORCH_CHECK(stoi_[id] == -1, "Token " + token +
                                    " already exists in the Vocab with index: " +
                                    std::to_string(stoi_[id]));
 
-  _add(token);
+  _add(std::move(token));
 }
 
-void Vocab::insert_token(const std::string &token, const int64_t &index) {
+void Vocab::insert_token(std::string token, const int64_t &index) {
   // throw error if index is not valid
   TORCH_CHECK(index >= 0 && index <= __len__(),
               "Specified index " + std::to_string(index) +
@@ -76,11 +76,11 @@ void Vocab::insert_token(const std::string &token, const int64_t &index) {
 
   // need to offset all tokens greater than or equal index by 1
   for (size_t i = index; i < __len__(); i++) {
-    stoi_[_find(c10::string_view{itos_[i].data(), itos_[i].size()})] = i + 1;
+    stoi_[_find(c10::string_view{itos_[i]})] = i + 1;
   }
 
-  itos_.insert(itos_.begin() + index, token);
-  stoi_[_find(c10::string_view{token.data(), token.size()})] = index;
+  stoi_[_find(c10::string_view{token})] = index;
+  itos_.insert(itos_.begin() + index, std::move(token));
 }
 
 std::string Vocab::lookup_token(const int64_t &index) {
@@ -144,7 +144,7 @@ int64_t _infer_lines(const std::string &file_path) {
 
 void parse_vocab_file_chunk(const std::string &file_path, size_t offset,
                             const int64_t start_line, const int64_t end_line,
-                            std::shared_ptr<IndexDict> counter) {
+                            const std::shared_ptr<IndexDict> &counter) {
   std::ifstream fin(file_path, std::ios::in);
   TORCH_CHECK(fin.is_open(), "Cannot open input file " + file_path);
 
@@ -165,7 +165,7 @@ void parse_vocab_file_chunk(const std::string &file_path, size_t offset,
 
 void parse_raw_text_file_chunk(const std::string &file_path, size_t offset,
                                const int64_t start_line, const int64_t end_line,
-                               std::shared_ptr<IndexDict> counter,
+                               const std::shared_ptr<IndexDict> &counter,
                                torch::jit::script::Module &module) {
   std::ifstream fin(file_path, std::ios::in);
   TORCH_CHECK(fin.is_open(), "Cannot open input file " + file_path);
@@ -225,9 +225,11 @@ _concat_tokens(std::vector<std::shared_ptr<IndexDict>> chunk_counters,
   // create token freq pairs
   std::vector<std::pair<std::string, int64_t>> token_freq_pairs;
 
-  for (std::string token : unique_tokens) {
-    token_freq_pairs.push_back(std::make_pair(token, tokens_freq[token]));
+  for (std::string &token : unique_tokens) {
+    auto token_freq = tokens_freq[token];
+    token_freq_pairs.emplace_back(std::move(token), token_freq);
   }
+  unique_tokens.clear();
 
   // sort tokens by freq
   if (sort_tokens) {
@@ -236,9 +238,8 @@ _concat_tokens(std::vector<std::shared_ptr<IndexDict>> chunk_counters,
   }
 
   // update unique tokens with correct order
-  unique_tokens.clear();
-  for (const auto &token_freq_pair : token_freq_pairs) {
-    unique_tokens.push_back(token_freq_pair.first);
+  for (auto &token_freq_pair : token_freq_pairs) {
+    unique_tokens.emplace_back(std::move(token_freq_pair.first));
   }
 
   return unique_tokens;
diff --git a/torchtext/csrc/vocab.h b/torchtext/csrc/vocab.h
@@ -27,7 +27,7 @@ int64_t _infer_lines(const std::string &file_path);
 
 struct Vocab : torch::CustomClassHolder {
   static const int32_t MAX_VOCAB_SIZE = 30000000;
-  int64_t unk_index_;
+  int64_t unk_index_{};
   std::vector<int32_t> stoi_;
   const std::string version_str_ = "0.0.2";
   StringList itos_;
@@ -36,16 +36,16 @@ struct Vocab : torch::CustomClassHolder {
   // TODO: [can we remove this?] we need to keep this constructor, otherwise
   // torch binding gets compilation error: no matching constructor for
   // initialization of 'torchtext::Vocab'
-  explicit Vocab(const StringList &tokens);
-  explicit Vocab(const StringList &tokens,
+  explicit Vocab(StringList tokens);
+  explicit Vocab(StringList tokens,
                  const c10::optional<int64_t> &default_index);
   int64_t __len__() const;
   int64_t __getitem__(const c10::string_view &token) const;
   bool __contains__(const c10::string_view &token) const;
   void set_default_index(c10::optional<int64_t> index);
   c10::optional<int64_t> get_default_index() const;
-  void insert_token(const std::string &token, const int64_t &index);
-  void append_token(const std::string &token);
+  void insert_token(std::string token, const int64_t &index);
+  void append_token(std::string token);
   std::string lookup_token(const int64_t &index);
   std::vector<std::string> lookup_tokens(const std::vector<int64_t> &indices);
   std::vector<int64_t>
@@ -72,10 +72,10 @@ struct Vocab : torch::CustomClassHolder {
     return id;
   }
 
-  void _add(const std::string &w) {
+  void _add(std::string w) {
     uint32_t h = _find(c10::string_view{w.data(), w.size()});
     if (stoi_[h] == -1) {
-      itos_.push_back(w);
+      itos_.emplace_back(std::move(w));
       stoi_[h] = itos_.size() - 1;
     }
   }

Original file line number	Diff line number	Diff line change
`@@ -75,7 +75,7 @@ c10::intrusive_ptr<SentencePiece> load_sp_model(const std::string &path) {`
`75`	`75`	`}`
`76`	`76`
`77`	`77`	`c10::intrusive_ptr<SentencePiece>`
`78`		`-load_sp_model_string(const std::string &content) {`
	`78`	`+load_sp_model_string(std::string content) {`
`79`	`79`	`return c10::make_intrusive<SentencePiece>(std::move(content));`
`80`	`80`	`}`
`81`	`81`