From 53f6e1ae60d9dff3c9c9bf27b742211fb510a435 Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Sun, 13 Jun 2021 14:04:16 +0800
Subject: [PATCH 01/15] use std::move

---
 torchtext/csrc/vocab.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/torchtext/csrc/vocab.cpp b/torchtext/csrc/vocab.cpp
index c83652c014..bb9dad9f8e 100644
--- a/torchtext/csrc/vocab.cpp
+++ b/torchtext/csrc/vocab.cpp
@@ -225,9 +225,11 @@ _concat_tokens(std::vector<std::shared_ptr<IndexDict>> chunk_counters,
   // create token freq pairs
   std::vector<std::pair<std::string, int64_t>> token_freq_pairs;
 
-  for (std::string token : unique_tokens) {
-    token_freq_pairs.push_back(std::make_pair(token, tokens_freq[token]));
+  for (std::string &token : unique_tokens) {
+    auto token_freq = tokens_freq[token];
+    token_freq_pairs.emplace_back(std::move(token), token_freq);
   }
+  unique_tokens.clear();
 
   // sort tokens by freq
   if (sort_tokens) {
@@ -236,9 +238,8 @@ _concat_tokens(std::vector<std::shared_ptr<IndexDict>> chunk_counters,
   }
 
   // update unique tokens with correct order
-  unique_tokens.clear();
-  for (const auto &token_freq_pair : token_freq_pairs) {
-    unique_tokens.push_back(token_freq_pair.first);
+  for (auto &token_freq_pair : token_freq_pairs) {
+    unique_tokens.emplace_back(std::move(token_freq_pair.first));
   }
 
   return unique_tokens;

From c30062e1c5b7b34b420368a2ebf0d49d3d9bd9c9 Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Sun, 13 Jun 2021 14:09:21 +0800
Subject: [PATCH 02/15] use const ref

---
 torchtext/csrc/vectors.cpp | 2 +-
 torchtext/csrc/vectors.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchtext/csrc/vectors.cpp b/torchtext/csrc/vectors.cpp
index c5410b1496..b7554d3926 100644
--- a/torchtext/csrc/vectors.cpp
+++ b/torchtext/csrc/vectors.cpp
@@ -15,7 +15,7 @@
 
 namespace torchtext {
 
-Vectors::Vectors(const IndexMap &stoi, const torch::Tensor vectors,
+Vectors::Vectors(const IndexMap &stoi, const torch::Tensor &vectors,
                  const torch::Tensor &unk_tensor)
     : stoi_(stoi), vectors_(vectors), unk_tensor_(unk_tensor) {}
 
diff --git a/torchtext/csrc/vectors.h b/torchtext/csrc/vectors.h
index 09716258ca..0f17314eaa 100644
--- a/torchtext/csrc/vectors.h
+++ b/torchtext/csrc/vectors.h
@@ -19,7 +19,7 @@ struct Vectors : torch::CustomClassHolder {
   torch::Tensor vectors_;
   torch::Tensor unk_tensor_;
 
-  explicit Vectors(const IndexMap &stoi, const torch::Tensor vectors,
+  explicit Vectors(const IndexMap &stoi, const torch::Tensor &vectors,
                    const torch::Tensor &unk_tensor);
   explicit Vectors(const std::vector<std::string> &tokens,
                    const std::vector<std::int64_t> &indices,

From 27db28c79d8c44902081ed613943194c829d8beb Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Sun, 13 Jun 2021 14:09:31 +0800
Subject: [PATCH 03/15] init member

---
 torchtext/csrc/vocab.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchtext/csrc/vocab.h b/torchtext/csrc/vocab.h
index 66f04aa1a0..804d8cd2f1 100644
--- a/torchtext/csrc/vocab.h
+++ b/torchtext/csrc/vocab.h
@@ -27,7 +27,7 @@ int64_t _infer_lines(const std::string &file_path);
 
 struct Vocab : torch::CustomClassHolder {
   static const int32_t MAX_VOCAB_SIZE = 30000000;
-  int64_t unk_index_;
+  int64_t unk_index_{};
   std::vector<int32_t> stoi_;
   const std::string version_str_ = "0.0.2";
   StringList itos_;

From bad4a8334eea86f117bf3025269aefbc91bbfe22 Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Thu, 17 Jun 2021 15:44:46 +0800
Subject: [PATCH 04/15] add std::move

---
 torchtext/csrc/vocab.cpp | 48 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/torchtext/csrc/vocab.cpp b/torchtext/csrc/vocab.cpp
index bb9dad9f8e..11776a3358 100644
--- a/torchtext/csrc/vocab.cpp
+++ b/torchtext/csrc/vocab.cpp
@@ -335,6 +335,54 @@ Vocab _build_vocab_from_text_file(const std::string &file_path,
   return Vocab(std::move(tokens));
 }
 
+Vocab _build_vocab_from_text_file_using_python_tokenizer(
+    const std::string &file_path, const int64_t min_freq,
+    py::object tokenizer) {
+  // find number of lines
+  int64_t num_lines = _infer_lines(file_path);
+  // Read text from file and add tokens
+  std::ifstream fin(file_path, std::ios::in);
+  TORCH_CHECK(fin.is_open(), "Cannot open input file " + file_path);
+
+  IndexDict counter;
+  std::string line;
+  for (int64_t i = 0; i < num_lines; i++) {
+    std::getline(fin, line);
+    std::vector<std::string> token_list =
+        tokenizer(line).cast<std::vector<std::string>>();
+
+    for (size_t i = 0; i < token_list.size(); i++) {
+      std::string token = token_list[i];
+
+      if (counter.find(token) == counter.end()) {
+        counter[token] = 1;
+      } else {
+        counter[token] += 1;
+      }
+    }
+  }
+
+  // create tokens-frequency pairs
+  std::vector<std::pair<std::string, int64_t>> token_freq_pairs;
+  for (const auto &item : counter) {
+    if (item.second >= min_freq) {
+      token_freq_pairs.push_back(item);
+    }
+  }
+
+  // sort tokens by frequency
+  CompareTokens compare_tokens;
+  std::sort(token_freq_pairs.begin(), token_freq_pairs.end(), compare_tokens);
+
+  // Create final list of tokens
+  StringList tokens;
+  for (auto &token_freq_pair : token_freq_pairs) {
+    tokens.emplace_back(std::move(token_freq_pair.first));
+  }
+
+  return Vocab(std::move(tokens));
+}
+
 VocabStates _serialize_vocab(const c10::intrusive_ptr<Vocab> &self) {
   std::vector<int64_t> integers;
   StringList strings = self->itos_;

From 912a7446c7b9023f9674dcd0766ac4fa0312f5ac Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Thu, 17 Jun 2021 16:41:30 +0800
Subject: [PATCH 05/15] more std::move

---
 torchtext/csrc/vocab.cpp | 20 ++++++++++----------
 torchtext/csrc/vocab.h   | 12 ++++++------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/torchtext/csrc/vocab.cpp b/torchtext/csrc/vocab.cpp
index 11776a3358..f5e89e2f01 100644
--- a/torchtext/csrc/vocab.cpp
+++ b/torchtext/csrc/vocab.cpp
@@ -7,20 +7,20 @@
 #include <vocab.h>       // @manual
 namespace torchtext {
 
-Vocab::Vocab(const StringList &tokens,
+Vocab::Vocab(StringList tokens,
              const c10::optional<int64_t> &default_index)
     : stoi_(MAX_VOCAB_SIZE, -1), default_index_{default_index} {
-  for (size_t i = 0; i < tokens.size(); i++) {
+  for (auto &token : tokens) {
     // throw error if duplicate token is found
-    auto id = _find(c10::string_view{tokens[i].data(), tokens[i].size()});
+    auto id = _find(c10::string_view{token.data(), token.size()});
     TORCH_CHECK(stoi_[id] == -1,
-                "Duplicate token found in tokens list: " + tokens[i]);
+                "Duplicate token found in tokens list: " + token);
 
-    _add(tokens[i]);
+    _add(std::move(token));
   }
 }
 
-Vocab::Vocab(const StringList &tokens) : Vocab(tokens, {}) {}
+Vocab::Vocab(StringList tokens) : Vocab(std::move(tokens), {}) {}
 
 int64_t Vocab::__len__() const { return itos_.size(); }
 
@@ -54,17 +54,17 @@ c10::optional<int64_t> Vocab::get_default_index() const {
   return default_index_;
 }
 
-void Vocab::append_token(const std::string &token) {
+void Vocab::append_token(std::string token) {
   // throw error if token already exist in vocab
   auto id = _find(c10::string_view{token.data(), token.size()});
   TORCH_CHECK(stoi_[id] == -1, "Token " + token +
                                    " already exists in the Vocab with index: " +
                                    std::to_string(stoi_[id]));
 
-  _add(token);
+  _add(std::move(token));
 }
 
-void Vocab::insert_token(const std::string &token, const int64_t &index) {
+void Vocab::insert_token(std::string token, const int64_t &index) {
   // throw error if index is not valid
   TORCH_CHECK(index >= 0 && index <= __len__(),
               "Specified index " + std::to_string(index) +
@@ -79,8 +79,8 @@ void Vocab::insert_token(const std::string &token, const int64_t &index) {
     stoi_[_find(c10::string_view{itos_[i].data(), itos_[i].size()})] = i + 1;
   }
 
-  itos_.insert(itos_.begin() + index, token);
   stoi_[_find(c10::string_view{token.data(), token.size()})] = index;
+  itos_.insert(itos_.begin() + index, std::move(token));
 }
 
 std::string Vocab::lookup_token(const int64_t &index) {
diff --git a/torchtext/csrc/vocab.h b/torchtext/csrc/vocab.h
index 804d8cd2f1..50ac492a63 100644
--- a/torchtext/csrc/vocab.h
+++ b/torchtext/csrc/vocab.h
@@ -36,16 +36,16 @@ struct Vocab : torch::CustomClassHolder {
   // TODO: [can we remove this?] we need to keep this constructor, otherwise
   // torch binding gets compilation error: no matching constructor for
   // initialization of 'torchtext::Vocab'
-  explicit Vocab(const StringList &tokens);
-  explicit Vocab(const StringList &tokens,
+  explicit Vocab(StringList tokens);
+  explicit Vocab(StringList tokens,
                  const c10::optional<int64_t> &default_index);
   int64_t __len__() const;
   int64_t __getitem__(const c10::string_view &token) const;
   bool __contains__(const c10::string_view &token) const;
   void set_default_index(c10::optional<int64_t> index);
   c10::optional<int64_t> get_default_index() const;
-  void insert_token(const std::string &token, const int64_t &index);
-  void append_token(const std::string &token);
+  void insert_token(std::string token, const int64_t &index);
+  void append_token(std::string token);
   std::string lookup_token(const int64_t &index);
   std::vector<std::string> lookup_tokens(const std::vector<int64_t> &indices);
   std::vector<int64_t>
@@ -72,10 +72,10 @@ struct Vocab : torch::CustomClassHolder {
     return id;
   }
 
-  void _add(const std::string &w) {
+  void _add(std::string w) {
     uint32_t h = _find(c10::string_view{w.data(), w.size()});
     if (stoi_[h] == -1) {
-      itos_.push_back(w);
+      itos_.emplace_back(std::move(w));
       stoi_[h] = itos_.size() - 1;
     }
   }

From 5993d1367d299a799e5efa5be0f4e0c3e37a1256 Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Thu, 17 Jun 2021 16:48:29 +0800
Subject: [PATCH 06/15] use the std::string constructor of c10::string_view and
 more std::move

---
 torchtext/csrc/vocab.cpp | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/torchtext/csrc/vocab.cpp b/torchtext/csrc/vocab.cpp
index f5e89e2f01..70eef0dc14 100644
--- a/torchtext/csrc/vocab.cpp
+++ b/torchtext/csrc/vocab.cpp
@@ -12,7 +12,7 @@ Vocab::Vocab(StringList tokens,
     : stoi_(MAX_VOCAB_SIZE, -1), default_index_{default_index} {
   for (auto &token : tokens) {
     // throw error if duplicate token is found
-    auto id = _find(c10::string_view{token.data(), token.size()});
+    auto id = _find(c10::string_view{token});
     TORCH_CHECK(stoi_[id] == -1,
                 "Duplicate token found in tokens list: " + token);
 
@@ -56,7 +56,7 @@ c10::optional<int64_t> Vocab::get_default_index() const {
 
 void Vocab::append_token(std::string token) {
   // throw error if token already exist in vocab
-  auto id = _find(c10::string_view{token.data(), token.size()});
+  auto id = _find(c10::string_view{token});
   TORCH_CHECK(stoi_[id] == -1, "Token " + token +
                                    " already exists in the Vocab with index: " +
                                    std::to_string(stoi_[id]));
@@ -76,10 +76,10 @@ void Vocab::insert_token(std::string token, const int64_t &index) {
 
   // need to offset all tokens greater than or equal index by 1
   for (size_t i = index; i < __len__(); i++) {
-    stoi_[_find(c10::string_view{itos_[i].data(), itos_[i].size()})] = i + 1;
+    stoi_[_find(c10::string_view{itos_[i]})] = i + 1;
   }
 
-  stoi_[_find(c10::string_view{token.data(), token.size()})] = index;
+  stoi_[_find(c10::string_view{token})] = index;
   itos_.insert(itos_.begin() + index, std::move(token));
 }
 
@@ -351,14 +351,8 @@ Vocab _build_vocab_from_text_file_using_python_tokenizer(
     std::vector<std::string> token_list =
         tokenizer(line).cast<std::vector<std::string>>();
 
-    for (size_t i = 0; i < token_list.size(); i++) {
-      std::string token = token_list[i];
-
-      if (counter.find(token) == counter.end()) {
-        counter[token] = 1;
-      } else {
-        counter[token] += 1;
-      }
+    for (auto &token : token_list) {
+      counter[std::move(token)] += 1;
     }
   }
 

From 8404f2b0b39ce516c0950fa191bdd7c252fd1b8c Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Thu, 17 Jun 2021 19:04:53 +0800
Subject: [PATCH 07/15] more std::move

---
 torchtext/csrc/vectors.cpp | 8 ++++----
 torchtext/csrc/vectors.h   | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/torchtext/csrc/vectors.cpp b/torchtext/csrc/vectors.cpp
index b7554d3926..fd5a11e915 100644
--- a/torchtext/csrc/vectors.cpp
+++ b/torchtext/csrc/vectors.cpp
@@ -15,13 +15,13 @@
 
 namespace torchtext {
 
-Vectors::Vectors(const IndexMap &stoi, const torch::Tensor &vectors,
-                 const torch::Tensor &unk_tensor)
-    : stoi_(stoi), vectors_(vectors), unk_tensor_(unk_tensor) {}
+Vectors::Vectors(const IndexMap &stoi, torch::Tensor vectors,
+                 torch::Tensor unk_tensor)
+    : stoi_(stoi), vectors_(std::move(vectors)), unk_tensor_(std::move(unk_tensor)) {}
 
 Vectors::Vectors(const std::vector<std::string> &tokens,
                  const std::vector<std::int64_t> &indices,
-                 const torch::Tensor &vectors, const torch::Tensor &unk_tensor)
+                 torch::Tensor vectors, torch::Tensor unk_tensor)
     : vectors_(std::move(vectors)), unk_tensor_(std::move(unk_tensor)) {
   // guarding against size mismatch of tokens and indices
   if (static_cast<int>(tokens.size()) != indices.size()) {
diff --git a/torchtext/csrc/vectors.h b/torchtext/csrc/vectors.h
index 0f17314eaa..10ca758d40 100644
--- a/torchtext/csrc/vectors.h
+++ b/torchtext/csrc/vectors.h
@@ -19,12 +19,12 @@ struct Vectors : torch::CustomClassHolder {
   torch::Tensor vectors_;
   torch::Tensor unk_tensor_;
 
-  explicit Vectors(const IndexMap &stoi, const torch::Tensor &vectors,
-                   const torch::Tensor &unk_tensor);
+  explicit Vectors(const IndexMap &stoi, torch::Tensor vectors,
+                   torch::Tensor unk_tensor);
   explicit Vectors(const std::vector<std::string> &tokens,
                    const std::vector<std::int64_t> &indices,
-                   const torch::Tensor &vectors,
-                   const torch::Tensor &unk_tensor);
+                   torch::Tensor vectors,
+                   torch::Tensor unk_tensor);
   std::unordered_map<std::string, int64_t> get_stoi();
   torch::Tensor __getitem__(const std::string &token);
   torch::Tensor lookup_vectors(const std::vector<std::string> &tokens);

From 4159ebe3d6e95511c75fa9f48f6e94895f10f559 Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Thu, 17 Jun 2021 20:35:37 +0800
Subject: [PATCH 08/15] more std::move

---
 torchtext/csrc/sentencepiece.cpp | 2 +-
 torchtext/csrc/sentencepiece.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchtext/csrc/sentencepiece.cpp b/torchtext/csrc/sentencepiece.cpp
index 1fc5e370b2..0099d827d9 100644
--- a/torchtext/csrc/sentencepiece.cpp
+++ b/torchtext/csrc/sentencepiece.cpp
@@ -75,7 +75,7 @@ c10::intrusive_ptr<SentencePiece> load_sp_model(const std::string &path) {
 }
 
 c10::intrusive_ptr<SentencePiece>
-load_sp_model_string(const std::string &content) {
+load_sp_model_string(std::string content) {
   return c10::make_intrusive<SentencePiece>(std::move(content));
 }
 
diff --git a/torchtext/csrc/sentencepiece.h b/torchtext/csrc/sentencepiece.h
index 3b26d90ffa..dfd997f52f 100644
--- a/torchtext/csrc/sentencepiece.h
+++ b/torchtext/csrc/sentencepiece.h
@@ -33,6 +33,6 @@ void generate_sp_model(const std::string &filename, const int64_t &vocab_size,
                        const std::string &model_prefix);
 c10::intrusive_ptr<SentencePiece> load_sp_model(const std::string &path);
 c10::intrusive_ptr<SentencePiece>
-load_sp_model_string(const std::string &content);
+load_sp_model_string(std::string content);
 
 } // namespace torchtext

From 714f8c0667cf676318efda8032dc4427fbd3eb0d Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Thu, 17 Jun 2021 20:57:54 +0800
Subject: [PATCH 09/15] use const reference

---
 torchtext/csrc/vectors.cpp | 2 +-
 torchtext/csrc/vectors.h   | 2 +-
 torchtext/csrc/vocab.cpp   | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/torchtext/csrc/vectors.cpp b/torchtext/csrc/vectors.cpp
index fd5a11e915..c9353782c9 100644
--- a/torchtext/csrc/vectors.cpp
+++ b/torchtext/csrc/vectors.cpp
@@ -206,7 +206,7 @@ _concat_vectors(std::vector<std::shared_ptr<StringList>> chunk_tokens,
 
 constexpr int64_t GRAIN_SIZE = 131072;
 std::tuple<Vectors, std::vector<std::string>> _load_token_and_vectors_from_file(
-    const std::string &file_path, const std::string delimiter_str,
+    const std::string &file_path, const std::string &delimiter_str,
     int64_t num_cpus, c10::optional<torch::Tensor> opt_unk_tensor) {
 
   TORCH_CHECK(delimiter_str.size() == 1,
diff --git a/torchtext/csrc/vectors.h b/torchtext/csrc/vectors.h
index 10ca758d40..abe69f5fe6 100644
--- a/torchtext/csrc/vectors.h
+++ b/torchtext/csrc/vectors.h
@@ -36,7 +36,7 @@ VectorsStates _serialize_vectors(const c10::intrusive_ptr<Vectors> &self);
 c10::intrusive_ptr<Vectors> _deserialize_vectors(VectorsStates states);
 
 std::tuple<Vectors, std::vector<std::string>> _load_token_and_vectors_from_file(
-    const std::string &file_path, const std::string delimiter_str,
+    const std::string &file_path, const std::string &delimiter_str,
     const int64_t num_cpus, c10::optional<torch::Tensor> opt_unk_tensor);
 
 } // namespace torchtext
diff --git a/torchtext/csrc/vocab.cpp b/torchtext/csrc/vocab.cpp
index 70eef0dc14..ca883f7545 100644
--- a/torchtext/csrc/vocab.cpp
+++ b/torchtext/csrc/vocab.cpp
@@ -144,7 +144,7 @@ int64_t _infer_lines(const std::string &file_path) {
 
 void parse_vocab_file_chunk(const std::string &file_path, size_t offset,
                             const int64_t start_line, const int64_t end_line,
-                            std::shared_ptr<IndexDict> counter) {
+                            const std::shared_ptr<IndexDict> &counter) {
   std::ifstream fin(file_path, std::ios::in);
   TORCH_CHECK(fin.is_open(), "Cannot open input file " + file_path);
 
@@ -165,7 +165,7 @@ void parse_vocab_file_chunk(const std::string &file_path, size_t offset,
 
 void parse_raw_text_file_chunk(const std::string &file_path, size_t offset,
                                const int64_t start_line, const int64_t end_line,
-                               std::shared_ptr<IndexDict> counter,
+                               const std::shared_ptr<IndexDict> &counter,
                                torch::jit::script::Module &module) {
   std::ifstream fin(file_path, std::ios::in);
   TORCH_CHECK(fin.is_open(), "Cannot open input file " + file_path);

From ded11da66ee1977f94a17b0778a91b54c537f5ec Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Thu, 17 Jun 2021 21:08:11 +0800
Subject: [PATCH 10/15] more std::move

---
 torchtext/csrc/register_pybindings.cpp | 125 ++++++++++++++++++++++++-
 1 file changed, 123 insertions(+), 2 deletions(-)

diff --git a/torchtext/csrc/register_pybindings.cpp b/torchtext/csrc/register_pybindings.cpp
index 80d91ee744..8277ba8565 100644
--- a/torchtext/csrc/register_pybindings.cpp
+++ b/torchtext/csrc/register_pybindings.cpp
@@ -100,7 +100,7 @@ PYBIND11_MODULE(_torchtext, m) {
           },
           // __setstate__
           [](VectorsStates states) -> c10::intrusive_ptr<Vectors> {
-            return _deserialize_vectors(states);
+            return _deserialize_vectors(std::move(states));
           }));
 
   py::class_<Vocab, c10::intrusive_ptr<Vocab>>(m, "Vocab")
@@ -149,7 +149,7 @@ PYBIND11_MODULE(_torchtext, m) {
           },
           // __setstate__
           [](VocabStates states) -> c10::intrusive_ptr<Vocab> {
-            return _deserialize_vocab(states);
+            return _deserialize_vocab(std::move(states));
           }));
 
   // Functions
@@ -159,6 +159,127 @@ PYBIND11_MODULE(_torchtext, m) {
   m.def("_build_vocab_from_text_file", &build_vocab_from_text_file);
   m.def("_build_vocab_from_text_file_using_python_tokenizer",
         &_build_vocab_from_text_file_using_python_tokenizer);
+  m.def("_build_vocab_from_text_file_using_python_tokenizer", &_build_vocab_from_text_file_using_python_tokenizer);
+}
+
+TORCH_LIBRARY_FRAGMENT(torchtext, m) {
+  m.class_<Regex>("Regex")
+      .def(torch::init<std::string>())
+      .def("Sub", &Regex::Sub)
+      .def_pickle(
+          // __getstate__
+          [](const c10::intrusive_ptr<Regex> &self) -> std::string {
+            return _serialize_regex(self);
+          },
+          // __setstate__
+          [](std::string state) -> c10::intrusive_ptr<Regex> {
+            return _deserialize_regex(std::move(state));
+          });
+
+  m.class_<RegexTokenizer>("RegexTokenizer")
+      .def(torch::init<std::vector<std::string>, std::vector<std::string>,
+                       bool>())
+      .def("forward", &RegexTokenizer::forward)
+      .def_pickle(
+          // __getstate__
+          [](const c10::intrusive_ptr<RegexTokenizer> &self)
+              -> RegexTokenizerStates {
+            return _serialize_regex_tokenizer(self);
+          },
+          // __setstate__
+          [](RegexTokenizerStates states)
+              -> c10::intrusive_ptr<RegexTokenizer> {
+            return _deserialize_regex_tokenizer(std::move(states));
+          });
+
+  m.class_<SentencePiece>("SentencePiece")
+      .def(torch::init<std::string>())
+      .def("Encode", &SentencePiece::Encode)
+      .def("EncodeAsIds", &SentencePiece::EncodeAsIds)
+      .def("DecodeIds", &SentencePiece::DecodeIds)
+      .def("EncodeAsPieces", &SentencePiece::EncodeAsPieces)
+      .def("DecodePieces", &SentencePiece::DecodePieces)
+      .def("GetPieceSize", &SentencePiece::GetPieceSize)
+      .def("unk_id", &SentencePiece::unk_id)
+      .def("PieceToId", &SentencePiece::PieceToId)
+      .def("IdToPiece", &SentencePiece::IdToPiece)
+      .def_pickle(
+          // The underlying content of SentencePiece contains byte string,
+          // and returing it as std::string cause UTF8 decoding error.
+          // Since TorchScript does not support byte string, we use byte Tensor
+          // to pass around the data.
+          // __getstate__
+          [](const c10::intrusive_ptr<SentencePiece> &self) -> torch::Tensor {
+            auto *data =
+                static_cast<void *>(const_cast<char *>(self->content_.data()));
+            auto numel = static_cast<int64_t>(self->content_.size());
+            return torch::from_blob(data, {numel}, {torch::kUInt8}).clone();
+          },
+          // __setstate__
+          [](torch::Tensor state) -> c10::intrusive_ptr<SentencePiece> {
+            auto *data = static_cast<char *>(state.data_ptr());
+            auto numel = state.size(0);
+            return c10::make_intrusive<SentencePiece>(std::string(data, numel));
+          });
+
+  m.class_<Vectors>("Vectors")
+      .def(torch::init<std::vector<std::string>, std::vector<std::int64_t>,
+                       torch::Tensor, torch::Tensor>())
+      .def("__getitem__", &Vectors::__getitem__)
+      .def("lookup_vectors", &Vectors::lookup_vectors)
+      .def("__setitem__", &Vectors::__setitem__)
+      .def("__len__", &Vectors::__len__)
+      .def_pickle(
+          // __getstate__
+          [](const c10::intrusive_ptr<Vectors> &self) -> VectorsStates {
+            return _serialize_vectors(self);
+          },
+          // __setstate__
+          [](VectorsStates states) -> c10::intrusive_ptr<Vectors> {
+            return _deserialize_vectors(std::move(states));
+          });
+
+  m.class_<Vocab>("Vocab")
+      .def(torch::init<StringList, c10::optional<int64_t>>())
+      .def("__contains__",
+           [](const c10::intrusive_ptr<Vocab> &self, const std::string &item)
+               -> bool { return self->__contains__(c10::string_view{item}); })
+      .def("__getitem__",
+           [](const c10::intrusive_ptr<Vocab> &self, const std::string &item)
+               -> int64_t { return self->__getitem__(c10::string_view{item}); })
+      .def("insert_token", &Vocab::insert_token)
+      .def("__len__", &Vocab::__len__)
+      .def("set_default_index", &Vocab::set_default_index)
+      .def("get_default_index", &Vocab::get_default_index)
+      .def("append_token", &Vocab::append_token)
+      .def("lookup_token", &Vocab::lookup_token)
+      .def("lookup_tokens", &Vocab::lookup_tokens)
+      .def("lookup_indices",
+           [](const c10::intrusive_ptr<Vocab> &self,
+              const std::vector<std::string> &items) {
+             std::vector<int64_t> indices(items.size());
+             int64_t counter = 0;
+             for (const auto &item : items) {
+               indices[counter++] = self->__getitem__(c10::string_view{item});
+             }
+             return indices;
+           })
+      .def("get_stoi", &Vocab::get_stoi)
+      .def("get_itos", &Vocab::get_itos)
+      .def_pickle(
+          // __getstate__
+          [](const c10::intrusive_ptr<Vocab> &self) -> VocabStates {
+            return _serialize_vocab(self);
+          },
+          // __setstate__
+          [](VocabStates states) -> c10::intrusive_ptr<Vocab> {
+            return _deserialize_vocab(std::move(states));
+          });
+
+  m.def("torchtext::generate_sp_model", &generate_sp_model);
+  m.def("torchtext::load_sp_model", &load_sp_model);
+  m.def("torchtext::load_sp_model_string", &load_sp_model_string);
+>>>>>>> 1c8cd01e (more std::move):torchtext/csrc/register_bindings.cpp
 }
 
 } // namespace torchtext

From 518b87b67b9e10b81e54ddf687c6051f06f9ee69 Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Thu, 17 Jun 2021 21:17:04 +0800
Subject: [PATCH 11/15] unnecessary type convertion

---
 torchtext/csrc/vectors.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchtext/csrc/vectors.cpp b/torchtext/csrc/vectors.cpp
index c9353782c9..63aaafaf2d 100644
--- a/torchtext/csrc/vectors.cpp
+++ b/torchtext/csrc/vectors.cpp
@@ -24,7 +24,7 @@ Vectors::Vectors(const std::vector<std::string> &tokens,
                  torch::Tensor vectors, torch::Tensor unk_tensor)
     : vectors_(std::move(vectors)), unk_tensor_(std::move(unk_tensor)) {
   // guarding against size mismatch of tokens and indices
-  if (static_cast<int>(tokens.size()) != indices.size()) {
+  if (tokens.size() != indices.size()) {
 #ifdef _MSC_VER
     std::cerr << "[RuntimeError] Mismatching sizes for tokens and indices. "
                  "Size of tokens: "

From e742660eee485a0d80b4571fdb31b9d6558a444f Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Thu, 17 Jun 2021 21:21:16 +0800
Subject: [PATCH 12/15] use emplace_back

---
 torchtext/csrc/vectors.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchtext/csrc/vectors.cpp b/torchtext/csrc/vectors.cpp
index 63aaafaf2d..4a3ca89d17 100644
--- a/torchtext/csrc/vectors.cpp
+++ b/torchtext/csrc/vectors.cpp
@@ -72,7 +72,7 @@ torch::Tensor Vectors::__getitem__(const std::string &token) {
 torch::Tensor Vectors::lookup_vectors(const std::vector<std::string> &tokens) {
   std::vector<torch::Tensor> vectors;
   for (const std::string &token : tokens) {
-    vectors.push_back(__getitem__(token));
+    vectors.emplace_back(__getitem__(token));
   }
   return torch::stack(vectors, 0);
 }

From 0e89418ecf028339b25ccfe2ab874acf1eda129c Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Thu, 17 Jun 2021 21:23:03 +0800
Subject: [PATCH 13/15] use std::move

---
 torchtext/csrc/vectors.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchtext/csrc/vectors.cpp b/torchtext/csrc/vectors.cpp
index 4a3ca89d17..dd187e6ac2 100644
--- a/torchtext/csrc/vectors.cpp
+++ b/torchtext/csrc/vectors.cpp
@@ -265,7 +265,7 @@ std::tuple<Vectors, std::vector<std::string>> _load_token_and_vectors_from_file(
 
   torch::Tensor unk_tensor;
   if (opt_unk_tensor) {
-    unk_tensor = *opt_unk_tensor;
+    unk_tensor = std::move(*opt_unk_tensor);
   } else {
     unk_tensor = torch::zeros({vector_dim}, torch::kFloat32);
   }

From 5de883ca137a2f5699f6ae1a0e21d6744b59d844 Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Fri, 16 Jul 2021 12:37:28 +0800
Subject: [PATCH 14/15] Revert "more std::move"

This reverts commit ded11da66ee1977f94a17b0778a91b54c537f5ec.
---
 torchtext/csrc/register_pybindings.cpp | 125 +------------------------
 1 file changed, 2 insertions(+), 123 deletions(-)

diff --git a/torchtext/csrc/register_pybindings.cpp b/torchtext/csrc/register_pybindings.cpp
index 8277ba8565..80d91ee744 100644
--- a/torchtext/csrc/register_pybindings.cpp
+++ b/torchtext/csrc/register_pybindings.cpp
@@ -100,7 +100,7 @@ PYBIND11_MODULE(_torchtext, m) {
           },
           // __setstate__
           [](VectorsStates states) -> c10::intrusive_ptr<Vectors> {
-            return _deserialize_vectors(std::move(states));
+            return _deserialize_vectors(states);
           }));
 
   py::class_<Vocab, c10::intrusive_ptr<Vocab>>(m, "Vocab")
@@ -149,7 +149,7 @@ PYBIND11_MODULE(_torchtext, m) {
           },
           // __setstate__
           [](VocabStates states) -> c10::intrusive_ptr<Vocab> {
-            return _deserialize_vocab(std::move(states));
+            return _deserialize_vocab(states);
           }));
 
   // Functions
@@ -159,127 +159,6 @@ PYBIND11_MODULE(_torchtext, m) {
   m.def("_build_vocab_from_text_file", &build_vocab_from_text_file);
   m.def("_build_vocab_from_text_file_using_python_tokenizer",
         &_build_vocab_from_text_file_using_python_tokenizer);
-  m.def("_build_vocab_from_text_file_using_python_tokenizer", &_build_vocab_from_text_file_using_python_tokenizer);
-}
-
-TORCH_LIBRARY_FRAGMENT(torchtext, m) {
-  m.class_<Regex>("Regex")
-      .def(torch::init<std::string>())
-      .def("Sub", &Regex::Sub)
-      .def_pickle(
-          // __getstate__
-          [](const c10::intrusive_ptr<Regex> &self) -> std::string {
-            return _serialize_regex(self);
-          },
-          // __setstate__
-          [](std::string state) -> c10::intrusive_ptr<Regex> {
-            return _deserialize_regex(std::move(state));
-          });
-
-  m.class_<RegexTokenizer>("RegexTokenizer")
-      .def(torch::init<std::vector<std::string>, std::vector<std::string>,
-                       bool>())
-      .def("forward", &RegexTokenizer::forward)
-      .def_pickle(
-          // __getstate__
-          [](const c10::intrusive_ptr<RegexTokenizer> &self)
-              -> RegexTokenizerStates {
-            return _serialize_regex_tokenizer(self);
-          },
-          // __setstate__
-          [](RegexTokenizerStates states)
-              -> c10::intrusive_ptr<RegexTokenizer> {
-            return _deserialize_regex_tokenizer(std::move(states));
-          });
-
-  m.class_<SentencePiece>("SentencePiece")
-      .def(torch::init<std::string>())
-      .def("Encode", &SentencePiece::Encode)
-      .def("EncodeAsIds", &SentencePiece::EncodeAsIds)
-      .def("DecodeIds", &SentencePiece::DecodeIds)
-      .def("EncodeAsPieces", &SentencePiece::EncodeAsPieces)
-      .def("DecodePieces", &SentencePiece::DecodePieces)
-      .def("GetPieceSize", &SentencePiece::GetPieceSize)
-      .def("unk_id", &SentencePiece::unk_id)
-      .def("PieceToId", &SentencePiece::PieceToId)
-      .def("IdToPiece", &SentencePiece::IdToPiece)
-      .def_pickle(
-          // The underlying content of SentencePiece contains byte string,
-          // and returing it as std::string cause UTF8 decoding error.
-          // Since TorchScript does not support byte string, we use byte Tensor
-          // to pass around the data.
-          // __getstate__
-          [](const c10::intrusive_ptr<SentencePiece> &self) -> torch::Tensor {
-            auto *data =
-                static_cast<void *>(const_cast<char *>(self->content_.data()));
-            auto numel = static_cast<int64_t>(self->content_.size());
-            return torch::from_blob(data, {numel}, {torch::kUInt8}).clone();
-          },
-          // __setstate__
-          [](torch::Tensor state) -> c10::intrusive_ptr<SentencePiece> {
-            auto *data = static_cast<char *>(state.data_ptr());
-            auto numel = state.size(0);
-            return c10::make_intrusive<SentencePiece>(std::string(data, numel));
-          });
-
-  m.class_<Vectors>("Vectors")
-      .def(torch::init<std::vector<std::string>, std::vector<std::int64_t>,
-                       torch::Tensor, torch::Tensor>())
-      .def("__getitem__", &Vectors::__getitem__)
-      .def("lookup_vectors", &Vectors::lookup_vectors)
-      .def("__setitem__", &Vectors::__setitem__)
-      .def("__len__", &Vectors::__len__)
-      .def_pickle(
-          // __getstate__
-          [](const c10::intrusive_ptr<Vectors> &self) -> VectorsStates {
-            return _serialize_vectors(self);
-          },
-          // __setstate__
-          [](VectorsStates states) -> c10::intrusive_ptr<Vectors> {
-            return _deserialize_vectors(std::move(states));
-          });
-
-  m.class_<Vocab>("Vocab")
-      .def(torch::init<StringList, c10::optional<int64_t>>())
-      .def("__contains__",
-           [](const c10::intrusive_ptr<Vocab> &self, const std::string &item)
-               -> bool { return self->__contains__(c10::string_view{item}); })
-      .def("__getitem__",
-           [](const c10::intrusive_ptr<Vocab> &self, const std::string &item)
-               -> int64_t { return self->__getitem__(c10::string_view{item}); })
-      .def("insert_token", &Vocab::insert_token)
-      .def("__len__", &Vocab::__len__)
-      .def("set_default_index", &Vocab::set_default_index)
-      .def("get_default_index", &Vocab::get_default_index)
-      .def("append_token", &Vocab::append_token)
-      .def("lookup_token", &Vocab::lookup_token)
-      .def("lookup_tokens", &Vocab::lookup_tokens)
-      .def("lookup_indices",
-           [](const c10::intrusive_ptr<Vocab> &self,
-              const std::vector<std::string> &items) {
-             std::vector<int64_t> indices(items.size());
-             int64_t counter = 0;
-             for (const auto &item : items) {
-               indices[counter++] = self->__getitem__(c10::string_view{item});
-             }
-             return indices;
-           })
-      .def("get_stoi", &Vocab::get_stoi)
-      .def("get_itos", &Vocab::get_itos)
-      .def_pickle(
-          // __getstate__
-          [](const c10::intrusive_ptr<Vocab> &self) -> VocabStates {
-            return _serialize_vocab(self);
-          },
-          // __setstate__
-          [](VocabStates states) -> c10::intrusive_ptr<Vocab> {
-            return _deserialize_vocab(std::move(states));
-          });
-
-  m.def("torchtext::generate_sp_model", &generate_sp_model);
-  m.def("torchtext::load_sp_model", &load_sp_model);
-  m.def("torchtext::load_sp_model_string", &load_sp_model_string);
->>>>>>> 1c8cd01e (more std::move):torchtext/csrc/register_bindings.cpp
 }
 
 } // namespace torchtext

From 35fe71692f5ad36aa389b2faff6258f978db1f44 Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Fri, 16 Jul 2021 12:42:09 +0800
Subject: [PATCH 15/15] fix rebase

---
 torchtext/csrc/vocab.cpp | 42 ----------------------------------------
 1 file changed, 42 deletions(-)

diff --git a/torchtext/csrc/vocab.cpp b/torchtext/csrc/vocab.cpp
index ca883f7545..6f7abc8db4 100644
--- a/torchtext/csrc/vocab.cpp
+++ b/torchtext/csrc/vocab.cpp
@@ -335,48 +335,6 @@ Vocab _build_vocab_from_text_file(const std::string &file_path,
   return Vocab(std::move(tokens));
 }
 
-Vocab _build_vocab_from_text_file_using_python_tokenizer(
-    const std::string &file_path, const int64_t min_freq,
-    py::object tokenizer) {
-  // find number of lines
-  int64_t num_lines = _infer_lines(file_path);
-  // Read text from file and add tokens
-  std::ifstream fin(file_path, std::ios::in);
-  TORCH_CHECK(fin.is_open(), "Cannot open input file " + file_path);
-
-  IndexDict counter;
-  std::string line;
-  for (int64_t i = 0; i < num_lines; i++) {
-    std::getline(fin, line);
-    std::vector<std::string> token_list =
-        tokenizer(line).cast<std::vector<std::string>>();
-
-    for (auto &token : token_list) {
-      counter[std::move(token)] += 1;
-    }
-  }
-
-  // create tokens-frequency pairs
-  std::vector<std::pair<std::string, int64_t>> token_freq_pairs;
-  for (const auto &item : counter) {
-    if (item.second >= min_freq) {
-      token_freq_pairs.push_back(item);
-    }
-  }
-
-  // sort tokens by frequency
-  CompareTokens compare_tokens;
-  std::sort(token_freq_pairs.begin(), token_freq_pairs.end(), compare_tokens);
-
-  // Create final list of tokens
-  StringList tokens;
-  for (auto &token_freq_pair : token_freq_pairs) {
-    tokens.emplace_back(std::move(token_freq_pair.first));
-  }
-
-  return Vocab(std::move(tokens));
-}
-
 VocabStates _serialize_vocab(const c10::intrusive_ptr<Vocab> &self) {
   std::vector<int64_t> integers;
   StringList strings = self->itos_;