From 53f6e1ae60d9dff3c9c9bf27b742211fb510a435 Mon Sep 17 00:00:00 2001 From: cyy Date: Sun, 13 Jun 2021 14:04:16 +0800 Subject: [PATCH 01/15] use std::move --- torchtext/csrc/vocab.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/torchtext/csrc/vocab.cpp b/torchtext/csrc/vocab.cpp index c83652c014..bb9dad9f8e 100644 --- a/torchtext/csrc/vocab.cpp +++ b/torchtext/csrc/vocab.cpp @@ -225,9 +225,11 @@ _concat_tokens(std::vector> chunk_counters, // create token freq pairs std::vector> token_freq_pairs; - for (std::string token : unique_tokens) { - token_freq_pairs.push_back(std::make_pair(token, tokens_freq[token])); + for (std::string &token : unique_tokens) { + auto token_freq = tokens_freq[token]; + token_freq_pairs.emplace_back(std::move(token), token_freq); } + unique_tokens.clear(); // sort tokens by freq if (sort_tokens) { @@ -236,9 +238,8 @@ _concat_tokens(std::vector> chunk_counters, } // update unique tokens with correct order - unique_tokens.clear(); - for (const auto &token_freq_pair : token_freq_pairs) { - unique_tokens.push_back(token_freq_pair.first); + for (auto &token_freq_pair : token_freq_pairs) { + unique_tokens.emplace_back(std::move(token_freq_pair.first)); } return unique_tokens; From c30062e1c5b7b34b420368a2ebf0d49d3d9bd9c9 Mon Sep 17 00:00:00 2001 From: cyy Date: Sun, 13 Jun 2021 14:09:21 +0800 Subject: [PATCH 02/15] use const ref --- torchtext/csrc/vectors.cpp | 2 +- torchtext/csrc/vectors.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/torchtext/csrc/vectors.cpp b/torchtext/csrc/vectors.cpp index c5410b1496..b7554d3926 100644 --- a/torchtext/csrc/vectors.cpp +++ b/torchtext/csrc/vectors.cpp @@ -15,7 +15,7 @@ namespace torchtext { -Vectors::Vectors(const IndexMap &stoi, const torch::Tensor vectors, +Vectors::Vectors(const IndexMap &stoi, const torch::Tensor &vectors, const torch::Tensor &unk_tensor) : stoi_(stoi), vectors_(vectors), unk_tensor_(unk_tensor) {} diff --git a/torchtext/csrc/vectors.h b/torchtext/csrc/vectors.h index 09716258ca..0f17314eaa 100644 --- a/torchtext/csrc/vectors.h +++ b/torchtext/csrc/vectors.h @@ -19,7 +19,7 @@ struct Vectors : torch::CustomClassHolder { torch::Tensor vectors_; torch::Tensor unk_tensor_; - explicit Vectors(const IndexMap &stoi, const torch::Tensor vectors, + explicit Vectors(const IndexMap &stoi, const torch::Tensor &vectors, const torch::Tensor &unk_tensor); explicit Vectors(const std::vector &tokens, const std::vector &indices, From 27db28c79d8c44902081ed613943194c829d8beb Mon Sep 17 00:00:00 2001 From: cyy Date: Sun, 13 Jun 2021 14:09:31 +0800 Subject: [PATCH 03/15] init member --- torchtext/csrc/vocab.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchtext/csrc/vocab.h b/torchtext/csrc/vocab.h index 66f04aa1a0..804d8cd2f1 100644 --- a/torchtext/csrc/vocab.h +++ b/torchtext/csrc/vocab.h @@ -27,7 +27,7 @@ int64_t _infer_lines(const std::string &file_path); struct Vocab : torch::CustomClassHolder { static const int32_t MAX_VOCAB_SIZE = 30000000; - int64_t unk_index_; + int64_t unk_index_{}; std::vector stoi_; const std::string version_str_ = "0.0.2"; StringList itos_; From bad4a8334eea86f117bf3025269aefbc91bbfe22 Mon Sep 17 00:00:00 2001 From: cyy Date: Thu, 17 Jun 2021 15:44:46 +0800 Subject: [PATCH 04/15] add std::move --- torchtext/csrc/vocab.cpp | 48 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/torchtext/csrc/vocab.cpp b/torchtext/csrc/vocab.cpp index bb9dad9f8e..11776a3358 100644 --- a/torchtext/csrc/vocab.cpp +++ b/torchtext/csrc/vocab.cpp @@ -335,6 +335,54 @@ Vocab _build_vocab_from_text_file(const std::string &file_path, return Vocab(std::move(tokens)); } +Vocab _build_vocab_from_text_file_using_python_tokenizer( + const std::string &file_path, const int64_t min_freq, + py::object tokenizer) { + // find number of lines + int64_t num_lines = _infer_lines(file_path); + // Read text from file and add tokens + std::ifstream fin(file_path, std::ios::in); + TORCH_CHECK(fin.is_open(), "Cannot open input file " + file_path); + + IndexDict counter; + std::string line; + for (int64_t i = 0; i < num_lines; i++) { + std::getline(fin, line); + std::vector token_list = + tokenizer(line).cast>(); + + for (size_t i = 0; i < token_list.size(); i++) { + std::string token = token_list[i]; + + if (counter.find(token) == counter.end()) { + counter[token] = 1; + } else { + counter[token] += 1; + } + } + } + + // create tokens-frequency pairs + std::vector> token_freq_pairs; + for (const auto &item : counter) { + if (item.second >= min_freq) { + token_freq_pairs.push_back(item); + } + } + + // sort tokens by frequency + CompareTokens compare_tokens; + std::sort(token_freq_pairs.begin(), token_freq_pairs.end(), compare_tokens); + + // Create final list of tokens + StringList tokens; + for (auto &token_freq_pair : token_freq_pairs) { + tokens.emplace_back(std::move(token_freq_pair.first)); + } + + return Vocab(std::move(tokens)); +} + VocabStates _serialize_vocab(const c10::intrusive_ptr &self) { std::vector integers; StringList strings = self->itos_; From 912a7446c7b9023f9674dcd0766ac4fa0312f5ac Mon Sep 17 00:00:00 2001 From: cyy Date: Thu, 17 Jun 2021 16:41:30 +0800 Subject: [PATCH 05/15] more std::move --- torchtext/csrc/vocab.cpp | 20 ++++++++++---------- torchtext/csrc/vocab.h | 12 ++++++------ 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/torchtext/csrc/vocab.cpp b/torchtext/csrc/vocab.cpp index 11776a3358..f5e89e2f01 100644 --- a/torchtext/csrc/vocab.cpp +++ b/torchtext/csrc/vocab.cpp @@ -7,20 +7,20 @@ #include // @manual namespace torchtext { -Vocab::Vocab(const StringList &tokens, +Vocab::Vocab(StringList tokens, const c10::optional &default_index) : stoi_(MAX_VOCAB_SIZE, -1), default_index_{default_index} { - for (size_t i = 0; i < tokens.size(); i++) { + for (auto &token : tokens) { // throw error if duplicate token is found - auto id = _find(c10::string_view{tokens[i].data(), tokens[i].size()}); + auto id = _find(c10::string_view{token.data(), token.size()}); TORCH_CHECK(stoi_[id] == -1, - "Duplicate token found in tokens list: " + tokens[i]); + "Duplicate token found in tokens list: " + token); - _add(tokens[i]); + _add(std::move(token)); } } -Vocab::Vocab(const StringList &tokens) : Vocab(tokens, {}) {} +Vocab::Vocab(StringList tokens) : Vocab(std::move(tokens), {}) {} int64_t Vocab::__len__() const { return itos_.size(); } @@ -54,17 +54,17 @@ c10::optional Vocab::get_default_index() const { return default_index_; } -void Vocab::append_token(const std::string &token) { +void Vocab::append_token(std::string token) { // throw error if token already exist in vocab auto id = _find(c10::string_view{token.data(), token.size()}); TORCH_CHECK(stoi_[id] == -1, "Token " + token + " already exists in the Vocab with index: " + std::to_string(stoi_[id])); - _add(token); + _add(std::move(token)); } -void Vocab::insert_token(const std::string &token, const int64_t &index) { +void Vocab::insert_token(std::string token, const int64_t &index) { // throw error if index is not valid TORCH_CHECK(index >= 0 && index <= __len__(), "Specified index " + std::to_string(index) + @@ -79,8 +79,8 @@ void Vocab::insert_token(const std::string &token, const int64_t &index) { stoi_[_find(c10::string_view{itos_[i].data(), itos_[i].size()})] = i + 1; } - itos_.insert(itos_.begin() + index, token); stoi_[_find(c10::string_view{token.data(), token.size()})] = index; + itos_.insert(itos_.begin() + index, std::move(token)); } std::string Vocab::lookup_token(const int64_t &index) { diff --git a/torchtext/csrc/vocab.h b/torchtext/csrc/vocab.h index 804d8cd2f1..50ac492a63 100644 --- a/torchtext/csrc/vocab.h +++ b/torchtext/csrc/vocab.h @@ -36,16 +36,16 @@ struct Vocab : torch::CustomClassHolder { // TODO: [can we remove this?] we need to keep this constructor, otherwise // torch binding gets compilation error: no matching constructor for // initialization of 'torchtext::Vocab' - explicit Vocab(const StringList &tokens); - explicit Vocab(const StringList &tokens, + explicit Vocab(StringList tokens); + explicit Vocab(StringList tokens, const c10::optional &default_index); int64_t __len__() const; int64_t __getitem__(const c10::string_view &token) const; bool __contains__(const c10::string_view &token) const; void set_default_index(c10::optional index); c10::optional get_default_index() const; - void insert_token(const std::string &token, const int64_t &index); - void append_token(const std::string &token); + void insert_token(std::string token, const int64_t &index); + void append_token(std::string token); std::string lookup_token(const int64_t &index); std::vector lookup_tokens(const std::vector &indices); std::vector @@ -72,10 +72,10 @@ struct Vocab : torch::CustomClassHolder { return id; } - void _add(const std::string &w) { + void _add(std::string w) { uint32_t h = _find(c10::string_view{w.data(), w.size()}); if (stoi_[h] == -1) { - itos_.push_back(w); + itos_.emplace_back(std::move(w)); stoi_[h] = itos_.size() - 1; } } From 5993d1367d299a799e5efa5be0f4e0c3e37a1256 Mon Sep 17 00:00:00 2001 From: cyy Date: Thu, 17 Jun 2021 16:48:29 +0800 Subject: [PATCH 06/15] use the std::string constructor of c10::string_view and more std::move --- torchtext/csrc/vocab.cpp | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/torchtext/csrc/vocab.cpp b/torchtext/csrc/vocab.cpp index f5e89e2f01..70eef0dc14 100644 --- a/torchtext/csrc/vocab.cpp +++ b/torchtext/csrc/vocab.cpp @@ -12,7 +12,7 @@ Vocab::Vocab(StringList tokens, : stoi_(MAX_VOCAB_SIZE, -1), default_index_{default_index} { for (auto &token : tokens) { // throw error if duplicate token is found - auto id = _find(c10::string_view{token.data(), token.size()}); + auto id = _find(c10::string_view{token}); TORCH_CHECK(stoi_[id] == -1, "Duplicate token found in tokens list: " + token); @@ -56,7 +56,7 @@ c10::optional Vocab::get_default_index() const { void Vocab::append_token(std::string token) { // throw error if token already exist in vocab - auto id = _find(c10::string_view{token.data(), token.size()}); + auto id = _find(c10::string_view{token}); TORCH_CHECK(stoi_[id] == -1, "Token " + token + " already exists in the Vocab with index: " + std::to_string(stoi_[id])); @@ -76,10 +76,10 @@ void Vocab::insert_token(std::string token, const int64_t &index) { // need to offset all tokens greater than or equal index by 1 for (size_t i = index; i < __len__(); i++) { - stoi_[_find(c10::string_view{itos_[i].data(), itos_[i].size()})] = i + 1; + stoi_[_find(c10::string_view{itos_[i]})] = i + 1; } - stoi_[_find(c10::string_view{token.data(), token.size()})] = index; + stoi_[_find(c10::string_view{token})] = index; itos_.insert(itos_.begin() + index, std::move(token)); } @@ -351,14 +351,8 @@ Vocab _build_vocab_from_text_file_using_python_tokenizer( std::vector token_list = tokenizer(line).cast>(); - for (size_t i = 0; i < token_list.size(); i++) { - std::string token = token_list[i]; - - if (counter.find(token) == counter.end()) { - counter[token] = 1; - } else { - counter[token] += 1; - } + for (auto &token : token_list) { + counter[std::move(token)] += 1; } } From 8404f2b0b39ce516c0950fa191bdd7c252fd1b8c Mon Sep 17 00:00:00 2001 From: cyy Date: Thu, 17 Jun 2021 19:04:53 +0800 Subject: [PATCH 07/15] more std::move --- torchtext/csrc/vectors.cpp | 8 ++++---- torchtext/csrc/vectors.h | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/torchtext/csrc/vectors.cpp b/torchtext/csrc/vectors.cpp index b7554d3926..fd5a11e915 100644 --- a/torchtext/csrc/vectors.cpp +++ b/torchtext/csrc/vectors.cpp @@ -15,13 +15,13 @@ namespace torchtext { -Vectors::Vectors(const IndexMap &stoi, const torch::Tensor &vectors, - const torch::Tensor &unk_tensor) - : stoi_(stoi), vectors_(vectors), unk_tensor_(unk_tensor) {} +Vectors::Vectors(const IndexMap &stoi, torch::Tensor vectors, + torch::Tensor unk_tensor) + : stoi_(stoi), vectors_(std::move(vectors)), unk_tensor_(std::move(unk_tensor)) {} Vectors::Vectors(const std::vector &tokens, const std::vector &indices, - const torch::Tensor &vectors, const torch::Tensor &unk_tensor) + torch::Tensor vectors, torch::Tensor unk_tensor) : vectors_(std::move(vectors)), unk_tensor_(std::move(unk_tensor)) { // guarding against size mismatch of tokens and indices if (static_cast(tokens.size()) != indices.size()) { diff --git a/torchtext/csrc/vectors.h b/torchtext/csrc/vectors.h index 0f17314eaa..10ca758d40 100644 --- a/torchtext/csrc/vectors.h +++ b/torchtext/csrc/vectors.h @@ -19,12 +19,12 @@ struct Vectors : torch::CustomClassHolder { torch::Tensor vectors_; torch::Tensor unk_tensor_; - explicit Vectors(const IndexMap &stoi, const torch::Tensor &vectors, - const torch::Tensor &unk_tensor); + explicit Vectors(const IndexMap &stoi, torch::Tensor vectors, + torch::Tensor unk_tensor); explicit Vectors(const std::vector &tokens, const std::vector &indices, - const torch::Tensor &vectors, - const torch::Tensor &unk_tensor); + torch::Tensor vectors, + torch::Tensor unk_tensor); std::unordered_map get_stoi(); torch::Tensor __getitem__(const std::string &token); torch::Tensor lookup_vectors(const std::vector &tokens); From 4159ebe3d6e95511c75fa9f48f6e94895f10f559 Mon Sep 17 00:00:00 2001 From: cyy Date: Thu, 17 Jun 2021 20:35:37 +0800 Subject: [PATCH 08/15] more std::move --- torchtext/csrc/sentencepiece.cpp | 2 +- torchtext/csrc/sentencepiece.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/torchtext/csrc/sentencepiece.cpp b/torchtext/csrc/sentencepiece.cpp index 1fc5e370b2..0099d827d9 100644 --- a/torchtext/csrc/sentencepiece.cpp +++ b/torchtext/csrc/sentencepiece.cpp @@ -75,7 +75,7 @@ c10::intrusive_ptr load_sp_model(const std::string &path) { } c10::intrusive_ptr -load_sp_model_string(const std::string &content) { +load_sp_model_string(std::string content) { return c10::make_intrusive(std::move(content)); } diff --git a/torchtext/csrc/sentencepiece.h b/torchtext/csrc/sentencepiece.h index 3b26d90ffa..dfd997f52f 100644 --- a/torchtext/csrc/sentencepiece.h +++ b/torchtext/csrc/sentencepiece.h @@ -33,6 +33,6 @@ void generate_sp_model(const std::string &filename, const int64_t &vocab_size, const std::string &model_prefix); c10::intrusive_ptr load_sp_model(const std::string &path); c10::intrusive_ptr -load_sp_model_string(const std::string &content); +load_sp_model_string(std::string content); } // namespace torchtext From 714f8c0667cf676318efda8032dc4427fbd3eb0d Mon Sep 17 00:00:00 2001 From: cyy Date: Thu, 17 Jun 2021 20:57:54 +0800 Subject: [PATCH 09/15] use const reference --- torchtext/csrc/vectors.cpp | 2 +- torchtext/csrc/vectors.h | 2 +- torchtext/csrc/vocab.cpp | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/torchtext/csrc/vectors.cpp b/torchtext/csrc/vectors.cpp index fd5a11e915..c9353782c9 100644 --- a/torchtext/csrc/vectors.cpp +++ b/torchtext/csrc/vectors.cpp @@ -206,7 +206,7 @@ _concat_vectors(std::vector> chunk_tokens, constexpr int64_t GRAIN_SIZE = 131072; std::tuple> _load_token_and_vectors_from_file( - const std::string &file_path, const std::string delimiter_str, + const std::string &file_path, const std::string &delimiter_str, int64_t num_cpus, c10::optional opt_unk_tensor) { TORCH_CHECK(delimiter_str.size() == 1, diff --git a/torchtext/csrc/vectors.h b/torchtext/csrc/vectors.h index 10ca758d40..abe69f5fe6 100644 --- a/torchtext/csrc/vectors.h +++ b/torchtext/csrc/vectors.h @@ -36,7 +36,7 @@ VectorsStates _serialize_vectors(const c10::intrusive_ptr &self); c10::intrusive_ptr _deserialize_vectors(VectorsStates states); std::tuple> _load_token_and_vectors_from_file( - const std::string &file_path, const std::string delimiter_str, + const std::string &file_path, const std::string &delimiter_str, const int64_t num_cpus, c10::optional opt_unk_tensor); } // namespace torchtext diff --git a/torchtext/csrc/vocab.cpp b/torchtext/csrc/vocab.cpp index 70eef0dc14..ca883f7545 100644 --- a/torchtext/csrc/vocab.cpp +++ b/torchtext/csrc/vocab.cpp @@ -144,7 +144,7 @@ int64_t _infer_lines(const std::string &file_path) { void parse_vocab_file_chunk(const std::string &file_path, size_t offset, const int64_t start_line, const int64_t end_line, - std::shared_ptr counter) { + const std::shared_ptr &counter) { std::ifstream fin(file_path, std::ios::in); TORCH_CHECK(fin.is_open(), "Cannot open input file " + file_path); @@ -165,7 +165,7 @@ void parse_vocab_file_chunk(const std::string &file_path, size_t offset, void parse_raw_text_file_chunk(const std::string &file_path, size_t offset, const int64_t start_line, const int64_t end_line, - std::shared_ptr counter, + const std::shared_ptr &counter, torch::jit::script::Module &module) { std::ifstream fin(file_path, std::ios::in); TORCH_CHECK(fin.is_open(), "Cannot open input file " + file_path); From ded11da66ee1977f94a17b0778a91b54c537f5ec Mon Sep 17 00:00:00 2001 From: cyy Date: Thu, 17 Jun 2021 21:08:11 +0800 Subject: [PATCH 10/15] more std::move --- torchtext/csrc/register_pybindings.cpp | 125 ++++++++++++++++++++++++- 1 file changed, 123 insertions(+), 2 deletions(-) diff --git a/torchtext/csrc/register_pybindings.cpp b/torchtext/csrc/register_pybindings.cpp index 80d91ee744..8277ba8565 100644 --- a/torchtext/csrc/register_pybindings.cpp +++ b/torchtext/csrc/register_pybindings.cpp @@ -100,7 +100,7 @@ PYBIND11_MODULE(_torchtext, m) { }, // __setstate__ [](VectorsStates states) -> c10::intrusive_ptr { - return _deserialize_vectors(states); + return _deserialize_vectors(std::move(states)); })); py::class_>(m, "Vocab") @@ -149,7 +149,7 @@ PYBIND11_MODULE(_torchtext, m) { }, // __setstate__ [](VocabStates states) -> c10::intrusive_ptr { - return _deserialize_vocab(states); + return _deserialize_vocab(std::move(states)); })); // Functions @@ -159,6 +159,127 @@ PYBIND11_MODULE(_torchtext, m) { m.def("_build_vocab_from_text_file", &build_vocab_from_text_file); m.def("_build_vocab_from_text_file_using_python_tokenizer", &_build_vocab_from_text_file_using_python_tokenizer); + m.def("_build_vocab_from_text_file_using_python_tokenizer", &_build_vocab_from_text_file_using_python_tokenizer); +} + +TORCH_LIBRARY_FRAGMENT(torchtext, m) { + m.class_("Regex") + .def(torch::init()) + .def("Sub", &Regex::Sub) + .def_pickle( + // __getstate__ + [](const c10::intrusive_ptr &self) -> std::string { + return _serialize_regex(self); + }, + // __setstate__ + [](std::string state) -> c10::intrusive_ptr { + return _deserialize_regex(std::move(state)); + }); + + m.class_("RegexTokenizer") + .def(torch::init, std::vector, + bool>()) + .def("forward", &RegexTokenizer::forward) + .def_pickle( + // __getstate__ + [](const c10::intrusive_ptr &self) + -> RegexTokenizerStates { + return _serialize_regex_tokenizer(self); + }, + // __setstate__ + [](RegexTokenizerStates states) + -> c10::intrusive_ptr { + return _deserialize_regex_tokenizer(std::move(states)); + }); + + m.class_("SentencePiece") + .def(torch::init()) + .def("Encode", &SentencePiece::Encode) + .def("EncodeAsIds", &SentencePiece::EncodeAsIds) + .def("DecodeIds", &SentencePiece::DecodeIds) + .def("EncodeAsPieces", &SentencePiece::EncodeAsPieces) + .def("DecodePieces", &SentencePiece::DecodePieces) + .def("GetPieceSize", &SentencePiece::GetPieceSize) + .def("unk_id", &SentencePiece::unk_id) + .def("PieceToId", &SentencePiece::PieceToId) + .def("IdToPiece", &SentencePiece::IdToPiece) + .def_pickle( + // The underlying content of SentencePiece contains byte string, + // and returing it as std::string cause UTF8 decoding error. + // Since TorchScript does not support byte string, we use byte Tensor + // to pass around the data. + // __getstate__ + [](const c10::intrusive_ptr &self) -> torch::Tensor { + auto *data = + static_cast(const_cast(self->content_.data())); + auto numel = static_cast(self->content_.size()); + return torch::from_blob(data, {numel}, {torch::kUInt8}).clone(); + }, + // __setstate__ + [](torch::Tensor state) -> c10::intrusive_ptr { + auto *data = static_cast(state.data_ptr()); + auto numel = state.size(0); + return c10::make_intrusive(std::string(data, numel)); + }); + + m.class_("Vectors") + .def(torch::init, std::vector, + torch::Tensor, torch::Tensor>()) + .def("__getitem__", &Vectors::__getitem__) + .def("lookup_vectors", &Vectors::lookup_vectors) + .def("__setitem__", &Vectors::__setitem__) + .def("__len__", &Vectors::__len__) + .def_pickle( + // __getstate__ + [](const c10::intrusive_ptr &self) -> VectorsStates { + return _serialize_vectors(self); + }, + // __setstate__ + [](VectorsStates states) -> c10::intrusive_ptr { + return _deserialize_vectors(std::move(states)); + }); + + m.class_("Vocab") + .def(torch::init>()) + .def("__contains__", + [](const c10::intrusive_ptr &self, const std::string &item) + -> bool { return self->__contains__(c10::string_view{item}); }) + .def("__getitem__", + [](const c10::intrusive_ptr &self, const std::string &item) + -> int64_t { return self->__getitem__(c10::string_view{item}); }) + .def("insert_token", &Vocab::insert_token) + .def("__len__", &Vocab::__len__) + .def("set_default_index", &Vocab::set_default_index) + .def("get_default_index", &Vocab::get_default_index) + .def("append_token", &Vocab::append_token) + .def("lookup_token", &Vocab::lookup_token) + .def("lookup_tokens", &Vocab::lookup_tokens) + .def("lookup_indices", + [](const c10::intrusive_ptr &self, + const std::vector &items) { + std::vector indices(items.size()); + int64_t counter = 0; + for (const auto &item : items) { + indices[counter++] = self->__getitem__(c10::string_view{item}); + } + return indices; + }) + .def("get_stoi", &Vocab::get_stoi) + .def("get_itos", &Vocab::get_itos) + .def_pickle( + // __getstate__ + [](const c10::intrusive_ptr &self) -> VocabStates { + return _serialize_vocab(self); + }, + // __setstate__ + [](VocabStates states) -> c10::intrusive_ptr { + return _deserialize_vocab(std::move(states)); + }); + + m.def("torchtext::generate_sp_model", &generate_sp_model); + m.def("torchtext::load_sp_model", &load_sp_model); + m.def("torchtext::load_sp_model_string", &load_sp_model_string); +>>>>>>> 1c8cd01e (more std::move):torchtext/csrc/register_bindings.cpp } } // namespace torchtext From 518b87b67b9e10b81e54ddf687c6051f06f9ee69 Mon Sep 17 00:00:00 2001 From: cyy Date: Thu, 17 Jun 2021 21:17:04 +0800 Subject: [PATCH 11/15] unnecessary type convertion --- torchtext/csrc/vectors.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchtext/csrc/vectors.cpp b/torchtext/csrc/vectors.cpp index c9353782c9..63aaafaf2d 100644 --- a/torchtext/csrc/vectors.cpp +++ b/torchtext/csrc/vectors.cpp @@ -24,7 +24,7 @@ Vectors::Vectors(const std::vector &tokens, torch::Tensor vectors, torch::Tensor unk_tensor) : vectors_(std::move(vectors)), unk_tensor_(std::move(unk_tensor)) { // guarding against size mismatch of tokens and indices - if (static_cast(tokens.size()) != indices.size()) { + if (tokens.size() != indices.size()) { #ifdef _MSC_VER std::cerr << "[RuntimeError] Mismatching sizes for tokens and indices. " "Size of tokens: " From e742660eee485a0d80b4571fdb31b9d6558a444f Mon Sep 17 00:00:00 2001 From: cyy Date: Thu, 17 Jun 2021 21:21:16 +0800 Subject: [PATCH 12/15] use emplace_back --- torchtext/csrc/vectors.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchtext/csrc/vectors.cpp b/torchtext/csrc/vectors.cpp index 63aaafaf2d..4a3ca89d17 100644 --- a/torchtext/csrc/vectors.cpp +++ b/torchtext/csrc/vectors.cpp @@ -72,7 +72,7 @@ torch::Tensor Vectors::__getitem__(const std::string &token) { torch::Tensor Vectors::lookup_vectors(const std::vector &tokens) { std::vector vectors; for (const std::string &token : tokens) { - vectors.push_back(__getitem__(token)); + vectors.emplace_back(__getitem__(token)); } return torch::stack(vectors, 0); } From 0e89418ecf028339b25ccfe2ab874acf1eda129c Mon Sep 17 00:00:00 2001 From: cyy Date: Thu, 17 Jun 2021 21:23:03 +0800 Subject: [PATCH 13/15] use std::move --- torchtext/csrc/vectors.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchtext/csrc/vectors.cpp b/torchtext/csrc/vectors.cpp index 4a3ca89d17..dd187e6ac2 100644 --- a/torchtext/csrc/vectors.cpp +++ b/torchtext/csrc/vectors.cpp @@ -265,7 +265,7 @@ std::tuple> _load_token_and_vectors_from_file( torch::Tensor unk_tensor; if (opt_unk_tensor) { - unk_tensor = *opt_unk_tensor; + unk_tensor = std::move(*opt_unk_tensor); } else { unk_tensor = torch::zeros({vector_dim}, torch::kFloat32); } From 5de883ca137a2f5699f6ae1a0e21d6744b59d844 Mon Sep 17 00:00:00 2001 From: cyy Date: Fri, 16 Jul 2021 12:37:28 +0800 Subject: [PATCH 14/15] Revert "more std::move" This reverts commit ded11da66ee1977f94a17b0778a91b54c537f5ec. --- torchtext/csrc/register_pybindings.cpp | 125 +------------------------ 1 file changed, 2 insertions(+), 123 deletions(-) diff --git a/torchtext/csrc/register_pybindings.cpp b/torchtext/csrc/register_pybindings.cpp index 8277ba8565..80d91ee744 100644 --- a/torchtext/csrc/register_pybindings.cpp +++ b/torchtext/csrc/register_pybindings.cpp @@ -100,7 +100,7 @@ PYBIND11_MODULE(_torchtext, m) { }, // __setstate__ [](VectorsStates states) -> c10::intrusive_ptr { - return _deserialize_vectors(std::move(states)); + return _deserialize_vectors(states); })); py::class_>(m, "Vocab") @@ -149,7 +149,7 @@ PYBIND11_MODULE(_torchtext, m) { }, // __setstate__ [](VocabStates states) -> c10::intrusive_ptr { - return _deserialize_vocab(std::move(states)); + return _deserialize_vocab(states); })); // Functions @@ -159,127 +159,6 @@ PYBIND11_MODULE(_torchtext, m) { m.def("_build_vocab_from_text_file", &build_vocab_from_text_file); m.def("_build_vocab_from_text_file_using_python_tokenizer", &_build_vocab_from_text_file_using_python_tokenizer); - m.def("_build_vocab_from_text_file_using_python_tokenizer", &_build_vocab_from_text_file_using_python_tokenizer); -} - -TORCH_LIBRARY_FRAGMENT(torchtext, m) { - m.class_("Regex") - .def(torch::init()) - .def("Sub", &Regex::Sub) - .def_pickle( - // __getstate__ - [](const c10::intrusive_ptr &self) -> std::string { - return _serialize_regex(self); - }, - // __setstate__ - [](std::string state) -> c10::intrusive_ptr { - return _deserialize_regex(std::move(state)); - }); - - m.class_("RegexTokenizer") - .def(torch::init, std::vector, - bool>()) - .def("forward", &RegexTokenizer::forward) - .def_pickle( - // __getstate__ - [](const c10::intrusive_ptr &self) - -> RegexTokenizerStates { - return _serialize_regex_tokenizer(self); - }, - // __setstate__ - [](RegexTokenizerStates states) - -> c10::intrusive_ptr { - return _deserialize_regex_tokenizer(std::move(states)); - }); - - m.class_("SentencePiece") - .def(torch::init()) - .def("Encode", &SentencePiece::Encode) - .def("EncodeAsIds", &SentencePiece::EncodeAsIds) - .def("DecodeIds", &SentencePiece::DecodeIds) - .def("EncodeAsPieces", &SentencePiece::EncodeAsPieces) - .def("DecodePieces", &SentencePiece::DecodePieces) - .def("GetPieceSize", &SentencePiece::GetPieceSize) - .def("unk_id", &SentencePiece::unk_id) - .def("PieceToId", &SentencePiece::PieceToId) - .def("IdToPiece", &SentencePiece::IdToPiece) - .def_pickle( - // The underlying content of SentencePiece contains byte string, - // and returing it as std::string cause UTF8 decoding error. - // Since TorchScript does not support byte string, we use byte Tensor - // to pass around the data. - // __getstate__ - [](const c10::intrusive_ptr &self) -> torch::Tensor { - auto *data = - static_cast(const_cast(self->content_.data())); - auto numel = static_cast(self->content_.size()); - return torch::from_blob(data, {numel}, {torch::kUInt8}).clone(); - }, - // __setstate__ - [](torch::Tensor state) -> c10::intrusive_ptr { - auto *data = static_cast(state.data_ptr()); - auto numel = state.size(0); - return c10::make_intrusive(std::string(data, numel)); - }); - - m.class_("Vectors") - .def(torch::init, std::vector, - torch::Tensor, torch::Tensor>()) - .def("__getitem__", &Vectors::__getitem__) - .def("lookup_vectors", &Vectors::lookup_vectors) - .def("__setitem__", &Vectors::__setitem__) - .def("__len__", &Vectors::__len__) - .def_pickle( - // __getstate__ - [](const c10::intrusive_ptr &self) -> VectorsStates { - return _serialize_vectors(self); - }, - // __setstate__ - [](VectorsStates states) -> c10::intrusive_ptr { - return _deserialize_vectors(std::move(states)); - }); - - m.class_("Vocab") - .def(torch::init>()) - .def("__contains__", - [](const c10::intrusive_ptr &self, const std::string &item) - -> bool { return self->__contains__(c10::string_view{item}); }) - .def("__getitem__", - [](const c10::intrusive_ptr &self, const std::string &item) - -> int64_t { return self->__getitem__(c10::string_view{item}); }) - .def("insert_token", &Vocab::insert_token) - .def("__len__", &Vocab::__len__) - .def("set_default_index", &Vocab::set_default_index) - .def("get_default_index", &Vocab::get_default_index) - .def("append_token", &Vocab::append_token) - .def("lookup_token", &Vocab::lookup_token) - .def("lookup_tokens", &Vocab::lookup_tokens) - .def("lookup_indices", - [](const c10::intrusive_ptr &self, - const std::vector &items) { - std::vector indices(items.size()); - int64_t counter = 0; - for (const auto &item : items) { - indices[counter++] = self->__getitem__(c10::string_view{item}); - } - return indices; - }) - .def("get_stoi", &Vocab::get_stoi) - .def("get_itos", &Vocab::get_itos) - .def_pickle( - // __getstate__ - [](const c10::intrusive_ptr &self) -> VocabStates { - return _serialize_vocab(self); - }, - // __setstate__ - [](VocabStates states) -> c10::intrusive_ptr { - return _deserialize_vocab(std::move(states)); - }); - - m.def("torchtext::generate_sp_model", &generate_sp_model); - m.def("torchtext::load_sp_model", &load_sp_model); - m.def("torchtext::load_sp_model_string", &load_sp_model_string); ->>>>>>> 1c8cd01e (more std::move):torchtext/csrc/register_bindings.cpp } } // namespace torchtext From 35fe71692f5ad36aa389b2faff6258f978db1f44 Mon Sep 17 00:00:00 2001 From: cyy Date: Fri, 16 Jul 2021 12:42:09 +0800 Subject: [PATCH 15/15] fix rebase --- torchtext/csrc/vocab.cpp | 42 ---------------------------------------- 1 file changed, 42 deletions(-) diff --git a/torchtext/csrc/vocab.cpp b/torchtext/csrc/vocab.cpp index ca883f7545..6f7abc8db4 100644 --- a/torchtext/csrc/vocab.cpp +++ b/torchtext/csrc/vocab.cpp @@ -335,48 +335,6 @@ Vocab _build_vocab_from_text_file(const std::string &file_path, return Vocab(std::move(tokens)); } -Vocab _build_vocab_from_text_file_using_python_tokenizer( - const std::string &file_path, const int64_t min_freq, - py::object tokenizer) { - // find number of lines - int64_t num_lines = _infer_lines(file_path); - // Read text from file and add tokens - std::ifstream fin(file_path, std::ios::in); - TORCH_CHECK(fin.is_open(), "Cannot open input file " + file_path); - - IndexDict counter; - std::string line; - for (int64_t i = 0; i < num_lines; i++) { - std::getline(fin, line); - std::vector token_list = - tokenizer(line).cast>(); - - for (auto &token : token_list) { - counter[std::move(token)] += 1; - } - } - - // create tokens-frequency pairs - std::vector> token_freq_pairs; - for (const auto &item : counter) { - if (item.second >= min_freq) { - token_freq_pairs.push_back(item); - } - } - - // sort tokens by frequency - CompareTokens compare_tokens; - std::sort(token_freq_pairs.begin(), token_freq_pairs.end(), compare_tokens); - - // Create final list of tokens - StringList tokens; - for (auto &token_freq_pair : token_freq_pairs) { - tokens.emplace_back(std::move(token_freq_pair.first)); - } - - return Vocab(std::move(tokens)); -} - VocabStates _serialize_vocab(const c10::intrusive_ptr &self) { std::vector integers; StringList strings = self->itos_;