Skip to content
This repository was archived by the owner on Sep 10, 2025. It is now read-only.
2 changes: 1 addition & 1 deletion torchtext/csrc/sentencepiece.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ c10::intrusive_ptr<SentencePiece> load_sp_model(const std::string &path) {
}

c10::intrusive_ptr<SentencePiece>
load_sp_model_string(const std::string &content) {
load_sp_model_string(std::string content) {
return c10::make_intrusive<SentencePiece>(std::move(content));
}

Expand Down
2 changes: 1 addition & 1 deletion torchtext/csrc/sentencepiece.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,6 @@ void generate_sp_model(const std::string &filename, const int64_t &vocab_size,
const std::string &model_prefix);
c10::intrusive_ptr<SentencePiece> load_sp_model(const std::string &path);
c10::intrusive_ptr<SentencePiece>
load_sp_model_string(const std::string &content);
load_sp_model_string(std::string content);

} // namespace torchtext
16 changes: 8 additions & 8 deletions torchtext/csrc/vectors.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,16 @@

namespace torchtext {

Vectors::Vectors(const IndexMap &stoi, const torch::Tensor vectors,
const torch::Tensor &unk_tensor)
: stoi_(stoi), vectors_(vectors), unk_tensor_(unk_tensor) {}
Vectors::Vectors(const IndexMap &stoi, torch::Tensor vectors,
torch::Tensor unk_tensor)
: stoi_(stoi), vectors_(std::move(vectors)), unk_tensor_(std::move(unk_tensor)) {}

Vectors::Vectors(const std::vector<std::string> &tokens,
const std::vector<std::int64_t> &indices,
const torch::Tensor &vectors, const torch::Tensor &unk_tensor)
torch::Tensor vectors, torch::Tensor unk_tensor)
: vectors_(std::move(vectors)), unk_tensor_(std::move(unk_tensor)) {
// guarding against size mismatch of tokens and indices
if (static_cast<int>(tokens.size()) != indices.size()) {
if (tokens.size() != indices.size()) {
#ifdef _MSC_VER
std::cerr << "[RuntimeError] Mismatching sizes for tokens and indices. "
"Size of tokens: "
Expand Down Expand Up @@ -72,7 +72,7 @@ torch::Tensor Vectors::__getitem__(const std::string &token) {
torch::Tensor Vectors::lookup_vectors(const std::vector<std::string> &tokens) {
std::vector<torch::Tensor> vectors;
for (const std::string &token : tokens) {
vectors.push_back(__getitem__(token));
vectors.emplace_back(__getitem__(token));
}
return torch::stack(vectors, 0);
}
Expand Down Expand Up @@ -206,7 +206,7 @@ _concat_vectors(std::vector<std::shared_ptr<StringList>> chunk_tokens,

constexpr int64_t GRAIN_SIZE = 131072;
std::tuple<Vectors, std::vector<std::string>> _load_token_and_vectors_from_file(
const std::string &file_path, const std::string delimiter_str,
const std::string &file_path, const std::string &delimiter_str,
int64_t num_cpus, c10::optional<torch::Tensor> opt_unk_tensor) {

TORCH_CHECK(delimiter_str.size() == 1,
Expand Down Expand Up @@ -265,7 +265,7 @@ std::tuple<Vectors, std::vector<std::string>> _load_token_and_vectors_from_file(

torch::Tensor unk_tensor;
if (opt_unk_tensor) {
unk_tensor = *opt_unk_tensor;
unk_tensor = std::move(*opt_unk_tensor);
} else {
unk_tensor = torch::zeros({vector_dim}, torch::kFloat32);
}
Expand Down
10 changes: 5 additions & 5 deletions torchtext/csrc/vectors.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,12 @@ struct Vectors : torch::CustomClassHolder {
torch::Tensor vectors_;
torch::Tensor unk_tensor_;

explicit Vectors(const IndexMap &stoi, const torch::Tensor vectors,
const torch::Tensor &unk_tensor);
explicit Vectors(const IndexMap &stoi, torch::Tensor vectors,
torch::Tensor unk_tensor);
explicit Vectors(const std::vector<std::string> &tokens,
const std::vector<std::int64_t> &indices,
const torch::Tensor &vectors,
const torch::Tensor &unk_tensor);
torch::Tensor vectors,
torch::Tensor unk_tensor);
std::unordered_map<std::string, int64_t> get_stoi();
torch::Tensor __getitem__(const std::string &token);
torch::Tensor lookup_vectors(const std::vector<std::string> &tokens);
Expand All @@ -36,7 +36,7 @@ VectorsStates _serialize_vectors(const c10::intrusive_ptr<Vectors> &self);
c10::intrusive_ptr<Vectors> _deserialize_vectors(VectorsStates states);

std::tuple<Vectors, std::vector<std::string>> _load_token_and_vectors_from_file(
const std::string &file_path, const std::string delimiter_str,
const std::string &file_path, const std::string &delimiter_str,
const int64_t num_cpus, c10::optional<torch::Tensor> opt_unk_tensor);

} // namespace torchtext
41 changes: 21 additions & 20 deletions torchtext/csrc/vocab.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,20 @@
#include <vocab.h> // @manual
namespace torchtext {

Vocab::Vocab(const StringList &tokens,
Vocab::Vocab(StringList tokens,
const c10::optional<int64_t> &default_index)
: stoi_(MAX_VOCAB_SIZE, -1), default_index_{default_index} {
for (size_t i = 0; i < tokens.size(); i++) {
for (auto &token : tokens) {
// throw error if duplicate token is found
auto id = _find(c10::string_view{tokens[i].data(), tokens[i].size()});
auto id = _find(c10::string_view{token});
TORCH_CHECK(stoi_[id] == -1,
"Duplicate token found in tokens list: " + tokens[i]);
"Duplicate token found in tokens list: " + token);

_add(tokens[i]);
_add(std::move(token));
}
}

Vocab::Vocab(const StringList &tokens) : Vocab(tokens, {}) {}
Vocab::Vocab(StringList tokens) : Vocab(std::move(tokens), {}) {}

int64_t Vocab::__len__() const { return itos_.size(); }

Expand Down Expand Up @@ -54,17 +54,17 @@ c10::optional<int64_t> Vocab::get_default_index() const {
return default_index_;
}

void Vocab::append_token(const std::string &token) {
void Vocab::append_token(std::string token) {
// throw error if token already exist in vocab
auto id = _find(c10::string_view{token.data(), token.size()});
auto id = _find(c10::string_view{token});
TORCH_CHECK(stoi_[id] == -1, "Token " + token +
" already exists in the Vocab with index: " +
std::to_string(stoi_[id]));

_add(token);
_add(std::move(token));
}

void Vocab::insert_token(const std::string &token, const int64_t &index) {
void Vocab::insert_token(std::string token, const int64_t &index) {
// throw error if index is not valid
TORCH_CHECK(index >= 0 && index <= __len__(),
"Specified index " + std::to_string(index) +
Expand All @@ -76,11 +76,11 @@ void Vocab::insert_token(const std::string &token, const int64_t &index) {

// need to offset all tokens greater than or equal index by 1
for (size_t i = index; i < __len__(); i++) {
stoi_[_find(c10::string_view{itos_[i].data(), itos_[i].size()})] = i + 1;
stoi_[_find(c10::string_view{itos_[i]})] = i + 1;
}

itos_.insert(itos_.begin() + index, token);
stoi_[_find(c10::string_view{token.data(), token.size()})] = index;
stoi_[_find(c10::string_view{token})] = index;
itos_.insert(itos_.begin() + index, std::move(token));
}

std::string Vocab::lookup_token(const int64_t &index) {
Expand Down Expand Up @@ -144,7 +144,7 @@ int64_t _infer_lines(const std::string &file_path) {

void parse_vocab_file_chunk(const std::string &file_path, size_t offset,
const int64_t start_line, const int64_t end_line,
std::shared_ptr<IndexDict> counter) {
const std::shared_ptr<IndexDict> &counter) {
std::ifstream fin(file_path, std::ios::in);
TORCH_CHECK(fin.is_open(), "Cannot open input file " + file_path);

Expand All @@ -165,7 +165,7 @@ void parse_vocab_file_chunk(const std::string &file_path, size_t offset,

void parse_raw_text_file_chunk(const std::string &file_path, size_t offset,
const int64_t start_line, const int64_t end_line,
std::shared_ptr<IndexDict> counter,
const std::shared_ptr<IndexDict> &counter,
torch::jit::script::Module &module) {
std::ifstream fin(file_path, std::ios::in);
TORCH_CHECK(fin.is_open(), "Cannot open input file " + file_path);
Expand Down Expand Up @@ -225,9 +225,11 @@ _concat_tokens(std::vector<std::shared_ptr<IndexDict>> chunk_counters,
// create token freq pairs
std::vector<std::pair<std::string, int64_t>> token_freq_pairs;

for (std::string token : unique_tokens) {
token_freq_pairs.push_back(std::make_pair(token, tokens_freq[token]));
for (std::string &token : unique_tokens) {
auto token_freq = tokens_freq[token];
token_freq_pairs.emplace_back(std::move(token), token_freq);
}
unique_tokens.clear();

// sort tokens by freq
if (sort_tokens) {
Expand All @@ -236,9 +238,8 @@ _concat_tokens(std::vector<std::shared_ptr<IndexDict>> chunk_counters,
}

// update unique tokens with correct order
unique_tokens.clear();
for (const auto &token_freq_pair : token_freq_pairs) {
unique_tokens.push_back(token_freq_pair.first);
for (auto &token_freq_pair : token_freq_pairs) {
unique_tokens.emplace_back(std::move(token_freq_pair.first));
}

return unique_tokens;
Expand Down
14 changes: 7 additions & 7 deletions torchtext/csrc/vocab.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ int64_t _infer_lines(const std::string &file_path);

struct Vocab : torch::CustomClassHolder {
static const int32_t MAX_VOCAB_SIZE = 30000000;
int64_t unk_index_;
int64_t unk_index_{};
std::vector<int32_t> stoi_;
const std::string version_str_ = "0.0.2";
StringList itos_;
Expand All @@ -36,16 +36,16 @@ struct Vocab : torch::CustomClassHolder {
// TODO: [can we remove this?] we need to keep this constructor, otherwise
// torch binding gets compilation error: no matching constructor for
// initialization of 'torchtext::Vocab'
explicit Vocab(const StringList &tokens);
explicit Vocab(const StringList &tokens,
explicit Vocab(StringList tokens);
explicit Vocab(StringList tokens,
const c10::optional<int64_t> &default_index);
int64_t __len__() const;
int64_t __getitem__(const c10::string_view &token) const;
bool __contains__(const c10::string_view &token) const;
void set_default_index(c10::optional<int64_t> index);
c10::optional<int64_t> get_default_index() const;
void insert_token(const std::string &token, const int64_t &index);
void append_token(const std::string &token);
void insert_token(std::string token, const int64_t &index);
void append_token(std::string token);
std::string lookup_token(const int64_t &index);
std::vector<std::string> lookup_tokens(const std::vector<int64_t> &indices);
std::vector<int64_t>
Expand All @@ -72,10 +72,10 @@ struct Vocab : torch::CustomClassHolder {
return id;
}

void _add(const std::string &w) {
void _add(std::string w) {
uint32_t h = _find(c10::string_view{w.data(), w.size()});
if (stoi_[h] == -1) {
itos_.push_back(w);
itos_.emplace_back(std::move(w));
stoi_[h] = itos_.size() - 1;
}
}
Expand Down