Remove dependency on the torch::jit::script::Module for mobile builds (#1885)

joecummings · Alexander Mazukabzov · Nayef211 · web-flow · commit 6a43bd524ff9 · 2022-08-31T12:07:04.000-04:00
* Remove dependency on the torch::jit::script::Module for mobile builds Summary: In order to resolve linkage errors. Specifically when vocab getting build for "mobile" version it can't resolve symbols for torch::jit::script::Module Reviewed By: Nayef211 Differential Revision: D38771271 fbshipit-source-id: 693b656f2a17af9fa5a7a1904742557f902edb55 * Add vocab factory to CMakeLists * Fix type conversion from py::object to STL container (#1887) * Export symbols in `common.h` file (#1888) * Fix type conversion from py::object to STL container * Adding TORCHTEXT_API to expose symbols in common.h * Add common.h import in corresponding cpp file Co-authored-by: Alexander Mazukabzov <alexmaz@fb.com> Co-authored-by: Nayef Ahmed <22487263+Nayef211@users.noreply.github.com>
diff --git a/torchtext/csrc/CMakeLists.txt b/torchtext/csrc/CMakeLists.txt
@@ -113,6 +113,7 @@ if (BUILD_TORCHTEXT_PYTHON_EXTENSION)
   set(
     EXTENSION_SOURCES
     register_pybindings.cpp
+    vocab_factory.cpp
     )
 
   set(
diff --git a/torchtext/csrc/common.cpp b/torchtext/csrc/common.cpp
@@ -1,9 +1,8 @@
+#include <torchtext/csrc/common.h>
+
 #include <fstream>
 #include <ios>
-#include <iostream>
 #include <limits>
-#include <string>
-#include <vector>
 
 namespace torchtext {
 namespace impl {
diff --git a/torchtext/csrc/common.h b/torchtext/csrc/common.h
@@ -1,8 +1,14 @@
+#include <torchtext/csrc/export.h>
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
 namespace torchtext {
 
 namespace impl {
-int64_t divup(int64_t x, int64_t y);
-void infer_offsets(
+TORCHTEXT_API int64_t divup(int64_t x, int64_t y);
+TORCHTEXT_API void infer_offsets(
     const std::string& file_path,
     int64_t num_lines,
     int64_t chunk_size,
diff --git a/torchtext/csrc/vocab.cpp b/torchtext/csrc/vocab.cpp
@@ -136,231 +136,6 @@ StringList Vocab::get_itos() const {
   return itos_;
 }
 
-int64_t _infer_lines(const std::string& file_path) {
-  int64_t num_lines = 0;
-  std::ifstream fin;
-  fin.open(file_path, std::ios::in);
-  TORCH_CHECK(fin.is_open(), "Cannot open input file " + file_path);
-
-  while (fin.ignore(std::numeric_limits<std::streamsize>::max(), '\n')) {
-    num_lines++;
-  }
-  return num_lines;
-}
-
-void parse_vocab_file_chunk(
-    const std::string& file_path,
-    size_t offset,
-    const int64_t start_line,
-    const int64_t end_line,
-    const std::shared_ptr<IndexDict>& counter) {
-  std::ifstream fin(file_path, std::ios::in);
-  TORCH_CHECK(fin.is_open(), "Cannot open input file " + file_path);
-
-  fin.seekg(offset);
-
-  for (int64_t i = start_line; i < end_line; i++) {
-    std::string token;
-    fin >> token;
-    fin >> std::ws;
-
-    if ((*counter).find(token) == (*counter).end()) {
-      (*counter)[token] = 1;
-    } else {
-      (*counter)[token] += 1;
-    }
-  }
-}
-
-void parse_raw_text_file_chunk(
-    const std::string& file_path,
-    size_t offset,
-    const int64_t start_line,
-    const int64_t end_line,
-    const std::shared_ptr<IndexDict>& counter,
-    torch::jit::script::Module& module) {
-  std::ifstream fin(file_path, std::ios::in);
-  TORCH_CHECK(fin.is_open(), "Cannot open input file " + file_path);
-
-  fin.seekg(offset);
-
-  std::string line;
-  for (int64_t i = start_line; i < end_line; i++) {
-    std::getline(fin, line);
-    auto token_list =
-        module.forward(std::vector<c10::IValue>({c10::IValue(line)})).toList();
-
-    for (size_t i = 0; i < token_list.size(); i++) {
-      c10::IValue token_ref = token_list.get(i);
-      std::string token = token_ref.toStringRef();
-
-      if ((*counter).find(token) == (*counter).end()) {
-        (*counter)[token] = 1;
-      } else {
-        (*counter)[token] += 1;
-      }
-    }
-  }
-}
-
-StringList _concat_tokens(
-    std::vector<std::shared_ptr<IndexDict>> chunk_counters,
-    const int64_t min_freq,
-    const int64_t num_lines,
-    const bool sort_tokens) {
-  TORCH_CHECK(
-      chunk_counters.size() > 0,
-      "There must be at least 1 chunk to concatenate!");
-
-  IndexDict tokens_freq;
-  StringList unique_tokens;
-  unique_tokens.reserve(num_lines);
-
-  // concatenate all counters
-  for (size_t i = 0; i < chunk_counters.size(); i++) {
-    auto& cur_counter = *chunk_counters[i];
-    for (const auto& item : cur_counter) {
-      int64_t cur_token_freq = item.second;
-      if (tokens_freq.find(item.first) != tokens_freq.end()) {
-        tokens_freq[item.first] += cur_token_freq;
-      } else {
-        tokens_freq[item.first] = cur_token_freq;
-      }
-
-      // add to tokens list only if all of the conditions are met:
-      // 1. token is not empty
-      // 2. we exceed min_freq for the first time
-      if (item.first.length() &&
-          tokens_freq[item.first] - cur_token_freq < min_freq &&
-          tokens_freq[item.first] >= min_freq) {
-        unique_tokens.push_back(item.first);
-      }
-    }
-  }
-
-  // create token freq pairs
-  std::vector<std::pair<std::string, int64_t>> token_freq_pairs;
-
-  for (std::string& token : unique_tokens) {
-    auto token_freq = tokens_freq[token];
-    token_freq_pairs.emplace_back(std::move(token), token_freq);
-  }
-  unique_tokens.clear();
-
-  // sort tokens by freq
-  if (sort_tokens) {
-    CompareTokens compare_tokens;
-    std::sort(token_freq_pairs.begin(), token_freq_pairs.end(), compare_tokens);
-  }
-
-  // update unique tokens with correct order
-  for (auto& token_freq_pair : token_freq_pairs) {
-    unique_tokens.emplace_back(std::move(token_freq_pair.first));
-  }
-
-  return unique_tokens;
-}
-
-constexpr int64_t GRAIN_SIZE = 13107;
-Vocab _load_vocab_from_file(
-    const std::string& file_path,
-    const int64_t min_freq,
-    const int64_t num_cpus) {
-  int64_t num_lines = _infer_lines(file_path);
-  int64_t chunk_size = impl::divup(num_lines, num_cpus);
-  // Launching a thread on less lines than this likely has too much overhead.
-  // TODO: Add explicit test beyond grain size to cover multithreading
-  chunk_size = std::max(chunk_size, GRAIN_SIZE);
-
-  std::vector<size_t> offsets;
-  impl::infer_offsets(file_path, num_lines, chunk_size, offsets);
-
-  std::vector<std::shared_ptr<IndexDict>> chunk_counters;
-
-  std::mutex m;
-  std::condition_variable cv;
-  std::atomic<int> thread_count(0);
-
-  // create threads
-  int64_t j = 0;
-  for (int64_t i = 0; i < num_lines; i += chunk_size) {
-    auto counter_ptr = std::make_shared<IndexDict>();
-
-    thread_count++;
-    at::launch([&, file_path, num_lines, chunk_size, j, i, counter_ptr]() {
-      parse_vocab_file_chunk(
-          file_path,
-          offsets[j],
-          i,
-          std::min(num_lines, i + chunk_size),
-          counter_ptr);
-      std::lock_guard<std::mutex> lk(m);
-      thread_count--;
-      cv.notify_all();
-    });
-    chunk_counters.push_back(counter_ptr);
-    j++;
-  }
-
-  // block until all threads finish execution
-  std::unique_lock<std::mutex> lock(m);
-  cv.wait(lock, [&thread_count] { return thread_count == 0; });
-
-  StringList tokens =
-      _concat_tokens(chunk_counters, min_freq, num_lines, false);
-
-  return Vocab(std::move(tokens));
-}
-
-Vocab _build_vocab_from_text_file(
-    const std::string& file_path,
-    const int64_t min_freq,
-    const int64_t num_cpus,
-    torch::jit::script::Module tokenizer) {
-  int64_t num_lines = _infer_lines(file_path);
-  int64_t chunk_size = impl::divup(num_lines, num_cpus);
-  // Launching a thread on less lines than this likely has too much overhead.
-  chunk_size = std::max(chunk_size, GRAIN_SIZE);
-
-  std::vector<size_t> offsets;
-  impl::infer_offsets(file_path, num_lines, chunk_size, offsets);
-
-  std::vector<std::shared_ptr<IndexDict>> chunk_counters;
-
-  std::mutex m;
-  std::condition_variable cv;
-  std::atomic<int> thread_count(0);
-
-  // create threads
-  int64_t j = 0;
-  for (int64_t i = 0; i < num_lines; i += chunk_size) {
-    auto counter_ptr = std::make_shared<IndexDict>();
-    thread_count++;
-    at::launch([&, file_path, num_lines, chunk_size, j, i, counter_ptr]() {
-      parse_raw_text_file_chunk(
-          file_path,
-          offsets[j],
-          i,
-          std::min(num_lines, i + chunk_size),
-          counter_ptr,
-          tokenizer);
-      std::lock_guard<std::mutex> lk(m);
-      thread_count--;
-      cv.notify_all();
-    });
-    chunk_counters.push_back(counter_ptr);
-    j++;
-  }
-
-  // block until all threads finish execution
-  std::unique_lock<std::mutex> lock(m);
-  cv.wait(lock, [&thread_count] { return thread_count == 0; });
-
-  StringList tokens = _concat_tokens(chunk_counters, min_freq, num_lines, true);
-
-  return Vocab(std::move(tokens));
-}
-
 VocabStates _serialize_vocab(const c10::intrusive_ptr<Vocab>& self) {
   std::vector<int64_t> integers;
   StringList strings = self->itos_;
diff --git a/torchtext/csrc/vocab.h b/torchtext/csrc/vocab.h
@@ -91,14 +91,4 @@ struct Vocab : torch::CustomClassHolder {
 TORCHTEXT_API VocabStates
 _serialize_vocab(const c10::intrusive_ptr<Vocab>& self);
 TORCHTEXT_API c10::intrusive_ptr<Vocab> _deserialize_vocab(VocabStates states);
-
-TORCHTEXT_API Vocab _load_vocab_from_file(
-    const std::string& file_path,
-    const int64_t min_freq,
-    const int64_t num_cpus);
-TORCHTEXT_API Vocab _build_vocab_from_text_file(
-    const std::string& file_path,
-    const int64_t min_freq,
-    const int64_t num_cpus,
-    torch::jit::script::Module tokenizer);
 } // namespace torchtext
diff --git a/torchtext/csrc/vocab_factory.cpp b/torchtext/csrc/vocab_factory.cpp
diff --git a/torchtext/csrc/vocab_factory.h b/torchtext/csrc/vocab_factory.h

Original file line number	Diff line number	Diff line change
`@@ -113,6 +113,7 @@ if (BUILD_TORCHTEXT_PYTHON_EXTENSION)`
`113`	`113`	`set(`
`114`	`114`	`EXTENSION_SOURCES`
`115`	`115`	`register_pybindings.cpp`
	`116`	`+ vocab_factory.cpp`
`116`	`117`	`)`
`117`	`118`
`118`	`119`	`set(`