Add libtorchtext cpp example (#1817)

Nayef211 · web-flow · commit cf94d30cfae0 · 2022-07-07T23:32:58.000-04:00
* First attempt at adding examples

* Working tokenizer example

* Fixes to readme

* Formatting fixes

* Added instructions to download artifacts

* Resolve PR comments
diff --git a/examples/libtorchtext/.gitignore b/examples/libtorchtext/.gitignore
@@ -0,0 +1,2 @@
+build
+**/*.pt
diff --git a/examples/libtorchtext/CMakeLists.txt b/examples/libtorchtext/CMakeLists.txt
@@ -0,0 +1,11 @@
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+project(libtorchtext_cpp_example)
+
+SET(BUILD_TORCHTEXT_PYTHON_EXTENSION OFF CACHE BOOL "Build Python binding")
+
+find_package(Torch REQUIRED)
+message("libtorchtext CMakeLists: ${TORCH_CXX_FLAGS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
+
+add_subdirectory(../.. libtorchtext)
+add_subdirectory(tokenizer)
diff --git a/examples/libtorchtext/README.md b/examples/libtorchtext/README.md
@@ -0,0 +1,22 @@
+# Libtorchtext Examples
+
+- [Tokenizer](./tokenizer)
+
+## Build
+
+The example applications in this directory depend on `libtorch` and `libtorchtext`. If you have a working `PyTorch`, you
+already have `libtorch`. Please refer to
+[this tutorial](https://pytorch.org/tutorials/advanced/torch_script_custom_classes.html) for the use of `libtorch` and
+TorchScript.
+
+`libtorchtext` is the library of torchtext's C++ components without Python components. It is currently not distributed,
+and it will be built alongside with the applications.
+
+To build `libtorchtext` and the example applications you can run the following command.
+
+```bash
+chmod +x build.sh # give script execute permission
+./build.sh
+```
+
+For the usages of each application, refer to the corresponding application directory.
diff --git a/examples/libtorchtext/build.sh b/examples/libtorchtext/build.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+
+set -eux
+
+this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+build_dir="${this_dir}/build"
+
+mkdir -p "${build_dir}"
+cd "${build_dir}"
+
+git submodule update
+cmake \
+      -DCMAKE_PREFIX_PATH="$(python -c 'import torch;print(torch.utils.cmake_prefix_path)')" \
+      -DRE2_BUILD_TESTING:BOOL=OFF \
+      -DBUILD_TESTING:BOOL=OFF \
+      -DSPM_ENABLE_SHARED=OFF  \
+      ..
+cmake --build .
diff --git a/examples/libtorchtext/tokenizer/README.md b/examples/libtorchtext/tokenizer/README.md
@@ -0,0 +1,42 @@
+# Tokenizer
+
+This example demonstrates how you can use torchtext's `GPT2BPETokenizer` in a C++ environment.
+
+## Steps
+
+### 1. Download necessary artifacts
+
+First we download `gpt2_bpe_vocab.bpe` and `gpt2_bpe_encoder.json` artifacts, both of which are needed to construct the
+`GPT2BPETokenizer` object.
+
+```bash
+curl -O https://download.pytorch.org/models/text/gpt2_bpe_vocab.bpe
+curl -O https://download.pytorch.org/models/text/gpt2_bpe_encoder.json
+```
+
+### 2. Create tokenizer TorchScript file
+
+Next we create our tokenizer object, and save it as a TorchScript object. We also print out the output of the tokenizer
+on a sample sentence and verify that the output is the same before and after saving and re-loading the tokenizer. In the
+next steps we will load and execute the tokenizer in our C++ application. The C++ code is found in
+[`main.cpp`](./main.cpp).
+
+```bash
+tokenizer_file="tokenizer.pt"
+python create_tokenizer.py --tokenizer-file "${tokenizer_file}"
+```
+
+### 3. Build the application
+
+Please refer to [the top level README.md](../README.md)
+
+### 4. Run the application
+
+Now we run the C++ application `tokenizer`, with the TorchScript object we created in Step 2. The tokenizer is run with
+the following sentence as input and we verify that the output is the same as that of Step 2.
+
+In [the top level directory](../)
+
+```bash
+./build/tokenizer/tokenize "tokenizer/${tokenizer_file}"
+```
diff --git a/examples/libtorchtext/tokenizer/create_tokenizer.py b/examples/libtorchtext/tokenizer/create_tokenizer.py
@@ -0,0 +1,29 @@
+from argparse import ArgumentParser
+
+import torch
+from torchtext import transforms
+
+
+def main(args):
+    tokenizer_file = args.tokenizer_file
+    sentence = "The green grasshopper jumped over the fence"
+
+    # create tokenizer object
+    encoder_json = "gpt2_bpe_encoder.json"
+    bpe_vocab = "gpt2_bpe_vocab.bpe"
+    tokenizer = transforms.GPT2BPETokenizer(encoder_json_path=encoder_json, vocab_bpe_path=bpe_vocab)
+
+    # script and save tokenizer
+    tokenizer = torch.jit.script(tokenizer)
+    print(tokenizer(sentence))
+    torch.jit.save(tokenizer, tokenizer_file)
+
+    # load saved tokenizer and verify outputs match
+    t = torch.jit.load(tokenizer_file)
+    print(t(sentence))
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--tokenizer-file", default="tokenizer.pt", type=str)
+    main(parser.parse_args())
diff --git a/examples/libtorchtext/tokenizer/main.cpp b/examples/libtorchtext/tokenizer/main.cpp
@@ -0,0 +1,24 @@
+#include <torch/nn/functional/activation.h>
+#include <torch/script.h>
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+int main(int argc, const char* argv[]) {
+  std::cout << "Loading model...\n";
+
+  torch::jit::script::Module module;
+  try {
+    module = torch::jit::load(argv[1]);
+  } catch (const c10::Error& e) {
+    return -1;
+  }
+
+  torch::NoGradGuard no_grad; // ensures that autograd is off
+  torch::jit::IValue tokens_ivalue = module.forward(std::vector<c10::IValue>(
+      1, "The green grasshopper jumped over the fence"));
+  std::cout << "Result: " << tokens_ivalue << std::endl;
+
+  return 0;
+}