diff --git a/examples/libtorchtext/.gitignore b/examples/libtorchtext/.gitignore new file mode 100644 index 0000000000..85e34dcc83 --- /dev/null +++ b/examples/libtorchtext/.gitignore @@ -0,0 +1,2 @@ +build +**/*.pt diff --git a/examples/libtorchtext/CMakeLists.txt b/examples/libtorchtext/CMakeLists.txt new file mode 100644 index 0000000000..d048fa974f --- /dev/null +++ b/examples/libtorchtext/CMakeLists.txt @@ -0,0 +1,11 @@ +cmake_minimum_required(VERSION 3.18 FATAL_ERROR) +project(libtorchtext_cpp_example) + +SET(BUILD_TORCHTEXT_PYTHON_EXTENSION OFF CACHE BOOL "Build Python binding") + +find_package(Torch REQUIRED) +message("libtorchtext CMakeLists: ${TORCH_CXX_FLAGS}") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}") + +add_subdirectory(../.. libtorchtext) +add_subdirectory(tokenizer) diff --git a/examples/libtorchtext/README.md b/examples/libtorchtext/README.md new file mode 100644 index 0000000000..f08512288a --- /dev/null +++ b/examples/libtorchtext/README.md @@ -0,0 +1,22 @@ +# Libtorchtext Examples + +- [Tokenizer](./tokenizer) + +## Build + +The example applications in this directory depend on `libtorch` and `libtorchtext`. If you have a working `PyTorch`, you +already have `libtorch`. Please refer to +[this tutorial](https://pytorch.org/tutorials/advanced/torch_script_custom_classes.html) for the use of `libtorch` and +TorchScript. + +`libtorchtext` is the library of torchtext's C++ components without Python components. It is currently not distributed, +and it will be built alongside with the applications. + +To build `libtorchtext` and the example applications you can run the following command. + +```bash +chmod +x build.sh # give script execute permission +./build.sh +``` + +For the usages of each application, refer to the corresponding application directory. diff --git a/examples/libtorchtext/build.sh b/examples/libtorchtext/build.sh new file mode 100755 index 0000000000..4fff9354a9 --- /dev/null +++ b/examples/libtorchtext/build.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +set -eux + +this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +build_dir="${this_dir}/build" + +mkdir -p "${build_dir}" +cd "${build_dir}" + +git submodule update +cmake \ + -DCMAKE_PREFIX_PATH="$(python -c 'import torch;print(torch.utils.cmake_prefix_path)')" \ + -DRE2_BUILD_TESTING:BOOL=OFF \ + -DBUILD_TESTING:BOOL=OFF \ + -DSPM_ENABLE_SHARED=OFF \ + .. +cmake --build . diff --git a/examples/libtorchtext/tokenizer/README.md b/examples/libtorchtext/tokenizer/README.md new file mode 100644 index 0000000000..4e651145cc --- /dev/null +++ b/examples/libtorchtext/tokenizer/README.md @@ -0,0 +1,42 @@ +# Tokenizer + +This example demonstrates how you can use torchtext's `GPT2BPETokenizer` in a C++ environment. + +## Steps + +### 1. Download necessary artifacts + +First we download `gpt2_bpe_vocab.bpe` and `gpt2_bpe_encoder.json` artifacts, both of which are needed to construct the +`GPT2BPETokenizer` object. + +```bash +curl -O https://download.pytorch.org/models/text/gpt2_bpe_vocab.bpe +curl -O https://download.pytorch.org/models/text/gpt2_bpe_encoder.json +``` + +### 2. Create tokenizer TorchScript file + +Next we create our tokenizer object, and save it as a TorchScript object. We also print out the output of the tokenizer +on a sample sentence and verify that the output is the same before and after saving and re-loading the tokenizer. In the +next steps we will load and execute the tokenizer in our C++ application. The C++ code is found in +[`main.cpp`](./main.cpp). + +```bash +tokenizer_file="tokenizer.pt" +python create_tokenizer.py --tokenizer-file "${tokenizer_file}" +``` + +### 3. Build the application + +Please refer to [the top level README.md](../README.md) + +### 4. Run the application + +Now we run the C++ application `tokenizer`, with the TorchScript object we created in Step 2. The tokenizer is run with +the following sentence as input and we verify that the output is the same as that of Step 2. + +In [the top level directory](../) + +```bash +./build/tokenizer/tokenize "tokenizer/${tokenizer_file}" +``` diff --git a/examples/libtorchtext/tokenizer/create_tokenizer.py b/examples/libtorchtext/tokenizer/create_tokenizer.py new file mode 100644 index 0000000000..5f8c695b50 --- /dev/null +++ b/examples/libtorchtext/tokenizer/create_tokenizer.py @@ -0,0 +1,29 @@ +from argparse import ArgumentParser + +import torch +from torchtext import transforms + + +def main(args): + tokenizer_file = args.tokenizer_file + sentence = "The green grasshopper jumped over the fence" + + # create tokenizer object + encoder_json = "gpt2_bpe_encoder.json" + bpe_vocab = "gpt2_bpe_vocab.bpe" + tokenizer = transforms.GPT2BPETokenizer(encoder_json_path=encoder_json, vocab_bpe_path=bpe_vocab) + + # script and save tokenizer + tokenizer = torch.jit.script(tokenizer) + print(tokenizer(sentence)) + torch.jit.save(tokenizer, tokenizer_file) + + # load saved tokenizer and verify outputs match + t = torch.jit.load(tokenizer_file) + print(t(sentence)) + + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("--tokenizer-file", default="tokenizer.pt", type=str) + main(parser.parse_args()) diff --git a/examples/libtorchtext/tokenizer/main.cpp b/examples/libtorchtext/tokenizer/main.cpp new file mode 100644 index 0000000000..7d3afe26a9 --- /dev/null +++ b/examples/libtorchtext/tokenizer/main.cpp @@ -0,0 +1,24 @@ +#include +#include + +#include +#include +#include + +int main(int argc, const char* argv[]) { + std::cout << "Loading model...\n"; + + torch::jit::script::Module module; + try { + module = torch::jit::load(argv[1]); + } catch (const c10::Error& e) { + return -1; + } + + torch::NoGradGuard no_grad; // ensures that autograd is off + torch::jit::IValue tokens_ivalue = module.forward(std::vector( + 1, "The green grasshopper jumped over the fence")); + std::cout << "Result: " << tokens_ivalue << std::endl; + + return 0; +}