From eb9d57bb3c03b38bb86cf59260d3d477b8c21dae Mon Sep 17 00:00:00 2001 From: Nayef Ahmed Date: Tue, 21 Feb 2023 12:47:06 -0800 Subject: [PATCH] Fix UTF8 decoding error in GPT2BPETokenizer `decode` method Summary: - PyBind11 throws an error when decoding a C++ `std::string` which contains incomplete UTF8 byte sequences since the default UTF8 conversion uses `"strict"` error handling ([ref](https://docs.python.org/3/library/codecs.html#error-handlers)) - To resolve user issues (see [post](https://fb.workplace.com/groups/pytorchtext/permalink/899318121386487/)) we set the error handling to `"ignore"` which ignores the malformed data and continues decoding the string Differential Revision: D43361716 fbshipit-source-id: 4ac488e4b4b894c8049728941a2ee36b1799258a --- torchtext/csrc/register_pybindings.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/torchtext/csrc/register_pybindings.cpp b/torchtext/csrc/register_pybindings.cpp index 5f0a6d0483..afa4708cdd 100644 --- a/torchtext/csrc/register_pybindings.cpp +++ b/torchtext/csrc/register_pybindings.cpp @@ -179,7 +179,16 @@ PYBIND11_MODULE(_torchtext, m) { .def_property_readonly("byte_encoder_", &GPT2BPEEncoder::GetByteEncoder) .def("encode", &GPT2BPEEncoder::Encode) .def("tokenize", &GPT2BPEEncoder::Tokenize) - .def("decode", &GPT2BPEEncoder::Decode) + .def( + "decode", + [](const c10::intrusive_ptr& self, + const std::vector& tokens) { + std::string s = self->Decode(tokens); + PyObject* py_obj = + PyUnicode_DecodeUTF8(s.data(), s.length(), "ignore"); + py::str py_s = py::reinterpret_steal(py_obj); + return py_s; + }) .def( "add_special_tokens", [](const c10::intrusive_ptr& self,