pytorch · zhangguanheng66 · Oct 9, 2020 · Oct 9, 2020 · Oct 9, 2020 · Oct 9, 2020
diff --git a/test/experimental/test_vocab.py b/test/experimental/test_vocab.py
@@ -17,21 +17,26 @@ def tearDown(self):
         torch._C._jit_clear_class_registry()
         torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore()
 
-    def test_has_unk(self):
+    # we separate out these errors because Windows runs into seg faults when propagating
+    # exceptions from C++ using pybind11
+    @unittest.skipIf(platform.system() == "Windows", "Test is known to fail on Windows.")
+    def test_has_no_unk(self):
         c = OrderedDict()
         v = vocab(c)
+        with self.assertRaisesRegex(RuntimeError, 'bad optional access'):
+            v.get_default_index()
 
         # check if unk is mapped to the first index
-        self.assertEqual(v['not_in_it'], 0)
-        self.assertEqual(v['<unk>'], 0)
-
-    def test_new_unk(self):
-        c = OrderedDict()
-        v = vocab(c, unk_token="<new_unk>")
+        with self.assertRaises(RuntimeError):
+            v['not_in_it']
+        with self.assertRaises(RuntimeError):
+            v['<unk>']
 
-        # check if new_unk is mapped to the first index
-        self.assertEqual(v['<new_unk>'], 0)
+        v.insert_token('not_in_it', 0)
+        v.set_default_index(0)
+        self.assertEqual(v.get_default_index(), 0)
         self.assertEqual(v['not_in_it'], 0)
+        self.assertEqual(v['<unk>'], 0)
 
     def test_vocab_get_item(self):
         token_to_freq = {'<unk>': 2, 'a': 2, 'b': 2}
@@ -43,35 +48,81 @@ def test_vocab_get_item(self):
         self.assertEqual(v['a'], 1)
         self.assertEqual(v['b'], 2)
 
+    # we separate out these errors because Windows runs into seg faults when propagating
+    # exceptions from C++ using pybind11
+    @unittest.skipIf(platform.system() == "Windows", "Test is known to fail on Windows.")
+    def test_vocab_set_item(self):
+        token_to_freq = {'<unk>': 2, 'a': 2, 'b': 2}
+        sorted_by_freq_tuples = sorted(token_to_freq.items(), key=lambda x: x[1], reverse=True)
+        c = OrderedDict(sorted_by_freq_tuples)
+        v = vocab(c, min_freq=2)
+
+        v.set_default_index(0)
+        with self.assertRaises(RuntimeError):
+            v['b'] = 1
+        del v['b']
+        self.assertEqual(v['<unk>'], 0)
+        self.assertEqual(v['a'], 1)
+        self.assertEqual(v['not_in_it'], 0)
+        self.assertEqual(v['b'], 0)
+
+        v['b'] = 1
+        self.assertEqual(v['<unk>'], 0)
+        self.assertEqual(v['b'], 1)
+        self.assertEqual(v['not_in_it'], 0)
+        self.assertEqual(v['a'], 0)
+
     def test_vocab_insert_token(self):
         c = OrderedDict({'<unk>': 2, 'a': 2})
 
         # add item to end
         v = vocab(c)
+        v.set_default_index(0)
         v.insert_token('b', 2)
 
         expected_itos = ['<unk>', 'a', 'b']
         expected_stoi = {x: index for index, x in enumerate(expected_itos)}
 
+        self.assertEqual(v.get_default_index(), 0)
         self.assertEqual(v.get_itos(), expected_itos)
         self.assertEqual(dict(v.get_stoi()), expected_stoi)
 
         # add item to middle
         v = vocab(c)
+        v.set_default_index(0)
         v.insert_token('b', 0)
 
         expected_itos = ['b', '<unk>', 'a']
         expected_stoi = {x: index for index, x in enumerate(expected_itos)}
 
+        self.assertEqual(v.get_default_index(), 1)
         self.assertEqual(v.get_itos(), expected_itos)
         self.assertEqual(dict(v.get_stoi()), expected_stoi)
 
+    # we separate out these errors because Windows runs into seg faults when propagating
+    # exceptions from C++ using pybind11
+    @unittest.skipIf(platform.system() == "Windows", "Test is known to fail on Windows.")
+    def test_insert_existing_token(self):
+        c = OrderedDict({'a': 2, 'b': 2, 'c': 2})
+
+        # add item to end
+        v = vocab(c)
+        v.insert_token('<unk>', 2)
+        v.set_default_index(2)
+
+        with self.assertRaises(RuntimeError):
+            # Test proper error raised when setting a token out of bounds
+            v.insert_token('<unk>', 1)
+
+        v.insert_token('d', 1)
+        self.assertEqual(v['not_in_it'], 3)
+
     def test_vocab_append_token(self):
         c = OrderedDict({'a': 2})
         v = vocab(c)
         v.append_token('b')
 
-        expected_itos = ['<unk>', 'a', 'b']
+        expected_itos = ['a', 'b']
         expected_stoi = {x: index for index, x in enumerate(expected_itos)}
 
         self.assertEqual(v.get_itos(), expected_itos)
@@ -83,7 +134,7 @@ def test_vocab_len(self):
         c = OrderedDict(sorted_by_freq_tuples)
         v = vocab(c)
 
-        self.assertEqual(len(v), 4)
+        self.assertEqual(len(v), 3)
 
     def test_vocab_basic(self):
         token_to_freq = {'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2}
@@ -92,12 +143,15 @@ def test_vocab_basic(self):
         c = OrderedDict(sorted_by_freq_tuples)
         v = vocab(c, min_freq=3)
 
-        expected_itos = ['<unk>', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world']
+        expected_itos = ['ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world']
         expected_stoi = {x: index for index, x in enumerate(expected_itos)}
 
         self.assertEqual(v.get_itos(), expected_itos)
         self.assertEqual(dict(v.get_stoi()), expected_stoi)
 
+    # we separate out these errors because Windows runs into seg faults when propagating
+    # exceptions from C++ using pybind11
+    @unittest.skipIf(platform.system() == "Windows", "Test is known to fail on Windows.")
     def test_vocab_jit(self):
         token_to_freq = {'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2}
         sorted_by_freq_tuples = sorted(token_to_freq.items(), key=lambda x: x[1], reverse=True)
@@ -106,7 +160,7 @@ def test_vocab_jit(self):
         v = vocab(c, min_freq=3)
         jit_v = torch.jit.script(v)
 
-        expected_itos = ['<unk>', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world']
+        expected_itos = ['ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world']
         expected_stoi = {x: index for index, x in enumerate(expected_itos)}
 
         assert not v.is_jitable
@@ -117,6 +171,9 @@ def test_vocab_jit(self):
         self.assertEqual(jit_v.get_itos(), expected_itos)
         self.assertEqual(dict(jit_v.get_stoi()), expected_stoi)
 
+    # we separate out these errors because Windows runs into seg faults when propagating
+    # exceptions from C++ using pybind11
+    @unittest.skipIf(platform.system() == "Windows", "Test is known to fail on Windows.")
     def test_vocab_forward(self):
         token_to_freq = {'a': 2, 'b': 2, 'c': 2}
         sorted_by_freq_tuples = sorted(token_to_freq.items(), key=lambda x: x[1], reverse=True)
@@ -126,7 +183,7 @@ def test_vocab_forward(self):
         jit_v = torch.jit.script(v)
 
         tokens = ['b', 'a', 'c']
-        expected_indices = [2, 1, 3]
+        expected_indices = [1, 0, 2]
 
         self.assertEqual(v(tokens), expected_indices)
         self.assertEqual(jit_v(tokens), expected_indices)
@@ -137,15 +194,15 @@ def test_vocab_lookup_token(self):
         c = OrderedDict(sorted_by_freq_tuples)
         v = vocab(c)
 
-        self.assertEqual(v.lookup_token(1), 'a')
+        self.assertEqual(v.lookup_token(0), 'a')
 
     def test_vocab_lookup_tokens(self):
         token_to_freq = {'a': 2, 'b': 2, 'c': 2}
         sorted_by_freq_tuples = sorted(token_to_freq.items(), key=lambda x: x[1], reverse=True)
         c = OrderedDict(sorted_by_freq_tuples)
         v = vocab(c)
 
-        indices = [2, 1, 3]
+        indices = [1, 0, 2]
         expected_tokens = ['b', 'a', 'c']
 
         self.assertEqual(v.lookup_tokens(indices), expected_tokens)
@@ -157,7 +214,7 @@ def test_vocab_lookup_indices(self):
         v = vocab(c)
 
         tokens = ['b', 'a', 'c']
-        expected_indices = [2, 1, 3]
+        expected_indices = [1, 0, 2]
 
         self.assertEqual(v.lookup_indices(tokens), expected_indices)
 
@@ -179,23 +236,27 @@ def test_errors_vocab_cpp(self):
             v = vocab(c)
             v.lookup_token(100)
 
+    # we separate out these errors because Windows runs into seg faults when propagating
+    # exceptions from C++ using pybind11
+    @unittest.skipIf(platform.system() == "Windows", "Test is known to fail on Windows.")
     def test_errors_vocab_python(self):
         token_to_freq = {'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2}
         sorted_by_freq_tuples = sorted(token_to_freq.items(), key=lambda x: x[1], reverse=True)
         c = OrderedDict(sorted_by_freq_tuples)
+        v = vocab(c)
 
-        with self.assertRaises(ValueError):
+        with self.assertRaises(RuntimeError):
             # Test proper error raised when setting unk token to None
-            vocab(c, unk_token=None)
+            v(['not_in_vocab'])
 
     def test_vocab_load_and_save(self):
         token_to_freq = {'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2}
         sorted_by_freq_tuples = sorted(token_to_freq.items(), key=lambda x: x[1], reverse=True)
 
         c = OrderedDict(sorted_by_freq_tuples)
         v = vocab(c, min_freq=3)
-
-        expected_itos = ['<unk>', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world']
+        v.set_default_index(1)
+        expected_itos = ['ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world']
         expected_stoi = {x: index for index, x in enumerate(expected_itos)}
 
         self.assertEqual(v.get_itos(), expected_itos)
@@ -221,7 +282,7 @@ def test_build_vocab_iterator(self):
         iterator = [['hello', 'hello', 'hello', 'freq_low', 'hello', 'world', 'world', 'world', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T',
                     'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'freq_low', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T']]
         v = build_vocab_from_iterator(iterator)
-        expected_itos = ['<unk>', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world', 'freq_low']
+        expected_itos = ['ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world', 'freq_low']
         expected_stoi = {x: index for index, x in enumerate(expected_itos)}
         self.assertEqual(v.get_itos(), expected_itos)
         self.assertEqual(dict(v.get_stoi()), expected_stoi)
diff --git a/test/experimental/test_with_asset.py b/test/experimental/test_with_asset.py
@@ -15,11 +15,11 @@
     load_vocab_from_file,
     build_vocab_from_text_file,
 )
+import unittest
+import platform
 import shutil
 import tempfile
 import os
-import unittest
-import platform
 from torchtext.experimental.vectors import (
     GloVe,
     build_vectors,
@@ -75,6 +75,9 @@ def test_wikitext103(self):
 
 
 class TestTransformsWithAsset(TorchtextTestCase):
+    # we separate out these errors because Windows runs into seg faults when propagating
+    # exceptions from C++ using pybind11
+    @unittest.skipIf(platform.system() == "Windows", "Test is known to fail on Windows.")
     def test_vocab_transform(self):
         asset_name = 'vocab_test2.txt'
         asset_path = get_asset_path(asset_name)
@@ -180,7 +183,8 @@ def test_vocab_from_file(self):
         asset_name = 'vocab_test.txt'
         asset_path = get_asset_path(asset_name)
         with open(asset_path, 'r') as f:
-            v = load_vocab_from_file(f, unk_token='<new_unk>')
+            v = load_vocab_from_file(f)
+            v.insert_token('<new_unk>', 0)
             expected_itos = ['<new_unk>', 'b', 'a', 'c']
             expected_stoi = {x: index for index, x in enumerate(expected_itos)}
             self.assertEqual(v.get_itos(), expected_itos)
@@ -192,8 +196,8 @@ def test_vocab_from_raw_text_file(self):
         with open(asset_path, 'r') as f:
             tokenizer = basic_english_normalize()
             jit_tokenizer = torch.jit.script(tokenizer)
-            v = build_vocab_from_text_file(f, jit_tokenizer, unk_token='<new_unk>')
-            expected_itos = ['<new_unk>', "'", 'after', 'talks', '.', 'are', 'at', 'disappointed',
+            v = build_vocab_from_text_file(f, jit_tokenizer)
+            expected_itos = ["'", 'after', 'talks', '.', 'are', 'at', 'disappointed',
                              'fears', 'federal', 'firm', 'for', 'mogul', 'n', 'newall', 'parent',
                              'pension', 'representing', 'say', 'stricken', 't', 'they', 'turner',
                              'unions', 'with', 'workers']

diff --git a/torchtext/csrc/register_bindings.cpp b/torchtext/csrc/register_bindings.cpp
@@ -15,12 +15,11 @@ namespace py = pybind11;
 
 namespace {
 Vocab build_vocab_from_text_file(const std::string &file_path,
-                                 const std::string &unk_token,
                                  const int64_t min_freq,
                                  const int64_t num_cpus,
                                  py::object fn) {
   torch::jit::script::Module module(*torch::jit::as_module(fn));
-  return _build_vocab_from_text_file(file_path, unk_token, min_freq, num_cpus, module);
+  return _build_vocab_from_text_file(file_path, min_freq, num_cpus, module);
 }
 } // namespace
 
@@ -100,12 +99,15 @@ PYBIND11_MODULE(_torchtext, m) {
           }));
 
   py::class_<Vocab, c10::intrusive_ptr<Vocab>>(m, "Vocab")
-      .def(py::init<std::vector<std::string>, std::string>())
+      .def(py::init<std::vector<std::string>>())
       .def_readonly("itos_", &Vocab::itos_)
-      .def_readonly("unk_token_", &Vocab::unk_token_)
       .def("__getitem__", &Vocab::__getitem__)
+      .def("__setitem__", &Vocab::__setitem__)
+      .def("__delitem__", &Vocab::__delitem__)
       .def("__len__", &Vocab::__len__)
       .def("insert_token", &Vocab::insert_token)
+      .def("set_default_index", &Vocab::set_default_index)
+      .def("get_default_index", &Vocab::get_default_index)
       .def("append_token", &Vocab::append_token)
       .def("lookup_token", &Vocab::lookup_token)
       .def("lookup_tokens", &Vocab::lookup_tokens)
@@ -202,10 +204,14 @@ TORCH_LIBRARY_FRAGMENT(torchtext, m) {
         });
 
   m.class_<Vocab>("Vocab")
-    .def(torch::init<StringList, std::string>())
+    .def(torch::init<StringList>())
     .def("__getitem__", &Vocab::__getitem__)
+    .def("__setitem__", &Vocab::__setitem__)
+    .def("__delitem__", &Vocab::__delitem__)
     .def("__len__", &Vocab::__len__)
     .def("insert_token", &Vocab::insert_token)
+    .def("set_default_index", &Vocab::set_default_index)
+    .def("get_default_index", &Vocab::get_default_index)
     .def("append_token", &Vocab::append_token)
     .def("lookup_token", &Vocab::lookup_token)
     .def("lookup_tokens", &Vocab::lookup_tokens)