pytorch
diff --git a/‎README.rst‎
Lines changed: 2 additions & 2 deletions b/‎README.rst‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/text_classification/iterable_train.py‎
Lines changed: 4 additions & 4 deletions b/‎examples/text_classification/iterable_train.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎examples/text_classification/model.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/text_classification/model.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/text_classification/predict.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/text_classification/predict.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/text_classification/train.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/text_classification/train.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎packaging/torchtext/meta.yaml‎
Lines changed: 1 addition & 0 deletions b/‎packaging/torchtext/meta.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎test/data/test_batch.py‎
Lines changed: 2 additions & 2 deletions b/‎test/data/test_batch.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎test/data/test_builtin_datasets.py‎
Lines changed: 34 additions & 0 deletions b/‎test/data/test_builtin_datasets.py‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎test/data/test_functional.py‎
Lines changed: 49 additions & 12 deletions b/‎test/data/test_functional.py‎
Lines changed: 49 additions & 12 deletions
diff --git a/‎test/experimental/test_transforms.py‎
Lines changed: 24 additions & 0 deletions b/‎test/experimental/test_transforms.py‎
Lines changed: 24 additions & 0 deletions
@@ -19,7 +19,7 @@ Note: we are currently re-designing the torchtext library to make it more compat
 
     pip install --pre torch torchtext -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html  
 
-For more detail instructions, please refer to `Install PyTorch <https://pytorch.org/get-started/locally/>`_. It should be noted that the new building blocks are still under development, and the APIs have not been solidified.
+For more detailed instructions, please refer to `Install PyTorch <https://pytorch.org/get-started/locally/>`_. It should be noted that the new building blocks are still under development, and the APIs have not been solidified.
 
 Installation
 ============
@@ -81,7 +81,7 @@ To build torchtext from source, you need ``git``, ``CMake`` and C++11 compiler s
 **Note**
 
 When building from source, make sure that you have the same C++ compiler as the one used to build PyTorch. A simple way is to build PyTorch from source and use the same environment to build torchtext.
-If you are using nightly build of PyTorch, checkout the environment it was built `here (conda) <https://github.com/pytorch/builder/tree/master/conda>`_ and `here (pip) <https://github.com/pytorch/builder/tree/master/manywheel>`_.
+If you are using the nightly build of PyTorch, checkout the environment it was built with `conda (here) <https://github.com/pytorch/builder/tree/master/conda>`_ and `pip (here) <https://github.com/pytorch/builder/tree/master/manywheel>`_.
 
 Documentation
 =============
 
@@ -60,7 +60,7 @@ def train_and_valid(lr_, num_epoch, train_data_, valid_data_):
     r"""
     Here we use SGD optimizer to train the model.
 
-    Arguments:
+    Args:
         lr_: learning rate
         num_epoch: the number of epoches for training the model
         train_data_: the data used to train the model
@@ -108,7 +108,7 @@ def train_and_valid(lr_, num_epoch, train_data_, valid_data_):
 
 def test(data_):
     r"""
-    Arguments:
+    Args:
         data_: the data used to train the model
     """
     data = DataLoader(
@@ -137,7 +137,7 @@ def get_csv_iterator(data_path, ngrams, vocab, start=0, num_lines=None):
     Generate an iterator to read CSV file.
     The yield values are an integer for the label and a tensor for the text part.
 
-    Arguments:
+    Args:
         data_path: a path for the data file.
         ngrams: the number used for ngrams.
         vocab: a vocab object saving the string-to-index information
@@ -171,7 +171,7 @@ class Dataset(torch.utils.data.IterableDataset):
     An iterable dataset to save the data. This dataset supports multi-processing
     to load the data.
 
-    Arguments:
+    Args:
         iterator: the iterator to read data.
         num_lines: the number of lines read by the individual iterator.
     """
 
@@ -31,7 +31,7 @@ def init_weights(self):
 
     def forward(self, text, offsets):
         r"""
-        Arguments:
+        Args:
             text: 1-D tensor representing a bag of text tensors
             offsets: a list of offsets to delimit the 1-D text tensor
                 into the individual sequences.
 
@@ -11,7 +11,7 @@ def predict(text, model, dictionary, ngrams):
     The input text is numericalized with the vocab and then sent to
     the model for inference.
 
-    Arguments:
+    Args:
         text: a sample text string
         model: the trained model
         dictionary: a vocab object for the information of string-to-index
 
@@ -56,7 +56,7 @@ def train_and_valid(lr_, sub_train_, sub_valid_):
     We use a SGD optimizer to train the model here and the learning rate
     decreases linearly with the progress of the training process.
 
-    Arguments:
+    Args:
         lr_: learning rate
         sub_train_: the data used to train the model
         sub_valid_: the data used for validation
@@ -94,7 +94,7 @@ def train_and_valid(lr_, sub_train_, sub_valid_):
 
 def test(data_):
     r"""
-    Arguments:
+    Args:
         data_: the data used to train the model
     """
     data = DataLoader(data_, batch_size=batch_size, collate_fn=generate_batch)
 
@@ -38,6 +38,7 @@ test:
 
   requires:
     - pytest
+    - cpuonly
 
 about:
   home: https://github.com/pytorch/text
 
@@ -37,7 +37,7 @@ def test_batch_iter(self):
         batch = next(iter(itr))
         (x1, x2), y = batch
         x = (x1, x2)[fld_order.index("float")]
-        self.assertEquals(y.data[0], 1)
-        self.assertEquals(y.data[1], 12)
+        self.assertEqual(y.data[0], 1)
+        self.assertEqual(y.data[1], 12)
         self.assertAlmostEqual(x.data[0], 0.1, places=4)
         self.assertAlmostEqual(x.data[1], 0.5, places=4)
@@ -162,6 +162,40 @@ def test_imdb(self):
         self._helper_test_func(len(test_iter), 25000, next(iter(test_iter))[1][:25], 'I love sci-fi and am will')
         del train_iter, test_iter
 
+    def test_iwslt(self):
+        from torchtext.experimental.datasets import IWSLT
+
+        train_dataset, valid_dataset, test_dataset = IWSLT()
+
+        self.assertEqual(len(train_dataset), 196884)
+        self.assertEqual(len(valid_dataset), 993)
+        self.assertEqual(len(test_dataset), 1305)
+
+        de_vocab, en_vocab = train_dataset.get_vocab()
+
+        def assert_nth_pair_is_equal(n, expected_sentence_pair):
+            de_sentence = [de_vocab.itos[index] for index in train_dataset[n][0]]
+            en_sentence = [en_vocab.itos[index] for index in train_dataset[n][1]]
+            expected_de_sentence, expected_en_sentence = expected_sentence_pair
+
+            self.assertEqual(de_sentence, expected_de_sentence)
+            self.assertEqual(en_sentence, expected_en_sentence)
+
+        assert_nth_pair_is_equal(0, (['David', 'Gallo', ':', 'Das', 'ist', 'Bill', 'Lange',
+                                      '.', 'Ich', 'bin', 'Dave', 'Gallo', '.', '\n'],
+                                     ['David', 'Gallo', ':', 'This', 'is', 'Bill', 'Lange',
+                                      '.', 'I', "'m", 'Dave', 'Gallo', '.', '\n']))
+        assert_nth_pair_is_equal(10, (['Die', 'meisten', 'Tiere', 'leben', 'in',
+                                       'den', 'Ozeanen', '.', '\n'],
+                                      ['Most', 'of', 'the', 'animals', 'are', 'in',
+                                       'the', 'oceans', '.', '\n']))
+        assert_nth_pair_is_equal(20, (['Es', 'ist', 'einer', 'meiner', 'Lieblinge', ',', 'weil', 'es',
+                                       'alle', 'möglichen', 'Funktionsteile', 'hat', '.', '\n'],
+                                      ['It', "'s", 'one', 'of', 'my', 'favorites', ',', 'because', 'it', "'s",
+                                       'got', 'all', 'sorts', 'of', 'working', 'parts', '.', '\n']))
+        datafile = os.path.join(self.project_root, ".data", "2016-01.tgz")
+        conditional_remove(datafile)
+
     def test_multi30k(self):
         from torchtext.experimental.datasets import Multi30k
         # smoke test to ensure multi30k works properly
 
@@ -107,13 +107,24 @@ def test_BasicEnglishNormalize(self):
         self.assertEqual(eager_tokens, ref_results)
         self.assertEqual(experimental_eager_tokens, ref_results)
 
-        # test load and save
-        save_path = os.path.join(self.test_dir, 'basic_english_normalize.pt')
-        torch.save(basic_eng_norm.to_ivalue(), save_path)
-        loaded_basic_eng_norm = torch.load(save_path)
+    def test_basicEnglishNormalize_load_and_save(self):
+        test_sample = '\'".<br />,()!?;:   Basic English Normalization for a Line of Text   \'".<br />,()!?;:'
+        ref_results = ["'", '.', ',', '(', ')', '!', '?', 'basic', 'english', 'normalization',
+                       'for', 'a', 'line', 'of', 'text', "'", '.', ',', '(', ')', '!', '?']
 
-        loaded_eager_tokens = loaded_basic_eng_norm(test_sample)
-        self.assertEqual(loaded_eager_tokens, ref_results)
+        with self.subTest('pybind'):
+            save_path = os.path.join(self.test_dir, 'ben_pybind.pt')
+            ben = basic_english_normalize()
+            torch.save(ben, save_path)
+            loaded_ben = torch.load(save_path)
+            self.assertEqual(loaded_ben(test_sample), ref_results)
+
+        with self.subTest('torchscript'):
+            save_path = os.path.join(self.test_dir, 'ben_torchscrip.pt')
+            ben = basic_english_normalize().to_ivalue()
+            torch.save(ben, save_path)
+            loaded_ben = torch.load(save_path)
+            self.assertEqual(loaded_ben(test_sample), ref_results)
 
     # TODO(Nayef211): remove decorator once	https://github.com/pytorch/pytorch/issues/38207 is closed
     @unittest.skipIf(platform.system() == "Windows", "Test is known to fail on Windows.")
@@ -147,13 +158,39 @@ def test_RegexTokenizer(self):
         self.assertEqual(eager_tokens, ref_results)
         self.assertEqual(jit_tokens, ref_results)
 
-        # test load and save
-        save_path = os.path.join(self.test_dir, 'regex.pt')
-        torch.save(r_tokenizer.to_ivalue(), save_path)
-        loaded_r_tokenizer = torch.load(save_path)
+    def test_load_and_save(self):
+        test_sample = '\'".<br />,()!?;:   Basic Regex Tokenization for a Line of Text   \'".<br />,()!?;:'
+        ref_results = ["'", '.', ',', '(', ')', '!', '?', 'Basic', 'Regex', 'Tokenization',
+                       'for', 'a', 'Line', 'of', 'Text', "'", '.', ',', '(', ')', '!', '?']
+        patterns_list = [
+            (r'\'', ' \'  '),
+            (r'\"', ''),
+            (r'\.', ' . '),
+            (r'<br \/>', ' '),
+            (r',', ' , '),
+            (r'\(', ' ( '),
+            (r'\)', ' ) '),
+            (r'\!', ' ! '),
+            (r'\?', ' ? '),
+            (r'\;', ' '),
+            (r'\:', ' '),
+            (r'\s+', ' ')]
 
-        loaded_eager_tokens = loaded_r_tokenizer(test_sample)
-        self.assertEqual(loaded_eager_tokens, ref_results)
+        with self.subTest('pybind'):
+            save_path = os.path.join(self.test_dir, 'regex_pybind.pt')
+            tokenizer = regex_tokenizer(patterns_list)
+            torch.save(tokenizer, save_path)
+            loaded_tokenizer = torch.load(save_path)
+            results = loaded_tokenizer(test_sample)
+            self.assertEqual(results, ref_results)
+
+        with self.subTest('torchscript'):
+            save_path = os.path.join(self.test_dir, 'regex_torchscript.pt')
+            tokenizer = regex_tokenizer(patterns_list).to_ivalue()
+            torch.save(tokenizer, save_path)
+            loaded_tokenizer = torch.load(save_path)
+            results = loaded_tokenizer(test_sample)
+            self.assertEqual(results, ref_results)
 
     def test_custom_replace(self):
         custom_replace_transform = custom_replace([(r'S', 's'), (r'\s+', ' ')])
 
@@ -54,3 +54,27 @@ def test_vector_transform(self):
                                                         [-0.32423, -0.098845, -0.0073467]])
             self.assertEqual(vector_transform(['the', 'world'])[:, 0:3], expected_fasttext_simple_en)
             self.assertEqual(jit_vector_transform(['the', 'world'])[:, 0:3], expected_fasttext_simple_en)
+
+    def test_sentencepiece_load_and_save(self):
+        model_path = get_asset_path('spm_example.model')
+        input = 'SentencePiece is an unsupervised text tokenizer and detokenizer'
+        expected = [
+            '▁Sent', 'ence', 'P', 'ie', 'ce', '▁is',
+            '▁an', '▁un', 'super', 'vis', 'ed', '▁text',
+            '▁to', 'ken', 'izer', '▁and',
+            '▁de', 'to', 'ken', 'izer',
+        ]
+
+        with self.subTest('pybind'):
+            save_path = os.path.join(self.test_dir, 'spm_pybind.pt')
+            spm = sentencepiece_tokenizer((model_path))
+            torch.save(spm, save_path)
+            loaded_spm = torch.load(save_path)
+            self.assertEqual(expected, loaded_spm(input))
+
+        with self.subTest('torchscript'):
+            save_path = os.path.join(self.test_dir, 'spm_torchscript.pt')
+            spm = sentencepiece_tokenizer((model_path)).to_ivalue()
+            torch.save(spm, save_path)
+            loaded_spm = torch.load(save_path)
+            self.assertEqual(expected, loaded_spm(input))