diff --git a/requirements.txt b/requirements.txt index 6cea843d6a..cc0a18140f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,7 @@ requests # Optional NLP tools nltk spacy +sacremoses git+git://github.com/jekbradbury/revtok.git # Documentation diff --git a/test/data/test_utils.py b/test/data/test_utils.py index 362cc10ae8..40bcb01633 100644 --- a/test/data/test_utils.py +++ b/test/data/test_utils.py @@ -16,7 +16,7 @@ def test_get_tokenizer(self): "A", "string", ",", "particularly", "one", "with", "slightly", "complex", "punctuation", "."] - # Test Moses option. Test strings taken from NLTK doctests. + # Test Moses option. # Note that internally, MosesTokenizer converts to unicode if applicable moses_tokenizer = data.get_tokenizer("moses") assert moses_tokenizer(test_str) == [ @@ -26,6 +26,13 @@ def test_get_tokenizer(self): # Nonbreaking prefixes should tokenize the final period. assert moses_tokenizer(six.text_type("abc def.")) == ["abc", "def", "."] + # Test Toktok option. Test strings taken from NLTK doctests. + # Note that internally, MosesTokenizer converts to unicode if applicable + toktok_tokenizer = data.get_tokenizer("toktok") + assert toktok_tokenizer(test_str) == [ + "A", "string", ",", "particularly", "one", "with", "slightly", + "complex", "punctuation", "."] + # Test that errors are raised for invalid input arguments. with self.assertRaises(ValueError): data.get_tokenizer(1) diff --git a/torchtext/data/utils.py b/torchtext/data/utils.py index 97a27e5a2e..111f897ec3 100644 --- a/torchtext/data/utils.py +++ b/torchtext/data/utils.py @@ -21,16 +21,22 @@ def get_tokenizer(tokenizer): raise elif tokenizer == "moses": try: - from nltk.tokenize.moses import MosesTokenizer + from sacremoses import MosesTokenizer moses_tokenizer = MosesTokenizer() return moses_tokenizer.tokenize except ImportError: - print("Please install NLTK. " - "See the docs at http://nltk.org for more information.") + print("Please install SacreMoses. " + "See the docs at https://github.com/alvations/sacremoses " + "for more information.") raise - except LookupError: - print("Please install the necessary NLTK corpora. " - "See the docs at http://nltk.org for more information.") + elif tokenizer == "toktok": + try: + from nltk.tokenize.toktok import ToktokTokenizer + toktok = ToktokTokenizer() + return toktok.tokenize + except ImportError: + print("Please install NLTK. " + "See the docs at https://nltk.org for more information.") raise elif tokenizer == 'revtok': try: