diff --git a/requirements.txt b/requirements.txt index 6cea843d6a..b9237a2275 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,6 +8,7 @@ requests nltk spacy git+git://github.com/jekbradbury/revtok.git +sacremoses # Documentation Sphinx diff --git a/torchtext/data/utils.py b/torchtext/data/utils.py index 97a27e5a2e..111f897ec3 100644 --- a/torchtext/data/utils.py +++ b/torchtext/data/utils.py @@ -21,16 +21,22 @@ def get_tokenizer(tokenizer): raise elif tokenizer == "moses": try: - from nltk.tokenize.moses import MosesTokenizer + from sacremoses import MosesTokenizer moses_tokenizer = MosesTokenizer() return moses_tokenizer.tokenize except ImportError: - print("Please install NLTK. " - "See the docs at http://nltk.org for more information.") + print("Please install SacreMoses. " + "See the docs at https://github.com/alvations/sacremoses " + "for more information.") raise - except LookupError: - print("Please install the necessary NLTK corpora. " - "See the docs at http://nltk.org for more information.") + elif tokenizer == "toktok": + try: + from nltk.tokenize.toktok import ToktokTokenizer + toktok = ToktokTokenizer() + return toktok.tokenize + except ImportError: + print("Please install NLTK. " + "See the docs at https://nltk.org for more information.") raise elif tokenizer == 'revtok': try: