Skip to content
This repository was archived by the owner on Sep 10, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ requests
# Optional NLP tools
nltk
spacy
sacremoses
git+git://github.com/jekbradbury/revtok.git

# Documentation
Expand Down
9 changes: 8 additions & 1 deletion test/data/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def test_get_tokenizer(self):
"A", "string", ",", "particularly", "one", "with", "slightly",
"complex", "punctuation", "."]

# Test Moses option. Test strings taken from NLTK doctests.
# Test Moses option.
# Note that internally, MosesTokenizer converts to unicode if applicable
moses_tokenizer = data.get_tokenizer("moses")
assert moses_tokenizer(test_str) == [
Expand All @@ -26,6 +26,13 @@ def test_get_tokenizer(self):
# Nonbreaking prefixes should tokenize the final period.
assert moses_tokenizer(six.text_type("abc def.")) == ["abc", "def", "."]

# Test Toktok option. Test strings taken from NLTK doctests.
# Note that internally, MosesTokenizer converts to unicode if applicable
toktok_tokenizer = data.get_tokenizer("toktok")
assert toktok_tokenizer(test_str) == [
"A", "string", ",", "particularly", "one", "with", "slightly",
"complex", "punctuation", "."]

# Test that errors are raised for invalid input arguments.
with self.assertRaises(ValueError):
data.get_tokenizer(1)
Expand Down
18 changes: 12 additions & 6 deletions torchtext/data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,22 @@ def get_tokenizer(tokenizer):
raise
elif tokenizer == "moses":
try:
from nltk.tokenize.moses import MosesTokenizer
from sacremoses import MosesTokenizer
moses_tokenizer = MosesTokenizer()
return moses_tokenizer.tokenize
except ImportError:
print("Please install NLTK. "
"See the docs at http://nltk.org for more information.")
print("Please install SacreMoses. "
"See the docs at https://github.com/alvations/sacremoses "
"for more information.")
raise
except LookupError:
print("Please install the necessary NLTK corpora. "
"See the docs at http://nltk.org for more information.")
elif tokenizer == "toktok":
try:
from nltk.tokenize.toktok import ToktokTokenizer
toktok = ToktokTokenizer()
return toktok.tokenize
except ImportError:
print("Please install NLTK. "
"See the docs at https://nltk.org for more information.")
raise
elif tokenizer == 'revtok':
try:
Expand Down