From 922221c9862123b9ff1aa1b4bbcf33809763c671 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Thu, 4 Feb 2021 11:35:47 -0800 Subject: [PATCH 1/3] Replace model with full name when spacy load is used Model name shortcuts no longer work in spacCy 3+ or later --- torchtext/data/utils.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/torchtext/data/utils.py b/torchtext/data/utils.py index 045c2646fb..ba4995e102 100644 --- a/torchtext/data/utils.py +++ b/torchtext/data/utils.py @@ -112,7 +112,27 @@ def get_tokenizer(tokenizer, language='en'): if tokenizer == "spacy": try: import spacy - spacy = spacy.load(language) + try: + spacy = spacy.load(language) + except IOError: + # Model shortcuts no longer work in spaCy 3.0+, try using fullnames + # List is from https://github.com/explosion/spaCy/blob/b903de3fcb56df2f7247e5b6cfa6b66f4ff02b62/spacy/errors.py#L789 + OLD_MODEL_SHORTCUTS = { + 'en': 'en_core_web_sm', + 'de': 'de_core_news_sm', + 'es': 'es_core_news_sm', + 'pt': 'pt_core_news_sm', + 'fr': 'fr_core_news_sm', + 'it': 'it_core_news_sm', + 'nl': 'nl_core_news_sm', + 'el': 'el_core_news_sm', + 'nb': 'nb_core_news_sm', + 'lt': 'lt_core_news_sm', + 'xx': 'xx_ent_wiki_sm' + } + if language not in OLD_MODEL_SHORTCUTS: + raise + spacy = spacy.load(OLD_MODEL_SHORTCUTS[language]) return partial(_spacy_tokenize, spacy=spacy) except ImportError: print("Please install SpaCy. " From d8b9a03eaed09c005fc4519c497412833f265df8 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Thu, 4 Feb 2021 11:48:09 -0800 Subject: [PATCH 2/3] Add warning --- torchtext/data/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/torchtext/data/utils.py b/torchtext/data/utils.py index ba4995e102..531f076193 100644 --- a/torchtext/data/utils.py +++ b/torchtext/data/utils.py @@ -132,6 +132,8 @@ def get_tokenizer(tokenizer, language='en'): } if language not in OLD_MODEL_SHORTCUTS: raise + import warnings + warnings.warn(f'Spacy model "{language}" could not be loaded, trying "{OLD_MODEL_SHORTCUTS[language]}" instead') spacy = spacy.load(OLD_MODEL_SHORTCUTS[language]) return partial(_spacy_tokenize, spacy=spacy) except ImportError: From c9010a19ee5464f44400e19a20f76be5e84f02c1 Mon Sep 17 00:00:00 2001 From: cpuhrsch Date: Thu, 4 Feb 2021 14:52:27 -0500 Subject: [PATCH 3/3] Update torchtext/data/utils.py Co-authored-by: Nikita Shulga --- torchtext/data/utils.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/torchtext/data/utils.py b/torchtext/data/utils.py index 531f076193..12f653290b 100644 --- a/torchtext/data/utils.py +++ b/torchtext/data/utils.py @@ -117,19 +117,7 @@ def get_tokenizer(tokenizer, language='en'): except IOError: # Model shortcuts no longer work in spaCy 3.0+, try using fullnames # List is from https://github.com/explosion/spaCy/blob/b903de3fcb56df2f7247e5b6cfa6b66f4ff02b62/spacy/errors.py#L789 - OLD_MODEL_SHORTCUTS = { - 'en': 'en_core_web_sm', - 'de': 'de_core_news_sm', - 'es': 'es_core_news_sm', - 'pt': 'pt_core_news_sm', - 'fr': 'fr_core_news_sm', - 'it': 'it_core_news_sm', - 'nl': 'nl_core_news_sm', - 'el': 'el_core_news_sm', - 'nb': 'nb_core_news_sm', - 'lt': 'lt_core_news_sm', - 'xx': 'xx_ent_wiki_sm' - } + OLD_MODEL_SHORTCUTS = spacy.errors.OLD_MODEL_SHORTCUTS if hasattr(spacy.errors, 'OLD_MODEL_SHORTCUTS') else {} if language not in OLD_MODEL_SHORTCUTS: raise import warnings