From cc4d12a0de70667a001fc5d7814528365430800c Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Tue, 2 Feb 2021 09:04:20 -0800 Subject: [PATCH 01/13] switch spacy shortcut to en_core_web_sm --- torchtext/data/utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/torchtext/data/utils.py b/torchtext/data/utils.py index 045c2646fb..f74a9326fd 100644 --- a/torchtext/data/utils.py +++ b/torchtext/data/utils.py @@ -72,7 +72,7 @@ def _basic_english_normalize(line): return line.split() -def get_tokenizer(tokenizer, language='en'): +def get_tokenizer(tokenizer, language='en_core_web_sm'): r""" Generate tokenizer function for a string sentence. @@ -101,8 +101,6 @@ def get_tokenizer(tokenizer, language='en'): return _split_tokenizer if tokenizer == "basic_english": - if language != 'en': - raise ValueError("Basic normalization is only available for Enlish(en)") return _basic_english_normalize # simply return if a function is passed From c365fe9388e94a995f6c2c26ddfff583172da063 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Tue, 2 Feb 2021 09:26:41 -0800 Subject: [PATCH 02/13] checkpoint --- test/translation.py | 4 ++-- torchtext/data/field.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/translation.py b/test/translation.py index eb3c47a349..89a6f9017f 100644 --- a/test/translation.py +++ b/test/translation.py @@ -4,8 +4,8 @@ import re import spacy -spacy_de = spacy.load('de') -spacy_en = spacy.load('en') +spacy_de = spacy.load('de_core_news_sm') +spacy_en = spacy.load('en_core_web_sm') url = re.compile('(.*)') diff --git a/torchtext/data/field.py b/torchtext/data/field.py index 95be1b85f2..ff3a501b3e 100644 --- a/torchtext/data/field.py +++ b/torchtext/data/field.py @@ -492,7 +492,7 @@ class NestedField(Field): def __init__(self, nesting_field, use_vocab=True, init_token=None, eos_token=None, fix_length=None, dtype=torch.long, preprocessing=None, - postprocessing=None, tokenize=None, tokenizer_language='en', + postprocessing=None, tokenize=None, tokenizer_language='en_core_web_sm', include_lengths=False, pad_token='', pad_first=False, truncate_first=False): warnings.warn('{} class will be retired soon and moved to torchtext.legacy. Please see the most recent release notes for further information.'.format(self.__class__.__name__), UserWarning) From 1d5d354d325ae180904330dc8d7d951d5b8f89fd Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Tue, 2 Feb 2021 09:35:54 -0800 Subject: [PATCH 03/13] update circleci --- .circleci/unittest/linux/scripts/setup_env.sh | 4 +++- .circleci/unittest/windows/scripts/setup_env.sh | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh index d4f5457906..02ed10d938 100755 --- a/.circleci/unittest/linux/scripts/setup_env.sh +++ b/.circleci/unittest/linux/scripts/setup_env.sh @@ -45,4 +45,6 @@ fi # 4. Download printf "* Downloading SpaCy English models\n" -python -m spacy download en +python -m spacy download en_core_web_md +printf "* Downloading SpaCy German models\n" +python -m spacy download de_core_news_md diff --git a/.circleci/unittest/windows/scripts/setup_env.sh b/.circleci/unittest/windows/scripts/setup_env.sh index cd44b436dc..eb5404f261 100644 --- a/.circleci/unittest/windows/scripts/setup_env.sh +++ b/.circleci/unittest/windows/scripts/setup_env.sh @@ -39,4 +39,6 @@ conda env update --file "${this_dir}/environment.yml" --prune # 4. Download printf "* Downloading SpaCy English models\n" -python -m spacy download en +python -m spacy download en_core_web_md +printf "* Downloading SpaCy German models\n" +python -m spacy download de_core_news_md From ba4dcb87f727f99223e1ac326591da7f34726128 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Tue, 2 Feb 2021 18:54:52 -0800 Subject: [PATCH 04/13] checkpoint --- torchtext/data/field.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchtext/data/field.py b/torchtext/data/field.py index ff3a501b3e..f1a897d047 100644 --- a/torchtext/data/field.py +++ b/torchtext/data/field.py @@ -143,7 +143,7 @@ class Field(RawField): def __init__(self, sequential=True, use_vocab=True, init_token=None, eos_token=None, fix_length=None, dtype=torch.long, preprocessing=None, postprocessing=None, lower=False, - tokenize=None, tokenizer_language='en', include_lengths=False, + tokenize=None, tokenizer_language='en_core_web_sm', include_lengths=False, batch_first=False, pad_token="", unk_token="", pad_first=False, truncate_first=False, stop_words=None, is_target=False): From 846f0e5a385df97b81f7c3995952bc69a8d93d1a Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Tue, 2 Feb 2021 19:01:26 -0800 Subject: [PATCH 05/13] update circleci environment.ymml --- .circleci/unittest/linux/scripts/environment.yml | 4 ++-- .circleci/unittest/windows/scripts/environment.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.circleci/unittest/linux/scripts/environment.yml b/.circleci/unittest/linux/scripts/environment.yml index b60f219e16..282877c207 100644 --- a/.circleci/unittest/linux/scripts/environment.yml +++ b/.circleci/unittest/linux/scripts/environment.yml @@ -17,5 +17,5 @@ dependencies: - sphinx - sphinx-rtd-theme - tqdm - - https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz#egg=de_core_news_sm==2.2.5 - - https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz#egg=en_core_web_sm==2.2.5 + - https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.0.0/de_core_news_sm-3.0.0.tar.gz#egg=de_core_news_sm==3.0.0 + - https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz#egg=en_core_web_sm==3.0.0 diff --git a/.circleci/unittest/windows/scripts/environment.yml b/.circleci/unittest/windows/scripts/environment.yml index 28790d2585..2d7c790b91 100644 --- a/.circleci/unittest/windows/scripts/environment.yml +++ b/.circleci/unittest/windows/scripts/environment.yml @@ -19,5 +19,5 @@ dependencies: - tqdm - certifi - future - - https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz#egg=de_core_news_sm==2.2.5 - - https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz#egg=en_core_web_sm==2.2.5 + - https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.0.0/de_core_news_sm-3.0.0.tar.gz#egg=de_core_news_sm==3.0.0 + - https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz#egg=en_core_web_sm==3.0.0 From 1158243684fd368a8877ade65053a98ee669e8ff Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Tue, 2 Feb 2021 19:30:52 -0800 Subject: [PATCH 06/13] checkpoint --- .circleci/unittest/linux/scripts/setup_env.sh | 4 ++-- .circleci/unittest/windows/scripts/setup_env.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh index 02ed10d938..f25d78a447 100755 --- a/.circleci/unittest/linux/scripts/setup_env.sh +++ b/.circleci/unittest/linux/scripts/setup_env.sh @@ -45,6 +45,6 @@ fi # 4. Download printf "* Downloading SpaCy English models\n" -python -m spacy download en_core_web_md +python -m spacy download en_core_web_sd printf "* Downloading SpaCy German models\n" -python -m spacy download de_core_news_md +python -m spacy download de_core_news_sd diff --git a/.circleci/unittest/windows/scripts/setup_env.sh b/.circleci/unittest/windows/scripts/setup_env.sh index eb5404f261..8b4284c3ab 100644 --- a/.circleci/unittest/windows/scripts/setup_env.sh +++ b/.circleci/unittest/windows/scripts/setup_env.sh @@ -39,6 +39,6 @@ conda env update --file "${this_dir}/environment.yml" --prune # 4. Download printf "* Downloading SpaCy English models\n" -python -m spacy download en_core_web_md +python -m spacy download en_core_web_sd printf "* Downloading SpaCy German models\n" -python -m spacy download de_core_news_md +python -m spacy download de_core_news_sd From eab0524d972e2a584bb4c3c259c1e593cf72d546 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Tue, 2 Feb 2021 19:51:07 -0800 Subject: [PATCH 07/13] BC breaking in spacy 3.0 --- test/data/test_builtin_datasets.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py index fcef4f1507..0688634af0 100644 --- a/test/data/test_builtin_datasets.py +++ b/test/data/test_builtin_datasets.py @@ -200,14 +200,21 @@ def test_multi30k(self): from torchtext.experimental.datasets import Multi30k # smoke test to ensure multi30k works properly train_dataset, valid_dataset, test_dataset = Multi30k() + + # This change is due to the BC breaking in spacy 3.0 self._helper_test_func(len(train_dataset), 29000, train_dataset[20], - ([4, 444, 2531, 47, 17480, 7423, 8, 158, 10, 12, 5849, 3, 2], + # ([4, 444, 2531, 47, 17480, 7423, 8, 158, 10, 12, 5849, 3, 2], + ([4, 444, 2529, 47, 17490, 7422, 8, 158, 10, 12, 5846, 3, 2], [5, 61, 530, 137, 1494, 10, 9, 280, 6, 2, 3749, 4, 3])) + self._helper_test_func(len(valid_dataset), 1014, valid_dataset[30], ([4, 179, 26, 85, 1005, 57, 19, 154, 3, 2], [5, 24, 32, 81, 47, 1348, 6, 2, 119, 4, 3])) + + # This change is due to the BC breaking in spacy 3.0 self._helper_test_func(len(test_dataset), 1000, test_dataset[40], - ([4, 26, 6, 12, 3915, 1538, 21, 64, 3, 2], + # ([4, 26, 6, 12, 3915, 1538, 21, 64, 3, 2], + ([4, 26, 6, 12, 3913, 1537, 21, 64, 3, 2], [5, 32, 20, 2, 747, 345, 1915, 6, 46, 4, 3])) de_vocab, en_vocab = train_dataset.get_vocab() @@ -234,8 +241,11 @@ def test_multi30k(self): 'A group of men are loading cotton onto a truck\n'])) del train_iter, valid_iter train_dataset, = Multi30k(data_select=('train')) + + # This change is due to the BC breaking in spacy 3.0 self._helper_test_func(len(train_dataset), 29000, train_dataset[20], - ([4, 444, 2531, 47, 17480, 7423, 8, 158, 10, 12, 5849, 3, 2], + # ([4, 444, 2531, 47, 17480, 7423, 8, 158, 10, 12, 5849, 3, 2], + ([4, 444, 2529, 47, 17490, 7422, 8, 158, 10, 12, 5846, 3, 2], [5, 61, 530, 137, 1494, 10, 9, 280, 6, 2, 3749, 4, 3])) datafile = os.path.join(self.project_root, ".data", "train*") From a4080430b2059b87bdd2ba9a7861081d8550a3fa Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Tue, 2 Feb 2021 19:56:57 -0800 Subject: [PATCH 08/13] flake8 --- .circleci/unittest/linux/scripts/setup_env.sh | 4 ++-- .circleci/unittest/windows/scripts/setup_env.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh index f25d78a447..eb075b1eb1 100755 --- a/.circleci/unittest/linux/scripts/setup_env.sh +++ b/.circleci/unittest/linux/scripts/setup_env.sh @@ -45,6 +45,6 @@ fi # 4. Download printf "* Downloading SpaCy English models\n" -python -m spacy download en_core_web_sd +python -m spacy download en_core_web_sm printf "* Downloading SpaCy German models\n" -python -m spacy download de_core_news_sd +python -m spacy download de_core_news_sm diff --git a/.circleci/unittest/windows/scripts/setup_env.sh b/.circleci/unittest/windows/scripts/setup_env.sh index 8b4284c3ab..ea99130c5e 100644 --- a/.circleci/unittest/windows/scripts/setup_env.sh +++ b/.circleci/unittest/windows/scripts/setup_env.sh @@ -39,6 +39,6 @@ conda env update --file "${this_dir}/environment.yml" --prune # 4. Download printf "* Downloading SpaCy English models\n" -python -m spacy download en_core_web_sd +python -m spacy download en_core_web_sm printf "* Downloading SpaCy German models\n" -python -m spacy download de_core_news_sd +python -m spacy download de_core_news_sm From 701790e384d9b48392e0d8706db71216957d8b79 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Wed, 3 Feb 2021 07:23:01 -0800 Subject: [PATCH 09/13] checkpoint --- test/data/test_builtin_datasets.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py index 0688634af0..1822d82c0a 100644 --- a/test/data/test_builtin_datasets.py +++ b/test/data/test_builtin_datasets.py @@ -222,7 +222,9 @@ def test_multi30k(self): de_vocab[token] for token in 'Zwei Männer verpacken Donuts in Kunststofffolie'.split() ] - self.assertEqual(de_tokens_ids, [20, 30, 18705, 4448, 6, 6241]) + # This change is due to the BC breaking in spacy 3.0 + # self.assertEqual(de_tokens_ids, [20, 30, 18705, 4448, 6, 6241]) + self.assertEqual(de_tokens_ids, [20, 30, 18714, 4447, 6, 6239]) en_tokens_ids = [ en_vocab[token] for token in From 2c6a031e15be75d12cc9e171073d4ff1f5b12a44 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Wed, 3 Feb 2021 10:16:36 -0800 Subject: [PATCH 10/13] checkpoint --- torchtext/data/utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/torchtext/data/utils.py b/torchtext/data/utils.py index f74a9326fd..c290ad2e23 100644 --- a/torchtext/data/utils.py +++ b/torchtext/data/utils.py @@ -72,7 +72,7 @@ def _basic_english_normalize(line): return line.split() -def get_tokenizer(tokenizer, language='en_core_web_sm'): +def get_tokenizer(tokenizer, language='en'): r""" Generate tokenizer function for a string sentence. @@ -101,6 +101,8 @@ def get_tokenizer(tokenizer, language='en_core_web_sm'): return _split_tokenizer if tokenizer == "basic_english": + if language != 'en': + raise ValueError("Basic normalization is only available for Enlish(en)") return _basic_english_normalize # simply return if a function is passed @@ -108,6 +110,8 @@ def get_tokenizer(tokenizer, language='en_core_web_sm'): return tokenizer if tokenizer == "spacy": + if language == 'en': + raise RuntimeError("The en package has been deprecated in Spacy 3.0 release. Please switch to en_core_web_sm.") try: import spacy spacy = spacy.load(language) From 81c12c47391cf652c217a6342b33265bdf82c8d4 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Wed, 3 Feb 2021 13:37:26 -0800 Subject: [PATCH 11/13] checkpoint --- test/test_build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_build.py b/test/test_build.py index d61e844280..2e29392562 100644 --- a/test/test_build.py +++ b/test/test_build.py @@ -107,7 +107,7 @@ class TestDataUtils(TorchtextTestCase): def test_get_tokenizer_spacy(self): # Test SpaCy option, and verify it properly handles punctuation. - assert torchtext.data.get_tokenizer("spacy")(str(self.TEST_STR)) == [ + assert torchtext.data.get_tokenizer("spacy", language='en_core_web_sm')(str(self.TEST_STR)) == [ "A", "string", ",", "particularly", "one", "with", "slightly", "complex", "punctuation", "."] From b5e20c1ada1d9b6c4faaeddc2104bb66adb9bcb2 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Wed, 3 Feb 2021 16:07:28 -0800 Subject: [PATCH 12/13] checkpoint --- torchtext/data/field.py | 4 ++-- torchtext/data/utils.py | 8 ++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/torchtext/data/field.py b/torchtext/data/field.py index f1a897d047..95be1b85f2 100644 --- a/torchtext/data/field.py +++ b/torchtext/data/field.py @@ -143,7 +143,7 @@ class Field(RawField): def __init__(self, sequential=True, use_vocab=True, init_token=None, eos_token=None, fix_length=None, dtype=torch.long, preprocessing=None, postprocessing=None, lower=False, - tokenize=None, tokenizer_language='en_core_web_sm', include_lengths=False, + tokenize=None, tokenizer_language='en', include_lengths=False, batch_first=False, pad_token="", unk_token="", pad_first=False, truncate_first=False, stop_words=None, is_target=False): @@ -492,7 +492,7 @@ class NestedField(Field): def __init__(self, nesting_field, use_vocab=True, init_token=None, eos_token=None, fix_length=None, dtype=torch.long, preprocessing=None, - postprocessing=None, tokenize=None, tokenizer_language='en_core_web_sm', + postprocessing=None, tokenize=None, tokenizer_language='en', include_lengths=False, pad_token='', pad_first=False, truncate_first=False): warnings.warn('{} class will be retired soon and moved to torchtext.legacy. Please see the most recent release notes for further information.'.format(self.__class__.__name__), UserWarning) diff --git a/torchtext/data/utils.py b/torchtext/data/utils.py index c290ad2e23..86ab8b4cdf 100644 --- a/torchtext/data/utils.py +++ b/torchtext/data/utils.py @@ -2,7 +2,7 @@ from contextlib import contextmanager from copy import deepcopy import re - +import warnings from functools import partial @@ -111,7 +111,11 @@ def get_tokenizer(tokenizer, language='en'): if tokenizer == "spacy": if language == 'en': - raise RuntimeError("The en package has been deprecated in Spacy 3.0 release. Please switch to en_core_web_sm.") + warnings.warn("The en package has been deprecated in Spacy 3.0 release. Please switch the language argument in get_tokenizer to en_core_web_sm.", UserWarning) + language = 'en_core_web_sm' + elif language == 'de': + warnings.warn("The de package has been deprecated in Spacy 3.0 release. Please switch the language argument in get_tokenizer to de_core_news_sm.", UserWarning) + language = 'de_core_news_sm' try: import spacy spacy = spacy.load(language) From 5528794d6ff60e7a0e7ae169b1c9bcfe8eae9a75 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Thu, 4 Feb 2021 13:31:23 -0800 Subject: [PATCH 13/13] checkpoint --- torchtext/data/utils.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/torchtext/data/utils.py b/torchtext/data/utils.py index 28cda46d2c..240a3799ff 100644 --- a/torchtext/data/utils.py +++ b/torchtext/data/utils.py @@ -2,7 +2,6 @@ from contextlib import contextmanager from copy import deepcopy import re -import warnings from functools import partial @@ -110,12 +109,6 @@ def get_tokenizer(tokenizer, language='en'): return tokenizer if tokenizer == "spacy": - if language == 'en': - warnings.warn("The en package has been deprecated in Spacy 3.0 release. Please switch the language argument in get_tokenizer to en_core_web_sm.", UserWarning) - language = 'en_core_web_sm' - elif language == 'de': - warnings.warn("The de package has been deprecated in Spacy 3.0 release. Please switch the language argument in get_tokenizer to de_core_news_sm.", UserWarning) - language = 'de_core_news_sm' try: import spacy try: