pytorch · zhangguanheng66 · Feb 5, 2021 · Feb 2, 2021 · Feb 2, 2021 · Feb 2, 2021
diff --git a/.circleci/unittest/linux/scripts/environment.yml b/.circleci/unittest/linux/scripts/environment.yml
@@ -17,5 +17,5 @@ dependencies:
     - sphinx
     - sphinx-rtd-theme
     - tqdm
-    - https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz#egg=de_core_news_sm==2.2.5
-    - https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz#egg=en_core_web_sm==2.2.5
+    - https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.0.0/de_core_news_sm-3.0.0.tar.gz#egg=de_core_news_sm==3.0.0
+    - https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz#egg=en_core_web_sm==3.0.0
diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -45,4 +45,6 @@ fi
 
 # 4. Download
 printf "* Downloading SpaCy English models\n"
-python -m spacy download en
+python -m spacy download en_core_web_sm
+printf "* Downloading SpaCy German models\n"
+python -m spacy download de_core_news_sm 
diff --git a/.circleci/unittest/windows/scripts/environment.yml b/.circleci/unittest/windows/scripts/environment.yml
@@ -19,5 +19,5 @@ dependencies:
     - tqdm
     - certifi
     - future
-    - https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz#egg=de_core_news_sm==2.2.5
-    - https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz#egg=en_core_web_sm==2.2.5
+    - https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.0.0/de_core_news_sm-3.0.0.tar.gz#egg=de_core_news_sm==3.0.0
+    - https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz#egg=en_core_web_sm==3.0.0
diff --git a/.circleci/unittest/windows/scripts/setup_env.sh b/.circleci/unittest/windows/scripts/setup_env.sh
@@ -39,4 +39,6 @@ conda env update --file "${this_dir}/environment.yml" --prune
 
 # 4. Download
 printf "* Downloading SpaCy English models\n"
-python -m spacy download en
+python -m spacy download en_core_web_sm
+printf "* Downloading SpaCy German models\n"
+python -m spacy download de_core_news_sm 
diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py
@@ -206,22 +206,31 @@ def test_multi30k(self):
         from torchtext.experimental.datasets import Multi30k
         # smoke test to ensure multi30k works properly
         train_dataset, valid_dataset, test_dataset = Multi30k()
+
+        # This change is due to the BC breaking in spacy 3.0
         self._helper_test_func(len(train_dataset), 29000, train_dataset[20],
-                               ([4, 444, 2531, 47, 17480, 7423, 8, 158, 10, 12, 5849, 3, 2],
+                               # ([4, 444, 2531, 47, 17480, 7423, 8, 158, 10, 12, 5849, 3, 2],
+                               ([4, 444, 2529, 47, 17490, 7422, 8, 158, 10, 12, 5846, 3, 2],
                                 [5, 61, 530, 137, 1494, 10, 9, 280, 6, 2, 3749, 4, 3]))
+
         self._helper_test_func(len(valid_dataset), 1014, valid_dataset[30],
                                ([4, 179, 26, 85, 1005, 57, 19, 154, 3, 2],
                                 [5, 24, 32, 81, 47, 1348, 6, 2, 119, 4, 3]))
+
+        # This change is due to the BC breaking in spacy 3.0
         self._helper_test_func(len(test_dataset), 1000, test_dataset[40],
-                               ([4, 26, 6, 12, 3915, 1538, 21, 64, 3, 2],
+                               # ([4, 26, 6, 12, 3915, 1538, 21, 64, 3, 2],
+                               ([4, 26, 6, 12, 3913, 1537, 21, 64, 3, 2],
                                 [5, 32, 20, 2, 747, 345, 1915, 6, 46, 4, 3]))
 
         de_vocab, en_vocab = train_dataset.get_vocab()
         de_tokens_ids = [
             de_vocab[token] for token in
             'Zwei Männer verpacken Donuts in Kunststofffolie'.split()
         ]
-        self.assertEqual(de_tokens_ids, [20, 30, 18705, 4448, 6, 6241])
+        # This change is due to the BC breaking in spacy 3.0
+        # self.assertEqual(de_tokens_ids, [20, 30, 18705, 4448, 6, 6241])
+        self.assertEqual(de_tokens_ids, [20, 30, 18714, 4447, 6, 6239])
 
         en_tokens_ids = [
             en_vocab[token] for token in
@@ -240,8 +249,11 @@ def test_multi30k(self):
                                          'A group of men are loading cotton onto a truck\n']))
         del train_iter, valid_iter
         train_dataset, = Multi30k(data_select=('train'))
+
+        # This change is due to the BC breaking in spacy 3.0
         self._helper_test_func(len(train_dataset), 29000, train_dataset[20],
-                               ([4, 444, 2531, 47, 17480, 7423, 8, 158, 10, 12, 5849, 3, 2],
+                               # ([4, 444, 2531, 47, 17480, 7423, 8, 158, 10, 12, 5849, 3, 2],
+                               ([4, 444, 2529, 47, 17490, 7422, 8, 158, 10, 12, 5846, 3, 2],
                                 [5, 61, 530, 137, 1494, 10, 9, 280, 6, 2, 3749, 4, 3]))
 
         datafile = os.path.join(self.project_root, ".data", "train*")

diff --git a/test/test_build.py b/test/test_build.py
@@ -107,7 +107,7 @@ class TestDataUtils(TorchtextTestCase):
 
     def test_get_tokenizer_spacy(self):
         # Test SpaCy option, and verify it properly handles punctuation.
-        assert torchtext.data.get_tokenizer("spacy")(str(self.TEST_STR)) == [
+        assert torchtext.data.get_tokenizer("spacy", language='en_core_web_sm')(str(self.TEST_STR)) == [
             "A", "string", ",", "particularly", "one", "with", "slightly",
             "complex", "punctuation", "."]
 

diff --git a/test/translation.py b/test/translation.py
@@ -4,8 +4,8 @@
 import re
 import spacy
 
-spacy_de = spacy.load('de')
-spacy_en = spacy.load('en')
+spacy_de = spacy.load('de_core_news_sm')
+spacy_en = spacy.load('en_core_web_sm')
 
 url = re.compile('(<url>.*</url>)')
 

diff --git a/torchtext/data/utils.py b/torchtext/data/utils.py
@@ -2,7 +2,6 @@
 from contextlib import contextmanager
 from copy import deepcopy
 import re
-
 from functools import partial