From cc4d12a0de70667a001fc5d7814528365430800c Mon Sep 17 00:00:00 2001
From: Guanheng Zhang <zhangguanheng@devfair0197.h2.fair>
Date: Tue, 2 Feb 2021 09:04:20 -0800
Subject: [PATCH 01/13] switch spacy shortcut to en_core_web_sm

---
 torchtext/data/utils.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/torchtext/data/utils.py b/torchtext/data/utils.py
index 045c2646fb..f74a9326fd 100644
--- a/torchtext/data/utils.py
+++ b/torchtext/data/utils.py
@@ -72,7 +72,7 @@ def _basic_english_normalize(line):
     return line.split()
 
 
-def get_tokenizer(tokenizer, language='en'):
+def get_tokenizer(tokenizer, language='en_core_web_sm'):
     r"""
     Generate tokenizer function for a string sentence.
 
@@ -101,8 +101,6 @@ def get_tokenizer(tokenizer, language='en'):
         return _split_tokenizer
 
     if tokenizer == "basic_english":
-        if language != 'en':
-            raise ValueError("Basic normalization is only available for Enlish(en)")
         return _basic_english_normalize
 
     # simply return if a function is passed

From c365fe9388e94a995f6c2c26ddfff583172da063 Mon Sep 17 00:00:00 2001
From: Guanheng Zhang <zhangguanheng@devfair0197.h2.fair>
Date: Tue, 2 Feb 2021 09:26:41 -0800
Subject: [PATCH 02/13] checkpoint

---
 test/translation.py     | 4 ++--
 torchtext/data/field.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/translation.py b/test/translation.py
index eb3c47a349..89a6f9017f 100644
--- a/test/translation.py
+++ b/test/translation.py
@@ -4,8 +4,8 @@
 import re
 import spacy
 
-spacy_de = spacy.load('de')
-spacy_en = spacy.load('en')
+spacy_de = spacy.load('de_core_news_sm')
+spacy_en = spacy.load('en_core_web_sm')
 
 url = re.compile('(<url>.*</url>)')
 
diff --git a/torchtext/data/field.py b/torchtext/data/field.py
index 95be1b85f2..ff3a501b3e 100644
--- a/torchtext/data/field.py
+++ b/torchtext/data/field.py
@@ -492,7 +492,7 @@ class NestedField(Field):
 
     def __init__(self, nesting_field, use_vocab=True, init_token=None, eos_token=None,
                  fix_length=None, dtype=torch.long, preprocessing=None,
-                 postprocessing=None, tokenize=None, tokenizer_language='en',
+                 postprocessing=None, tokenize=None, tokenizer_language='en_core_web_sm',
                  include_lengths=False, pad_token='<pad>',
                  pad_first=False, truncate_first=False):
         warnings.warn('{} class will be retired soon and moved to torchtext.legacy. Please see the most recent release notes for further information.'.format(self.__class__.__name__), UserWarning)

From 1d5d354d325ae180904330dc8d7d951d5b8f89fd Mon Sep 17 00:00:00 2001
From: Guanheng Zhang <zhangguanheng@devfair0197.h2.fair>
Date: Tue, 2 Feb 2021 09:35:54 -0800
Subject: [PATCH 03/13] update circleci

---
 .circleci/unittest/linux/scripts/setup_env.sh   | 4 +++-
 .circleci/unittest/windows/scripts/setup_env.sh | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
index d4f5457906..02ed10d938 100755
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -45,4 +45,6 @@ fi
 
 # 4. Download
 printf "* Downloading SpaCy English models\n"
-python -m spacy download en
+python -m spacy download en_core_web_md 
+printf "* Downloading SpaCy German models\n"
+python -m spacy download de_core_news_md 
diff --git a/.circleci/unittest/windows/scripts/setup_env.sh b/.circleci/unittest/windows/scripts/setup_env.sh
index cd44b436dc..eb5404f261 100644
--- a/.circleci/unittest/windows/scripts/setup_env.sh
+++ b/.circleci/unittest/windows/scripts/setup_env.sh
@@ -39,4 +39,6 @@ conda env update --file "${this_dir}/environment.yml" --prune
 
 # 4. Download
 printf "* Downloading SpaCy English models\n"
-python -m spacy download en
+python -m spacy download en_core_web_md 
+printf "* Downloading SpaCy German models\n"
+python -m spacy download de_core_news_md 

From ba4dcb87f727f99223e1ac326591da7f34726128 Mon Sep 17 00:00:00 2001
From: Guanheng Zhang <zhangguanheng@devfair0197.h2.fair>
Date: Tue, 2 Feb 2021 18:54:52 -0800
Subject: [PATCH 04/13] checkpoint

---
 torchtext/data/field.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchtext/data/field.py b/torchtext/data/field.py
index ff3a501b3e..f1a897d047 100644
--- a/torchtext/data/field.py
+++ b/torchtext/data/field.py
@@ -143,7 +143,7 @@ class Field(RawField):
     def __init__(self, sequential=True, use_vocab=True, init_token=None,
                  eos_token=None, fix_length=None, dtype=torch.long,
                  preprocessing=None, postprocessing=None, lower=False,
-                 tokenize=None, tokenizer_language='en', include_lengths=False,
+                 tokenize=None, tokenizer_language='en_core_web_sm', include_lengths=False,
                  batch_first=False, pad_token="<pad>", unk_token="<unk>",
                  pad_first=False, truncate_first=False, stop_words=None,
                  is_target=False):

From 846f0e5a385df97b81f7c3995952bc69a8d93d1a Mon Sep 17 00:00:00 2001
From: Guanheng Zhang <zhangguanheng@devfair0197.h2.fair>
Date: Tue, 2 Feb 2021 19:01:26 -0800
Subject: [PATCH 05/13] update circleci environment.ymml

---
 .circleci/unittest/linux/scripts/environment.yml   | 4 ++--
 .circleci/unittest/windows/scripts/environment.yml | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.circleci/unittest/linux/scripts/environment.yml b/.circleci/unittest/linux/scripts/environment.yml
index b60f219e16..282877c207 100644
--- a/.circleci/unittest/linux/scripts/environment.yml
+++ b/.circleci/unittest/linux/scripts/environment.yml
@@ -17,5 +17,5 @@ dependencies:
     - sphinx
     - sphinx-rtd-theme
     - tqdm
-    - https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz#egg=de_core_news_sm==2.2.5
-    - https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz#egg=en_core_web_sm==2.2.5
+    - https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.0.0/de_core_news_sm-3.0.0.tar.gz#egg=de_core_news_sm==3.0.0
+    - https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz#egg=en_core_web_sm==3.0.0
diff --git a/.circleci/unittest/windows/scripts/environment.yml b/.circleci/unittest/windows/scripts/environment.yml
index 28790d2585..2d7c790b91 100644
--- a/.circleci/unittest/windows/scripts/environment.yml
+++ b/.circleci/unittest/windows/scripts/environment.yml
@@ -19,5 +19,5 @@ dependencies:
     - tqdm
     - certifi
     - future
-    - https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz#egg=de_core_news_sm==2.2.5
-    - https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz#egg=en_core_web_sm==2.2.5
+    - https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.0.0/de_core_news_sm-3.0.0.tar.gz#egg=de_core_news_sm==3.0.0
+    - https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz#egg=en_core_web_sm==3.0.0

From 1158243684fd368a8877ade65053a98ee669e8ff Mon Sep 17 00:00:00 2001
From: Guanheng Zhang <zhangguanheng@devfair0197.h2.fair>
Date: Tue, 2 Feb 2021 19:30:52 -0800
Subject: [PATCH 06/13] checkpoint

---
 .circleci/unittest/linux/scripts/setup_env.sh   | 4 ++--
 .circleci/unittest/windows/scripts/setup_env.sh | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
index 02ed10d938..f25d78a447 100755
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -45,6 +45,6 @@ fi
 
 # 4. Download
 printf "* Downloading SpaCy English models\n"
-python -m spacy download en_core_web_md 
+python -m spacy download en_core_web_sd 
 printf "* Downloading SpaCy German models\n"
-python -m spacy download de_core_news_md 
+python -m spacy download de_core_news_sd 
diff --git a/.circleci/unittest/windows/scripts/setup_env.sh b/.circleci/unittest/windows/scripts/setup_env.sh
index eb5404f261..8b4284c3ab 100644
--- a/.circleci/unittest/windows/scripts/setup_env.sh
+++ b/.circleci/unittest/windows/scripts/setup_env.sh
@@ -39,6 +39,6 @@ conda env update --file "${this_dir}/environment.yml" --prune
 
 # 4. Download
 printf "* Downloading SpaCy English models\n"
-python -m spacy download en_core_web_md 
+python -m spacy download en_core_web_sd 
 printf "* Downloading SpaCy German models\n"
-python -m spacy download de_core_news_md 
+python -m spacy download de_core_news_sd 

From eab0524d972e2a584bb4c3c259c1e593cf72d546 Mon Sep 17 00:00:00 2001
From: Guanheng Zhang <zhangguanheng@devfair0197.h2.fair>
Date: Tue, 2 Feb 2021 19:51:07 -0800
Subject: [PATCH 07/13] BC breaking in spacy 3.0

---
 test/data/test_builtin_datasets.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py
index fcef4f1507..0688634af0 100644
--- a/test/data/test_builtin_datasets.py
+++ b/test/data/test_builtin_datasets.py
@@ -200,14 +200,21 @@ def test_multi30k(self):
         from torchtext.experimental.datasets import Multi30k
         # smoke test to ensure multi30k works properly
         train_dataset, valid_dataset, test_dataset = Multi30k()
+
+        # This change is due to the BC breaking in spacy 3.0
         self._helper_test_func(len(train_dataset), 29000, train_dataset[20],
-                               ([4, 444, 2531, 47, 17480, 7423, 8, 158, 10, 12, 5849, 3, 2],
+                               # ([4, 444, 2531, 47, 17480, 7423, 8, 158, 10, 12, 5849, 3, 2],
+                               ([4, 444, 2529, 47, 17490, 7422, 8, 158, 10, 12, 5846, 3, 2],
                                 [5, 61, 530, 137, 1494, 10, 9, 280, 6, 2, 3749, 4, 3]))
+
         self._helper_test_func(len(valid_dataset), 1014, valid_dataset[30],
                                ([4, 179, 26, 85, 1005, 57, 19, 154, 3, 2],
                                 [5, 24, 32, 81, 47, 1348, 6, 2, 119, 4, 3]))
+
+        # This change is due to the BC breaking in spacy 3.0
         self._helper_test_func(len(test_dataset), 1000, test_dataset[40],
-                               ([4, 26, 6, 12, 3915, 1538, 21, 64, 3, 2],
+                               # ([4, 26, 6, 12, 3915, 1538, 21, 64, 3, 2],
+                               ([4, 26, 6, 12, 3913, 1537, 21, 64, 3, 2],
                                 [5, 32, 20, 2, 747, 345, 1915, 6, 46, 4, 3]))
 
         de_vocab, en_vocab = train_dataset.get_vocab()
@@ -234,8 +241,11 @@ def test_multi30k(self):
                                          'A group of men are loading cotton onto a truck\n']))
         del train_iter, valid_iter
         train_dataset, = Multi30k(data_select=('train'))
+
+        # This change is due to the BC breaking in spacy 3.0
         self._helper_test_func(len(train_dataset), 29000, train_dataset[20],
-                               ([4, 444, 2531, 47, 17480, 7423, 8, 158, 10, 12, 5849, 3, 2],
+                               # ([4, 444, 2531, 47, 17480, 7423, 8, 158, 10, 12, 5849, 3, 2],
+                               ([4, 444, 2529, 47, 17490, 7422, 8, 158, 10, 12, 5846, 3, 2],
                                 [5, 61, 530, 137, 1494, 10, 9, 280, 6, 2, 3749, 4, 3]))
 
         datafile = os.path.join(self.project_root, ".data", "train*")

From a4080430b2059b87bdd2ba9a7861081d8550a3fa Mon Sep 17 00:00:00 2001
From: Guanheng Zhang <zhangguanheng@devfair0197.h2.fair>
Date: Tue, 2 Feb 2021 19:56:57 -0800
Subject: [PATCH 08/13] flake8

---
 .circleci/unittest/linux/scripts/setup_env.sh   | 4 ++--
 .circleci/unittest/windows/scripts/setup_env.sh | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
index f25d78a447..eb075b1eb1 100755
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -45,6 +45,6 @@ fi
 
 # 4. Download
 printf "* Downloading SpaCy English models\n"
-python -m spacy download en_core_web_sd 
+python -m spacy download en_core_web_sm
 printf "* Downloading SpaCy German models\n"
-python -m spacy download de_core_news_sd 
+python -m spacy download de_core_news_sm 
diff --git a/.circleci/unittest/windows/scripts/setup_env.sh b/.circleci/unittest/windows/scripts/setup_env.sh
index 8b4284c3ab..ea99130c5e 100644
--- a/.circleci/unittest/windows/scripts/setup_env.sh
+++ b/.circleci/unittest/windows/scripts/setup_env.sh
@@ -39,6 +39,6 @@ conda env update --file "${this_dir}/environment.yml" --prune
 
 # 4. Download
 printf "* Downloading SpaCy English models\n"
-python -m spacy download en_core_web_sd 
+python -m spacy download en_core_web_sm
 printf "* Downloading SpaCy German models\n"
-python -m spacy download de_core_news_sd 
+python -m spacy download de_core_news_sm 

From 701790e384d9b48392e0d8706db71216957d8b79 Mon Sep 17 00:00:00 2001
From: Guanheng Zhang <zhangguanheng@devfair0197.h2.fair>
Date: Wed, 3 Feb 2021 07:23:01 -0800
Subject: [PATCH 09/13] checkpoint

---
 test/data/test_builtin_datasets.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py
index 0688634af0..1822d82c0a 100644
--- a/test/data/test_builtin_datasets.py
+++ b/test/data/test_builtin_datasets.py
@@ -222,7 +222,9 @@ def test_multi30k(self):
             de_vocab[token] for token in
             'Zwei Männer verpacken Donuts in Kunststofffolie'.split()
         ]
-        self.assertEqual(de_tokens_ids, [20, 30, 18705, 4448, 6, 6241])
+        # This change is due to the BC breaking in spacy 3.0
+        # self.assertEqual(de_tokens_ids, [20, 30, 18705, 4448, 6, 6241])
+        self.assertEqual(de_tokens_ids, [20, 30, 18714, 4447, 6, 6239])
 
         en_tokens_ids = [
             en_vocab[token] for token in

From 2c6a031e15be75d12cc9e171073d4ff1f5b12a44 Mon Sep 17 00:00:00 2001
From: Guanheng Zhang <zhangguanheng@devfair0197.h2.fair>
Date: Wed, 3 Feb 2021 10:16:36 -0800
Subject: [PATCH 10/13] checkpoint

---
 torchtext/data/utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/torchtext/data/utils.py b/torchtext/data/utils.py
index f74a9326fd..c290ad2e23 100644
--- a/torchtext/data/utils.py
+++ b/torchtext/data/utils.py
@@ -72,7 +72,7 @@ def _basic_english_normalize(line):
     return line.split()
 
 
-def get_tokenizer(tokenizer, language='en_core_web_sm'):
+def get_tokenizer(tokenizer, language='en'):
     r"""
     Generate tokenizer function for a string sentence.
 
@@ -101,6 +101,8 @@ def get_tokenizer(tokenizer, language='en_core_web_sm'):
         return _split_tokenizer
 
     if tokenizer == "basic_english":
+        if language != 'en':
+            raise ValueError("Basic normalization is only available for Enlish(en)")
         return _basic_english_normalize
 
     # simply return if a function is passed
@@ -108,6 +110,8 @@ def get_tokenizer(tokenizer, language='en_core_web_sm'):
         return tokenizer
 
     if tokenizer == "spacy":
+        if language == 'en':
+            raise RuntimeError("The en package has been deprecated in Spacy 3.0 release. Please switch to en_core_web_sm.")
         try:
             import spacy
             spacy = spacy.load(language)

From 81c12c47391cf652c217a6342b33265bdf82c8d4 Mon Sep 17 00:00:00 2001
From: Guanheng Zhang <zhangguanheng@devfair0197.h2.fair>
Date: Wed, 3 Feb 2021 13:37:26 -0800
Subject: [PATCH 11/13] checkpoint

---
 test/test_build.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_build.py b/test/test_build.py
index d61e844280..2e29392562 100644
--- a/test/test_build.py
+++ b/test/test_build.py
@@ -107,7 +107,7 @@ class TestDataUtils(TorchtextTestCase):
 
     def test_get_tokenizer_spacy(self):
         # Test SpaCy option, and verify it properly handles punctuation.
-        assert torchtext.data.get_tokenizer("spacy")(str(self.TEST_STR)) == [
+        assert torchtext.data.get_tokenizer("spacy", language='en_core_web_sm')(str(self.TEST_STR)) == [
             "A", "string", ",", "particularly", "one", "with", "slightly",
             "complex", "punctuation", "."]
 

From b5e20c1ada1d9b6c4faaeddc2104bb66adb9bcb2 Mon Sep 17 00:00:00 2001
From: Guanheng Zhang <zhangguanheng@devfair0197.h2.fair>
Date: Wed, 3 Feb 2021 16:07:28 -0800
Subject: [PATCH 12/13] checkpoint

---
 torchtext/data/field.py | 4 ++--
 torchtext/data/utils.py | 8 ++++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/torchtext/data/field.py b/torchtext/data/field.py
index f1a897d047..95be1b85f2 100644
--- a/torchtext/data/field.py
+++ b/torchtext/data/field.py
@@ -143,7 +143,7 @@ class Field(RawField):
     def __init__(self, sequential=True, use_vocab=True, init_token=None,
                  eos_token=None, fix_length=None, dtype=torch.long,
                  preprocessing=None, postprocessing=None, lower=False,
-                 tokenize=None, tokenizer_language='en_core_web_sm', include_lengths=False,
+                 tokenize=None, tokenizer_language='en', include_lengths=False,
                  batch_first=False, pad_token="<pad>", unk_token="<unk>",
                  pad_first=False, truncate_first=False, stop_words=None,
                  is_target=False):
@@ -492,7 +492,7 @@ class NestedField(Field):
 
     def __init__(self, nesting_field, use_vocab=True, init_token=None, eos_token=None,
                  fix_length=None, dtype=torch.long, preprocessing=None,
-                 postprocessing=None, tokenize=None, tokenizer_language='en_core_web_sm',
+                 postprocessing=None, tokenize=None, tokenizer_language='en',
                  include_lengths=False, pad_token='<pad>',
                  pad_first=False, truncate_first=False):
         warnings.warn('{} class will be retired soon and moved to torchtext.legacy. Please see the most recent release notes for further information.'.format(self.__class__.__name__), UserWarning)
diff --git a/torchtext/data/utils.py b/torchtext/data/utils.py
index c290ad2e23..86ab8b4cdf 100644
--- a/torchtext/data/utils.py
+++ b/torchtext/data/utils.py
@@ -2,7 +2,7 @@
 from contextlib import contextmanager
 from copy import deepcopy
 import re
-
+import warnings
 from functools import partial
 
 
@@ -111,7 +111,11 @@ def get_tokenizer(tokenizer, language='en'):
 
     if tokenizer == "spacy":
         if language == 'en':
-            raise RuntimeError("The en package has been deprecated in Spacy 3.0 release. Please switch to en_core_web_sm.")
+            warnings.warn("The en package has been deprecated in Spacy 3.0 release. Please switch the language argument in get_tokenizer to en_core_web_sm.", UserWarning)
+            language = 'en_core_web_sm'
+        elif language == 'de':
+            warnings.warn("The de package has been deprecated in Spacy 3.0 release. Please switch the language argument in get_tokenizer to de_core_news_sm.", UserWarning)
+            language = 'de_core_news_sm'
         try:
             import spacy
             spacy = spacy.load(language)

From 5528794d6ff60e7a0e7ae169b1c9bcfe8eae9a75 Mon Sep 17 00:00:00 2001
From: Guanheng Zhang <zhangguanheng@devfair0197.h2.fair>
Date: Thu, 4 Feb 2021 13:31:23 -0800
Subject: [PATCH 13/13] checkpoint

---
 torchtext/data/utils.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/torchtext/data/utils.py b/torchtext/data/utils.py
index 28cda46d2c..240a3799ff 100644
--- a/torchtext/data/utils.py
+++ b/torchtext/data/utils.py
@@ -2,7 +2,6 @@
 from contextlib import contextmanager
 from copy import deepcopy
 import re
-import warnings
 from functools import partial
 
 
@@ -110,12 +109,6 @@ def get_tokenizer(tokenizer, language='en'):
         return tokenizer
 
     if tokenizer == "spacy":
-        if language == 'en':
-            warnings.warn("The en package has been deprecated in Spacy 3.0 release. Please switch the language argument in get_tokenizer to en_core_web_sm.", UserWarning)
-            language = 'en_core_web_sm'
-        elif language == 'de':
-            warnings.warn("The de package has been deprecated in Spacy 3.0 release. Please switch the language argument in get_tokenizer to de_core_news_sm.", UserWarning)
-            language = 'de_core_news_sm'
         try:
             import spacy
             try: