Skip to content
This repository was archived by the owner on Sep 10, 2025. It is now read-only.

Commit 8168aba

Browse files
datumboxfacebook-github-bot
authored andcommitted
Fix broken CI tests due to spacy 3.0 release (#1138)
Reviewed By: zhangguanheng66 Differential Revision: D26368998 fbshipit-source-id: 84e883562a9a3d0fe47b54823b22f7b2cd82fca4
1 parent ab53f2f commit 8168aba

File tree

8 files changed

+29
-14
lines changed

8 files changed

+29
-14
lines changed

.circleci/unittest/linux/scripts/environment.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,5 @@ dependencies:
1717
- sphinx
1818
- sphinx-rtd-theme
1919
- tqdm
20-
- https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz#egg=de_core_news_sm==2.2.5
21-
- https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz#egg=en_core_web_sm==2.2.5
20+
- https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.0.0/de_core_news_sm-3.0.0.tar.gz#egg=de_core_news_sm==3.0.0
21+
- https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz#egg=en_core_web_sm==3.0.0

.circleci/unittest/linux/scripts/setup_env.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,4 +45,6 @@ fi
4545

4646
# 4. Download
4747
printf "* Downloading SpaCy English models\n"
48-
python -m spacy download en
48+
python -m spacy download en_core_web_sm
49+
printf "* Downloading SpaCy German models\n"
50+
python -m spacy download de_core_news_sm

.circleci/unittest/windows/scripts/environment.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,5 @@ dependencies:
1919
- tqdm
2020
- certifi
2121
- future
22-
- https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz#egg=de_core_news_sm==2.2.5
23-
- https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz#egg=en_core_web_sm==2.2.5
22+
- https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.0.0/de_core_news_sm-3.0.0.tar.gz#egg=de_core_news_sm==3.0.0
23+
- https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz#egg=en_core_web_sm==3.0.0

.circleci/unittest/windows/scripts/setup_env.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,4 +39,6 @@ conda env update --file "${this_dir}/environment.yml" --prune
3939

4040
# 4. Download
4141
printf "* Downloading SpaCy English models\n"
42-
python -m spacy download en
42+
python -m spacy download en_core_web_sm
43+
printf "* Downloading SpaCy German models\n"
44+
python -m spacy download de_core_news_sm

test/data/test_builtin_datasets.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -206,22 +206,31 @@ def test_multi30k(self):
206206
from torchtext.experimental.datasets import Multi30k
207207
# smoke test to ensure multi30k works properly
208208
train_dataset, valid_dataset, test_dataset = Multi30k()
209+
210+
# This change is due to the BC breaking in spacy 3.0
209211
self._helper_test_func(len(train_dataset), 29000, train_dataset[20],
210-
([4, 444, 2531, 47, 17480, 7423, 8, 158, 10, 12, 5849, 3, 2],
212+
# ([4, 444, 2531, 47, 17480, 7423, 8, 158, 10, 12, 5849, 3, 2],
213+
([4, 444, 2529, 47, 17490, 7422, 8, 158, 10, 12, 5846, 3, 2],
211214
[5, 61, 530, 137, 1494, 10, 9, 280, 6, 2, 3749, 4, 3]))
215+
212216
self._helper_test_func(len(valid_dataset), 1014, valid_dataset[30],
213217
([4, 179, 26, 85, 1005, 57, 19, 154, 3, 2],
214218
[5, 24, 32, 81, 47, 1348, 6, 2, 119, 4, 3]))
219+
220+
# This change is due to the BC breaking in spacy 3.0
215221
self._helper_test_func(len(test_dataset), 1000, test_dataset[40],
216-
([4, 26, 6, 12, 3915, 1538, 21, 64, 3, 2],
222+
# ([4, 26, 6, 12, 3915, 1538, 21, 64, 3, 2],
223+
([4, 26, 6, 12, 3913, 1537, 21, 64, 3, 2],
217224
[5, 32, 20, 2, 747, 345, 1915, 6, 46, 4, 3]))
218225

219226
de_vocab, en_vocab = train_dataset.get_vocab()
220227
de_tokens_ids = [
221228
de_vocab[token] for token in
222229
'Zwei Männer verpacken Donuts in Kunststofffolie'.split()
223230
]
224-
self.assertEqual(de_tokens_ids, [20, 30, 18705, 4448, 6, 6241])
231+
# This change is due to the BC breaking in spacy 3.0
232+
# self.assertEqual(de_tokens_ids, [20, 30, 18705, 4448, 6, 6241])
233+
self.assertEqual(de_tokens_ids, [20, 30, 18714, 4447, 6, 6239])
225234

226235
en_tokens_ids = [
227236
en_vocab[token] for token in
@@ -240,8 +249,11 @@ def test_multi30k(self):
240249
'A group of men are loading cotton onto a truck\n']))
241250
del train_iter, valid_iter
242251
train_dataset, = Multi30k(data_select=('train'))
252+
253+
# This change is due to the BC breaking in spacy 3.0
243254
self._helper_test_func(len(train_dataset), 29000, train_dataset[20],
244-
([4, 444, 2531, 47, 17480, 7423, 8, 158, 10, 12, 5849, 3, 2],
255+
# ([4, 444, 2531, 47, 17480, 7423, 8, 158, 10, 12, 5849, 3, 2],
256+
([4, 444, 2529, 47, 17490, 7422, 8, 158, 10, 12, 5846, 3, 2],
245257
[5, 61, 530, 137, 1494, 10, 9, 280, 6, 2, 3749, 4, 3]))
246258

247259
datafile = os.path.join(self.project_root, ".data", "train*")

test/test_build.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ class TestDataUtils(TorchtextTestCase):
107107

108108
def test_get_tokenizer_spacy(self):
109109
# Test SpaCy option, and verify it properly handles punctuation.
110-
assert torchtext.data.get_tokenizer("spacy")(str(self.TEST_STR)) == [
110+
assert torchtext.data.get_tokenizer("spacy", language='en_core_web_sm')(str(self.TEST_STR)) == [
111111
"A", "string", ",", "particularly", "one", "with", "slightly",
112112
"complex", "punctuation", "."]
113113

test/translation.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
import re
55
import spacy
66

7-
spacy_de = spacy.load('de')
8-
spacy_en = spacy.load('en')
7+
spacy_de = spacy.load('de_core_news_sm')
8+
spacy_en = spacy.load('en_core_web_sm')
99

1010
url = re.compile('(<url>.*</url>)')
1111

torchtext/data/utils.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
from contextlib import contextmanager
33
from copy import deepcopy
44
import re
5-
65
from functools import partial
76

87

0 commit comments

Comments
 (0)