Skip to content
This repository was archived by the owner on Sep 10, 2025. It is now read-only.

Commit 52436c8

Browse files
authored
Resolve inconsistency in IMDB label output (#1914)
1 parent 5c48f4a commit 52436c8

File tree

2 files changed

+5
-3
lines changed

2 files changed

+5
-3
lines changed

test/torchtext_unittest/datasets/test_imdb.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@ def _get_mock_dataset(root_dir):
2929
for i in range(5):
3030
# all negative labels are read first before positive labels in the
3131
# IMDB dataset implementation
32-
label = "neg" if i < 2 else "pos"
33-
cur_dir = pos_dir if label == "pos" else neg_dir
32+
label = 1 if i < 2 else 2
33+
cur_dir = pos_dir if label == 2 else neg_dir
3434
txt_file = os.path.join(cur_dir, f"{i}{i}_{i}.txt")
3535
with open(txt_file, "w", encoding="utf-8") as f:
3636
rand_string = get_random_unicode(seed)

torchtext/datasets/imdb.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
"test": 25000,
2121
}
2222

23+
MAP_LABELS = {"neg": 1, "pos": 2}
24+
2325
_PATH = "aclImdb_v1.tar.gz"
2426

2527
DATASET_NAME = "IMDB"
@@ -50,7 +52,7 @@ def _cache_filepath_fn(root, decompressed_folder, split, x):
5052

5153

5254
def _modify_res(t):
53-
return Path(t[0]).parts[-1], t[1]
55+
return MAP_LABELS[Path(t[0]).parts[-1]], t[1]
5456

5557

5658
def filter_imdb_data(key, fname):

0 commit comments

Comments
 (0)