pytorch · Nayef211 · Feb 8, 2022 · Feb 7, 2022 · Feb 8, 2022
diff --git a/torchtext/datasets/conll2000chunking.py b/torchtext/datasets/conll2000chunking.py
@@ -45,20 +45,25 @@ def CoNLL2000Chunking(root: str, split: Union[Tuple[str], str]):
     url_dp = IterableWrapper([URL[split]])
 
     # Cache and check HTTP response
-    cache_dp = url_dp.on_disk_cache(
-        filepath_fn=lambda x: os.path.join(root, "conll2000chunking", os.path.basename(URL[split])),
-        hash_dict={os.path.join(root, "conll2000chunking", os.path.basename(URL[split])): MD5[split]},
-        hash_type="md5"
+    cache_compressed_dp = url_dp.on_disk_cache(
+        filepath_fn=lambda x: os.path.join(root, os.path.basename(URL[split])),
+        hash_dict={os.path.join(root, os.path.basename(URL[split])): MD5[split]},
+        hash_type="md5",
+    )
+    cache_compressed_dp = HttpReader(cache_compressed_dp).end_caching(
+        mode="wb", same_filepath_fn=True
     )
-    cache_dp = HttpReader(cache_dp).end_caching(mode="wb", same_filepath_fn=True)
-    cache_dp = FileOpener(cache_dp, mode="b")
 
     # Cache and check the gzip extraction for relevant split
-    cache_dp = cache_dp.on_disk_cache(
-        filepath_fn=lambda x: os.path.join(root, "conll2000chunking", _EXTRACTED_FILES[split])
+    cache_decompressed_dp = cache_compressed_dp.on_disk_cache(
+        filepath_fn=lambda x: os.path.join(root, _EXTRACTED_FILES[split])
+    )
+    cache_decompressed_dp = FileOpener(cache_decompressed_dp, mode="b").extract(
+        file_type="gzip"
+    )
+    cache_decompressed_dp = cache_decompressed_dp.end_caching(
+        mode="wb", same_filepath_fn=True
     )
-    cache_dp = cache_dp.extract(file_type="gzip").filter(lambda x: _EXTRACTED_FILES[split] in x[0])
-    cache_dp = cache_dp.end_caching(mode="wb")
 
-    cache_dp = FileOpener(cache_dp, mode="b")
-    return cache_dp.readlines(decode=True).read_iob(sep=" ")
+    data_dp = FileOpener(cache_decompressed_dp, mode="b")
+    return data_dp.readlines(decode=True).read_iob(sep=" ")