diff --git a/torchtext/datasets/conll2000chunking.py b/torchtext/datasets/conll2000chunking.py index 513132233c..fd9062a2fb 100644 --- a/torchtext/datasets/conll2000chunking.py +++ b/torchtext/datasets/conll2000chunking.py @@ -45,20 +45,25 @@ def CoNLL2000Chunking(root: str, split: Union[Tuple[str], str]): url_dp = IterableWrapper([URL[split]]) # Cache and check HTTP response - cache_dp = url_dp.on_disk_cache( - filepath_fn=lambda x: os.path.join(root, "conll2000chunking", os.path.basename(URL[split])), - hash_dict={os.path.join(root, "conll2000chunking", os.path.basename(URL[split])): MD5[split]}, - hash_type="md5" + cache_compressed_dp = url_dp.on_disk_cache( + filepath_fn=lambda x: os.path.join(root, os.path.basename(URL[split])), + hash_dict={os.path.join(root, os.path.basename(URL[split])): MD5[split]}, + hash_type="md5", + ) + cache_compressed_dp = HttpReader(cache_compressed_dp).end_caching( + mode="wb", same_filepath_fn=True ) - cache_dp = HttpReader(cache_dp).end_caching(mode="wb", same_filepath_fn=True) - cache_dp = FileOpener(cache_dp, mode="b") # Cache and check the gzip extraction for relevant split - cache_dp = cache_dp.on_disk_cache( - filepath_fn=lambda x: os.path.join(root, "conll2000chunking", _EXTRACTED_FILES[split]) + cache_decompressed_dp = cache_compressed_dp.on_disk_cache( + filepath_fn=lambda x: os.path.join(root, _EXTRACTED_FILES[split]) + ) + cache_decompressed_dp = FileOpener(cache_decompressed_dp, mode="b").extract( + file_type="gzip" + ) + cache_decompressed_dp = cache_decompressed_dp.end_caching( + mode="wb", same_filepath_fn=True ) - cache_dp = cache_dp.extract(file_type="gzip").filter(lambda x: _EXTRACTED_FILES[split] in x[0]) - cache_dp = cache_dp.end_caching(mode="wb") - cache_dp = FileOpener(cache_dp, mode="b") - return cache_dp.readlines(decode=True).read_iob(sep=" ") + data_dp = FileOpener(cache_decompressed_dp, mode="b") + return data_dp.readlines(decode=True).read_iob(sep=" ")