From 8947465671162cf7ee5d4325ac7a843c2ae42f81 Mon Sep 17 00:00:00 2001 From: nayef211 Date: Mon, 7 Feb 2022 13:37:09 -0800 Subject: [PATCH 1/2] Updating dataset to be consistent with other datasets --- torchtext/datasets/conll2000chunking.py | 31 +++++++++++++++---------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/torchtext/datasets/conll2000chunking.py b/torchtext/datasets/conll2000chunking.py index 513132233c..deff390b09 100644 --- a/torchtext/datasets/conll2000chunking.py +++ b/torchtext/datasets/conll2000chunking.py @@ -45,20 +45,27 @@ def CoNLL2000Chunking(root: str, split: Union[Tuple[str], str]): url_dp = IterableWrapper([URL[split]]) # Cache and check HTTP response - cache_dp = url_dp.on_disk_cache( - filepath_fn=lambda x: os.path.join(root, "conll2000chunking", os.path.basename(URL[split])), - hash_dict={os.path.join(root, "conll2000chunking", os.path.basename(URL[split])): MD5[split]}, - hash_type="md5" + cache_compressed_dp = url_dp.on_disk_cache( + filepath_fn=lambda x: os.path.join(root, os.path.basename(URL[split])), + hash_dict={os.path.join(root, os.path.basename(URL[split])): MD5[split]}, + hash_type="md5", + ) + cache_compressed_dp = HttpReader(cache_compressed_dp).end_caching( + mode="wb", same_filepath_fn=True ) - cache_dp = HttpReader(cache_dp).end_caching(mode="wb", same_filepath_fn=True) - cache_dp = FileOpener(cache_dp, mode="b") # Cache and check the gzip extraction for relevant split - cache_dp = cache_dp.on_disk_cache( - filepath_fn=lambda x: os.path.join(root, "conll2000chunking", _EXTRACTED_FILES[split]) + cache_decompressed_dp = cache_compressed_dp.on_disk_cache( + filepath_fn=lambda x: os.path.join(root, _EXTRACTED_FILES[split]) + ) + cache_decompressed_dp = ( + FileOpener(cache_decompressed_dp, mode="b") + .extract(file_type="gzip") + .filter(lambda x: _EXTRACTED_FILES[split] in x[0]) + ) + cache_decompressed_dp = cache_decompressed_dp.end_caching( + mode="wb", same_filepath_fn=True ) - cache_dp = cache_dp.extract(file_type="gzip").filter(lambda x: _EXTRACTED_FILES[split] in x[0]) - cache_dp = cache_dp.end_caching(mode="wb") - cache_dp = FileOpener(cache_dp, mode="b") - return cache_dp.readlines(decode=True).read_iob(sep=" ") + data_dp = FileOpener(cache_decompressed_dp, mode="b") + return data_dp.readlines(decode=True).read_iob(sep=" ") From 06d6ce9c0336ef51e74872eee2bdebc17a22bd58 Mon Sep 17 00:00:00 2001 From: nayef211 Date: Mon, 7 Feb 2022 22:12:16 -0800 Subject: [PATCH 2/2] Resolving PR comments --- torchtext/datasets/conll2000chunking.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/torchtext/datasets/conll2000chunking.py b/torchtext/datasets/conll2000chunking.py index deff390b09..fd9062a2fb 100644 --- a/torchtext/datasets/conll2000chunking.py +++ b/torchtext/datasets/conll2000chunking.py @@ -58,10 +58,8 @@ def CoNLL2000Chunking(root: str, split: Union[Tuple[str], str]): cache_decompressed_dp = cache_compressed_dp.on_disk_cache( filepath_fn=lambda x: os.path.join(root, _EXTRACTED_FILES[split]) ) - cache_decompressed_dp = ( - FileOpener(cache_decompressed_dp, mode="b") - .extract(file_type="gzip") - .filter(lambda x: _EXTRACTED_FILES[split] in x[0]) + cache_decompressed_dp = FileOpener(cache_decompressed_dp, mode="b").extract( + file_type="gzip" ) cache_decompressed_dp = cache_decompressed_dp.end_caching( mode="wb", same_filepath_fn=True