From a7043ced3ba841a043b7df48ffd2ef30294a75bc Mon Sep 17 00:00:00 2001 From: Elijah Rippeth Date: Wed, 19 Jan 2022 09:59:07 -0500 Subject: [PATCH 1/3] add double caching for yelp polarity to speed up extracted reading. --- torchtext/datasets/yelpreviewpolarity.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/torchtext/datasets/yelpreviewpolarity.py b/torchtext/datasets/yelpreviewpolarity.py index a536d6dd0f..3488b2686d 100644 --- a/torchtext/datasets/yelpreviewpolarity.py +++ b/torchtext/datasets/yelpreviewpolarity.py @@ -47,8 +47,20 @@ def YelpReviewPolarity(root: str, split: Union[Tuple[str], str]): cache_dp = GDriveReader(cache_dp).end_caching(mode="wb", same_filepath_fn=True) cache_dp = FileOpener(cache_dp, mode="b") - extracted_files = cache_dp.read_from_tar() + def extracted_filepath_fn(_): + file_path = os.path.join(root, _EXTRACTED_FILES[split]) + dir_path = os.path.dirname(file_path) + if not os.path.exists(dir_path): + os.makedirs(dir_path) + return file_path + + cache_dp = cache_dp.on_disk_cache( + filepath_fn=extracted_filepath_fn + ) + cache_dp = cache_dp.read_from_tar() - filter_extracted_files = extracted_files.filter(lambda x: _EXTRACTED_FILES[split] in x[0]) + cache_dp = cache_dp.filter(lambda x: _EXTRACTED_FILES[split] in x[0]) + cache_dp = cache_dp.end_caching(mode="wb", same_filepath_fn=True) + cache_dp = FileOpener(cache_dp, mode="b") - return filter_extracted_files.parse_csv().map(fn=lambda t: (int(t[0]), " ".join(t[1:]))) + return cache_dp.parse_csv().map(fn=lambda t: (int(t[0]), " ".join(t[1:]))) From 42fb20e442ef3536cd2ed0b9d054786cc136d6a9 Mon Sep 17 00:00:00 2001 From: Elijah Rippeth Date: Wed, 19 Jan 2022 14:40:49 -0500 Subject: [PATCH 2/3] rename dps for consistency and simplify filepath_fn --- torchtext/datasets/yelpreviewpolarity.py | 34 ++++++++++-------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/torchtext/datasets/yelpreviewpolarity.py b/torchtext/datasets/yelpreviewpolarity.py index 3488b2686d..c570622864 100644 --- a/torchtext/datasets/yelpreviewpolarity.py +++ b/torchtext/datasets/yelpreviewpolarity.py @@ -33,34 +33,28 @@ @_add_docstring_header(num_lines=NUM_LINES, num_classes=2) @_create_dataset_directory(dataset_name=DATASET_NAME) -@_wrap_split_argument(('train', 'test')) +@_wrap_split_argument(("train", "test")) def YelpReviewPolarity(root: str, split: Union[Tuple[str], str]): if not is_module_available("torchdata"): raise ModuleNotFoundError("Package `torchdata` not found. Please install following instructions at `https://github.com/pytorch/data`") url_dp = IterableWrapper([URL]) - cache_dp = url_dp.on_disk_cache( + cache_compressed_dp = url_dp.on_disk_cache( filepath_fn=lambda x: os.path.join(root, _PATH), - hash_dict={os.path.join(root, _PATH): MD5}, hash_type="md5" + hash_dict={os.path.join(root, _PATH): MD5}, + hash_type="md5" ) - cache_dp = GDriveReader(cache_dp).end_caching(mode="wb", same_filepath_fn=True) - cache_dp = FileOpener(cache_dp, mode="b") - - def extracted_filepath_fn(_): - file_path = os.path.join(root, _EXTRACTED_FILES[split]) - dir_path = os.path.dirname(file_path) - if not os.path.exists(dir_path): - os.makedirs(dir_path) - return file_path - - cache_dp = cache_dp.on_disk_cache( - filepath_fn=extracted_filepath_fn + cache_compressed_dp = GDriveReader(cache_compressed_dp).end_caching(mode="wb", same_filepath_fn=True) + cache_compressed_dp = FileOpener(cache_compressed_dp, mode="b") + + cache_decompressed_dp = cache_compressed_dp.on_disk_cache( + filepath_fn=lambda x: os.path.join(root, _EXTRACTED_FILES[split]) ) - cache_dp = cache_dp.read_from_tar() + cache_decompressed_dp = cache_decompressed_dp.read_from_tar() - cache_dp = cache_dp.filter(lambda x: _EXTRACTED_FILES[split] in x[0]) - cache_dp = cache_dp.end_caching(mode="wb", same_filepath_fn=True) - cache_dp = FileOpener(cache_dp, mode="b") + cache_decompressed_dp = cache_decompressed_dp.filter(lambda x: _EXTRACTED_FILES[split] in x[0]) + cache_decompressed_dp = cache_decompressed_dp.end_caching(mode="wb", same_filepath_fn=True) + data_dp = FileOpener(cache_decompressed_dp, mode="b") - return cache_dp.parse_csv().map(fn=lambda t: (int(t[0]), " ".join(t[1:]))) + return data_dp.parse_csv().map(fn=lambda t: (int(t[0]), " ".join(t[1:]))) From 68feea622c7bd1009069b5160c00a6460450c412 Mon Sep 17 00:00:00 2001 From: Elijah Rippeth Date: Thu, 20 Jan 2022 12:08:09 -0500 Subject: [PATCH 3/3] add FileOpener within caching block for more consistency. --- torchtext/datasets/yelpreviewpolarity.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/torchtext/datasets/yelpreviewpolarity.py b/torchtext/datasets/yelpreviewpolarity.py index c570622864..627aab1d2d 100644 --- a/torchtext/datasets/yelpreviewpolarity.py +++ b/torchtext/datasets/yelpreviewpolarity.py @@ -51,6 +51,8 @@ def YelpReviewPolarity(root: str, split: Union[Tuple[str], str]): cache_decompressed_dp = cache_compressed_dp.on_disk_cache( filepath_fn=lambda x: os.path.join(root, _EXTRACTED_FILES[split]) ) + cache_decompressed_dp = FileOpener(cache_decompressed_dp, mode="b") + cache_decompressed_dp = cache_decompressed_dp.read_from_tar() cache_decompressed_dp = cache_decompressed_dp.filter(lambda x: _EXTRACTED_FILES[split] in x[0])