From bcea78ee41a4bbe4e4e39abc517a203c43813f4a Mon Sep 17 00:00:00 2001 From: Parmeet Singh Bhatia Date: Mon, 23 Aug 2021 22:19:42 -0400 Subject: [PATCH 1/2] removing dependence on iopath --- .../unittest/linux/scripts/environment.yml | 1 - .../unittest/windows/scripts/environment.yml | 2 - docs/requirements.txt | 1 - packaging/pkg_helpers.bash | 1 - packaging/torchtext/meta.yaml | 1 - requirements.txt | 1 - torchtext/_download_hooks.py | 129 ++---------------- 7 files changed, 10 insertions(+), 126 deletions(-) diff --git a/.circleci/unittest/linux/scripts/environment.yml b/.circleci/unittest/linux/scripts/environment.yml index e616d8f107..363e3085e1 100644 --- a/.circleci/unittest/linux/scripts/environment.yml +++ b/.circleci/unittest/linux/scripts/environment.yml @@ -8,7 +8,6 @@ dependencies: - dataclasses - nltk - requests - - iopath - revtok - pytest - pytest-cov diff --git a/.circleci/unittest/windows/scripts/environment.yml b/.circleci/unittest/windows/scripts/environment.yml index 75e6d25c13..dc9889b588 100644 --- a/.circleci/unittest/windows/scripts/environment.yml +++ b/.circleci/unittest/windows/scripts/environment.yml @@ -3,13 +3,11 @@ channels: dependencies: - flake8>=3.7.9 - codecov - - pywin32 - pip - pip: - dataclasses - nltk - requests - - iopath - revtok - pytest - pytest-cov diff --git a/docs/requirements.txt b/docs/requirements.txt index 65538b7b8a..560a2b3600 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,3 +1,2 @@ sphinx==2.4.4 -iopath -e git+git://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme diff --git a/packaging/pkg_helpers.bash b/packaging/pkg_helpers.bash index e0c81205bc..8f1e24f1da 100644 --- a/packaging/pkg_helpers.bash +++ b/packaging/pkg_helpers.bash @@ -180,7 +180,6 @@ setup_pip_pytorch_version() { # You MUST have populated PYTORCH_VERSION_SUFFIX before hand. setup_conda_pytorch_constraint() { CONDA_CHANNEL_FLAGS=${CONDA_CHANNEL_FLAGS:-} - CONDA_CHANNEL_FLAGS="${CONDA_CHANNEL_FLAGS} -c iopath" if [[ -z "$PYTORCH_VERSION" ]]; then export CONDA_CHANNEL_FLAGS="${CONDA_CHANNEL_FLAGS} -c pytorch-nightly" export PYTORCH_VERSION="$(conda search --json 'pytorch[channel=pytorch-nightly]' | python -c "import sys, json, re; print(re.sub(r'\\+.*$', '', json.load(sys.stdin)['pytorch'][-1]['version']))")" diff --git a/packaging/torchtext/meta.yaml b/packaging/torchtext/meta.yaml index 61b8f50adb..36008e5cf5 100644 --- a/packaging/torchtext/meta.yaml +++ b/packaging/torchtext/meta.yaml @@ -20,7 +20,6 @@ requirements: run: - python - requests - - iopath - tqdm {{ environ.get('CONDA_PYTORCH_CONSTRAINT') }} diff --git a/requirements.txt b/requirements.txt index ceb21c99cf..fd100b8eb3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,6 @@ tqdm # Downloading data and other files requests -iopath # Optional NLP tools nltk diff --git a/torchtext/_download_hooks.py b/torchtext/_download_hooks.py index 269d251ec3..c00a94e74e 100644 --- a/torchtext/_download_hooks.py +++ b/torchtext/_download_hooks.py @@ -6,13 +6,6 @@ import re import shutil from tqdm import tqdm -from iopath.common.file_io import ( - PathHandler, - PathManager, - get_cache_dir, - file_lock, - HTTPURLHandler, -) def _stream_response(r, chunk_size=16 * 1024): @@ -54,118 +47,16 @@ def _get_response_from_google_drive(url): return response, filename -class GoogleDrivePathHandler(PathHandler): - """ - Download URLs and cache them to disk. - """ - - MAX_FILENAME_LEN = 250 - - def __init__(self) -> None: - self.cache_map: Dict[str, str] = {} - - def _get_supported_prefixes(self) -> List[str]: - return ["https://drive.google.com"] - - def _get_local_path( - self, - path: str, - force: bool = False, - cache_dir: Optional[str] = None, - **kwargs: Any, - ) -> str: - """ - This implementation downloads the remote resource from google drive and caches it locally. - The resource will only be downloaded if not previously requested. - """ - self._check_kwargs(kwargs) - if ( - force - or path not in self.cache_map - or not os.path.exists(self.cache_map[path]) - ): - logger = logging.getLogger(__name__) - dirname = get_cache_dir(cache_dir) - - response, filename = _get_response_from_google_drive(path) - if len(filename) > self.MAX_FILENAME_LEN: - filename = filename[:100] + "_" + uuid.uuid4().hex - - cached = os.path.join(dirname, filename) - with file_lock(cached): - if not os.path.isfile(cached): - logger.info("Downloading {} ...".format(path)) - with open(cached, 'wb') as f: - for data in _stream_response(response): - f.write(data) - logger.info("URL {} cached in {}".format(path, cached)) - self.cache_map[path] = cached - return self.cache_map[path] - - def _open( - self, path: str, mode: str = "r", buffering: int = -1, **kwargs: Any - ) -> Union[IO[str], IO[bytes]]: - """ - Open a google drive path. The resource is first downloaded and cached - locally. - Args: - path (str): A URI supported by this PathHandler - mode (str): Specifies the mode in which the file is opened. It defaults - to 'r'. - buffering (int): Not used for this PathHandler. - Returns: - file: a file-like object. - """ - self._check_kwargs(kwargs) - assert mode in ("r", "rb"), "{} does not support open with {} mode".format( - self.__class__.__name__, mode - ) - assert ( - buffering == -1 - ), f"{self.__class__.__name__} does not support the `buffering` argument" - local_path = self._get_local_path(path, force=False) - return open(local_path, mode) - - -class CombinedInternalPathhandler(PathHandler): - def __init__(self): - path_manager = PathManager() - path_manager.register_handler(HTTPURLHandler()) - path_manager.register_handler(GoogleDrivePathHandler()) - self.path_manager = path_manager - - def _get_supported_prefixes(self) -> List[str]: - return ["https://", "http://"] - - def _get_local_path( - self, - path: str, - force: bool = False, - cache_dir: Optional[str] = None, - **kwargs: Any, - ) -> str: - - destination = kwargs["destination"] - - local_path = self.path_manager.get_local_path(path, force) - - shutil.move(local_path, destination) - - return destination +class DownloadManager: + def get_local_path(self, url, destination): + if 'drive.google.com' not in url: + response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, stream=True) + else: + response, filename = _get_response_from_google_drive(url) - def _open( - self, path: str, mode: str = "r", buffering: int = -1, **kwargs: Any - ) -> Union[IO[str], IO[bytes]]: - self._check_kwargs(kwargs) - assert mode in ("r", "rb"), "{} does not support open with {} mode".format( - self.__class__.__name__, mode - ) - assert ( - buffering == -1 - ), f"{self.__class__.__name__} does not support the `buffering` argument" - local_path = self._get_local_path(path, force=False) - return open(local_path, mode) + with open(destination, 'wb') as f: + for chunk in _stream_response(response): + f.write(chunk) -_DATASET_DOWNLOAD_MANAGER = PathManager() -_DATASET_DOWNLOAD_MANAGER.register_handler(CombinedInternalPathhandler()) +_DATASET_DOWNLOAD_MANAGER = DownloadManager() From a6efcc798249f514b2627e65a33b8d824de0aded Mon Sep 17 00:00:00 2001 From: Parmeet Singh Bhatia Date: Mon, 23 Aug 2021 22:23:56 -0400 Subject: [PATCH 2/2] fix flake issues --- torchtext/_download_hooks.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/torchtext/_download_hooks.py b/torchtext/_download_hooks.py index c00a94e74e..9a666d5e20 100644 --- a/torchtext/_download_hooks.py +++ b/torchtext/_download_hooks.py @@ -1,10 +1,5 @@ -from typing import List, Optional, Union, IO, Dict, Any import requests -import os -import logging -import uuid import re -import shutil from tqdm import tqdm