diff --git a/.drone.yml b/.drone.yml index bb4d8a74b28f5..9774ffaaaecc7 100644 --- a/.drone.yml +++ b/.drone.yml @@ -20,7 +20,7 @@ name: torch-GPU steps: - name: testing - image: pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.5 + image: pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.6 environment: CODECOV_TOKEN: @@ -32,6 +32,8 @@ steps: - pip --version - nvidia-smi - pip install -r ./requirements/devel.txt --upgrade-strategy only-if-needed -v --no-cache-dir + # when Image has defined CUDa version we can switch to this package spec "nvidia-dali-cuda${CUDA_VERSION%%.*}0" + - pip install --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda100 --upgrade-strategy only-if-needed - pip list - coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --color=yes --durations=25 # --flake8 - python -m pytest benchmarks pl_examples -v --color=yes --maxfail=2 --durations=0 # --flake8 @@ -46,6 +48,7 @@ steps: trigger: branch: - master + - release/* event: include: - push diff --git a/.github/BECOMING_A_CORE_CONTRIBUTOR.md b/.github/BECOMING_A_CORE_CONTRIBUTOR.md index 1c1a4136ea557..3fa357ef062ca 100644 --- a/.github/BECOMING_A_CORE_CONTRIBUTOR.md +++ b/.github/BECOMING_A_CORE_CONTRIBUTOR.md @@ -53,7 +53,10 @@ different fields contributing! The first 5 core contributors will fit this profile. Thus if you overlap strongly with experiences and expertise as someone else on the team, you might have to wait until the next set of contributors are added. #### Summary: Requirements to apply -- Solve 10 Github issues. The goal is to be inline with expectations for solving issues by the last one so you can do them on your own. If not, I might ask you to solve a few more specific ones. -- Do 10 PR reviews. The goal is to be inline with expectations for solving issues by the last one so you can do them on your own. If not, I might ask you to solve a few more specific ones. +The goal is to be inline with expectations for solving issues by the last one so you can do them on your own. If not, I might ask you to solve a few more specific ones. -If you want to be considered, ping me on gitter and start [tracking your progress here](https://docs.google.com/spreadsheets/d/15D58gp8DvI0Z6qbbYVRuaWioiwzafcP58-UlbuO_CMU/edit?usp=sharing). +- Solve 10+ Github issues. +- Create 5+ meaningful PRs which solves some reported issue - bug, +- Perform 10+ PR reviews from other contributors. + +If you want to be considered, ping me on [Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A). diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index b048baf1302f6..0316d766752ca 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -5,4 +5,11 @@ # the repo. Unless a later match takes precedence, # @global-owner1 and @global-owner2 will be requested for # review when someone opens a pull request. -* @williamfalcon @borda @teddykoker @awaelchli @nateraw @justusschock @tchaton @SeanNaren @ananyahjha93 +* @williamfalcon @borda @teddykoker @awaelchli @nateraw @justusschock @tchaton @SeanNaren @ananyahjha93 + +# Metrics +/pytorch_lightning/metrics/* @teddykoker @ananyahjha93 @justusschock +/tests/metrics/* @teddykoker @ananyahjha93 @justusschock +/docs/source/metrics.rst @teddykoker @ananyahjha93 @justusschock + + diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 6e6a6af863d27..78c89cdae7e05 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -4,6 +4,8 @@ Please include a summary of the change and which issue is fixed. Please also include relevant motivation and context. List any dependencies that are required for this change. + +If we didn't discuss your PR in Github issues there's a high chance it will not be merged. --> Fixes # (issue) @@ -20,10 +22,14 @@ Fixes # (issue) ## PR review - - [ ] Is this pull request ready for review? (if not, please submit in draft mode) +Anyone in the community is free to review the PR once the tests have passed. +Before you start reviewing make sure you have read [Review guidelines](https://github.com/PyTorchLightning/pytorch-lightning/wiki/Review-guidelines). In in short, see following bullet-list: -Anyone in the community is free to review the PR once the tests have passed. -If we didn't discuss your PR in Github issues there's a high chance it will not be merged. + - [ ] Is this pull request ready for review? (if not, please submit in draft mode) + - [ ] Check that all items from **Before submitting** are resolved + - [ ] Make sure the title is self explanatory and the description concisely explains the PR + - [ ] Add labels and milestones (and optionally projects) to the PR so it can be classified; _Bugfixes should be including in bug-fix release milestones (m.f.X) and features should be included in (m.X.b) releases._ + ## Did you have fun? Make sure you had fun coding 🙃 diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml index c8816486f2688..ef410a9afd9a8 100644 --- a/.github/workflows/ci_dockers.yml +++ b/.github/workflows/ci_dockers.yml @@ -4,9 +4,9 @@ name: CI build Docker # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows on: # Trigger the workflow on push or pull request, but only for the master branch push: - branches: [master] + branches: [master, "release/*"] # include release branches like release/1.0.x pull_request: - branches: [master] + branches: [master, "release/*"] jobs: build-PL: @@ -24,7 +24,7 @@ jobs: # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command - uses: docker/setup-buildx-action@v1 - name: Build PL Docker - # publish master + # publish master/release uses: docker/build-push-action@v2 with: build-args: | @@ -40,7 +40,7 @@ jobs: fail-fast: false matrix: python_version: [3.7] - xla_version: [1.6] # todo: , "nightly" + xla_version: [1.6, "nightly"] steps: - name: Checkout uses: actions/checkout@v2 @@ -49,13 +49,13 @@ jobs: # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command - uses: docker/setup-buildx-action@v1 - name: Build XLA Docker - # publish master + # publish master/release uses: docker/build-push-action@v2 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} XLA_VERSION=${{ matrix.xla_version }} - cache-from: pytorchlightning/pytorch_lightning:base-xla-cache-py${{ matrix.python_version }}-torch${{ matrix.xla_version }} + cache-from: pytorchlightning/pytorch_lightning:base-xla-py${{ matrix.python_version }}-torch${{ matrix.xla_version }} file: dockers/base-xla/Dockerfile push: false timeout-minutes: 50 @@ -66,8 +66,8 @@ jobs: fail-fast: false matrix: include: - #- python_version: 3.8 - # pytorch_version: 1.7 # todo + - python_version: 3.8 + pytorch_version: 1.7 - python_version: 3.7 pytorch_version: 1.6 - python_version: 3.6 @@ -86,14 +86,14 @@ jobs: # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command - uses: docker/setup-buildx-action@v1 - name: Build CUDA Docker - # publish master + # publish master/release uses: docker/build-push-action@v2 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} CUDA_VERSION=${{ steps.extend.outputs.CUDA }} - cache-from: pytorchlightning/pytorch_lightning:base-cuda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} + cache-from: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} file: dockers/base-cuda/Dockerfile push: false timeout-minutes: 50 @@ -108,8 +108,11 @@ jobs: pytorch_version: 1.6 - python_version: 3.6 pytorch_version: 1.4 - #- python_version: 3.7 - # pytorch_version: 1.8 # todo + - python_version: 3.7 + pytorch_version: 1.7 + # TODO + # - python_version: 3.7 + # pytorch_version: 1.8 steps: - name: Checkout uses: actions/checkout@v2 @@ -126,7 +129,7 @@ jobs: # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command - uses: docker/setup-buildx-action@v1 - name: Build CUDA Docker - # publish master + # publish master/release uses: docker/build-push-action@v2 with: build-args: | @@ -134,7 +137,7 @@ jobs: PYTORCH_VERSION=${{ matrix.pytorch_version }} PYTORCH_CHANNEL=${{ steps.extend.outputs.CHANNEL }} CUDA_VERSION=${{ steps.extend.outputs.CUDA }} - cache-from: pytorchlightning/pytorch_lightning:base-conda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} + cache-from: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} file: dockers/base-conda/Dockerfile push: false timeout-minutes: 50 diff --git a/.github/workflows/ci_pkg-install.yml b/.github/workflows/ci_pkg-install.yml index aea8d6509db95..4d70beddf3f1b 100644 --- a/.github/workflows/ci_pkg-install.yml +++ b/.github/workflows/ci_pkg-install.yml @@ -3,9 +3,9 @@ name: Install pkg # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows on: # Trigger the workflow on push or pull request, but only for the master branch push: - branches: [master] + branches: [master, "release/*"] # include release branches like release/1.0.x pull_request: - branches: [master] + branches: [master, "release/*"] jobs: @@ -20,7 +20,7 @@ jobs: python-version: [3.6, 3.8] steps: - - uses: actions/checkout@master + - uses: actions/checkout@v2 - uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/ci_test-base.yml b/.github/workflows/ci_test-base.yml index 9b490dec76ee6..de88e94914292 100644 --- a/.github/workflows/ci_test-base.yml +++ b/.github/workflows/ci_test-base.yml @@ -3,9 +3,9 @@ name: CI base testing # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows on: # Trigger the workflow on push or pull request, but only for the master branch push: - branches: [master] + branches: [master, "release/*"] # include release branches like release/1.0.x pull_request: - branches: [master] + branches: [master, "release/*"] jobs: doctest: @@ -72,7 +72,7 @@ jobs: coverage run --source pytorch_lightning -m pytest pytorch_lightning -v --color=yes --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml - name: Upload pytest test results - uses: actions/upload-artifact@master + uses: actions/upload-artifact@v2 with: name: pytest-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }} path: junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml index f652cbb1a4b58..c86806785880b 100644 --- a/.github/workflows/ci_test-conda.yml +++ b/.github/workflows/ci_test-conda.yml @@ -3,9 +3,9 @@ name: PyTorch & Conda # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows on: # Trigger the workflow on push or pull request, but only for the master branch push: - branches: [master] + branches: [master, "release/*"] # include release branches like release/1.0.x pull_request: - branches: [master] + branches: [master, "release/*"] jobs: conda: @@ -16,7 +16,7 @@ jobs: matrix: # os: [ubuntu-20.04] python-version: [3.7] - pytorch-version: [1.3, 1.4, 1.5, 1.6] # , 1.7 # todo + pytorch-version: [1.3, 1.4, 1.5, 1.6, 1.7] # Timeout: https://stackoverflow.com/a/59076067/4521646 timeout-minutes: 35 @@ -46,7 +46,7 @@ jobs: shell: bash -l {0} - name: Upload pytest test results - uses: actions/upload-artifact@master + uses: actions/upload-artifact@v2 with: name: pytest-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }} path: junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml index d74a923693e0b..cd07c655094f3 100644 --- a/.github/workflows/ci_test-full.yml +++ b/.github/workflows/ci_test-full.yml @@ -3,9 +3,9 @@ name: CI complete testing # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows on: # Trigger the workflow on push or pull request, but only for the master branch push: - branches: [master] + branches: [master, "release/*"] # include release branches like release/1.0.x pull_request: - branches: [master] + branches: [master, "release/*"] jobs: pytest: @@ -89,7 +89,7 @@ jobs: run: | # python -m pip install --upgrade --user pip pip install --requirement requirements.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet --upgrade - pip install --requirement ./requirements/devel.txt --quiet --upgrade + pip install --requirement ./requirements/devel.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet --upgrade python --version pip --version pip list @@ -122,7 +122,7 @@ jobs: coverage run --source pytorch_lightning -m pytest pytorch_lightning tests pl_examples -v --color=yes --durations=0 --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml - name: Upload pytest test results - uses: actions/upload-artifact@master + uses: actions/upload-artifact@v2 with: name: pytest-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }} path: junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml diff --git a/.github/workflows/ci_test-tpu.yml b/.github/workflows/ci_test-tpu.yml index 3d58542c490f3..ffedd9728bb06 100644 --- a/.github/workflows/ci_test-tpu.yml +++ b/.github/workflows/ci_test-tpu.yml @@ -2,7 +2,7 @@ name: TPU tests on: push: - branches: [master] + branches: [master, "release/*"] # include release branches like release/1.0.x # TODO: temporal disable TPU testing until we find way how to pass credentials to forked PRs # pull_request: # branches: @@ -119,7 +119,7 @@ jobs: # pycobertura show coverage.xml - name: Upload coverage results - uses: actions/upload-artifact@master + uses: actions/upload-artifact@v2 with: name: coverage-TPU path: coverage diff --git a/.github/workflows/code-formatting.yml b/.github/workflows/code-formatting.yml index 598b7a5df9aa8..e549f5b8f1cfb 100644 --- a/.github/workflows/code-formatting.yml +++ b/.github/workflows/code-formatting.yml @@ -2,9 +2,9 @@ name: "Check Code Format" on: # Trigger the workflow on push or pull request, but only for the master branch push: - branches: [master] + branches: [master, "release/*"] # include release branches like release/1.0.x pull_request: - branches: [master] + branches: [master, "release/*"] jobs: code-black: diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index 0ba6f701f65d6..b8ca5d8723b39 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -3,7 +3,7 @@ name: Publish Docker Releases # https://github.com/docker/build-push-action on: push: - branches: [master] + branches: [master, "release/*"] # include release branches like release/1.0.x release: types: [created] @@ -14,7 +14,7 @@ jobs: fail-fast: false matrix: python_version: [3.6, 3.7, 3.8] - pytorch_version: [1.3, 1.4, 1.5, 1.6] + pytorch_version: [1.3, 1.4, 1.5, 1.6, 1.7] exclude: # excludes PT 1.3 as it is missing on pypi - python_version: 3.8 diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml index fa7580e0f9726..2f91a4f5d43c8 100644 --- a/.github/workflows/docs-checks.yml +++ b/.github/workflows/docs-checks.yml @@ -3,12 +3,12 @@ name: "Docs check" on: # Trigger the workflow on push or pull request, but only for the master branch push: - branches: [master] + branches: [master, "release/*"] # include release branches like release/1.0.x pull_request: - branches: [master] + branches: [master, "release/*"] jobs: - check-docs: + sphinx-check: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 @@ -24,7 +24,7 @@ jobs: runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@master + - uses: actions/checkout@v2 - uses: actions/setup-python@v2 with: python-version: 3.7 @@ -48,6 +48,7 @@ jobs: # python -m pip install --upgrade --user pip pip install --requirement requirements.txt --upgrade-strategy only-if-needed --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet pip install --requirement requirements/extra.txt + pip install --requirement requirements/loggers.txt pip install --requirement requirements/docs.txt python --version pip --version @@ -68,7 +69,7 @@ jobs: runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@master + - uses: actions/checkout@v2 - uses: actions/setup-python@v2 with: python-version: 3.7 diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index eb10c43936044..1395b7ede4b1d 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -29,7 +29,7 @@ jobs: # We do this, since failures on test.pypi aren't that bad - name: Publish to Test PyPI - uses: pypa/gh-action-pypi-publish@master + uses: pypa/gh-action-pypi-publish@v1.4.1 with: user: __token__ password: ${{ secrets.test_pypi_password }} @@ -57,14 +57,14 @@ jobs: password: ${{ secrets.DOCKER_PASSWORD }} - name: Publish XLA to Docker Hub - # publish master + # publish master/release uses: docker/build-push-action@v2 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} XLA_VERSION=${{ matrix.xla_version }} - cache-from: pytorchlightning/pytorch_lightning:base-xla-cache-py${{ matrix.python_version }}-torch${{ matrix.xla_version }} - cache-to: pytorchlightning/pytorch_lightning:base-xla-cache-py${{ matrix.python_version }}-torch${{ matrix.xla_version }} + cache-from: pytorchlightning/pytorch_lightning:base-xla-py${{ matrix.python_version }}-torch${{ matrix.xla_version }} + cache-to: type=inline file: dockers/base-xla/Dockerfile push: true tags: pytorchlightning/pytorch_lightning:base-xla-py${{ matrix.python_version }}-torch${{ matrix.xla_version }} @@ -76,7 +76,7 @@ jobs: fail-fast: false matrix: python_version: [3.6, 3.7, 3.8] - pytorch_version: [1.3, 1.4, 1.5, 1.6] # todo: , 1.7 + pytorch_version: [1.3, 1.4, 1.5, 1.6, 1.7] exclude: # excludes PT 1.3 as it is missing on pypi - python_version: 3.8 @@ -104,22 +104,22 @@ jobs: id: extend - name: Publish CUDA to Docker Hub - # publish master + # publish master/release uses: docker/build-push-action@v2 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} CUDA_VERSION=${{ steps.extend.outputs.CUDA }} - cache-from: pytorchlightning/pytorch_lightning:base-cuda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} - cache-to: pytorchlightning/pytorch_lightning:base-cuda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} + cache-from: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} + cache-to: type=inline file: dockers/base-cuda/Dockerfile push: true tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} timeout-minutes: 55 - name: Publish Conda to Docker Hub - # publish master + # publish master/release uses: docker/build-push-action@v2 with: build-args: | @@ -127,8 +127,8 @@ jobs: PYTORCH_VERSION=${{ matrix.pytorch_version }} PYTORCH_CHANNEL=${{ steps.extend.outputs.CHANNEL }} CUDA_VERSION=${{ steps.extend.outputs.CUDA }} - cache-from: pytorchlightning/pytorch_lightning:base-conda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} - cache-to: pytorchlightning/pytorch_lightning:base-conda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} + cache-from: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} + cache-to: type=inline file: dockers/base-conda/Dockerfile push: true tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} diff --git a/.github/workflows/pypi-release.yml b/.github/workflows/pypi-release.yml index 7579fc0b199a4..354f799df20b1 100644 --- a/.github/workflows/pypi-release.yml +++ b/.github/workflows/pypi-release.yml @@ -3,9 +3,9 @@ name: PyPI Release # https://help.github.com/en/actions/reference/events-that-trigger-workflows on: # Trigger the workflow on push or pull request, but only for the master branch push: - branches: [master] + branches: [master, "release/*"] # include release branches like release/1.0.x release: - types: [created] + types: [created, "release/*"] jobs: @@ -30,7 +30,7 @@ jobs: # We do this, since failures on test.pypi aren't that bad - name: Publish to Test PyPI if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' - uses: pypa/gh-action-pypi-publish@master + uses: pypa/gh-action-pypi-publish@v1.4.1 with: user: __token__ password: ${{ secrets.test_pypi_password }} @@ -39,7 +39,7 @@ jobs: - name: Publish distribution 📦 to PyPI if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' - uses: pypa/gh-action-pypi-publish@master + uses: pypa/gh-action-pypi-publish@v1.4.1 with: user: __token__ password: ${{ secrets.pypi_password }} diff --git a/.gitignore b/.gitignore index fff549a718794..946d5f0f4c2ca 100644 --- a/.gitignore +++ b/.gitignore @@ -33,6 +33,7 @@ timit_data/ .Python ide_layouts/ build/ +_build/ develop-eggs/ dist/ downloads/ diff --git a/.pyrightconfig.json b/.pyrightconfig.json index 3f00d9a3e4454..416b33f6ad6d2 100644 --- a/.pyrightconfig.json +++ b/.pyrightconfig.json @@ -30,6 +30,7 @@ "pytorch_lightning/trainer/training_tricks.py", "pytorch_lightning/trainer/batch_size_scaling.py", "pytorch_lightning/trainer/distrib_data_parallel.py", + "pytorch_lightning/trainer/properties.py", "pytorch_lightning/trainer/lr_scheduler_connector.py", "pytorch_lightning/trainer/training_loop_temp.py", "pytorch_lightning/trainer/connectors/checkpoint_connector.py", diff --git a/CHANGELOG.md b/CHANGELOG.md index 9a534c6bfaf40..16daa24aa2ed9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,57 +9,123 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added -- Added `dirpath` and `filename` parameter in `ModelCheckpoint` ([#4213](https://github.com/PyTorchLightning/pytorch-lightning/pull/4213)) +- Added "monitor" key to saved `ModelCheckpoints` ([#4383](https://github.com/PyTorchLightning/pytorch-lightning/pull/4383)) -- Added plugins docs and DDPPlugin to customize ddp across all accelerators([#4258](https://github.com/PyTorchLightning/pytorch-lightning/pull/4285)) +- Added `ConfusionMatrix` class interface ([#4348](https://github.com/PyTorchLightning/pytorch-lightning/pull/4348)) -- Added `strict` option to the scheduler dictionary ([#3586](https://github.com/PyTorchLightning/pytorch-lightning/pull/3586)) +- Added multiclass AUROC metric ([#4236](https://github.com/PyTorchLightning/pytorch-lightning/pull/4236)) -- Added `fsspec` support for profilers ([#4162](https://github.com/PyTorchLightning/pytorch-lightning/pull/4162)) +- Added global step indexing to the checkpoint name for a better sub-epoch checkpointing experience ([#3807](https://github.com/PyTorchLightning/pytorch-lightning/pull/3807)) + + +- Added optimizer hooks in callbacks ([#4379](https://github.com/PyTorchLightning/pytorch-lightning/pull/4379)) + + +- Added option to log momentum ([#4384](https://github.com/PyTorchLightning/pytorch-lightning/pull/4384)) + + +- Added `fsspec` to tuner ([#4458](https://github.com/PyTorchLightning/pytorch-lightning/pull/4458)) + + +- Added metrics aggregation in Horovod and fixed early stopping ([#3775](https://github.com/PyTorchLightning/pytorch-lightning/pull/3775)) + + +- Added `manual_optimizer_step` which work with `AMP Native` and `accumulated_grad_batches` ([#4485](https://github.com/PyTorchLightning/pytorch-lightning/pull/4485)) + + +- Added `persistent(mode)` method to metrics, to enable and disable metric states being added to `state_dict` ([#4482](https://github.com/PyTorchLightning/pytorch-lightning/pull/4482)) ### Changed +- Tuner algorithms will be skipped if `fast_dev_run=True` ([#3903](https://github.com/PyTorchLightning/pytorch-lightning/pull/3903)) -- Improved error messages for invalid `configure_optimizers` returns ([#3587](https://github.com/PyTorchLightning/pytorch-lightning/pull/3587)) +### Deprecated -- Allow changing the logged step value in `validation_step` ([#4130](https://github.com/PyTorchLightning/pytorch-lightning/pull/4130)) +### Removed -- Allow setting `replace_sampler_ddp=True` with a distributed sampler already added ([#4273](https://github.com/PyTorchLightning/pytorch-lightning/pull/4273)) -- Fixed santized parameters for `WandbLogger.log_hyperparams` ([#4320](https://github.com/PyTorchLightning/pytorch-lightning/pull/4320)) +### Fixed + +- Fixed feature-lack in hpc load ([#4526](https://github.com/PyTorchLightning/pytorch-lightning/pull/4526)) + + +- Fixed metrics states being overridden in ddp mode ([#4482](https://github.com/PyTorchLightning/pytorch-lightning/pull/4482)) + + +- Fixed `lightning_getattr`, `lightning_hasattr` not finding the correct attributes in datamodule ([#4347](https://github.com/PyTorchLightning/pytorch-lightning/pull/4347)) + +## [1.0.5] - 2020-11-03 + +### Added + +- Added PyTorch 1.7 Stable support ([#3821](https://github.com/PyTorchLightning/pytorch-lightning/pull/3821)) +- Added timeout for `tpu_device_exists` to ensure process does not hang indefinitely ([#4340](https://github.com/PyTorchLightning/pytorch-lightning/pull/4340)) + +### Changed + +- W&B log in sync with `Trainer` step ([#4405](https://github.com/PyTorchLightning/pytorch-lightning/pull/4405)) +- Hook `on_after_backward` is called only when `optimizer_step` is being called ([#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439)) +- Moved `track_and_norm_grad` into `training loop` and called only when `optimizer_step` is being called ([#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439)) +- Changed type checker with explicit cast of `ref_model` object ([#4457](https://github.com/PyTorchLightning/pytorch-lightning/pull/4457)) ### Deprecated +- Deprecated passing `ModelCheckpoint` instance to `checkpoint_callback` Trainer argument ([#4336](https://github.com/PyTorchLightning/pytorch-lightning/pull/4336)) -- Deprecated `filepath` in `ModelCheckpoint` ([#4213](https://github.com/PyTorchLightning/pytorch-lightning/pull/4213)) +### Fixed +- Disable saving checkpoints if not trained ([#4372](https://github.com/PyTorchLightning/pytorch-lightning/pull/4372)) +- Fixed error using `auto_select_gpus=True` with `gpus=-1` ([#4209](https://github.com/PyTorchLightning/pytorch-lightning/pull/4209)) +- Disabled training when `limit_train_batches=0` ([#4371](https://github.com/PyTorchLightning/pytorch-lightning/pull/4371)) +- Fixed that metrics do not store computational graph for all seen data ([#4313](https://github.com/PyTorchLightning/pytorch-lightning/pull/4313)) +- Fixed AMP unscale for `on_after_backward` ([#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439)) +- Fixed TorchScript export when module includes Metrics ([#4428](https://github.com/PyTorchLightning/pytorch-lightning/pull/4428)) +- Fixed TorchScript trace method's data to device and docstring ([#4360](https://github.com/PyTorchLightning/pytorch-lightning/pull/4360)) +- Fixed CSV logger warning ([#4419](https://github.com/PyTorchLightning/pytorch-lightning/pull/4419)) +- Fixed skip DDP parameter sync ([#4301](https://github.com/PyTorchLightning/pytorch-lightning/pull/4301)) -- Deprecated `reorder` parameter of the `auc` metric ([#4237](https://github.com/PyTorchLightning/pytorch-lightning/pull/4237)) +## [1.0.4] - 2020-10-27 +### Added -### Removed +- Added `dirpath` and `filename` parameter in `ModelCheckpoint` ([#4213](https://github.com/PyTorchLightning/pytorch-lightning/pull/4213)) +- Added plugins docs and DDPPlugin to customize ddp across all accelerators ([#4258](https://github.com/PyTorchLightning/pytorch-lightning/pull/4285)) +- Added `strict` option to the scheduler dictionary ([#3586](https://github.com/PyTorchLightning/pytorch-lightning/pull/3586)) +- Added `fsspec` support for profilers ([#4162](https://github.com/PyTorchLightning/pytorch-lightning/pull/4162)) +- Added autogenerated helptext to `Trainer.add_argparse_args` ([#4344](https://github.com/PyTorchLightning/pytorch-lightning/pull/4344)) +- Added support for string values in `Trainer`'s `profiler` parameter ([#3656](https://github.com/PyTorchLightning/pytorch-lightning/pull/3656)) +### Changed +- Improved error messages for invalid `configure_optimizers` returns ([#3587](https://github.com/PyTorchLightning/pytorch-lightning/pull/3587)) +- Allow changing the logged step value in `validation_step` ([#4130](https://github.com/PyTorchLightning/pytorch-lightning/pull/4130)) +- Allow setting `replace_sampler_ddp=True` with a distributed sampler already added ([#4273](https://github.com/PyTorchLightning/pytorch-lightning/pull/4273)) +- Fixed santized parameters for `WandbLogger.log_hyperparams` ([#4320](https://github.com/PyTorchLightning/pytorch-lightning/pull/4320)) + +### Deprecated + +- Deprecated `filepath` in `ModelCheckpoint` ([#4213](https://github.com/PyTorchLightning/pytorch-lightning/pull/4213)) +- Deprecated `reorder` parameter of the `auc` metric ([#4237](https://github.com/PyTorchLightning/pytorch-lightning/pull/4237)) +- Deprecated bool values in `Trainer`'s `profiler` parameter ([#3656](https://github.com/PyTorchLightning/pytorch-lightning/pull/3656)) ### Fixed - Fixed setting device ids in DDP ([#4297](https://github.com/PyTorchLightning/pytorch-lightning/pull/4297)) - - Fixed synchronization of best model path in `ddp_accelerator` ([#4323](https://github.com/PyTorchLightning/pytorch-lightning/pull/4323)) - -- Fixed WandbLogger not uploading checkpoint artifacts at the end of training ([#4341](https://github.com/PyTorchLightning/pytorch-lightning/pull/4341)) +- Fixed `WandbLogger` not uploading checkpoint artifacts at the end of training ([#4341](https://github.com/PyTorchLightning/pytorch-lightning/pull/4341)) ## [1.0.3] - 2020-10-20 ### Added + - Added persistent flag to `Metric.add_state` ([#4195](https://github.com/PyTorchLightning/pytorch-lightning/pull/4195)) ### Changed diff --git a/README.md b/README.md index 21f4aaab19ad1..30079df931759 100644 --- a/README.md +++ b/README.md @@ -89,14 +89,14 @@ Lightning can automatically export to ONNX or TorchScript for those cases. ## Continuous Integration
-| System / PyTorch ver. | 1.3 (min. req.)* | 1.4 | 1.5 | 1.6 (latest) | 1.7 (nightly) | +| System / PyTorch ver. | 1.3 (min. req.)* | 1.4 | 1.5 | 1.6 | 1.7 (latest) | | :---: | :---: | :---: | :---: | :---: | :---: | -| Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | - | -| Linux py3.7 [GPUs**] | - | - | [![Build Status](http://104.154.220.231/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://104.154.220.231/PyTorchLightning/pytorch-lightning) | - | - | +| Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | +| Linux py3.7 [GPUs**] | - | - | - | [![Build Status](http://104.154.220.231/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://104.154.220.231/PyTorchLightning/pytorch-lightning) | - | | Linux py3.7 [TPUs***] | - | - | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) | - | -| Linux py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | -| OSX py3.6 / py3.7 | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | -| Windows py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | +| Linux py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | +| OSX py3.6 / py3.7 | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | +| Windows py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - _\* `torch>=1.4` is the minimal pytorch version for Python 3.8_ - _\** tests run on two NVIDIA K80_ @@ -183,7 +183,7 @@ trainer = pl.Trainer() trainer.fit(autoencoder, DataLoader(train), DataLoader(val)) ``` -#### And without changing a single line of code, you could run on GPU/TPUss +#### And without changing a single line of code, you could run on GPUs/TPUs ```python # 8 GPUs trainer = Trainer(max_epochs=1, gpus=8) diff --git a/benchmarks/test_parity.py b/benchmarks/test_parity.py index d2b30afb23946..d2bc97deff598 100644 --- a/benchmarks/test_parity.py +++ b/benchmarks/test_parity.py @@ -11,7 +11,7 @@ @pytest.mark.parametrize('cls_model,max_diff', [ (ParityModuleRNN, 0.05), - (ParityModuleMNIST, 0.70) + (ParityModuleMNIST, 0.8) ]) @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") def test_pytorch_parity(tmpdir, cls_model, max_diff): diff --git a/dockers/README.md b/dockers/README.md index 73c40635eb0a5..aab82a171641a 100644 --- a/dockers/README.md +++ b/dockers/README.md @@ -14,10 +14,10 @@ or with specific arguments ```bash git clone docker image build \ - -t pytorch-lightning:py3.8 \ - -f dockers/conda/Dockerfile \ + -t pytorch-lightning:py3.8-pt1.6 \ + -f dockers/base-cuda/Dockerfile \ --build-arg PYTHON_VERSION=3.8 \ - --build-arg PYTORCH_VERSION=1.4 \ + --build-arg PYTORCH_VERSION=1.6 \ . ``` diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile index 6a7f03970cf75..ea8c6bc5d001d 100644 --- a/dockers/base-conda/Dockerfile +++ b/dockers/base-conda/Dockerfile @@ -35,7 +35,8 @@ SHELL ["/bin/bash", "-c"] ENV PATH="$PATH:/root/.local/bin" -RUN apt-get update && apt-get install -y --no-install-recommends \ +RUN apt-get update -qq && \ + apt-get install -y --no-install-recommends \ build-essential \ cmake \ git \ @@ -74,7 +75,7 @@ ENV CONDA_ENV=lightning COPY environment.yml environment.yml # conda init -RUN conda create -y --name $CONDA_ENV && \ +RUN conda create -y --name $CONDA_ENV cudatoolkit=${CUDA_VERSION} && \ conda init bash && \ # NOTE: this requires that the channel is presented in the yaml before packages # replace channel to nigtly if needed, fix PT version and remove Horovod as it will be installe later @@ -104,6 +105,7 @@ RUN \ # Install remaining requirements pip install -r requirements-extra.txt --upgrade-strategy only-if-needed && \ pip install -r requirements-test.txt --upgrade-strategy only-if-needed && \ + pip install --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda${CUDA_VERSION%%.*}0 && \ rm requirements* RUN \ @@ -118,4 +120,4 @@ RUN \ conda info && \ pip list && \ python -c "import sys; assert sys.version[:3] == '$PYTHON_VERSION', sys.version" && \ - python -c "import torch; assert torch.__version__[:3] == '$PYTORCH_VERSION', torch.__version__" \ No newline at end of file + python -c "import torch; assert torch.__version__[:3] == '$PYTORCH_VERSION', torch.__version__" diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index e22b5a862a7d7..f886ccc30be7a 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -28,6 +28,7 @@ FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu18.04 ARG PYTHON_VERSION=3.7 ARG PYTORCH_VERSION=1.6 +ARG CMAKE_VERSION=3.18.4 SHELL ["/bin/bash", "-c"] # https://techoverflow.net/2019/05/18/how-to-fix-configuring-tzdata-interactive-input-when-building-docker-images/ @@ -37,7 +38,7 @@ ENV TZ=Europe/Prague ENV PATH="$PATH:/root/.local/bin" ENV CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda" -RUN apt-get update && \ +RUN apt-get update -qq && \ apt-get install -y --no-install-recommends \ build-essential \ pkg-config \ @@ -93,6 +94,7 @@ RUN \ # Install all requirements pip install -r requirements/devel.txt --upgrade-strategy only-if-needed --use-feature=2020-resolver && \ + pip install --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda${CUDA_VERSION%%.*}0 && \ rm -rf requirements* RUN \ @@ -105,5 +107,6 @@ RUN \ # Show what we have pip --version && \ pip list && \ + python -c 'from nvidia.dali.pipeline import Pipeline' && \ python -c "import sys; assert sys.version[:3] == '$PYTHON_VERSION', sys.version" && \ - python -c "import torch; assert torch.__version__[:3] == '$PYTORCH_VERSION', torch.__version__" \ No newline at end of file + python -c "import torch; assert torch.__version__[:3] == '$PYTORCH_VERSION', torch.__version__" diff --git a/dockers/base-xla/Dockerfile b/dockers/base-xla/Dockerfile index f44465383a0e0..8eb093295c37b 100644 --- a/dockers/base-xla/Dockerfile +++ b/dockers/base-xla/Dockerfile @@ -31,7 +31,7 @@ ENV CONDA_ENV=lightning # show system inforation RUN lsb_release -a && cat /etc/*-release -RUN apt-get update && \ +RUN apt-get update -qq && \ apt-get install -y --no-install-recommends \ build-essential \ cmake \ @@ -110,4 +110,4 @@ RUN \ conda info && \ pip list && \ python -c "import sys; assert sys.version[:3] == '$PYTHON_VERSION', sys.version" && \ - python -c "import torch; ver = '$XLA_VERSION' ; ver = dict(nightly='1.7').get(ver, ver) ; assert torch.__version__[:3] == ver, torch.__version__" + python -c "import torch; ver = '$XLA_VERSION' ; ver = dict(nightly='1.8').get(ver, ver) ; assert torch.__version__[:3] == ver, torch.__version__" diff --git a/docs/source/accelerators.rst b/docs/source/accelerators.rst new file mode 100644 index 0000000000000..ee801f2dee28b --- /dev/null +++ b/docs/source/accelerators.rst @@ -0,0 +1,182 @@ +############ +Accelerators +############ +Accelerators connect a Lightning Trainer to arbitrary accelerators (CPUs, GPUs, TPUs, etc). Accelerators +also manage distributed accelerators (like DP, DDP, HPC cluster). + +Accelerators can also be configured to run on arbitrary clusters using Plugins or to link up to arbitrary +computational strategies like 16-bit precision via AMP and Apex. + +---------- + +****************************** +Implement a custom accelerator +****************************** +To link up arbitrary hardware, implement your own Accelerator subclass + +.. code-block:: python + + from pytorch_lightning.accelerators.accelerator import Accelerator + + class MyAccelerator(Accelerator): + def __init__(self, trainer, cluster_environment=None): + super().__init__(trainer, cluster_environment) + self.nickname = 'my_accelator' + + def setup(self): + # find local rank, etc, custom things to implement + + def train(self): + # implement what happens during training + + def training_step(self): + # implement how to do a training_step on this accelerator + + def validation_step(self): + # implement how to do a validation_step on this accelerator + + def test_step(self): + # implement how to do a test_step on this accelerator + + def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): + # implement how to do a backward pass with this accelerator + + def barrier(self, name: Optional[str] = None): + # implement this accelerator's barrier + + def broadcast(self, obj, src=0): + # implement this accelerator's broadcast function + + def sync_tensor(self, + tensor: Union[torch.Tensor], + group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: + # implement how to sync tensors when reducing metrics across accelerators + +******** +Examples +******** +The following examples illustrate customizing accelerators. + +Example 1: Arbitrary HPC cluster +================================ +To link any accelerator with an arbitrary cluster (SLURM, Condor, etc), pass in a Cluster Plugin which will be passed +into any accelerator. + +First, implement your own ClusterEnvironment. Here is the torch elastic implementation. + +.. code-block:: python + + import os + from pytorch_lightning import _logger as log + from pytorch_lightning.utilities import rank_zero_warn + from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment + + class TorchElasticEnvironment(ClusterEnvironment): + + def __init__(self): + super().__init__() + + def master_address(self): + if "MASTER_ADDR" not in os.environ: + rank_zero_warn( + "MASTER_ADDR environment variable is not defined. Set as localhost" + ) + os.environ["MASTER_ADDR"] = "127.0.0.1" + log.debug(f"MASTER_ADDR: {os.environ['MASTER_ADDR']}") + master_address = os.environ.get('MASTER_ADDR') + return master_address + + def master_port(self): + if "MASTER_PORT" not in os.environ: + rank_zero_warn( + "MASTER_PORT environment variable is not defined. Set as 12910" + ) + os.environ["MASTER_PORT"] = "12910" + log.debug(f"MASTER_PORT: {os.environ['MASTER_PORT']}") + + port = os.environ.get('MASTER_PORT') + return port + + def world_size(self): + return os.environ.get('WORLD_SIZE') + + def local_rank(self): + return int(os.environ['LOCAL_RANK']) + +Now, pass it into the trainer which will use Torch Elastic across your accelerator of choice. + +.. code-block:: python + + cluster = TorchElasticEnvironment() + accelerator = MyAccelerator() + trainer = Trainer(plugins=[cluster], accelerator=MyAccelerator()) + +In this example, MyAccelerator can define arbitrary hardware (like IPUs or TPUs) and links it to an arbitrary +compute cluster. + +------------ + +********************** +Available Accelerators +********************** + +CPU Accelerator +=============== + +.. autoclass:: pytorch_lightning.accelerators.cpu_accelerator.CPUAccelerator + :noindex: + +DDP Accelerator +=============== + +.. autoclass:: pytorch_lightning.accelerators.ddp_accelerator.DDPAccelerator + :noindex: + +DDP2 Accelerator +================ + +.. autoclass:: pytorch_lightning.accelerators.ddp2_accelerator.DDP2Accelerator + :noindex: + +DDP CPU HPC Accelerator +======================= + +.. autoclass:: pytorch_lightning.accelerators.ddp_cpu_hpc_accelerator.DDPCPUHPCAccelerator + :noindex: + +DDP CPU Spawn Accelerator +========================= + +.. autoclass:: pytorch_lightning.accelerators.ddp_cpu_spawn_accelerator.DDPCPUSpawnAccelerator + :noindex: + +DDP HPC Accelerator +=================== + +.. autoclass:: pytorch_lightning.accelerators.ddp_hpc_accelerator.DDPHPCAccelerator + :noindex: + +DDP Spawn Accelerator +===================== + +.. autoclass:: pytorch_lightning.accelerators.ddp_spawn_accelerator.DDPSpawnAccelerator + :noindex: + +GPU Accelerator +=============== + +.. autoclass:: pytorch_lightning.accelerators.gpu_accelerator.GPUAccelerator + :noindex: + +Horovod Accelerator +=================== + +.. autoclass:: pytorch_lightning.accelerators.horovod_accelerator.HorovodAccelerator + :noindex: + +TPU Accelerator +=============== + +.. autoclass:: pytorch_lightning.accelerators.tpu_accelerator.TPUAccelerator + :noindex: diff --git a/docs/source/community_examples.rst b/docs/source/community_examples.rst index 470c8e2dd8c9f..a89b2599b1a99 100644 --- a/docs/source/community_examples.rst +++ b/docs/source/community_examples.rst @@ -16,4 +16,5 @@ Community Examples - `VAE Library of over 18+ VAE flavors `_. - `Transformers Question Answering (SQuAD) `_. - `Atlas: End-to-End 3D Scene Reconstruction from Posed Images `_. -- `Self-Supervised Representation Learning (MoCo and BYOL) `_. \ No newline at end of file +- `Self-Supervised Representation Learning (MoCo and BYOL) `_. +- `pytorch-forecasting: Time series forecasting package `_. diff --git a/docs/source/conf.py b/docs/source/conf.py index c662e1e9c912e..38431c2264636 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -304,22 +304,23 @@ def package_list_from_file(file): return mocked_packages +# define mapping from PyPI names to python imports +PACKAGE_MAPPING = { + 'Pillow': 'PIL', + 'opencv-python': 'cv2', + 'PyYAML': 'yaml', + 'comet-ml': 'comet_ml', + 'neptune-client': 'neptune', +} MOCK_PACKAGES = [] if SPHINX_MOCK_REQUIREMENTS: # mock also base packages when we are on RTD since we don't install them there MOCK_PACKAGES += package_list_from_file(os.path.join(PATH_ROOT, 'requirements.txt')) MOCK_PACKAGES += package_list_from_file(os.path.join(PATH_ROOT, 'requirements/extra.txt')) MOCK_PACKAGES += package_list_from_file(os.path.join(PATH_ROOT, 'requirements/loggers.txt')) +MOCK_PACKAGES = [PACKAGE_MAPPING.get(pkg, pkg) for pkg in MOCK_PACKAGES] -MOCK_MANUAL_PACKAGES = [ - 'torchvision', - 'PIL', - # packages with different package name compare to import name - 'yaml', - 'comet_ml', - 'neptune', -] -autodoc_mock_imports = MOCK_PACKAGES + MOCK_MANUAL_PACKAGES +autodoc_mock_imports = MOCK_PACKAGES autosummary_generate = True diff --git a/docs/source/hyperparameters.rst b/docs/source/hyperparameters.rst index 4d1d7ee45583d..7e5f349ba0ca8 100644 --- a/docs/source/hyperparameters.rst +++ b/docs/source/hyperparameters.rst @@ -112,87 +112,87 @@ Often times we train many versions of a model. You might share that model or com at which point it is very useful to know how that model was trained (i.e.: what learning rate, neural network, etc...). Lightning has a few ways of saving that information for you in checkpoints and yaml files. The goal here is to -improve readability and reproducibility +improve readability and reproducibility. -1. The first way is to ask lightning to save the values of anything in the __init__ for you to the checkpoint. This also -makes those values available via `self.hparams`. +1. The first way is to ask lightning to save the values of anything in the __init__ for you to the checkpoint. This also + makes those values available via `self.hparams`. -.. code-block:: python + .. code-block:: python - class LitMNIST(LightningModule): + class LitMNIST(LightningModule): - def __init__(self, layer_1_dim=128, learning_rate=1e-2, **kwargs): - super().__init__() - # call this to save (layer_1_dim=128, learning_rate=1e-4) to the checkpoint - self.save_hyperparameters() + def __init__(self, layer_1_dim=128, learning_rate=1e-2, **kwargs): + super().__init__() + # call this to save (layer_1_dim=128, learning_rate=1e-4) to the checkpoint + self.save_hyperparameters() - # equivalent - self.save_hyperparameters('layer_1_dim', 'learning_rate') + # equivalent + self.save_hyperparameters('layer_1_dim', 'learning_rate') - # this now works - self.hparams.layer_1_dim + # Now possible to access layer_1_dim from hparams + self.hparams.layer_1_dim -2. Sometimes your init might have objects or other parameters you might not want to save. -In that case, choose only a few +2. Sometimes your init might have objects or other parameters you might not want to save. + In that case, choose only a few -.. code-block:: python + .. code-block:: python - class LitMNIST(LightningModule): + class LitMNIST(LightningModule): - def __init__(self, loss_fx, generator_network, layer_1_dim=128 **kwargs): - super().__init__() - self.layer_1_dim = layer_1_dim - self.loss_fx = loss_fx + def __init__(self, loss_fx, generator_network, layer_1_dim=128 **kwargs): + super().__init__() + self.layer_1_dim = layer_1_dim + self.loss_fx = loss_fx - # call this to save (layer_1_dim=128) to the checkpoint - self.save_hyperparameters('layer_1_dim') + # call this to save (layer_1_dim=128) to the checkpoint + self.save_hyperparameters('layer_1_dim') - # to load specify the other args - model = LitMNIST.load_from_checkpoint(PATH, loss_fx=torch.nn.SomeOtherLoss, generator_network=MyGenerator()) + # to load specify the other args + model = LitMNIST.load_from_checkpoint(PATH, loss_fx=torch.nn.SomeOtherLoss, generator_network=MyGenerator()) -3. Assign to `self.hparams`. Anything assigned to `self.hparams` will also be saved automatically +3. Assign to `self.hparams`. Anything assigned to `self.hparams` will also be saved automatically. -.. code-block:: python + .. code-block:: python - # using a argparse.Namespace - class LitMNIST(LightningModule): + # using a argparse.Namespace + class LitMNIST(LightningModule): + def __init__(self, hparams, *args, **kwargs): + super().__init__() + self.hparams = hparams + self.layer_1 = torch.nn.Linear(28 * 28, self.hparams.layer_1_dim) + self.layer_2 = torch.nn.Linear(self.hparams.layer_1_dim, self.hparams.layer_2_dim) + self.layer_3 = torch.nn.Linear(self.hparams.layer_2_dim, 10) + def train_dataloader(self): + return DataLoader(mnist_train, batch_size=self.hparams.batch_size) - def __init__(self, hparams, *args, **kwargs): - super().__init__() - self.hparams = hparams + .. warning:: Deprecated. This method of assigning hyperparameters to the LightningModule is no longer + recommended and will not be supported in future versions of Lightning. - self.layer_1 = torch.nn.Linear(28 * 28, self.hparams.layer_1_dim) - self.layer_2 = torch.nn.Linear(self.hparams.layer_1_dim, self.hparams.layer_2_dim) - self.layer_3 = torch.nn.Linear(self.hparams.layer_2_dim, 10) - def train_dataloader(self): - return DataLoader(mnist_train, batch_size=self.hparams.batch_size) +4. You can also save full objects such as `dict` or `Namespace` to the checkpoint. -4. You can also save full objects such as `dict` or `Namespace` to the checkpoint. + .. code-block:: python -.. code-block:: python + # using a argparse.Namespace + class LitMNIST(LightningModule): - # using a argparse.Namespace - class LitMNIST(LightningModule): + def __init__(self, conf, *args, **kwargs): + super().__init__() + self.save_hyperparameters(conf) - def __init__(self, conf, *args, **kwargs): - super().__init__() - self.hparams = conf + self.layer_1 = torch.nn.Linear(28 * 28, self.hparams.layer_1_dim) + self.layer_2 = torch.nn.Linear(self.hparams.layer_1_dim, self.hparams.layer_2_dim) + self.layer_3 = torch.nn.Linear(self.hparams.layer_2_dim, 10) - # equivalent - self.save_hyperparameters(conf) + conf = OmegaConf.create(...) + model = LitMNIST(conf) - self.layer_1 = torch.nn.Linear(28 * 28, self.hparams.layer_1_dim) - self.layer_2 = torch.nn.Linear(self.hparams.layer_1_dim, self.hparams.layer_2_dim) - self.layer_3 = torch.nn.Linear(self.hparams.layer_2_dim, 10) + # Now possible to access any stored variables from hparams + model.hparams.anything - conf = OmegaConf.create(...) - model = LitMNIST(conf) - # this works - model.hparams.anything ---------- diff --git a/docs/source/index.rst b/docs/source/index.rst index 6ea7709a8e72f..8f0642c6ad771 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -39,6 +39,7 @@ PyTorch Lightning Documentation :name: docs :caption: Optional extensions + accelerators callbacks datamodules logging diff --git a/docs/source/introduction_guide.rst b/docs/source/introduction_guide.rst index e331368c94884..d6d082e2ed779 100644 --- a/docs/source/introduction_guide.rst +++ b/docs/source/introduction_guide.rst @@ -543,7 +543,7 @@ Or multiple nodes # (32 GPUs) model = LitMNIST() - trainer = Trainer(gpus=8, num_nodes=4, distributed_backend='ddp') + trainer = Trainer(gpus=8, num_nodes=4, accelerator='ddp') trainer.fit(model, train_loader) Refer to the :ref:`distributed computing guide for more details `. diff --git a/docs/source/lightning_module.rst b/docs/source/lightning_module.rst index 4c297a09a55d8..c26e0fc0351d1 100644 --- a/docs/source/lightning_module.rst +++ b/docs/source/lightning_module.rst @@ -172,10 +172,11 @@ Under the hood, Lightning does the following (pseudocode): model.train() torch.set_grad_enabled(True) - outs = [] + losses = [] for batch in train_dataloader: # forward - out = training_step(val_batch) + loss = training_step(batch) + losses.append(loss.detach()) # backward loss.backward() @@ -184,6 +185,7 @@ Under the hood, Lightning does the following (pseudocode): optimizer.step() optimizer.zero_grad() + Training epoch-level metrics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ If you want to calculate epoch-level metrics and log them, use the `.log` method @@ -256,7 +258,7 @@ The matching pseudocode is: Training with DataParallel ~~~~~~~~~~~~~~~~~~~~~~~~~~ -When training using a `distributed_backend` that splits data from each batch across GPUs, sometimes you might +When training using a `accelerator` that splits data from each batch across GPUs, sometimes you might need to aggregate them on the master GPU for processing (dp, or ddp2). In this case, implement the `training_step_end` method @@ -360,7 +362,7 @@ If you need to do something with all the outputs of each `validation_step`, over Validating with DataParallel ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -When training using a `distributed_backend` that splits data from each batch across GPUs, sometimes you might +When training using a `accelerator` that splits data from each batch across GPUs, sometimes you might need to aggregate them on the master GPU for processing (dp, or ddp2). In this case, implement the `validation_step_end` method @@ -1007,6 +1009,12 @@ manual_backward .. automethod:: pytorch_lightning.core.lightning.LightningModule.manual_backward :noindex: +manual_optimizer_step +~~~~~~~~~~~~~~~~~~~~~ + +.. automethod:: pytorch_lightning.core.lightning.LightningModule.manual_optimizer_step + :noindex: + on_after_backward ~~~~~~~~~~~~~~~~~ diff --git a/docs/source/loggers.rst b/docs/source/loggers.rst index 1c97fa8e2cc3c..a3b85450e6233 100644 --- a/docs/source/loggers.rst +++ b/docs/source/loggers.rst @@ -19,6 +19,15 @@ but you can pass to the :class:`~pytorch_lightning.trainer.trainer.Trainer` any Read more about :ref:`logging` options. +To log arbitrary artifacts like images or audio samples use the `trainer.log_dir` property to resolve +the path. + +.. code-block:: python + + def training_step(self, batch, batch_idx): + img = ... + log_image(img, self.trainer.log_dir) + Comet.ml ======== diff --git a/docs/source/logging.rst b/docs/source/logging.rst index a1d16a4ddc771..ae1a0487a468b 100644 --- a/docs/source/logging.rst +++ b/docs/source/logging.rst @@ -260,12 +260,12 @@ Logging hyperparameters *********************** When training a model, it's useful to know what hyperparams went into that model. -When Lightning creates a checkpoint, it stores a key "hparams" with the hyperparams. +When Lightning creates a checkpoint, it stores a key "hyper_parameters" with the hyperparams. .. code-block:: python lightning_checkpoint = torch.load(filepath, map_location=lambda storage, loc: storage) - hyperparams = lightning_checkpoint['hparams'] + hyperparams = lightning_checkpoint['hyper_parameters'] Some loggers also allow logging the hyperparams used in the experiment. For instance, when using the TestTubeLogger or the TensorBoardLogger, all hyperparams will show diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst index dadf654b1ef67..d80b35f91abd1 100644 --- a/docs/source/metrics.rst +++ b/docs/source/metrics.rst @@ -78,6 +78,25 @@ If ``on_epoch`` is True, the logger automatically logs the end of epoch metric v self.valid_acc(logits, y) self.log('valid_acc', self.valid_acc, on_step=True, on_epoch=True) +.. note:: + If using metrics in data parallel mode (dp), the metric update/logging should be done + in the ``_step_end`` method (where ```` is either ``training``, ``validation`` + or ``test``). This is due to metric states else being destroyed after each forward pass, + leading to wrong accumulation. In practice do the following: + + .. code-block:: python + + def training_step(self, batch, batch_idx): + data, target = batch + pred = self(data) + ... + return {'loss' : loss, 'preds' : preds, 'target' : target} + + def training_step_end(self, outputs): + #update and log + self.metric(outputs['preds'], outputs['target']) + self.log('metric', self.metric) + This metrics API is independent of PyTorch Lightning. Metrics can directly be used in PyTorch as shown in the example: @@ -105,6 +124,19 @@ This metrics API is independent of PyTorch Lightning. Metrics can directly be us # total accuracy over all validation batches total_valid_accuracy = valid_accuracy.compute() +.. note:: + + Metrics contain internal states that keep track of the data seen so far. + Do not mix metric states across training, validation and testing. + It is highly recommended to re-initialize the metric per mode as + shown in the examples above. + +.. note:: + + Metric states will as default add their internal state to the models ``state_dict``. + To change this after initializing the metric the method ``.persistent(mode)`` can + be used to enable (``mode=True``) or disable (``mode=False``) this behaviour. + ********************* Implementing a Metric ********************* @@ -144,6 +176,19 @@ Example implementation: def compute(self): return self.correct.float() / self.total +Metrics support backpropagation, if all computations involved in the metric calculation +are differentiable. However, note that the cached state is detached from the computational +graph and cannot be backpropagated. Not doing this would mean storing the computational +graph for each update call, which can lead to out-of-memory errors. +In practise this means that: + +.. code-block:: python + + metric = MyMetric() + val = metric(pred, target) # this value can be backpropagated + val = metric.compute() # this value cannot be backpropagated + + ********** Metric API ********** @@ -182,6 +227,12 @@ Fbeta .. autoclass:: pytorch_lightning.metrics.classification.Fbeta :noindex: +ConfusionMatrix +~~~~~~~~~~~~~~~ + +.. autoclass:: pytorch_lightning.metrics.classification.ConfusionMatrix + :noindex: + Regression Metrics ------------------ @@ -259,6 +310,13 @@ auroc [func] :noindex: +multiclass_auroc [func] +~~~~~~~~~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.metrics.functional.classification.multiclass_auroc + :noindex: + + average_precision [func] ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -269,7 +327,7 @@ average_precision [func] confusion_matrix [func] ~~~~~~~~~~~~~~~~~~~~~~~ -.. autofunction:: pytorch_lightning.metrics.functional.classification.confusion_matrix +.. autofunction:: pytorch_lightning.metrics.functional.confusion_matrix :noindex: @@ -434,4 +492,3 @@ embedding_similarity [func] .. autofunction:: pytorch_lightning.metrics.functional.self_supervised.embedding_similarity :noindex: - diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst index ea49601a397cd..1d2600df02180 100644 --- a/docs/source/multi_gpu.rst +++ b/docs/source/multi_gpu.rst @@ -206,6 +206,8 @@ Note in particular the difference between `gpus=0`, `gpus=[0]` and `gpus="0"`. `auto_select_gpus=True` will automatically help you find `k` gpus that are not occupied by other processes. This is especially useful when GPUs are configured to be in "exclusive mode", such that only one process at a time can access them. + For more details see the :ref:`Trainer guide `. + Remove CUDA flags ^^^^^^^^^^^^^^^^^ @@ -229,11 +231,11 @@ Distributed modes ----------------- Lightning allows multiple ways of training -- Data Parallel (`distributed_backend='dp'`) (multiple-gpus, 1 machine) -- DistributedDataParallel (`distributed_backend='ddp'`) (multiple-gpus across many machines (python script based)). -- DistributedDataParallel (`distributed_backend='ddp_spawn'`) (multiple-gpus across many machines (spawn based)). -- DistributedDataParallel 2 (`distributed_backend='ddp2'`) (DP in a machine, DDP across machines). -- Horovod (`distributed_backend='horovod'`) (multi-machine, multi-gpu, configured at runtime) +- Data Parallel (`accelerator='dp'`) (multiple-gpus, 1 machine) +- DistributedDataParallel (`accelerator='ddp'`) (multiple-gpus across many machines (python script based)). +- DistributedDataParallel (`accelerator='ddp_spawn'`) (multiple-gpus across many machines (spawn based)). +- DistributedDataParallel 2 (`accelerator='ddp2'`) (DP in a machine, DDP across machines). +- Horovod (`accelerator='horovod'`) (multi-machine, multi-gpu, configured at runtime) - TPUs (`tpu_cores=8|x`) (tpu or TPU pod) .. note:: @@ -256,7 +258,7 @@ after which the root node will aggregate the results. :skipif: torch.cuda.device_count() < 2 # train on 2 GPUs (using DP mode) - trainer = Trainer(gpus=2, distributed_backend='dp') + trainer = Trainer(gpus=2, accelerator='dp') Distributed Data Parallel ^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -279,10 +281,10 @@ Distributed Data Parallel .. code-block:: python # train on 8 GPUs (same machine (ie: node)) - trainer = Trainer(gpus=8, distributed_backend='ddp') + trainer = Trainer(gpus=8, accelerator='ddp') # train on 32 GPUs (4 nodes) - trainer = Trainer(gpus=8, distributed_backend='ddp', num_nodes=4) + trainer = Trainer(gpus=8, accelerator='ddp', num_nodes=4) This Lightning implementation of DDP calls your script under the hood multiple times with the correct environment variables: @@ -328,7 +330,7 @@ In this case, we can use DDP2 which behaves like DP in a machine and DDP across .. code-block:: python # train on 32 GPUs (4 nodes) - trainer = Trainer(gpus=8, distributed_backend='ddp2', num_nodes=4) + trainer = Trainer(gpus=8, accelerator='ddp2', num_nodes=4) Distributed Data Parallel Spawn ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -346,7 +348,7 @@ project module) you can use the following method: .. code-block:: python # train on 8 GPUs (same machine (ie: node)) - trainer = Trainer(gpus=8, distributed_backend='ddp') + trainer = Trainer(gpus=8, accelerator='ddp') We STRONGLY discourage this use because it has limitations (due to Python and PyTorch): @@ -398,7 +400,7 @@ You can then call your scripts anywhere .. code-block:: bash cd /project/src - python some_file.py --distributed_backend 'ddp' --gpus 8 + python some_file.py --accelerator 'ddp' --gpus 8 Horovod @@ -419,10 +421,10 @@ Horovod can be configured in the training script to run with any number of GPUs .. code-block:: python # train Horovod on GPU (number of GPUs / machines provided on command-line) - trainer = Trainer(distributed_backend='horovod', gpus=1) + trainer = Trainer(accelerator='horovod', gpus=1) # train Horovod on CPU (number of processes / machines provided on command-line) - trainer = Trainer(distributed_backend='horovod') + trainer = Trainer(accelerator='horovod') When starting the training job, the driver application will then be used to specify the total number of worker processes: @@ -552,13 +554,13 @@ Below are the possible configurations we support. +=======+=========+====+=====+=========+============================================================+ | Y | | | | | `Trainer(gpus=1)` | +-------+---------+----+-----+---------+------------------------------------------------------------+ -| Y | | | | Y | `Trainer(gpus=1, use_amp=True)` | +| Y | | | | Y | `Trainer(gpus=1, precision=16)` | +-------+---------+----+-----+---------+------------------------------------------------------------+ -| | Y | Y | | | `Trainer(gpus=k, distributed_backend='dp')` | +| | Y | Y | | | `Trainer(gpus=k, accelerator='dp')` | +-------+---------+----+-----+---------+------------------------------------------------------------+ -| | Y | | Y | | `Trainer(gpus=k, distributed_backend='ddp')` | +| | Y | | Y | | `Trainer(gpus=k, accelerator='ddp')` | +-------+---------+----+-----+---------+------------------------------------------------------------+ -| | Y | | Y | Y | `Trainer(gpus=k, distributed_backend='ddp', use_amp=True)` | +| | Y | | Y | Y | `Trainer(gpus=k, accelerator='ddp', precision=16)` | +-------+---------+----+-----+---------+------------------------------------------------------------+ @@ -588,10 +590,10 @@ In (DDP, Horovod) your effective batch size will be 7 * gpus * num_nodes. .. code-block:: python # effective batch size = 7 * 8 - Trainer(gpus=8, distributed_backend='ddp|horovod') + Trainer(gpus=8, accelerator='ddp|horovod') # effective batch size = 7 * 8 * 10 - Trainer(gpus=8, num_nodes=10, distributed_backend='ddp|horovod') + Trainer(gpus=8, num_nodes=10, accelerator='ddp|horovod') In DDP2, your effective batch size will be 7 * num_nodes. @@ -600,10 +602,10 @@ The reason is that the full batch is visible to all GPUs on the node when using .. code-block:: python # effective batch size = 7 - Trainer(gpus=8, distributed_backend='ddp2') + Trainer(gpus=8, accelerator='ddp2') # effective batch size = 7 * 10 - Trainer(gpus=8, num_nodes=10, distributed_backend='ddp2') + Trainer(gpus=8, num_nodes=10, accelerator='ddp2') .. note:: Huge batch sizes are actually really bad for convergence. Check out: @@ -617,7 +619,7 @@ Lightning supports the use of PytorchElastic to enable fault-tolerent and elasti .. code-block:: python - Trainer(gpus=8, distributed_backend='ddp') + Trainer(gpus=8, accelerator='ddp') Following the `PytorchElastic Quickstart documentation `_, you then need to start a single-node etcd server on one of the hosts: diff --git a/docs/source/new-project.rst b/docs/source/new-project.rst index fabc5cf6e5766..e5ba47351a98b 100644 --- a/docs/source/new-project.rst +++ b/docs/source/new-project.rst @@ -608,7 +608,13 @@ Here's an example adding a not-so-fancy learning rate decay rule: new_lr_group.append(new_lr) param_group['lr'] = new_lr self.old_lrs[opt_idx] = new_lr_group - + +And pass the callback to the Trainer + +.. code-block:: python + + decay_callback = DecayLearningRate() + trainer = Trainer(callbacks=[decay_callback]) Things you can do with a callback: diff --git a/docs/source/optimizers.rst b/docs/source/optimizers.rst index 2b8025959c9ca..7f1bcc97662b4 100644 --- a/docs/source/optimizers.rst +++ b/docs/source/optimizers.rst @@ -36,8 +36,8 @@ to manually manage the optimization process. To do so, do the following: # use self.backward which will also handle scaling the loss when using amp self.manual_backward(loss_a, opt_g) - opt_g.step() - opt_g.zero_grad() + self.manual_optimizer_step(opt_g) + # do anything you want loss_b = ... @@ -45,8 +45,11 @@ to manually manage the optimization process. To do so, do the following: # pass in any args that loss.backward() normally takes self.manual_backward(loss_b, opt_d, retain_graph=True) self.manual_backward(loss_b, opt_d) - opt_d.step() - opt_d.zero_grad() + self.manual_optimizer_step(opt_d) + + # log losses + self.log('loss_a', loss_a) + self.log('loss_b', loss_b) .. note:: This is only recommended for experts who need ultimate flexibility @@ -108,7 +111,7 @@ Every optimizer you use can be paired with any `LearningRateScheduler 0`` when using ``.spawn()``. For this reason we recommend you -use ``distributed_backend=ddp`` so you can increase the ``num_workers``, however your script has to be callable like so: +use ``accelerator=ddp`` so you can increase the ``num_workers``, however your script has to be callable like so: .. code-block:: bash diff --git a/docs/source/slurm.rst b/docs/source/slurm.rst index 287fccf71fe81..be40810c3f944 100644 --- a/docs/source/slurm.rst +++ b/docs/source/slurm.rst @@ -24,7 +24,7 @@ To train a model using multiple nodes, do the following: .. code-block:: python # train on 32 GPUs across 4 nodes - trainer = Trainer(gpus=8, num_nodes=4, distributed_backend='ddp') + trainer = Trainer(gpus=8, num_nodes=4, accelerator='ddp') 3. It's a good idea to structure your training script like this: @@ -37,7 +37,7 @@ To train a model using multiple nodes, do the following: trainer = pl.Trainer( gpus=8, num_nodes=4, - distributed_backend='ddp' + accelerator='ddp' ) trainer.fit(model) diff --git a/docs/source/tpu.rst b/docs/source/tpu.rst index f6189244fa020..5f4c48076d813 100644 --- a/docs/source/tpu.rst +++ b/docs/source/tpu.rst @@ -128,19 +128,33 @@ That's it! Your model will train on all 8 TPU cores. ---------------- -Single TPU core training +TPU core training + ------------------------ -Lightning supports training on a single TPU core. Just pass the TPU core ID [1-8] in a list. + +Lightning supports training on a single TPU core or 8 TPU cores. + +The Trainer parameters ``tpu_cores`` defines how many TPU cores to train on (1 or 8) / Single TPU to train on [1]. + +For Single TPU training, Just pass the TPU core ID [1-8] in a list. + +Single TPU core training. Model will train on TPU core ID 5. .. code-block:: python - trainer = pl.Trainer(tpu_cores=[1]) + trainer = pl.Trainer(tpu_cores=[5]) + +8 TPU cores training. Model will train on 8 TPU cores. + +.. code-block:: python + + trainer = pl.Trainer(tpu_cores=8) ---------------- Distributed Backend with TPU ---------------------------- -The ```distributed_backend``` option used for GPUs does not apply to TPUs. +The ``accelerator`` option used for GPUs does not apply to TPUs. TPUs work in DDP mode by default (distributing over each core) ---------------- diff --git a/docs/source/trainer.rst b/docs/source/trainer.rst index 792d50f15d856..e82b0871ef85b 100644 --- a/docs/source/trainer.rst +++ b/docs/source/trainer.rst @@ -1,26 +1,1705 @@ .. role:: hidden :class: hidden-section +.. testsetup:: * + + import os + from pytorch_lightning.trainer.trainer import Trainer + from pytorch_lightning.core.lightning import LightningModule + from pytorch_lightning.utilities.seed import seed_everything + .. _trainer: Trainer ======= -.. automodule:: pytorch_lightning.trainer - :members: fit, test - :noindex: - :exclude-members: - setup_training, - _abc_impl, - set_random_port, - _Trainer__set_root_gpu, - _Trainer__init_optimizers, - _Trainer__parse_gpu_ids, - _Trainer__configure_schedulers, - data_parallel, - num_gpus, - slurm_job_id, - tng_tqdm_dic, - training_tqdm_dict, - progress_bar_dict, - init_optimizers, - configure_schedulers + +Once you've organized your PyTorch code into a LightningModule, +the Trainer automates everything else. + +.. raw:: html + + + +| + +This abstraction achieves the following: + +1. You maintain control over all aspects via PyTorch code without an added abstraction. + +2. The trainer uses best practices embedded by contributors and users + from top AI labs such as Facebook AI Research, NYU, MIT, Stanford, etc... + +3. The trainer allows overriding any key part that you don't want automated. + +| + +----------- + +Basic use +--------- + +This is the basic use of the trainer: + +.. code-block:: python + + model = MyLightningModule() + + trainer = Trainer() + trainer.fit(model, train_dataloader, val_dataloader) + +-------- + +Under the hood +-------------- +Under the hood, the Lightning Trainer handles the training loop details for you, some examples include: + +- Automatically eenabling/disabling grads +- Running the training, validation and test dataloaders +- Calling the Callbacks at the appropriate times +- Putting batches and computations on the correct devices + +Here's the pseudocode for what the trainer does under the hood (showing the train loop only) + +.. code-block:: python + + # put model in train mode + model.train() + torch.set_grad_enabled(True) + + losses = [] + for batch in train_dataloader: + # calls hooks like this one + on_train_batch_start() + + # train step + loss = training_step(batch) + + # backward + loss.backward() + + # apply and clear grads + optimizer.step() + optimizer.zero_grad() + + losses.append(loss) + + +-------- + +Trainer in Python scripts +------------------------- +In Python scripts, it's recommended you use a main function to call the Trainer. + +.. code-block:: python + + from argparse import ArgumentParser + + def main(hparams): + model = LightningModule() + trainer = Trainer(gpus=hparams.gpus) + trainer.fit(model) + + if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument('--gpus', default=None) + args = parser.parse_args() + + main(args) + +So you can run it like so: + +.. code-block:: bash + + python main.py --gpus 2 + +.. note:: + + Pro-tip: You don't need to define all flags manually. Lightning can add them automatically + +.. code-block:: python + + from argparse import ArgumentParser + + def main(args): + model = LightningModule() + trainer = Trainer.from_argparse_args(args) + trainer.fit(model) + + if __name__ == '__main__': + parser = ArgumentParser() + parser = Trainer.add_argparse_args(parser) + args = parser.parse_args() + + main(args) + +So you can run it like so: + +.. code-block:: bash + + python main.py --gpus 2 --max_steps 10 --limit_train_batches 10 --any_trainer_arg x + +.. note:: + If you want to stop a training run early, you can press "Ctrl + C" on your keyboard. + The trainer will catch the `KeyboardInterrupt` and attempt a graceful shutdown, including + running callbacks such as `on_train_end`. The trainer object will also set an attribute + `interrupted` to `True` in such cases. If you have a callback which shuts down compute + resources, for example, you can conditionally run the shutdown logic for only uninterrupted runs. + +------------ + +Testing +------- +Once you're done training, feel free to run the test set! +(Only right before publishing your paper or pushing to production) + +.. code-block:: python + + trainer.test(test_dataloader=test_dataloader) + +------------ + +Deployment / prediction +----------------------- +You just trained a LightningModule which is also just a torch.nn.Module. +Use it to do whatever! + +.. code-block:: python + + # load model + pretrained_model = LightningModule.load_from_checkpoint(PATH) + pretrained_model.freeze() + + # use it for finetuning + def forward(self, x): + features = pretrained_model(x) + classes = classifier(features) + + # or for prediction + out = pretrained_model(x) + api_write({'response': out} + + +You may wish to run the model on a variety of devices. Instead of moving the data +manually to the correct device, decorate the forward method (or any other method you use for inference) +with :func:`~pytorch_lightning.core.decorators.auto_move_data` and Lightning will take care of the rest. + +------------ + +Reproducibility +--------------- + +To ensure full reproducibility from run to run you need to set seeds for pseudo-random generators, +and set ``deterministic`` flag in ``Trainer``. + +Example:: + + from pytorch_lightning import Trainer, seed_everything + + seed_everything(42) + # sets seeds for numpy, torch, python.random and PYTHONHASHSEED. + model = Model() + trainer = Trainer(deterministic=True) + + +------- + +Trainer flags +------------- + +accelerator +^^^^^^^^^^^ + +.. raw:: html + + + +| + +The accelerator backend to use (previously known as distributed_backend). + +- (```dp```) is DataParallel (split batch among GPUs of same machine) +- (```ddp```) is DistributedDataParallel (each gpu on each node trains, and syncs grads) +- (```ddp_cpu```) is DistributedDataParallel on CPU (same as `ddp`, but does not use GPUs. + Useful for multi-node CPU training or single-node debugging. Note that this will **not** give + a speedup on a single node, since Torch already makes effient use of multiple CPUs on a single + machine.) +- (```ddp2```) dp on node, ddp across nodes. Useful for things like increasing + the number of negative samples + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(accelerator=None) + +Example:: + + # dp = DataParallel + trainer = Trainer(gpus=2, accelerator='dp') + + # ddp = DistributedDataParallel + trainer = Trainer(gpus=2, num_nodes=2, accelerator='ddp') + + # ddp2 = DistributedDataParallel + dp + trainer = Trainer(gpus=2, num_nodes=2, accelerator='ddp2') + +.. note:: this option does not apply to TPU. TPUs use ```ddp``` by default (over each core) + +You can also modify hardware behavior by subclassing an existing accelerator to adjust for your needs. + +Example:: + + class MyOwnDDP(DDPAccelerator): + ... + + Trainer(accelerator=MyOwnDDP()) + +.. warning:: Passing in custom accelerators is experimental but work is in progress to enable full compatibility. + +accumulate_grad_batches +^^^^^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Accumulates grads every k batches or as set up in the dict. +Trainer also calls ``optimizer.step()`` for the last indivisible step number. + +.. testcode:: + + # default used by the Trainer (no accumulation) + trainer = Trainer(accumulate_grad_batches=1) + +Example:: + + # accumulate every 4 batches (effective batch size is batch*4) + trainer = Trainer(accumulate_grad_batches=4) + + # no accumulation for epochs 1-4. accumulate 3 for epochs 5-10. accumulate 20 after that + trainer = Trainer(accumulate_grad_batches={5: 3, 10: 20}) + +amp_backend +^^^^^^^^^^^ + +.. raw:: html + + + +| + +Use PyTorch AMP ('native') (available PyTorch 1.6+), or NVIDIA apex ('apex'). + +.. testcode:: + + # using PyTorch built-in AMP, default used by the Trainer + trainer = Trainer(amp_backend='native') + + # using NVIDIA Apex + trainer = Trainer(amp_backend='apex') + +amp_level +^^^^^^^^^ + +.. raw:: html + + + +| + +The optimization level to use (O1, O2, etc...) +for 16-bit GPU precision (using NVIDIA apex under the hood). + +Check `NVIDIA apex docs `_ for level + +Example:: + + # default used by the Trainer + trainer = Trainer(amp_level='O2') + +automatic_optimization +^^^^^^^^^^^^^^^^^^^^^^ +When set to False, Lightning does not automate the optimization process. This means you are responsible for your own +optimizer behavior + +Example:: + + def training_step(self, batch, batch_idx): + opt = self.optimizers() + + loss = ... + self.manual_backward(loss, opt) + opt.step() + opt.zero_grad() + +This is not recommended when using a single optimizer, instead it's recommended when using 2+ optimizers +AND you are an expert user. Most useful for research like RL, sparse coding and GAN research. + +In the multi-optimizer case, ignore the optimizer_idx flag and use the optimizers directly + +Example:: + + def training_step(self, batch, batch_idx, optimizer_idx): + (opt_a, opt_b) = self.optimizers() + + gen_loss = ... + self.manual_backward(gen_loss, opt_a) + opt_a.step() + opt_a.zero_grad() + + disc_loss = ... + self.manual_backward(disc_loss, opt_b) + opt_b.step() + opt_b.zero_grad() + +auto_scale_batch_size +^^^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Automatically tries to find the largest batch size that fits into memory, +before any training. + +.. code-block:: + + # default used by the Trainer (no scaling of batch size) + trainer = Trainer(auto_scale_batch_size=None) + + # run batch size scaling, result overrides hparams.batch_size + trainer = Trainer(auto_scale_batch_size='binsearch') + + # call tune to find the batch size + trainer.tune(model) + +auto_select_gpus +^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +If enabled and `gpus` is an integer, pick available gpus automatically. +This is especially useful when GPUs are configured to be in "exclusive mode", +such that only one process at a time can access them. + +Example:: + + # no auto selection (picks first 2 gpus on system, may fail if other process is occupying) + trainer = Trainer(gpus=2, auto_select_gpus=False) + + # enable auto selection (will find two available gpus on system) + trainer = Trainer(gpus=2, auto_select_gpus=True) + + # specifies all GPUs regardless of its availability + Trainer(gpus=-1, auto_select_gpus=False) + + # specifies all available GPUs (if only one GPU is not occupied, uses one gpu) + Trainer(gpus=-1, auto_select_gpus=True) + +auto_lr_find +^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Runs a learning rate finder algorithm (see this `paper `_) +when calling trainer.tune(), to find optimal initial learning rate. + +.. code-block:: python + + # default used by the Trainer (no learning rate finder) + trainer = Trainer(auto_lr_find=False) + +Example:: + + # run learning rate finder, results override hparams.learning_rate + trainer = Trainer(auto_lr_find=True) + + # call tune to find the lr + trainer.tune(model) + +Example:: + + # run learning rate finder, results override hparams.my_lr_arg + trainer = Trainer(auto_lr_find='my_lr_arg') + + # call tune to find the lr + trainer.tune(model) + +.. note:: + See the :ref:`learning rate finder guide `. + +benchmark +^^^^^^^^^ + +.. raw:: html + + + +| + +If true enables cudnn.benchmark. +This flag is likely to increase the speed of your system if your +input sizes don't change. However, if it does, then it will likely +make your system slower. + +The speedup comes from allowing the cudnn auto-tuner to find the best +algorithm for the hardware `[see discussion here] +`_. + +Example:: + + # default used by the Trainer + trainer = Trainer(benchmark=False) + +deterministic +^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +If true enables cudnn.deterministic. +Might make your system slower, but ensures reproducibility. +Also sets ``$HOROVOD_FUSION_THRESHOLD=0``. + +For more info check `[pytorch docs] +`_. + +Example:: + + # default used by the Trainer + trainer = Trainer(deterministic=False) + +callbacks +^^^^^^^^^ + +.. raw:: html + + + +| + +Add a list of :class:`~pytorch_lightning.callbacks.Callback`. + +.. code-block:: python + + # a list of callbacks + callbacks = [PrintCallback()] + trainer = Trainer(callbacks=callbacks) + +Example:: + + from pytorch_lightning.callbacks import Callback + + class PrintCallback(Callback): + def on_train_start(self, trainer, pl_module): + print("Training is started!") + def on_train_end(self, trainer, pl_module): + print("Training is done.") + +check_val_every_n_epoch +^^^^^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Check val every n train epochs. + +Example:: + + # default used by the Trainer + trainer = Trainer(check_val_every_n_epoch=1) + + # run val loop every 10 training epochs + trainer = Trainer(check_val_every_n_epoch=10) + +checkpoint_callback +^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +By default Lightning saves a checkpoint for you in your current working directory, with the state of your last training epoch, +Checkpoints capture the exact value of all parameters used by a model. +To disable automatic checkpointing, set this to `False`. + +.. code-block:: python + + # default used by Trainer + trainer = Trainer(checkpoint_callback=True) + + # turn off automatic checkpointing + trainer = Trainer(checkpoint_callback=False) + + +You can override the default behavior by initializing the :class:`~pytorch_lightning.callbacks.ModelCheckpoint` +callback, and adding it to the :paramref:`~pytorch_lightning.trainer.trainer.Trainer.callbacks` list. +See :ref:`Saving and Loading Weights ` for how to customize checkpointing. + + +.. warning:: Passing a ModelCheckpoint instance to this argument is deprecated since + v1.1.0 and will be unsupported from v1.3.0. + + +default_root_dir +^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Default path for logs and weights when no logger or +:class:`pytorch_lightning.callbacks.ModelCheckpoint` callback passed. On +certain clusters you might want to separate where logs and checkpoints are +stored. If you don't then use this argument for convenience. Paths can be local +paths or remote paths such as `s3://bucket/path` or 'hdfs://path/'. Credentials +will need to be set up to use remote filepaths. + +Example:: + + # default used by the Trainer + trainer = Trainer(default_root_path=os.getcwd()) + +distributed_backend +^^^^^^^^^^^^^^^^^^^ +This has been renamed "accelerator". + +fast_dev_run +^^^^^^^^^^^^ + +.. raw:: html + + + +| + +.. raw:: html + + + +| + +Runs 1 batch of train, test and val to find any bugs (ie: a sort of unit test). + +Under the hood the pseudocode looks like this: + +.. code-block:: python + + # loading + __init__() + prepare_data + + # test training step + training_batch = next(train_dataloader) + training_step(training_batch) + + # test val step + val_batch = next(val_dataloader) + out = validation_step(val_batch) + validation_epoch_end([out]) + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(fast_dev_run=False) + + # runs 1 train, val, test batch and program ends + trainer = Trainer(fast_dev_run=True) + +gpus +^^^^ + +.. raw:: html + + + +| + +- Number of GPUs to train on (int) +- or which GPUs to train on (list) +- can handle strings + +.. testcode:: + + # default used by the Trainer (ie: train on CPU) + trainer = Trainer(gpus=None) + + # equivalent + trainer = Trainer(gpus=0) + +Example:: + + # int: train on 2 gpus + trainer = Trainer(gpus=2) + + # list: train on GPUs 1, 4 (by bus ordering) + trainer = Trainer(gpus=[1, 4]) + trainer = Trainer(gpus='1, 4') # equivalent + + # -1: train on all gpus + trainer = Trainer(gpus=-1) + trainer = Trainer(gpus='-1') # equivalent + + # combine with num_nodes to train on multiple GPUs across nodes + # uses 8 gpus in total + trainer = Trainer(gpus=2, num_nodes=4) + + # train only on GPUs 1 and 4 across nodes + trainer = Trainer(gpus=[1, 4], num_nodes=4) + +See Also: + - :ref:`Multi-GPU training guide `. + +gradient_clip_val +^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Gradient clipping value + +- 0 means don't clip. + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(gradient_clip_val=0.0) + + +limit_test_batches +^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +How much of test dataset to check. + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(limit_test_batches=1.0) + + # run through only 25% of the test set each epoch + trainer = Trainer(limit_test_batches=0.25) + + # run for only 10 batches + trainer = Trainer(limit_test_batches=10) + +In the case of multiple test dataloaders, the limit applies to each dataloader individually. + +limit_val_batches +^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +How much of validation dataset to check. +Useful when debugging or testing something that happens at the end of an epoch. + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(limit_val_batches=1.0) + + # run through only 25% of the validation set each epoch + trainer = Trainer(limit_val_batches=0.25) + + # run for only 10 batches + trainer = Trainer(limit_val_batches=10) + +In the case of multiple validation dataloaders, the limit applies to each dataloader individually. + +log_gpu_memory +^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Options: + +- None +- 'min_max' +- 'all' + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(log_gpu_memory=None) + + # log all the GPUs (on master node only) + trainer = Trainer(log_gpu_memory='all') + + # log only the min and max memory on the master node + trainer = Trainer(log_gpu_memory='min_max') + +.. note:: Might slow performance because it uses the output of nvidia-smi. + +flush_logs_every_n_steps +^^^^^^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Writes logs to disk this often. + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(flush_logs_every_n_steps=100) + +See Also: + - :ref:`logging` + +logger +^^^^^^ + +.. raw:: html + + + +| + +:ref:`Logger ` (or iterable collection of loggers) for experiment tracking. + +.. testcode:: + + from pytorch_lightning.loggers import TensorBoardLogger + + # default logger used by trainer + logger = TensorBoardLogger( + save_dir=os.getcwd(), + version=1, + name='lightning_logs' + ) + Trainer(logger=logger) + +max_epochs +^^^^^^^^^^ + +.. raw:: html + + + +| + +Stop training once this number of epochs is reached + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(max_epochs=1000) + +min_epochs +^^^^^^^^^^ + +.. raw:: html + + + +| + +Force training for at least these many epochs + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(min_epochs=1) + +max_steps +^^^^^^^^^ + +.. raw:: html + + + +| + +Stop training after this number of steps +Training will stop if max_steps or max_epochs have reached (earliest). + +.. testcode:: + + # Default (disabled) + trainer = Trainer(max_steps=None) + + # Stop after 100 steps + trainer = Trainer(max_steps=100) + +min_steps +^^^^^^^^^ + +.. raw:: html + + + +| + +Force training for at least these number of steps. +Trainer will train model for at least min_steps or min_epochs (latest). + +.. testcode:: + + # Default (disabled) + trainer = Trainer(min_steps=None) + + # Run at least for 100 steps (disable min_epochs) + trainer = Trainer(min_steps=100, min_epochs=0) + +num_nodes +^^^^^^^^^ + +.. raw:: html + + + +| + +Number of GPU nodes for distributed training. + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(num_nodes=1) + + # to train on 8 nodes + trainer = Trainer(num_nodes=8) + +num_processes +^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Number of processes to train with. Automatically set to the number of GPUs +when using ``accelerator="ddp"``. Set to a number greater than 1 when +using ``accelerator="ddp_cpu"`` to mimic distributed training on a +machine without GPUs. This is useful for debugging, but **will not** provide +any speedup, since single-process Torch already makes effient use of multiple +CPUs. + +.. testcode:: + + # Simulate DDP for debugging on your GPU-less laptop + trainer = Trainer(accelerator="ddp_cpu", num_processes=2) + +num_sanity_val_steps +^^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Sanity check runs n batches of val before starting the training routine. +This catches any bugs in your validation without having to wait for the first validation check. +The Trainer uses 2 steps by default. Turn it off or modify it here. + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(num_sanity_val_steps=2) + + # turn it off + trainer = Trainer(num_sanity_val_steps=0) + + # check all validation data + trainer = Trainer(num_sanity_val_steps=-1) + + +This option will reset the validation dataloader unless ``num_sanity_val_steps=0``. + + +plugins +^^^^^^^ + +.. raw:: html + + + +| + +Plugins allow you to connect arbitrary backends, precision libraries, SLURM, etc... For example: + +- DDP +- SLURM +- TorchElastic +- Apex + +To define your own behavior, subclass the relevant class and pass it in. Here's an example linking up your own cluster. + +.. code-block:: python + + from pytorch_lightning.cluster_environments import cluster_environment + + class MyCluster(ClusterEnvironment): + + def master_address(self): + return your_master_address + + def master_port(self): + return your_master_port + + def world_size(self): + return the_world_size + + trainer = Trainer(cluster_environment=cluster_environment()) + +prepare_data_per_node +^^^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +If True will call `prepare_data()` on LOCAL_RANK=0 for every node. +If False will only call from NODE_RANK=0, LOCAL_RANK=0 + +.. testcode:: + + # default + Trainer(prepare_data_per_node=True) + + # use only NODE_RANK=0, LOCAL_RANK=0 + Trainer(prepare_data_per_node=False) + +tpu_cores +^^^^^^^^^ + +.. raw:: html + + + +| + +- How many TPU cores to train on (1 or 8). +- Which TPU core to train on [1-8] + +A single TPU v2 or v3 has 8 cores. A TPU pod has +up to 2048 cores. A slice of a POD means you get as many cores +as you request. + +Your effective batch size is batch_size * total tpu cores. + +.. note:: No need to add a DistributedDataSampler, Lightning automatically does it for you. + +This parameter can be either 1 or 8. + +.. testcode:: + + # your_trainer_file.py + + # default used by the Trainer (ie: train on CPU) + trainer = Trainer(tpu_cores=None) + + # int: train on a single core + trainer = Trainer(tpu_cores=1) + + # list: train on a single selected core + trainer = Trainer(tpu_cores=[2]) + + # int: train on all cores few cores + trainer = Trainer(tpu_cores=8) + + # for 8+ cores must submit via xla script with + # a max of 8 cores specified. The XLA script + # will duplicate script onto each TPU in the POD + trainer = Trainer(tpu_cores=8) + +To train on more than 8 cores (ie: a POD), +submit this script using the xla_dist script. + +Example:: + + python -m torch_xla.distributed.xla_dist + --tpu=$TPU_POD_NAME + --conda-env=torch-xla-nightly + --env=XLA_USE_BF16=1 + -- python your_trainer_file.py + +overfit_batches +^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Uses this much data of the training set. If nonzero, will use the same training set for validation and testing. +If the training dataloaders have `shuffle=True`, Lightning will automatically disable it. + +Useful for quickly debugging or trying to overfit on purpose. + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(overfit_batches=0.0) + + # use only 1% of the train set (and use the train set for val and test) + trainer = Trainer(overfit_batches=0.01) + + # overfit on 10 of the same batches + trainer = Trainer(overfit_batches=10) + +precision +^^^^^^^^^ + +.. raw:: html + + + +| + +Full precision (32), half precision (16). +Can be used on CPU, GPU or TPUs. + +If used on TPU will use torch.bfloat16 but tensor printing +will still show torch.float32. + +.. testcode:: + :skipif: not APEX_AVAILABLE and not NATIVE_AMP_AVALAIBLE + + # default used by the Trainer + trainer = Trainer(precision=32) + + # 16-bit precision + trainer = Trainer(precision=16) + +Example:: + + # one day + trainer = Trainer(precision=8|4|2) + +process_position +^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Orders the progress bar. Useful when running multiple trainers on the same node. + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(process_position=0) + +Note: + This argument is ignored if a custom callback is passed to :paramref:`~Trainer.callbacks`. + +profiler +^^^^^^^^ + +.. raw:: html + + + +| + +To profile individual steps during training and assist in identifying bottlenecks. + +See the :ref:`profiler documentation `. for more details. + +.. testcode:: + + from pytorch_lightning.profiler import SimpleProfiler, AdvancedProfiler + + # default used by the Trainer + trainer = Trainer(profiler=None) + + # to profile standard training events, equivalent to `profiler=SimpleProfiler()` + trainer = Trainer(profiler="simple") + + # advanced profiler for function-level stats, equivalent to `profiler=AdvancedProfiler()` + trainer = Trainer(profiler="advanced") + +progress_bar_refresh_rate +^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +How often to refresh progress bar (in steps). +In notebooks, faster refresh rates (lower number) is known to crash them +because of their screen refresh rates, so raise it to 50 or more. + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(progress_bar_refresh_rate=1) + + # disable progress bar + trainer = Trainer(progress_bar_refresh_rate=0) + +Note: + This argument is ignored if a custom callback is passed to :paramref:`~Trainer.callbacks`. + +reload_dataloaders_every_epoch +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Set to True to reload dataloaders every epoch. + +.. code-block:: python + + # if False (default) + train_loader = model.train_dataloader() + for epoch in epochs: + for batch in train_loader: + ... + + # if True + for epoch in epochs: + train_loader = model.train_dataloader() + for batch in train_loader: + +replace_sampler_ddp +^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Enables auto adding of distributed sampler. By default it will add ``shuffle=True`` +for train sampler and ``shuffle=False`` for val/test sampler. If you want to customize +it, you can set ``replace_sampler_ddp=False`` and add your own distributed sampler. +If ``replace_sampler_ddp=True`` and a distributed sampler was already added, +Lightning will not replace the existing one. + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(replace_sampler_ddp=True) + +By setting to False, you have to add your own distributed sampler: + +.. code-block:: python + + # default used by the Trainer + sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=True) + dataloader = DataLoader(dataset, batch_size=32, sampler=sampler) + +resume_from_checkpoint +^^^^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +To resume training from a specific checkpoint pass in the path here. + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(resume_from_checkpoint=None) + + # resume from a specific checkpoint + trainer = Trainer(resume_from_checkpoint='some/path/to/my_checkpoint.ckpt') + +log_every_n_steps +^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + + +How often to add logging rows (does not write to disk) + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(log_every_n_steps=50) + +See Also: + - :ref:`logging` + + +sync_batchnorm +^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Enable synchronization between batchnorm layers across all GPUs. + +.. testcode:: + + trainer = Trainer(sync_batchnorm=True) + +track_grad_norm +^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +- no tracking (-1) +- Otherwise tracks that norm (2 for 2-norm) + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(track_grad_norm=-1) + + # track the 2-norm + trainer = Trainer(track_grad_norm=2) + +limit_train_batches +^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +How much of training dataset to check. +Useful when debugging or testing something that happens at the end of an epoch. + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(limit_train_batches=1.0) + +Example:: + + # default used by the Trainer + trainer = Trainer(limit_train_batches=1.0) + + # run through only 25% of the training set each epoch + trainer = Trainer(limit_train_batches=0.25) + + # run through only 10 batches of the training set each epoch + trainer = Trainer(limit_train_batches=10) + +truncated_bptt_steps +^^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Truncated back prop breaks performs backprop every k steps of +a much longer sequence. + +If this is enabled, your batches will automatically get truncated +and the trainer will apply Truncated Backprop to it. + +(`Williams et al. "An efficient gradient-based algorithm for on-line training of +recurrent network trajectories." +`_) + +.. testcode:: + + # default used by the Trainer (ie: disabled) + trainer = Trainer(truncated_bptt_steps=None) + + # backprop every 5 steps in a batch + trainer = Trainer(truncated_bptt_steps=5) + +.. note:: Make sure your batches have a sequence dimension. + +Lightning takes care to split your batch along the time-dimension. + +.. code-block:: python + + # we use the second as the time dimension + # (batch, time, ...) + sub_batch = batch[0, 0:t, ...] + +Using this feature requires updating your LightningModule's +:meth:`pytorch_lightning.core.LightningModule.training_step` to include a `hiddens` arg +with the hidden + +.. code-block:: python + + # Truncated back-propagation through time + def training_step(self, batch, batch_idx, hiddens): + # hiddens are the hiddens from the previous truncated backprop step + out, hiddens = self.lstm(data, hiddens) + + return { + "loss": ..., + "hiddens": hiddens # remember to detach() this + } + +To modify how the batch is split, +override :meth:`pytorch_lightning.core.LightningModule.tbptt_split_batch`: + +.. testcode:: + + class LitMNIST(LightningModule): + def tbptt_split_batch(self, batch, split_size): + # do your own splitting on the batch + return splits + +val_check_interval +^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +How often within one training epoch to check the validation set. +Can specify as float or int. + +- use (float) to check within a training epoch +- use (int) to check every n steps (batches) + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(val_check_interval=1.0) + + # check validation set 4 times during a training epoch + trainer = Trainer(val_check_interval=0.25) + + # check validation set every 1000 training batches + # use this when using iterableDataset and your dataset has no length + # (ie: production cases with streaming data) + trainer = Trainer(val_check_interval=1000) + + +weights_save_path +^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Directory of where to save weights if specified. + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(weights_save_path=os.getcwd()) + + # save to your custom path + trainer = Trainer(weights_save_path='my/path') + +Example:: + + # if checkpoint callback used, then overrides the weights path + # **NOTE: this saves weights to some/path NOT my/path + checkpoint = ModelCheckpoint(dirpath='some/path') + trainer = Trainer( + callbacks=[checkpoint], + weights_save_path='my/path' + ) + +weights_summary +^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Prints a summary of the weights when training begins. +Options: 'full', 'top', None. + +.. testcode:: + + # default used by the Trainer (ie: print summary of top level modules) + trainer = Trainer(weights_summary='top') + + # print full summary of all modules and submodules + trainer = Trainer(weights_summary='full') + + # don't print a summary + trainer = Trainer(weights_summary=None) + +----- + +Trainer class API +----------------- + +Methods +^^^^^^^ + +init +**** + +.. automethod:: pytorch_lightning.trainer.Trainer.__init__ + :noindex: + +fit +**** + +.. automethod:: pytorch_lightning.trainer.Trainer.fit + :noindex: + +test +**** + +.. automethod:: pytorch_lightning.trainer.Trainer.test + :noindex: + +tune +**** + +.. automethod:: pytorch_lightning.trainer.Trainer.tune + :noindex: + +Properties +^^^^^^^^^^ + +callback_metrics +**************** + +The metrics available to callbacks. These are automatically set when you log via `self.log` + +.. code-block:: python + + def training_step(self, batch, batch_idx): + self.log('a_val', 2) + + + callback_metrics = trainer.callback_metrics + assert callback_metrics['a_val'] == 2 + +current_epoch +************* + +The current epoch + +.. code-block:: python + + def training_step(self, batch, batch_idx): + current_epoch = self.trainer.current_epoch + if current_epoch > 100: + # do something + pass + + +logger (p) +********** + +The current logger being used. Here's an example using tensorboard + +.. code-block:: python + + def training_step(self, batch, batch_idx): + logger = self.trainer.logger + tensorboard = logger.experiment + + +logged_metrics +************** + +The metrics sent to the logger (visualizer). + +.. code-block:: python + + def training_step(self, batch, batch_idx): + self.log('a_val', 2, log=True) + + + logged_metrics = trainer.logged_metrics + assert logged_metrics['a_val'] == 2 + +log_dir +******* +The directory for the current experiment. Use this to save images to, etc... + +.. code-block:: python + + def training_step(self, batch, batch_idx): + img = ... + save_img(img, self.trainer.log_dir) + + + +is_global_zero +************** + +Whether this process is the global zero in multi-node training + +.. code-block:: python + + def training_step(self, batch, batch_idx): + if self.trainer.is_global_zero: + print('in node 0, accelerator 0') + +progress_bar_metrics +******************** + +The metrics sent to the progress bar. + +.. code-block:: python + + def training_step(self, batch, batch_idx): + self.log('a_val', 2, prog_bar=True) + + + progress_bar_metrics = trainer.progress_bar_metrics + assert progress_bar_metrics['a_val'] == 2 + diff --git a/docs/source/weights_loading.rst b/docs/source/weights_loading.rst index bd6900769f30f..cddee7e1feb3f 100644 --- a/docs/source/weights_loading.rst +++ b/docs/source/weights_loading.rst @@ -65,8 +65,8 @@ You can customize the checkpointing behavior to monitor any quantity of your tra # 3. Init ModelCheckpoint callback, monitoring 'val_loss' checkpoint_callback = ModelCheckpoint(monitor='val_loss') - # 4. Pass your callback to checkpoint_callback trainer flag - trainer = Trainer(checkpoint_callback=checkpoint_callback) + # 4. Add your callback to the callbacks list + trainer = Trainer(callbacks=[checkpoint_callback]) You can also control more advanced options, like `save_top_k`, to save the best k models and the mode of the monitored quantity (min/max/auto, where the mode is automatically inferred from the name of the monitored quantity), `save_weights_only` or `period` to set the interval of epochs between checkpoints, to avoid slowdowns. @@ -89,14 +89,14 @@ You can also control more advanced options, like `save_top_k`, to save the best save_top_k=3, mode='min') - trainer = Trainer(checkpoint_callback=checkpoint_callback) + trainer = Trainer(callbacks=[checkpoint_callback]) You can retrieve the checkpoint after training by calling .. code-block:: python checkpoint_callback = ModelCheckpoint(dirpath='my/path/') - trainer = Trainer(checkpoint_callback=checkpoint_callback) + trainer = Trainer(callbacks=[checkpoint_callback]) trainer.fit(model) checkpoint_callback.best_model_path @@ -111,7 +111,7 @@ You can disable checkpointing by passing The Lightning checkpoint also saves the arguments passed into the LightningModule init -under the `module_arguments` key in the checkpoint. +under the `hyper_parameters` key in the checkpoint. .. code-block:: python @@ -119,10 +119,11 @@ under the `module_arguments` key in the checkpoint. def __init__(self, learning_rate, *args, **kwargs): super().__init__() + self.save_hyperparameters() # all init args were saved to the checkpoint checkpoint = torch.load(CKPT_PATH) - print(checkpoint['module_arguments']) + print(checkpoint['hyper_parameters']) # {'learning_rate': the_value} Manual saving @@ -140,7 +141,7 @@ You can manually save checkpoints and restore your model from the checkpointed s Checkpoint loading ****************** -To load a model along with its weights, biases and `module_arguments` use the following method: +To load a model along with its weights, biases and hyperparameters use the following method: .. code-block:: python diff --git a/environment.yml b/environment.yml index 09eb997b78ee6..3d59c1eeed0dd 100644 --- a/environment.yml +++ b/environment.yml @@ -26,7 +26,7 @@ dependencies: - python>=3.6 - pip>20.1 - numpy>=1.16.4 - - pytorch>=1.3 + - pytorch>=1.3,<1.8 - future>=0.17.1 - PyYAML>=5.1 - tqdm>=4.41.0 @@ -41,7 +41,7 @@ dependencies: - torchtext>=0.3.1 # Examples - - torchvision>=0.4.1 + - torchvision>=0.4.1,<0.9.0 - pip: - test-tube>=0.7.5 diff --git a/notebooks/01-mnist-hello-world.ipynb b/notebooks/01-mnist-hello-world.ipynb index 79bc9ebec9632..b0323458c228b 100644 --- a/notebooks/01-mnist-hello-world.ipynb +++ b/notebooks/01-mnist-hello-world.ipynb @@ -1,400 +1,448 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "01-mnist-hello-world.ipynb", - "provenance": [], - "collapsed_sections": [], - "authorship_tag": "ABX9TyOtAKVa5POQ6Xg3UcTQqXDJ", - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "accelerator": "GPU" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "i7XbLCXGkll9", - "colab_type": "text" - }, - "source": [ - "# Introduction to Pytorch Lightning ⚡\n", - "\n", - "In this notebook, we'll go over the basics of lightning by preparing models to train on the [MNIST Handwritten Digits dataset](https://en.wikipedia.org/wiki/MNIST_database).\n", - "\n", - "---\n", - " - Give us a ⭐ [on Github](https://www.github.com/PytorchLightning/pytorch-lightning/)\n", - " - Check out [the documentation](https://pytorch-lightning.readthedocs.io/en/latest/)\n", - " - Join us [on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2LODD6w9ixlT", - "colab_type": "text" - }, - "source": [ - "### Setup \n", - "Lightning is easy to install. Simply ```pip install pytorch-lightning```" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "zK7-Gg69kMnG", - "colab_type": "code", - "colab": {} - }, - "source": [ - "! pip install pytorch-lightning --quiet" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "w4_TYnt_keJi", - "colab_type": "code", - "colab": {} - }, - "source": [ - "import os\n", - "\n", - "import torch\n", - "from torch import nn\n", - "from torch.nn import functional as F\n", - "from torch.utils.data import DataLoader, random_split\n", - "from torchvision.datasets import MNIST\n", - "from torchvision import transforms\n", - "import pytorch_lightning as pl\n", - "from pytorch_lightning.metrics.functional import accuracy" - ], - "execution_count": 2, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EHpyMPKFkVbZ", - "colab_type": "text" - }, - "source": [ - "## Simplest example\n", - "\n", - "Here's the simplest most minimal example with just a training loop (no validation, no testing).\n", - "\n", - "**Keep in Mind** - A `LightningModule` *is* a PyTorch `nn.Module` - it just has a few more helpful features." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "V7ELesz1kVQo", - "colab_type": "code", - "colab": {} - }, - "source": [ - "class MNISTModel(pl.LightningModule):\n", - "\n", - " def __init__(self):\n", - " super(MNISTModel, self).__init__()\n", - " self.l1 = torch.nn.Linear(28 * 28, 10)\n", - "\n", - " def forward(self, x):\n", - " return torch.relu(self.l1(x.view(x.size(0), -1)))\n", - "\n", - " def training_step(self, batch, batch_nb):\n", - " x, y = batch\n", - " loss = F.cross_entropy(self(x), y)\n", - " return loss\n", - "\n", - " def configure_optimizers(self):\n", - " return torch.optim.Adam(self.parameters(), lr=0.02)" - ], - "execution_count": 3, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hIrtHg-Dv8TJ", - "colab_type": "text" - }, - "source": [ - "By using the `Trainer` you automatically get:\n", - "1. Tensorboard logging\n", - "2. Model checkpointing\n", - "3. Training and validation loop\n", - "4. early-stopping" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "4Dk6Ykv8lI7X", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Init our model\n", - "mnist_model = MNISTModel()\n", - "\n", - "# Init DataLoader from MNIST Dataset\n", - "train_ds = MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor())\n", - "train_loader = DataLoader(train_ds, batch_size=32)\n", - "\n", - "# Initialize a trainer\n", - "trainer = pl.Trainer(gpus=1, max_epochs=3, progress_bar_refresh_rate=20)\n", - "\n", - "# Train the model ⚡\n", - "trainer.fit(mnist_model, train_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KNpOoBeIjscS", - "colab_type": "text" - }, - "source": [ - "## A more complete MNIST Lightning Module Example\n", - "\n", - "That wasn't so hard was it?\n", - "\n", - "Now that we've got our feet wet, let's dive in a bit deeper and write a more complete `LightningModule` for MNIST...\n", - "\n", - "This time, we'll bake in all the dataset specific pieces directly in the `LightningModule`. This way, we can avoid writing extra code at the beginning of our script every time we want to run it.\n", - "\n", - "---\n", - "\n", - "### Note what the following built-in functions are doing:\n", - "\n", - "1. [prepare_data()](https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.core.lightning.html#pytorch_lightning.core.lightning.LightningModule.prepare_data) 💾\n", - " - This is where we can download the dataset. We point to our desired dataset and ask torchvision's `MNIST` dataset class to download if the dataset isn't found there.\n", - " - **Note we do not make any state assignments in this function** (i.e. `self.something = ...`)\n", - "\n", - "2. [setup(stage)](https://pytorch-lightning.readthedocs.io/en/latest/lightning-module.html#setup) ⚙️\n", - " - Loads in data from file and prepares PyTorch tensor datasets for each split (train, val, test). \n", - " - Setup expects a 'stage' arg which is used to separate logic for 'fit' and 'test'.\n", - " - If you don't mind loading all your datasets at once, you can set up a condition to allow for both 'fit' related setup and 'test' related setup to run whenever `None` is passed to `stage` (or ignore it altogether and exclude any conditionals).\n", - " - **Note this runs across all GPUs and it *is* safe to make state assignments here**\n", - "\n", - "3. [x_dataloader()](https://pytorch-lightning.readthedocs.io/en/latest/lightning-module.html#data-hooks) ♻️\n", - " - `train_dataloader()`, `val_dataloader()`, and `test_dataloader()` all return PyTorch `DataLoader` instances that are created by wrapping their respective datasets that we prepared in `setup()`" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "4DNItffri95Q", - "colab_type": "code", - "colab": {} - }, - "source": [ - "class LitMNIST(pl.LightningModule):\n", - " \n", - " def __init__(self, data_dir='./', hidden_size=64, learning_rate=2e-4):\n", - "\n", - " super().__init__()\n", - "\n", - " # Set our init args as class attributes\n", - " self.data_dir = data_dir\n", - " self.hidden_size = hidden_size\n", - " self.learning_rate = learning_rate\n", - "\n", - " # Hardcode some dataset specific attributes\n", - " self.num_classes = 10\n", - " self.dims = (1, 28, 28)\n", - " channels, width, height = self.dims\n", - " self.transform = transforms.Compose([\n", - " transforms.ToTensor(),\n", - " transforms.Normalize((0.1307,), (0.3081,))\n", - " ])\n", - "\n", - " # Define PyTorch model\n", - " self.model = nn.Sequential(\n", - " nn.Flatten(),\n", - " nn.Linear(channels * width * height, hidden_size),\n", - " nn.ReLU(),\n", - " nn.Dropout(0.1),\n", - " nn.Linear(hidden_size, hidden_size),\n", - " nn.ReLU(),\n", - " nn.Dropout(0.1),\n", - " nn.Linear(hidden_size, self.num_classes)\n", - " )\n", - "\n", - " def forward(self, x):\n", - " x = self.model(x)\n", - " return F.log_softmax(x, dim=1)\n", - "\n", - " def training_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " logits = self(x)\n", - " loss = F.nll_loss(logits, y)\n", - " return loss\n", - "\n", - " def validation_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " logits = self(x)\n", - " loss = F.nll_loss(logits, y)\n", - " preds = torch.argmax(logits, dim=1)\n", - " acc = accuracy(preds, y)\n", - "\n", - " # Calling self.log will surface up scalars for you in TensorBoard\n", - " self.log('val_loss', loss, prog_bar=True)\n", - " self.log('val_acc', acc, prog_bar=True)\n", - " return loss\n", - "\n", - " def test_step(self, batch, batch_idx):\n", - " # Here we just reuse the validation_step for testing\n", - " return self.validation_step(batch, batch_idx)\n", - "\n", - " def configure_optimizers(self):\n", - " optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)\n", - " return optimizer\n", - "\n", - " ####################\n", - " # DATA RELATED HOOKS\n", - " ####################\n", - "\n", - " def prepare_data(self):\n", - " # download\n", - " MNIST(self.data_dir, train=True, download=True)\n", - " MNIST(self.data_dir, train=False, download=True)\n", - "\n", - " def setup(self, stage=None):\n", - "\n", - " # Assign train/val datasets for use in dataloaders\n", - " if stage == 'fit' or stage is None:\n", - " mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)\n", - " self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])\n", - "\n", - " # Assign test dataset for use in dataloader(s)\n", - " if stage == 'test' or stage is None:\n", - " self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)\n", - "\n", - " def train_dataloader(self):\n", - " return DataLoader(self.mnist_train, batch_size=32)\n", - "\n", - " def val_dataloader(self):\n", - " return DataLoader(self.mnist_val, batch_size=32)\n", - "\n", - " def test_dataloader(self):\n", - " return DataLoader(self.mnist_test, batch_size=32)" - ], - "execution_count": 5, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "Mb0U5Rk2kLBy", - "colab_type": "code", - "colab": {} - }, - "source": [ - "model = LitMNIST()\n", - "trainer = pl.Trainer(gpus=1, max_epochs=3, progress_bar_refresh_rate=20)\n", - "trainer.fit(model)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nht8AvMptY6I", - "colab_type": "text" - }, - "source": [ - "### Testing\n", - "\n", - "To test a model, call `trainer.test(model)`.\n", - "\n", - "Or, if you've just trained a model, you can just call `trainer.test()` and Lightning will automatically test using the best saved checkpoint (conditioned on val_loss)." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "PA151FkLtprO", - "colab_type": "code", - "colab": {} - }, - "source": [ - "trainer.test()" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "T3-3lbbNtr5T", - "colab_type": "text" - }, - "source": [ - "### Bonus Tip\n", - "\n", - "You can keep calling `trainer.fit(model)` as many times as you'd like to continue training" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "IFBwCbLet2r6", - "colab_type": "code", - "colab": {} - }, - "source": [ - "trainer.fit(model)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8TRyS5CCt3n9", - "colab_type": "text" - }, - "source": [ - "In Colab, you can use the TensorBoard magic function to view the logs that Lightning has created for you!" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "wizS-QiLuAYo", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Start tensorboard.\n", - "%load_ext tensorboard\n", - "%tensorboard --logdir lightning_logs/" - ], - "execution_count": null, - "outputs": [] - } - ] + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "i7XbLCXGkll9" + }, + "source": [ + "# Introduction to Pytorch Lightning ⚡\n", + "\n", + "In this notebook, we'll go over the basics of lightning by preparing models to train on the [MNIST Handwritten Digits dataset](https://en.wikipedia.org/wiki/MNIST_database).\n", + "\n", + "---\n", + " - Give us a ⭐ [on Github](https://www.github.com/PytorchLightning/pytorch-lightning/)\n", + " - Check out [the documentation](https://pytorch-lightning.readthedocs.io/en/latest/)\n", + " - Join us [on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "2LODD6w9ixlT" + }, + "source": [ + "### Setup \n", + "Lightning is easy to install. Simply ```pip install pytorch-lightning```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "zK7-Gg69kMnG" + }, + "outputs": [], + "source": [ + "! pip install pytorch-lightning --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "w4_TYnt_keJi" + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "import torch\n", + "from torch import nn\n", + "from torch.nn import functional as F\n", + "from torch.utils.data import DataLoader, random_split\n", + "from torchvision.datasets import MNIST\n", + "from torchvision import transforms\n", + "import pytorch_lightning as pl\n", + "from pytorch_lightning.metrics.functional import accuracy" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "EHpyMPKFkVbZ" + }, + "source": [ + "## Simplest example\n", + "\n", + "Here's the simplest most minimal example with just a training loop (no validation, no testing).\n", + "\n", + "**Keep in Mind** - A `LightningModule` *is* a PyTorch `nn.Module` - it just has a few more helpful features." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "V7ELesz1kVQo" + }, + "outputs": [], + "source": [ + "class MNISTModel(pl.LightningModule):\n", + "\n", + " def __init__(self):\n", + " super(MNISTModel, self).__init__()\n", + " self.l1 = torch.nn.Linear(28 * 28, 10)\n", + "\n", + " def forward(self, x):\n", + " return torch.relu(self.l1(x.view(x.size(0), -1)))\n", + "\n", + " def training_step(self, batch, batch_nb):\n", + " x, y = batch\n", + " loss = F.cross_entropy(self(x), y)\n", + " return loss\n", + "\n", + " def configure_optimizers(self):\n", + " return torch.optim.Adam(self.parameters(), lr=0.02)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "hIrtHg-Dv8TJ" + }, + "source": [ + "By using the `Trainer` you automatically get:\n", + "1. Tensorboard logging\n", + "2. Model checkpointing\n", + "3. Training and validation loop\n", + "4. early-stopping" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "4Dk6Ykv8lI7X" + }, + "outputs": [], + "source": [ + "# Init our model\n", + "mnist_model = MNISTModel()\n", + "\n", + "# Init DataLoader from MNIST Dataset\n", + "train_ds = MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor())\n", + "train_loader = DataLoader(train_ds, batch_size=32)\n", + "\n", + "# Initialize a trainer\n", + "trainer = pl.Trainer(gpus=1, max_epochs=3, progress_bar_refresh_rate=20)\n", + "\n", + "# Train the model ⚡\n", + "trainer.fit(mnist_model, train_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "KNpOoBeIjscS" + }, + "source": [ + "## A more complete MNIST Lightning Module Example\n", + "\n", + "That wasn't so hard was it?\n", + "\n", + "Now that we've got our feet wet, let's dive in a bit deeper and write a more complete `LightningModule` for MNIST...\n", + "\n", + "This time, we'll bake in all the dataset specific pieces directly in the `LightningModule`. This way, we can avoid writing extra code at the beginning of our script every time we want to run it.\n", + "\n", + "---\n", + "\n", + "### Note what the following built-in functions are doing:\n", + "\n", + "1. [prepare_data()](https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.core.lightning.html#pytorch_lightning.core.lightning.LightningModule.prepare_data) 💾\n", + " - This is where we can download the dataset. We point to our desired dataset and ask torchvision's `MNIST` dataset class to download if the dataset isn't found there.\n", + " - **Note we do not make any state assignments in this function** (i.e. `self.something = ...`)\n", + "\n", + "2. [setup(stage)](https://pytorch-lightning.readthedocs.io/en/latest/lightning-module.html#setup) ⚙️\n", + " - Loads in data from file and prepares PyTorch tensor datasets for each split (train, val, test). \n", + " - Setup expects a 'stage' arg which is used to separate logic for 'fit' and 'test'.\n", + " - If you don't mind loading all your datasets at once, you can set up a condition to allow for both 'fit' related setup and 'test' related setup to run whenever `None` is passed to `stage` (or ignore it altogether and exclude any conditionals).\n", + " - **Note this runs across all GPUs and it *is* safe to make state assignments here**\n", + "\n", + "3. [x_dataloader()](https://pytorch-lightning.readthedocs.io/en/latest/lightning-module.html#data-hooks) ♻️\n", + " - `train_dataloader()`, `val_dataloader()`, and `test_dataloader()` all return PyTorch `DataLoader` instances that are created by wrapping their respective datasets that we prepared in `setup()`" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "4DNItffri95Q" + }, + "outputs": [], + "source": [ + "class LitMNIST(pl.LightningModule):\n", + " \n", + " def __init__(self, data_dir='./', hidden_size=64, learning_rate=2e-4):\n", + "\n", + " super().__init__()\n", + "\n", + " # Set our init args as class attributes\n", + " self.data_dir = data_dir\n", + " self.hidden_size = hidden_size\n", + " self.learning_rate = learning_rate\n", + "\n", + " # Hardcode some dataset specific attributes\n", + " self.num_classes = 10\n", + " self.dims = (1, 28, 28)\n", + " channels, width, height = self.dims\n", + " self.transform = transforms.Compose([\n", + " transforms.ToTensor(),\n", + " transforms.Normalize((0.1307,), (0.3081,))\n", + " ])\n", + "\n", + " # Define PyTorch model\n", + " self.model = nn.Sequential(\n", + " nn.Flatten(),\n", + " nn.Linear(channels * width * height, hidden_size),\n", + " nn.ReLU(),\n", + " nn.Dropout(0.1),\n", + " nn.Linear(hidden_size, hidden_size),\n", + " nn.ReLU(),\n", + " nn.Dropout(0.1),\n", + " nn.Linear(hidden_size, self.num_classes)\n", + " )\n", + "\n", + " def forward(self, x):\n", + " x = self.model(x)\n", + " return F.log_softmax(x, dim=1)\n", + "\n", + " def training_step(self, batch, batch_idx):\n", + " x, y = batch\n", + " logits = self(x)\n", + " loss = F.nll_loss(logits, y)\n", + " return loss\n", + "\n", + " def validation_step(self, batch, batch_idx):\n", + " x, y = batch\n", + " logits = self(x)\n", + " loss = F.nll_loss(logits, y)\n", + " preds = torch.argmax(logits, dim=1)\n", + " acc = accuracy(preds, y)\n", + "\n", + " # Calling self.log will surface up scalars for you in TensorBoard\n", + " self.log('val_loss', loss, prog_bar=True)\n", + " self.log('val_acc', acc, prog_bar=True)\n", + " return loss\n", + "\n", + " def test_step(self, batch, batch_idx):\n", + " # Here we just reuse the validation_step for testing\n", + " return self.validation_step(batch, batch_idx)\n", + "\n", + " def configure_optimizers(self):\n", + " optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)\n", + " return optimizer\n", + "\n", + " ####################\n", + " # DATA RELATED HOOKS\n", + " ####################\n", + "\n", + " def prepare_data(self):\n", + " # download\n", + " MNIST(self.data_dir, train=True, download=True)\n", + " MNIST(self.data_dir, train=False, download=True)\n", + "\n", + " def setup(self, stage=None):\n", + "\n", + " # Assign train/val datasets for use in dataloaders\n", + " if stage == 'fit' or stage is None:\n", + " mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)\n", + " self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])\n", + "\n", + " # Assign test dataset for use in dataloader(s)\n", + " if stage == 'test' or stage is None:\n", + " self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)\n", + "\n", + " def train_dataloader(self):\n", + " return DataLoader(self.mnist_train, batch_size=32)\n", + "\n", + " def val_dataloader(self):\n", + " return DataLoader(self.mnist_val, batch_size=32)\n", + "\n", + " def test_dataloader(self):\n", + " return DataLoader(self.mnist_test, batch_size=32)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Mb0U5Rk2kLBy" + }, + "outputs": [], + "source": [ + "model = LitMNIST()\n", + "trainer = pl.Trainer(gpus=1, max_epochs=3, progress_bar_refresh_rate=20)\n", + "trainer.fit(model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "nht8AvMptY6I" + }, + "source": [ + "### Testing\n", + "\n", + "To test a model, call `trainer.test(model)`.\n", + "\n", + "Or, if you've just trained a model, you can just call `trainer.test()` and Lightning will automatically test using the best saved checkpoint (conditioned on val_loss)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "PA151FkLtprO" + }, + "outputs": [], + "source": [ + "trainer.test()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "T3-3lbbNtr5T" + }, + "source": [ + "### Bonus Tip\n", + "\n", + "You can keep calling `trainer.fit(model)` as many times as you'd like to continue training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "IFBwCbLet2r6" + }, + "outputs": [], + "source": [ + "trainer.fit(model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "8TRyS5CCt3n9" + }, + "source": [ + "In Colab, you can use the TensorBoard magic function to view the logs that Lightning has created for you!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "wizS-QiLuAYo" + }, + "outputs": [], + "source": [ + "# Start tensorboard.\n", + "%load_ext tensorboard\n", + "%tensorboard --logdir lightning_logs/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "

Congratulations - Time to Join the Community!

\n", + "
\n", + "\n", + "Congratulations on completing this notebook tutorial! If you enjoyed this and would like to join the Lightning movement, you can do so in the following ways!\n", + "\n", + "### Star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) on GitHub\n", + "The easiest way to help our community is just by starring the GitHub repos! This helps raise awareness of the cool tools we're building.\n", + "\n", + "* Please, star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning)\n", + "\n", + "### Join our [Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)!\n", + "The best way to keep up to date on the latest advancements is to join our community! Make sure to introduce yourself and share your interests in `#general` channel\n", + "\n", + "### Interested by SOTA AI models ! Check out [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts)\n", + "Bolts has a collection of state-of-the-art models, all implemented in [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) and can be easily integrated within your own projects.\n", + "\n", + "* Please, star [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts)\n", + "\n", + "### Contributions !\n", + "The best way to contribute to our community is to become a code contributor! At any time you can go to [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) or [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts) GitHub Issues page and filter for \"good first issue\". \n", + "\n", + "* [Lightning good first issue](https://github.com/PyTorchLightning/pytorch-lightning/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", + "* [Bolt good first issue](https://github.com/PyTorchLightning/pytorch-lightning-bolts/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", + "* You can also contribute your own notebooks with useful examples !\n", + "\n", + "### Great thanks from the entire Pytorch Lightning Team for your interest !\n", + "\n", + "" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "authorship_tag": "ABX9TyOtAKVa5POQ6Xg3UcTQqXDJ", + "collapsed_sections": [], + "include_colab_link": true, + "name": "01-mnist-hello-world.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } diff --git a/notebooks/02-datamodules.ipynb b/notebooks/02-datamodules.ipynb index 3e027cd304c77..599cb1d6bd289 100644 --- a/notebooks/02-datamodules.ipynb +++ b/notebooks/02-datamodules.ipynb @@ -1,540 +1,588 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "02-datamodules.ipynb", - "provenance": [], - "collapsed_sections": [], - "toc_visible": true, - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "accelerator": "GPU" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2O5r7QvP8-rt", - "colab_type": "text" - }, - "source": [ - "# PyTorch Lightning DataModules ⚡\n", - "\n", - "With the release of `pytorch-lightning` version 0.9.0, we have included a new class called `LightningDataModule` to help you decouple data related hooks from your `LightningModule`.\n", - "\n", - "This notebook will walk you through how to start using Datamodules.\n", - "\n", - "The most up to date documentation on datamodules can be found [here](https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html).\n", - "\n", - "---\n", - "\n", - " - Give us a ⭐ [on Github](https://www.github.com/PytorchLightning/pytorch-lightning/)\n", - " - Check out [the documentation](https://pytorch-lightning.readthedocs.io/en/latest/)\n", - " - Join us [on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6RYMhmfA9ATN", - "colab_type": "text" - }, - "source": [ - "### Setup\n", - "Lightning is easy to install. Simply ```pip install pytorch-lightning```" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "lj2zD-wsbvGr", - "colab_type": "code", - "colab": {} - }, - "source": [ - "! pip install pytorch-lightning --quiet" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8g2mbvy-9xDI", - "colab_type": "text" - }, - "source": [ - "# Introduction\n", - "\n", - "First, we'll go over a regular `LightningModule` implementation without the use of a `LightningDataModule`" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "eg-xDlmDdAwy", - "colab_type": "code", - "colab": {} - }, - "source": [ - "import pytorch_lightning as pl\n", - "from pytorch_lightning.metrics.functional import accuracy\n", - "import torch\n", - "from torch import nn\n", - "import torch.nn.functional as F\n", - "from torch.utils.data import random_split, DataLoader\n", - "\n", - "# Note - you must have torchvision installed for this example\n", - "from torchvision.datasets import MNIST, CIFAR10\n", - "from torchvision import transforms" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DzgY7wi88UuG", - "colab_type": "text" - }, - "source": [ - "## Defining the LitMNISTModel\n", - "\n", - "Below, we reuse a `LightningModule` from our hello world tutorial that classifies MNIST Handwritten Digits.\n", - "\n", - "Unfortunately, we have hardcoded dataset-specific items within the model, forever limiting it to working with MNIST Data. 😢\n", - "\n", - "This is fine if you don't plan on training/evaluating your model on different datasets. However, in many cases, this can become bothersome when you want to try out your architecture with different datasets." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "IQkW8_FF5nU2", - "colab_type": "code", - "colab": {} - }, - "source": [ - "class LitMNIST(pl.LightningModule):\n", - " \n", - " def __init__(self, data_dir='./', hidden_size=64, learning_rate=2e-4):\n", - "\n", - " super().__init__()\n", - "\n", - " # We hardcode dataset specific stuff here.\n", - " self.data_dir = data_dir\n", - " self.num_classes = 10\n", - " self.dims = (1, 28, 28)\n", - " channels, width, height = self.dims\n", - " self.transform = transforms.Compose([\n", - " transforms.ToTensor(),\n", - " transforms.Normalize((0.1307,), (0.3081,))\n", - " ])\n", - "\n", - " self.hidden_size = hidden_size\n", - " self.learning_rate = learning_rate\n", - "\n", - " # Build model\n", - " self.model = nn.Sequential(\n", - " nn.Flatten(),\n", - " nn.Linear(channels * width * height, hidden_size),\n", - " nn.ReLU(),\n", - " nn.Dropout(0.1),\n", - " nn.Linear(hidden_size, hidden_size),\n", - " nn.ReLU(),\n", - " nn.Dropout(0.1),\n", - " nn.Linear(hidden_size, self.num_classes)\n", - " )\n", - "\n", - " def forward(self, x):\n", - " x = self.model(x)\n", - " return F.log_softmax(x, dim=1)\n", - "\n", - " def training_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " logits = self(x)\n", - " loss = F.nll_loss(logits, y)\n", - " return loss\n", - "\n", - " def validation_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " logits = self(x)\n", - " loss = F.nll_loss(logits, y)\n", - " preds = torch.argmax(logits, dim=1)\n", - " acc = accuracy(preds, y)\n", - " self.log('val_loss', loss, prog_bar=True)\n", - " self.log('val_acc', acc, prog_bar=True)\n", - " return loss\n", - "\n", - " def configure_optimizers(self):\n", - " optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)\n", - " return optimizer\n", - "\n", - " ####################\n", - " # DATA RELATED HOOKS\n", - " ####################\n", - "\n", - " def prepare_data(self):\n", - " # download\n", - " MNIST(self.data_dir, train=True, download=True)\n", - " MNIST(self.data_dir, train=False, download=True)\n", - "\n", - " def setup(self, stage=None):\n", - "\n", - " # Assign train/val datasets for use in dataloaders\n", - " if stage == 'fit' or stage is None:\n", - " mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)\n", - " self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])\n", - "\n", - " # Assign test dataset for use in dataloader(s)\n", - " if stage == 'test' or stage is None:\n", - " self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)\n", - "\n", - " def train_dataloader(self):\n", - " return DataLoader(self.mnist_train, batch_size=32)\n", - "\n", - " def val_dataloader(self):\n", - " return DataLoader(self.mnist_val, batch_size=32)\n", - "\n", - " def test_dataloader(self):\n", - " return DataLoader(self.mnist_test, batch_size=32)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "K7sg9KQd-QIO", - "colab_type": "text" - }, - "source": [ - "## Training the ListMNIST Model" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "QxDNDaus6byD", - "colab_type": "code", - "colab": {} - }, - "source": [ - "model = LitMNIST()\n", - "trainer = pl.Trainer(max_epochs=2, gpus=1, progress_bar_refresh_rate=20)\n", - "trainer.fit(model)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dY8d6GxmB0YU", - "colab_type": "text" - }, - "source": [ - "# Using DataModules\n", - "\n", - "DataModules are a way of decoupling data-related hooks from the `LightningModule` so you can develop dataset agnostic models." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eJeT5bW081wn", - "colab_type": "text" - }, - "source": [ - "## Defining The MNISTDataModule\n", - "\n", - "Let's go over each function in the class below and talk about what they're doing:\n", - "\n", - "1. ```__init__```\n", - " - Takes in a `data_dir` arg that points to where you have downloaded/wish to download the MNIST dataset.\n", - " - Defines a transform that will be applied across train, val, and test dataset splits.\n", - " - Defines default `self.dims`, which is a tuple returned from `datamodule.size()` that can help you initialize models.\n", - "\n", - "\n", - "2. ```prepare_data```\n", - " - This is where we can download the dataset. We point to our desired dataset and ask torchvision's `MNIST` dataset class to download if the dataset isn't found there.\n", - " - **Note we do not make any state assignments in this function** (i.e. `self.something = ...`)\n", - "\n", - "3. ```setup```\n", - " - Loads in data from file and prepares PyTorch tensor datasets for each split (train, val, test). \n", - " - Setup expects a 'stage' arg which is used to separate logic for 'fit' and 'test'.\n", - " - If you don't mind loading all your datasets at once, you can set up a condition to allow for both 'fit' related setup and 'test' related setup to run whenever `None` is passed to `stage`.\n", - " - **Note this runs across all GPUs and it *is* safe to make state assignments here**\n", - "\n", - "\n", - "4. ```x_dataloader```\n", - " - `train_dataloader()`, `val_dataloader()`, and `test_dataloader()` all return PyTorch `DataLoader` instances that are created by wrapping their respective datasets that we prepared in `setup()`" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "DfGKyGwG_X9v", - "colab_type": "code", - "colab": {} - }, - "source": [ - "class MNISTDataModule(pl.LightningDataModule):\n", - "\n", - " def __init__(self, data_dir: str = './'):\n", - " super().__init__()\n", - " self.data_dir = data_dir\n", - " self.transform = transforms.Compose([\n", - " transforms.ToTensor(),\n", - " transforms.Normalize((0.1307,), (0.3081,))\n", - " ])\n", - "\n", - " # self.dims is returned when you call dm.size()\n", - " # Setting default dims here because we know them.\n", - " # Could optionally be assigned dynamically in dm.setup()\n", - " self.dims = (1, 28, 28)\n", - " self.num_classes = 10\n", - "\n", - " def prepare_data(self):\n", - " # download\n", - " MNIST(self.data_dir, train=True, download=True)\n", - " MNIST(self.data_dir, train=False, download=True)\n", - "\n", - " def setup(self, stage=None):\n", - "\n", - " # Assign train/val datasets for use in dataloaders\n", - " if stage == 'fit' or stage is None:\n", - " mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)\n", - " self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])\n", - "\n", - " # Assign test dataset for use in dataloader(s)\n", - " if stage == 'test' or stage is None:\n", - " self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)\n", - "\n", - " def train_dataloader(self):\n", - " return DataLoader(self.mnist_train, batch_size=32)\n", - "\n", - " def val_dataloader(self):\n", - " return DataLoader(self.mnist_val, batch_size=32)\n", - "\n", - " def test_dataloader(self):\n", - " return DataLoader(self.mnist_test, batch_size=32)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "H2Yoj-9M9dS7", - "colab_type": "text" - }, - "source": [ - "## Defining the dataset agnostic `LitModel`\n", - "\n", - "Below, we define the same model as the `LitMNIST` model we made earlier. \n", - "\n", - "However, this time our model has the freedom to use any input data that we'd like 🔥." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "PM2IISuOBDIu", - "colab_type": "code", - "colab": {} - }, - "source": [ - "class LitModel(pl.LightningModule):\n", - " \n", - " def __init__(self, channels, width, height, num_classes, hidden_size=64, learning_rate=2e-4):\n", - "\n", - " super().__init__()\n", - "\n", - " # We take in input dimensions as parameters and use those to dynamically build model.\n", - " self.channels = channels\n", - " self.width = width\n", - " self.height = height\n", - " self.num_classes = num_classes\n", - " self.hidden_size = hidden_size\n", - " self.learning_rate = learning_rate\n", - "\n", - " self.model = nn.Sequential(\n", - " nn.Flatten(),\n", - " nn.Linear(channels * width * height, hidden_size),\n", - " nn.ReLU(),\n", - " nn.Dropout(0.1),\n", - " nn.Linear(hidden_size, hidden_size),\n", - " nn.ReLU(),\n", - " nn.Dropout(0.1),\n", - " nn.Linear(hidden_size, num_classes)\n", - " )\n", - "\n", - " def forward(self, x):\n", - " x = self.model(x)\n", - " return F.log_softmax(x, dim=1)\n", - "\n", - " def training_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " logits = self(x)\n", - " loss = F.nll_loss(logits, y)\n", - " return loss\n", - "\n", - " def validation_step(self, batch, batch_idx):\n", - "\n", - " x, y = batch\n", - " logits = self(x)\n", - " loss = F.nll_loss(logits, y)\n", - " preds = torch.argmax(logits, dim=1)\n", - " acc = accuracy(preds, y)\n", - " self.log('val_loss', loss, prog_bar=True)\n", - " self.log('val_acc', acc, prog_bar=True)\n", - " return loss\n", - "\n", - " def configure_optimizers(self):\n", - " optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)\n", - " return optimizer" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "G4Z5olPe-xEo", - "colab_type": "text" - }, - "source": [ - "## Training the `LitModel` using the `MNISTDataModule`\n", - "\n", - "Now, we initialize and train the `LitModel` using the `MNISTDataModule`'s configuration settings and dataloaders." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "kV48vP_9mEli", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Init DataModule\n", - "dm = MNISTDataModule()\n", - "# Init model from datamodule's attributes\n", - "model = LitModel(*dm.size(), dm.num_classes)\n", - "# Init trainer\n", - "trainer = pl.Trainer(max_epochs=3, progress_bar_refresh_rate=20, gpus=1)\n", - "# Pass the datamodule as arg to trainer.fit to override model hooks :)\n", - "trainer.fit(model, dm)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WNxrugIGRRv5", - "colab_type": "text" - }, - "source": [ - "## Defining the CIFAR10 DataModule\n", - "\n", - "Lets prove the `LitModel` we made earlier is dataset agnostic by defining a new datamodule for the CIFAR10 dataset." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "1tkaYLU7RT5P", - "colab_type": "code", - "colab": {} - }, - "source": [ - "class CIFAR10DataModule(pl.LightningDataModule):\n", - "\n", - " def __init__(self, data_dir: str = './'):\n", - " super().__init__()\n", - " self.data_dir = data_dir\n", - " self.transform = transforms.Compose([\n", - " transforms.ToTensor(),\n", - " transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))\n", - " ])\n", - "\n", - " self.dims = (3, 32, 32)\n", - " self.num_classes = 10\n", - "\n", - " def prepare_data(self):\n", - " # download\n", - " CIFAR10(self.data_dir, train=True, download=True)\n", - " CIFAR10(self.data_dir, train=False, download=True)\n", - "\n", - " def setup(self, stage=None):\n", - "\n", - " # Assign train/val datasets for use in dataloaders\n", - " if stage == 'fit' or stage is None:\n", - " cifar_full = CIFAR10(self.data_dir, train=True, transform=self.transform)\n", - " self.cifar_train, self.cifar_val = random_split(cifar_full, [45000, 5000])\n", - "\n", - " # Assign test dataset for use in dataloader(s)\n", - " if stage == 'test' or stage is None:\n", - " self.cifar_test = CIFAR10(self.data_dir, train=False, transform=self.transform)\n", - "\n", - " def train_dataloader(self):\n", - " return DataLoader(self.cifar_train, batch_size=32)\n", - "\n", - " def val_dataloader(self):\n", - " return DataLoader(self.cifar_val, batch_size=32)\n", - "\n", - " def test_dataloader(self):\n", - " return DataLoader(self.cifar_test, batch_size=32)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BrXxf3oX_gsZ", - "colab_type": "text" - }, - "source": [ - "## Training the `LitModel` using the `CIFAR10DataModule`\n", - "\n", - "Our model isn't very good, so it will perform pretty badly on the CIFAR10 dataset.\n", - "\n", - "The point here is that we can see that our `LitModel` has no problem using a different datamodule as its input data." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "sd-SbWi_krdj", - "colab_type": "code", - "colab": {} - }, - "source": [ - "dm = CIFAR10DataModule()\n", - "model = LitModel(*dm.size(), dm.num_classes, hidden_size=256)\n", - "trainer = pl.Trainer(max_epochs=5, progress_bar_refresh_rate=20, gpus=1)\n", - "trainer.fit(model, dm)" - ], - "execution_count": null, - "outputs": [] - } - ] + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "2O5r7QvP8-rt" + }, + "source": [ + "# PyTorch Lightning DataModules ⚡\n", + "\n", + "With the release of `pytorch-lightning` version 0.9.0, we have included a new class called `LightningDataModule` to help you decouple data related hooks from your `LightningModule`.\n", + "\n", + "This notebook will walk you through how to start using Datamodules.\n", + "\n", + "The most up to date documentation on datamodules can be found [here](https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html).\n", + "\n", + "---\n", + "\n", + " - Give us a ⭐ [on Github](https://www.github.com/PytorchLightning/pytorch-lightning/)\n", + " - Check out [the documentation](https://pytorch-lightning.readthedocs.io/en/latest/)\n", + " - Join us [on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "6RYMhmfA9ATN" + }, + "source": [ + "### Setup\n", + "Lightning is easy to install. Simply ```pip install pytorch-lightning```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "lj2zD-wsbvGr" + }, + "outputs": [], + "source": [ + "! pip install pytorch-lightning --quiet" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "8g2mbvy-9xDI" + }, + "source": [ + "# Introduction\n", + "\n", + "First, we'll go over a regular `LightningModule` implementation without the use of a `LightningDataModule`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "eg-xDlmDdAwy" + }, + "outputs": [], + "source": [ + "import pytorch_lightning as pl\n", + "from pytorch_lightning.metrics.functional import accuracy\n", + "import torch\n", + "from torch import nn\n", + "import torch.nn.functional as F\n", + "from torch.utils.data import random_split, DataLoader\n", + "\n", + "# Note - you must have torchvision installed for this example\n", + "from torchvision.datasets import MNIST, CIFAR10\n", + "from torchvision import transforms" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "DzgY7wi88UuG" + }, + "source": [ + "## Defining the LitMNISTModel\n", + "\n", + "Below, we reuse a `LightningModule` from our hello world tutorial that classifies MNIST Handwritten Digits.\n", + "\n", + "Unfortunately, we have hardcoded dataset-specific items within the model, forever limiting it to working with MNIST Data. 😢\n", + "\n", + "This is fine if you don't plan on training/evaluating your model on different datasets. However, in many cases, this can become bothersome when you want to try out your architecture with different datasets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "IQkW8_FF5nU2" + }, + "outputs": [], + "source": [ + "class LitMNIST(pl.LightningModule):\n", + " \n", + " def __init__(self, data_dir='./', hidden_size=64, learning_rate=2e-4):\n", + "\n", + " super().__init__()\n", + "\n", + " # We hardcode dataset specific stuff here.\n", + " self.data_dir = data_dir\n", + " self.num_classes = 10\n", + " self.dims = (1, 28, 28)\n", + " channels, width, height = self.dims\n", + " self.transform = transforms.Compose([\n", + " transforms.ToTensor(),\n", + " transforms.Normalize((0.1307,), (0.3081,))\n", + " ])\n", + "\n", + " self.hidden_size = hidden_size\n", + " self.learning_rate = learning_rate\n", + "\n", + " # Build model\n", + " self.model = nn.Sequential(\n", + " nn.Flatten(),\n", + " nn.Linear(channels * width * height, hidden_size),\n", + " nn.ReLU(),\n", + " nn.Dropout(0.1),\n", + " nn.Linear(hidden_size, hidden_size),\n", + " nn.ReLU(),\n", + " nn.Dropout(0.1),\n", + " nn.Linear(hidden_size, self.num_classes)\n", + " )\n", + "\n", + " def forward(self, x):\n", + " x = self.model(x)\n", + " return F.log_softmax(x, dim=1)\n", + "\n", + " def training_step(self, batch, batch_idx):\n", + " x, y = batch\n", + " logits = self(x)\n", + " loss = F.nll_loss(logits, y)\n", + " return loss\n", + "\n", + " def validation_step(self, batch, batch_idx):\n", + " x, y = batch\n", + " logits = self(x)\n", + " loss = F.nll_loss(logits, y)\n", + " preds = torch.argmax(logits, dim=1)\n", + " acc = accuracy(preds, y)\n", + " self.log('val_loss', loss, prog_bar=True)\n", + " self.log('val_acc', acc, prog_bar=True)\n", + " return loss\n", + "\n", + " def configure_optimizers(self):\n", + " optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)\n", + " return optimizer\n", + "\n", + " ####################\n", + " # DATA RELATED HOOKS\n", + " ####################\n", + "\n", + " def prepare_data(self):\n", + " # download\n", + " MNIST(self.data_dir, train=True, download=True)\n", + " MNIST(self.data_dir, train=False, download=True)\n", + "\n", + " def setup(self, stage=None):\n", + "\n", + " # Assign train/val datasets for use in dataloaders\n", + " if stage == 'fit' or stage is None:\n", + " mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)\n", + " self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])\n", + "\n", + " # Assign test dataset for use in dataloader(s)\n", + " if stage == 'test' or stage is None:\n", + " self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)\n", + "\n", + " def train_dataloader(self):\n", + " return DataLoader(self.mnist_train, batch_size=32)\n", + "\n", + " def val_dataloader(self):\n", + " return DataLoader(self.mnist_val, batch_size=32)\n", + "\n", + " def test_dataloader(self):\n", + " return DataLoader(self.mnist_test, batch_size=32)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "K7sg9KQd-QIO" + }, + "source": [ + "## Training the ListMNIST Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "QxDNDaus6byD" + }, + "outputs": [], + "source": [ + "model = LitMNIST()\n", + "trainer = pl.Trainer(max_epochs=2, gpus=1, progress_bar_refresh_rate=20)\n", + "trainer.fit(model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "dY8d6GxmB0YU" + }, + "source": [ + "# Using DataModules\n", + "\n", + "DataModules are a way of decoupling data-related hooks from the `LightningModule` so you can develop dataset agnostic models." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "eJeT5bW081wn" + }, + "source": [ + "## Defining The MNISTDataModule\n", + "\n", + "Let's go over each function in the class below and talk about what they're doing:\n", + "\n", + "1. ```__init__```\n", + " - Takes in a `data_dir` arg that points to where you have downloaded/wish to download the MNIST dataset.\n", + " - Defines a transform that will be applied across train, val, and test dataset splits.\n", + " - Defines default `self.dims`, which is a tuple returned from `datamodule.size()` that can help you initialize models.\n", + "\n", + "\n", + "2. ```prepare_data```\n", + " - This is where we can download the dataset. We point to our desired dataset and ask torchvision's `MNIST` dataset class to download if the dataset isn't found there.\n", + " - **Note we do not make any state assignments in this function** (i.e. `self.something = ...`)\n", + "\n", + "3. ```setup```\n", + " - Loads in data from file and prepares PyTorch tensor datasets for each split (train, val, test). \n", + " - Setup expects a 'stage' arg which is used to separate logic for 'fit' and 'test'.\n", + " - If you don't mind loading all your datasets at once, you can set up a condition to allow for both 'fit' related setup and 'test' related setup to run whenever `None` is passed to `stage`.\n", + " - **Note this runs across all GPUs and it *is* safe to make state assignments here**\n", + "\n", + "\n", + "4. ```x_dataloader```\n", + " - `train_dataloader()`, `val_dataloader()`, and `test_dataloader()` all return PyTorch `DataLoader` instances that are created by wrapping their respective datasets that we prepared in `setup()`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "DfGKyGwG_X9v" + }, + "outputs": [], + "source": [ + "class MNISTDataModule(pl.LightningDataModule):\n", + "\n", + " def __init__(self, data_dir: str = './'):\n", + " super().__init__()\n", + " self.data_dir = data_dir\n", + " self.transform = transforms.Compose([\n", + " transforms.ToTensor(),\n", + " transforms.Normalize((0.1307,), (0.3081,))\n", + " ])\n", + "\n", + " # self.dims is returned when you call dm.size()\n", + " # Setting default dims here because we know them.\n", + " # Could optionally be assigned dynamically in dm.setup()\n", + " self.dims = (1, 28, 28)\n", + " self.num_classes = 10\n", + "\n", + " def prepare_data(self):\n", + " # download\n", + " MNIST(self.data_dir, train=True, download=True)\n", + " MNIST(self.data_dir, train=False, download=True)\n", + "\n", + " def setup(self, stage=None):\n", + "\n", + " # Assign train/val datasets for use in dataloaders\n", + " if stage == 'fit' or stage is None:\n", + " mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)\n", + " self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])\n", + "\n", + " # Assign test dataset for use in dataloader(s)\n", + " if stage == 'test' or stage is None:\n", + " self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)\n", + "\n", + " def train_dataloader(self):\n", + " return DataLoader(self.mnist_train, batch_size=32)\n", + "\n", + " def val_dataloader(self):\n", + " return DataLoader(self.mnist_val, batch_size=32)\n", + "\n", + " def test_dataloader(self):\n", + " return DataLoader(self.mnist_test, batch_size=32)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "H2Yoj-9M9dS7" + }, + "source": [ + "## Defining the dataset agnostic `LitModel`\n", + "\n", + "Below, we define the same model as the `LitMNIST` model we made earlier. \n", + "\n", + "However, this time our model has the freedom to use any input data that we'd like 🔥." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "PM2IISuOBDIu" + }, + "outputs": [], + "source": [ + "class LitModel(pl.LightningModule):\n", + " \n", + " def __init__(self, channels, width, height, num_classes, hidden_size=64, learning_rate=2e-4):\n", + "\n", + " super().__init__()\n", + "\n", + " # We take in input dimensions as parameters and use those to dynamically build model.\n", + " self.channels = channels\n", + " self.width = width\n", + " self.height = height\n", + " self.num_classes = num_classes\n", + " self.hidden_size = hidden_size\n", + " self.learning_rate = learning_rate\n", + "\n", + " self.model = nn.Sequential(\n", + " nn.Flatten(),\n", + " nn.Linear(channels * width * height, hidden_size),\n", + " nn.ReLU(),\n", + " nn.Dropout(0.1),\n", + " nn.Linear(hidden_size, hidden_size),\n", + " nn.ReLU(),\n", + " nn.Dropout(0.1),\n", + " nn.Linear(hidden_size, num_classes)\n", + " )\n", + "\n", + " def forward(self, x):\n", + " x = self.model(x)\n", + " return F.log_softmax(x, dim=1)\n", + "\n", + " def training_step(self, batch, batch_idx):\n", + " x, y = batch\n", + " logits = self(x)\n", + " loss = F.nll_loss(logits, y)\n", + " return loss\n", + "\n", + " def validation_step(self, batch, batch_idx):\n", + "\n", + " x, y = batch\n", + " logits = self(x)\n", + " loss = F.nll_loss(logits, y)\n", + " preds = torch.argmax(logits, dim=1)\n", + " acc = accuracy(preds, y)\n", + " self.log('val_loss', loss, prog_bar=True)\n", + " self.log('val_acc', acc, prog_bar=True)\n", + " return loss\n", + "\n", + " def configure_optimizers(self):\n", + " optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)\n", + " return optimizer" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "G4Z5olPe-xEo" + }, + "source": [ + "## Training the `LitModel` using the `MNISTDataModule`\n", + "\n", + "Now, we initialize and train the `LitModel` using the `MNISTDataModule`'s configuration settings and dataloaders." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "kV48vP_9mEli" + }, + "outputs": [], + "source": [ + "# Init DataModule\n", + "dm = MNISTDataModule()\n", + "# Init model from datamodule's attributes\n", + "model = LitModel(*dm.size(), dm.num_classes)\n", + "# Init trainer\n", + "trainer = pl.Trainer(max_epochs=3, progress_bar_refresh_rate=20, gpus=1)\n", + "# Pass the datamodule as arg to trainer.fit to override model hooks :)\n", + "trainer.fit(model, dm)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "WNxrugIGRRv5" + }, + "source": [ + "## Defining the CIFAR10 DataModule\n", + "\n", + "Lets prove the `LitModel` we made earlier is dataset agnostic by defining a new datamodule for the CIFAR10 dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "1tkaYLU7RT5P" + }, + "outputs": [], + "source": [ + "class CIFAR10DataModule(pl.LightningDataModule):\n", + "\n", + " def __init__(self, data_dir: str = './'):\n", + " super().__init__()\n", + " self.data_dir = data_dir\n", + " self.transform = transforms.Compose([\n", + " transforms.ToTensor(),\n", + " transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))\n", + " ])\n", + "\n", + " self.dims = (3, 32, 32)\n", + " self.num_classes = 10\n", + "\n", + " def prepare_data(self):\n", + " # download\n", + " CIFAR10(self.data_dir, train=True, download=True)\n", + " CIFAR10(self.data_dir, train=False, download=True)\n", + "\n", + " def setup(self, stage=None):\n", + "\n", + " # Assign train/val datasets for use in dataloaders\n", + " if stage == 'fit' or stage is None:\n", + " cifar_full = CIFAR10(self.data_dir, train=True, transform=self.transform)\n", + " self.cifar_train, self.cifar_val = random_split(cifar_full, [45000, 5000])\n", + "\n", + " # Assign test dataset for use in dataloader(s)\n", + " if stage == 'test' or stage is None:\n", + " self.cifar_test = CIFAR10(self.data_dir, train=False, transform=self.transform)\n", + "\n", + " def train_dataloader(self):\n", + " return DataLoader(self.cifar_train, batch_size=32)\n", + "\n", + " def val_dataloader(self):\n", + " return DataLoader(self.cifar_val, batch_size=32)\n", + "\n", + " def test_dataloader(self):\n", + " return DataLoader(self.cifar_test, batch_size=32)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "BrXxf3oX_gsZ" + }, + "source": [ + "## Training the `LitModel` using the `CIFAR10DataModule`\n", + "\n", + "Our model isn't very good, so it will perform pretty badly on the CIFAR10 dataset.\n", + "\n", + "The point here is that we can see that our `LitModel` has no problem using a different datamodule as its input data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "sd-SbWi_krdj" + }, + "outputs": [], + "source": [ + "dm = CIFAR10DataModule()\n", + "model = LitModel(*dm.size(), dm.num_classes, hidden_size=256)\n", + "trainer = pl.Trainer(max_epochs=5, progress_bar_refresh_rate=20, gpus=1)\n", + "trainer.fit(model, dm)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "

Congratulations - Time to Join the Community!

\n", + "
\n", + "\n", + "Congratulations on completing this notebook tutorial! If you enjoyed this and would like to join the Lightning movement, you can do so in the following ways!\n", + "\n", + "### Star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) on GitHub\n", + "The easiest way to help our community is just by starring the GitHub repos! This helps raise awareness of the cool tools we're building.\n", + "\n", + "* Please, star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning)\n", + "\n", + "### Join our [Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)!\n", + "The best way to keep up to date on the latest advancements is to join our community! Make sure to introduce yourself and share your interests in `#general` channel\n", + "\n", + "### Interested by SOTA AI models ! Check out [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts)\n", + "Bolts has a collection of state-of-the-art models, all implemented in [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) and can be easily integrated within your own projects.\n", + "\n", + "* Please, star [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts)\n", + "\n", + "### Contributions !\n", + "The best way to contribute to our community is to become a code contributor! At any time you can go to [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) or [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts) GitHub Issues page and filter for \"good first issue\". \n", + "\n", + "* [Lightning good first issue](https://github.com/PyTorchLightning/pytorch-lightning/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", + "* [Bolt good first issue](https://github.com/PyTorchLightning/pytorch-lightning-bolts/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", + "* You can also contribute your own notebooks with useful examples !\n", + "\n", + "### Great thanks from the entire Pytorch Lightning Team for your interest !\n", + "\n", + "" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "include_colab_link": true, + "name": "02-datamodules.ipynb", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } diff --git a/notebooks/03-basic-gan.ipynb b/notebooks/03-basic-gan.ipynb index a19153e133a5f..31555265938d8 100644 --- a/notebooks/03-basic-gan.ipynb +++ b/notebooks/03-basic-gan.ipynb @@ -1,424 +1,472 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "03-basic-gan.ipynb", - "provenance": [], - "collapsed_sections": [], - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "accelerator": "GPU" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "J37PBnE_x7IW", - "colab_type": "text" - }, - "source": [ - "# PyTorch Lightning Basic GAN Tutorial ⚡\n", - "\n", - "How to train a GAN!\n", - "\n", - "Main takeaways:\n", - "1. Generator and discriminator are arbitrary PyTorch modules.\n", - "2. training_step does both the generator and discriminator training.\n", - "\n", - "---\n", - "\n", - " - Give us a ⭐ [on Github](https://www.github.com/PytorchLightning/pytorch-lightning/)\n", - " - Check out [the documentation](https://pytorch-lightning.readthedocs.io/en/latest/)\n", - " - Join us [on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kg2MKpRmybht", - "colab_type": "text" - }, - "source": [ - "### Setup\n", - "Lightning is easy to install. Simply `pip install pytorch-lightning`" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "LfrJLKPFyhsK", - "colab_type": "code", - "colab": {} - }, - "source": [ - "! pip install pytorch-lightning --quiet" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "BjEPuiVLyanw", - "colab_type": "code", - "colab": {} - }, - "source": [ - "import os\n", - "from argparse import ArgumentParser\n", - "from collections import OrderedDict\n", - "\n", - "import numpy as np\n", - "import torch\n", - "import torch.nn as nn\n", - "import torch.nn.functional as F\n", - "import torchvision\n", - "import torchvision.transforms as transforms\n", - "from torch.utils.data import DataLoader, random_split\n", - "from torchvision.datasets import MNIST\n", - "\n", - "import pytorch_lightning as pl" - ], - "execution_count": 2, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "OuXJzr4G2uHV", - "colab_type": "text" - }, - "source": [ - "### MNIST DataModule\n", - "\n", - "Below, we define a DataModule for the MNIST Dataset. To learn more about DataModules, check out our tutorial on them or see the [latest docs](https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html)." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "DOY_nHu328g7", - "colab_type": "code", - "colab": {} - }, - "source": [ - "class MNISTDataModule(pl.LightningDataModule):\n", - "\n", - " def __init__(self, data_dir: str = './', batch_size: int = 64, num_workers: int = 8):\n", - " super().__init__()\n", - " self.data_dir = data_dir\n", - " self.batch_size = batch_size\n", - " self.num_workers = num_workers\n", - "\n", - " self.transform = transforms.Compose([\n", - " transforms.ToTensor(),\n", - " transforms.Normalize((0.1307,), (0.3081,))\n", - " ])\n", - "\n", - " # self.dims is returned when you call dm.size()\n", - " # Setting default dims here because we know them.\n", - " # Could optionally be assigned dynamically in dm.setup()\n", - " self.dims = (1, 28, 28)\n", - " self.num_classes = 10\n", - "\n", - " def prepare_data(self):\n", - " # download\n", - " MNIST(self.data_dir, train=True, download=True)\n", - " MNIST(self.data_dir, train=False, download=True)\n", - "\n", - " def setup(self, stage=None):\n", - "\n", - " # Assign train/val datasets for use in dataloaders\n", - " if stage == 'fit' or stage is None:\n", - " mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)\n", - " self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])\n", - "\n", - " # Assign test dataset for use in dataloader(s)\n", - " if stage == 'test' or stage is None:\n", - " self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)\n", - "\n", - " def train_dataloader(self):\n", - " return DataLoader(self.mnist_train, batch_size=self.batch_size, num_workers=self.num_workers)\n", - "\n", - " def val_dataloader(self):\n", - " return DataLoader(self.mnist_val, batch_size=self.batch_size, num_workers=self.num_workers)\n", - "\n", - " def test_dataloader(self):\n", - " return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=self.num_workers)" - ], - "execution_count": 3, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tW3c0QrQyF9P", - "colab_type": "text" - }, - "source": [ - "### A. Generator" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "0E2QDjl5yWtz", - "colab_type": "code", - "colab": {} - }, - "source": [ - "class Generator(nn.Module):\n", - " def __init__(self, latent_dim, img_shape):\n", - " super().__init__()\n", - " self.img_shape = img_shape\n", - "\n", - " def block(in_feat, out_feat, normalize=True):\n", - " layers = [nn.Linear(in_feat, out_feat)]\n", - " if normalize:\n", - " layers.append(nn.BatchNorm1d(out_feat, 0.8))\n", - " layers.append(nn.LeakyReLU(0.2, inplace=True))\n", - " return layers\n", - "\n", - " self.model = nn.Sequential(\n", - " *block(latent_dim, 128, normalize=False),\n", - " *block(128, 256),\n", - " *block(256, 512),\n", - " *block(512, 1024),\n", - " nn.Linear(1024, int(np.prod(img_shape))),\n", - " nn.Tanh()\n", - " )\n", - "\n", - " def forward(self, z):\n", - " img = self.model(z)\n", - " img = img.view(img.size(0), *self.img_shape)\n", - " return img" - ], - "execution_count": 4, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uyrltsGvyaI3", - "colab_type": "text" - }, - "source": [ - "### B. Discriminator" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Ed3MR3vnyxyW", - "colab_type": "code", - "colab": {} - }, - "source": [ - "class Discriminator(nn.Module):\n", - " def __init__(self, img_shape):\n", - " super().__init__()\n", - "\n", - " self.model = nn.Sequential(\n", - " nn.Linear(int(np.prod(img_shape)), 512),\n", - " nn.LeakyReLU(0.2, inplace=True),\n", - " nn.Linear(512, 256),\n", - " nn.LeakyReLU(0.2, inplace=True),\n", - " nn.Linear(256, 1),\n", - " nn.Sigmoid(),\n", - " )\n", - "\n", - " def forward(self, img):\n", - " img_flat = img.view(img.size(0), -1)\n", - " validity = self.model(img_flat)\n", - "\n", - " return validity" - ], - "execution_count": 5, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BwUMom3ryySK", - "colab_type": "text" - }, - "source": [ - "### C. GAN\n", - "\n", - "#### A couple of cool features to check out in this example...\n", - "\n", - " - We use `some_tensor.type_as(another_tensor)` to make sure we initialize new tensors on the right device (i.e. GPU, CPU).\n", - " - Lightning will put your dataloader data on the right device automatically\n", - " - In this example, we pull from latent dim on the fly, so we need to dynamically add tensors to the right device.\n", - " - `type_as` is the way we recommend to do this.\n", - " - This example shows how to use multiple dataloaders in your `LightningModule`." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "3vKszYf6y1Vv", - "colab_type": "code", - "colab": {} - }, - "source": [ - " class GAN(pl.LightningModule):\n", - "\n", - " def __init__(\n", - " self,\n", - " channels,\n", - " width,\n", - " height,\n", - " latent_dim: int = 100,\n", - " lr: float = 0.0002,\n", - " b1: float = 0.5,\n", - " b2: float = 0.999,\n", - " batch_size: int = 64,\n", - " **kwargs\n", - " ):\n", - " super().__init__()\n", - " self.save_hyperparameters()\n", - "\n", - " # networks\n", - " data_shape = (channels, width, height)\n", - " self.generator = Generator(latent_dim=self.hparams.latent_dim, img_shape=data_shape)\n", - " self.discriminator = Discriminator(img_shape=data_shape)\n", - "\n", - " self.validation_z = torch.randn(8, self.hparams.latent_dim)\n", - "\n", - " self.example_input_array = torch.zeros(2, self.hparams.latent_dim)\n", - "\n", - " def forward(self, z):\n", - " return self.generator(z)\n", - "\n", - " def adversarial_loss(self, y_hat, y):\n", - " return F.binary_cross_entropy(y_hat, y)\n", - "\n", - " def training_step(self, batch, batch_idx, optimizer_idx):\n", - " imgs, _ = batch\n", - "\n", - " # sample noise\n", - " z = torch.randn(imgs.shape[0], self.hparams.latent_dim)\n", - " z = z.type_as(imgs)\n", - "\n", - " # train generator\n", - " if optimizer_idx == 0:\n", - "\n", - " # generate images\n", - " self.generated_imgs = self(z)\n", - "\n", - " # log sampled images\n", - " sample_imgs = self.generated_imgs[:6]\n", - " grid = torchvision.utils.make_grid(sample_imgs)\n", - " self.logger.experiment.add_image('generated_images', grid, 0)\n", - "\n", - " # ground truth result (ie: all fake)\n", - " # put on GPU because we created this tensor inside training_loop\n", - " valid = torch.ones(imgs.size(0), 1)\n", - " valid = valid.type_as(imgs)\n", - "\n", - " # adversarial loss is binary cross-entropy\n", - " g_loss = self.adversarial_loss(self.discriminator(self(z)), valid)\n", - " tqdm_dict = {'g_loss': g_loss}\n", - " output = OrderedDict({\n", - " 'loss': g_loss,\n", - " 'progress_bar': tqdm_dict,\n", - " 'log': tqdm_dict\n", - " })\n", - " return output\n", - "\n", - " # train discriminator\n", - " if optimizer_idx == 1:\n", - " # Measure discriminator's ability to classify real from generated samples\n", - "\n", - " # how well can it label as real?\n", - " valid = torch.ones(imgs.size(0), 1)\n", - " valid = valid.type_as(imgs)\n", - "\n", - " real_loss = self.adversarial_loss(self.discriminator(imgs), valid)\n", - "\n", - " # how well can it label as fake?\n", - " fake = torch.zeros(imgs.size(0), 1)\n", - " fake = fake.type_as(imgs)\n", - "\n", - " fake_loss = self.adversarial_loss(\n", - " self.discriminator(self(z).detach()), fake)\n", - "\n", - " # discriminator loss is the average of these\n", - " d_loss = (real_loss + fake_loss) / 2\n", - " tqdm_dict = {'d_loss': d_loss}\n", - " output = OrderedDict({\n", - " 'loss': d_loss,\n", - " 'progress_bar': tqdm_dict,\n", - " 'log': tqdm_dict\n", - " })\n", - " return output\n", - "\n", - " def configure_optimizers(self):\n", - " lr = self.hparams.lr\n", - " b1 = self.hparams.b1\n", - " b2 = self.hparams.b2\n", - "\n", - " opt_g = torch.optim.Adam(self.generator.parameters(), lr=lr, betas=(b1, b2))\n", - " opt_d = torch.optim.Adam(self.discriminator.parameters(), lr=lr, betas=(b1, b2))\n", - " return [opt_g, opt_d], []\n", - "\n", - " def on_epoch_end(self):\n", - " z = self.validation_z.type_as(self.generator.model[0].weight)\n", - "\n", - " # log sampled images\n", - " sample_imgs = self(z)\n", - " grid = torchvision.utils.make_grid(sample_imgs)\n", - " self.logger.experiment.add_image('generated_images', grid, self.current_epoch)" - ], - "execution_count": 6, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "Ey5FmJPnzm_E", - "colab_type": "code", - "colab": {} - }, - "source": [ - "dm = MNISTDataModule()\n", - "model = GAN(*dm.size())\n", - "trainer = pl.Trainer(gpus=1, max_epochs=5, progress_bar_refresh_rate=20)\n", - "trainer.fit(model, dm)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "MlECc7cHzolp", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Start tensorboard.\n", - "%load_ext tensorboard\n", - "%tensorboard --logdir lightning_logs/" - ], - "execution_count": null, - "outputs": [] - } - ] + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "J37PBnE_x7IW" + }, + "source": [ + "# PyTorch Lightning Basic GAN Tutorial ⚡\n", + "\n", + "How to train a GAN!\n", + "\n", + "Main takeaways:\n", + "1. Generator and discriminator are arbitrary PyTorch modules.\n", + "2. training_step does both the generator and discriminator training.\n", + "\n", + "---\n", + "\n", + " - Give us a ⭐ [on Github](https://www.github.com/PytorchLightning/pytorch-lightning/)\n", + " - Check out [the documentation](https://pytorch-lightning.readthedocs.io/en/latest/)\n", + " - Join us [on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "kg2MKpRmybht" + }, + "source": [ + "### Setup\n", + "Lightning is easy to install. Simply `pip install pytorch-lightning`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "LfrJLKPFyhsK" + }, + "outputs": [], + "source": [ + "! pip install pytorch-lightning --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "BjEPuiVLyanw" + }, + "outputs": [], + "source": [ + "import os\n", + "from argparse import ArgumentParser\n", + "from collections import OrderedDict\n", + "\n", + "import numpy as np\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "import torchvision\n", + "import torchvision.transforms as transforms\n", + "from torch.utils.data import DataLoader, random_split\n", + "from torchvision.datasets import MNIST\n", + "\n", + "import pytorch_lightning as pl" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "OuXJzr4G2uHV" + }, + "source": [ + "### MNIST DataModule\n", + "\n", + "Below, we define a DataModule for the MNIST Dataset. To learn more about DataModules, check out our tutorial on them or see the [latest docs](https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html)." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "DOY_nHu328g7" + }, + "outputs": [], + "source": [ + "class MNISTDataModule(pl.LightningDataModule):\n", + "\n", + " def __init__(self, data_dir: str = './', batch_size: int = 64, num_workers: int = 8):\n", + " super().__init__()\n", + " self.data_dir = data_dir\n", + " self.batch_size = batch_size\n", + " self.num_workers = num_workers\n", + "\n", + " self.transform = transforms.Compose([\n", + " transforms.ToTensor(),\n", + " transforms.Normalize((0.1307,), (0.3081,))\n", + " ])\n", + "\n", + " # self.dims is returned when you call dm.size()\n", + " # Setting default dims here because we know them.\n", + " # Could optionally be assigned dynamically in dm.setup()\n", + " self.dims = (1, 28, 28)\n", + " self.num_classes = 10\n", + "\n", + " def prepare_data(self):\n", + " # download\n", + " MNIST(self.data_dir, train=True, download=True)\n", + " MNIST(self.data_dir, train=False, download=True)\n", + "\n", + " def setup(self, stage=None):\n", + "\n", + " # Assign train/val datasets for use in dataloaders\n", + " if stage == 'fit' or stage is None:\n", + " mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)\n", + " self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])\n", + "\n", + " # Assign test dataset for use in dataloader(s)\n", + " if stage == 'test' or stage is None:\n", + " self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)\n", + "\n", + " def train_dataloader(self):\n", + " return DataLoader(self.mnist_train, batch_size=self.batch_size, num_workers=self.num_workers)\n", + "\n", + " def val_dataloader(self):\n", + " return DataLoader(self.mnist_val, batch_size=self.batch_size, num_workers=self.num_workers)\n", + "\n", + " def test_dataloader(self):\n", + " return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=self.num_workers)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "tW3c0QrQyF9P" + }, + "source": [ + "### A. Generator" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "0E2QDjl5yWtz" + }, + "outputs": [], + "source": [ + "class Generator(nn.Module):\n", + " def __init__(self, latent_dim, img_shape):\n", + " super().__init__()\n", + " self.img_shape = img_shape\n", + "\n", + " def block(in_feat, out_feat, normalize=True):\n", + " layers = [nn.Linear(in_feat, out_feat)]\n", + " if normalize:\n", + " layers.append(nn.BatchNorm1d(out_feat, 0.8))\n", + " layers.append(nn.LeakyReLU(0.2, inplace=True))\n", + " return layers\n", + "\n", + " self.model = nn.Sequential(\n", + " *block(latent_dim, 128, normalize=False),\n", + " *block(128, 256),\n", + " *block(256, 512),\n", + " *block(512, 1024),\n", + " nn.Linear(1024, int(np.prod(img_shape))),\n", + " nn.Tanh()\n", + " )\n", + "\n", + " def forward(self, z):\n", + " img = self.model(z)\n", + " img = img.view(img.size(0), *self.img_shape)\n", + " return img" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "uyrltsGvyaI3" + }, + "source": [ + "### B. Discriminator" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Ed3MR3vnyxyW" + }, + "outputs": [], + "source": [ + "class Discriminator(nn.Module):\n", + " def __init__(self, img_shape):\n", + " super().__init__()\n", + "\n", + " self.model = nn.Sequential(\n", + " nn.Linear(int(np.prod(img_shape)), 512),\n", + " nn.LeakyReLU(0.2, inplace=True),\n", + " nn.Linear(512, 256),\n", + " nn.LeakyReLU(0.2, inplace=True),\n", + " nn.Linear(256, 1),\n", + " nn.Sigmoid(),\n", + " )\n", + "\n", + " def forward(self, img):\n", + " img_flat = img.view(img.size(0), -1)\n", + " validity = self.model(img_flat)\n", + "\n", + " return validity" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "BwUMom3ryySK" + }, + "source": [ + "### C. GAN\n", + "\n", + "#### A couple of cool features to check out in this example...\n", + "\n", + " - We use `some_tensor.type_as(another_tensor)` to make sure we initialize new tensors on the right device (i.e. GPU, CPU).\n", + " - Lightning will put your dataloader data on the right device automatically\n", + " - In this example, we pull from latent dim on the fly, so we need to dynamically add tensors to the right device.\n", + " - `type_as` is the way we recommend to do this.\n", + " - This example shows how to use multiple dataloaders in your `LightningModule`." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "3vKszYf6y1Vv" + }, + "outputs": [], + "source": [ + " class GAN(pl.LightningModule):\n", + "\n", + " def __init__(\n", + " self,\n", + " channels,\n", + " width,\n", + " height,\n", + " latent_dim: int = 100,\n", + " lr: float = 0.0002,\n", + " b1: float = 0.5,\n", + " b2: float = 0.999,\n", + " batch_size: int = 64,\n", + " **kwargs\n", + " ):\n", + " super().__init__()\n", + " self.save_hyperparameters()\n", + "\n", + " # networks\n", + " data_shape = (channels, width, height)\n", + " self.generator = Generator(latent_dim=self.hparams.latent_dim, img_shape=data_shape)\n", + " self.discriminator = Discriminator(img_shape=data_shape)\n", + "\n", + " self.validation_z = torch.randn(8, self.hparams.latent_dim)\n", + "\n", + " self.example_input_array = torch.zeros(2, self.hparams.latent_dim)\n", + "\n", + " def forward(self, z):\n", + " return self.generator(z)\n", + "\n", + " def adversarial_loss(self, y_hat, y):\n", + " return F.binary_cross_entropy(y_hat, y)\n", + "\n", + " def training_step(self, batch, batch_idx, optimizer_idx):\n", + " imgs, _ = batch\n", + "\n", + " # sample noise\n", + " z = torch.randn(imgs.shape[0], self.hparams.latent_dim)\n", + " z = z.type_as(imgs)\n", + "\n", + " # train generator\n", + " if optimizer_idx == 0:\n", + "\n", + " # generate images\n", + " self.generated_imgs = self(z)\n", + "\n", + " # log sampled images\n", + " sample_imgs = self.generated_imgs[:6]\n", + " grid = torchvision.utils.make_grid(sample_imgs)\n", + " self.logger.experiment.add_image('generated_images', grid, 0)\n", + "\n", + " # ground truth result (ie: all fake)\n", + " # put on GPU because we created this tensor inside training_loop\n", + " valid = torch.ones(imgs.size(0), 1)\n", + " valid = valid.type_as(imgs)\n", + "\n", + " # adversarial loss is binary cross-entropy\n", + " g_loss = self.adversarial_loss(self.discriminator(self(z)), valid)\n", + " tqdm_dict = {'g_loss': g_loss}\n", + " output = OrderedDict({\n", + " 'loss': g_loss,\n", + " 'progress_bar': tqdm_dict,\n", + " 'log': tqdm_dict\n", + " })\n", + " return output\n", + "\n", + " # train discriminator\n", + " if optimizer_idx == 1:\n", + " # Measure discriminator's ability to classify real from generated samples\n", + "\n", + " # how well can it label as real?\n", + " valid = torch.ones(imgs.size(0), 1)\n", + " valid = valid.type_as(imgs)\n", + "\n", + " real_loss = self.adversarial_loss(self.discriminator(imgs), valid)\n", + "\n", + " # how well can it label as fake?\n", + " fake = torch.zeros(imgs.size(0), 1)\n", + " fake = fake.type_as(imgs)\n", + "\n", + " fake_loss = self.adversarial_loss(\n", + " self.discriminator(self(z).detach()), fake)\n", + "\n", + " # discriminator loss is the average of these\n", + " d_loss = (real_loss + fake_loss) / 2\n", + " tqdm_dict = {'d_loss': d_loss}\n", + " output = OrderedDict({\n", + " 'loss': d_loss,\n", + " 'progress_bar': tqdm_dict,\n", + " 'log': tqdm_dict\n", + " })\n", + " return output\n", + "\n", + " def configure_optimizers(self):\n", + " lr = self.hparams.lr\n", + " b1 = self.hparams.b1\n", + " b2 = self.hparams.b2\n", + "\n", + " opt_g = torch.optim.Adam(self.generator.parameters(), lr=lr, betas=(b1, b2))\n", + " opt_d = torch.optim.Adam(self.discriminator.parameters(), lr=lr, betas=(b1, b2))\n", + " return [opt_g, opt_d], []\n", + "\n", + " def on_epoch_end(self):\n", + " z = self.validation_z.type_as(self.generator.model[0].weight)\n", + "\n", + " # log sampled images\n", + " sample_imgs = self(z)\n", + " grid = torchvision.utils.make_grid(sample_imgs)\n", + " self.logger.experiment.add_image('generated_images', grid, self.current_epoch)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Ey5FmJPnzm_E" + }, + "outputs": [], + "source": [ + "dm = MNISTDataModule()\n", + "model = GAN(*dm.size())\n", + "trainer = pl.Trainer(gpus=1, max_epochs=5, progress_bar_refresh_rate=20)\n", + "trainer.fit(model, dm)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "MlECc7cHzolp" + }, + "outputs": [], + "source": [ + "# Start tensorboard.\n", + "%load_ext tensorboard\n", + "%tensorboard --logdir lightning_logs/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "

Congratulations - Time to Join the Community!

\n", + "
\n", + "\n", + "Congratulations on completing this notebook tutorial! If you enjoyed this and would like to join the Lightning movement, you can do so in the following ways!\n", + "\n", + "### Star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) on GitHub\n", + "The easiest way to help our community is just by starring the GitHub repos! This helps raise awareness of the cool tools we're building.\n", + "\n", + "* Please, star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning)\n", + "\n", + "### Join our [Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)!\n", + "The best way to keep up to date on the latest advancements is to join our community! Make sure to introduce yourself and share your interests in `#general` channel\n", + "\n", + "### Interested by SOTA AI models ! Check out [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts)\n", + "Bolts has a collection of state-of-the-art models, all implemented in [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) and can be easily integrated within your own projects.\n", + "\n", + "* Please, star [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts)\n", + "\n", + "### Contributions !\n", + "The best way to contribute to our community is to become a code contributor! At any time you can go to [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) or [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts) GitHub Issues page and filter for \"good first issue\". \n", + "\n", + "* [Lightning good first issue](https://github.com/PyTorchLightning/pytorch-lightning/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", + "* [Bolt good first issue](https://github.com/PyTorchLightning/pytorch-lightning-bolts/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", + "* You can also contribute your own notebooks with useful examples !\n", + "\n", + "### Great thanks from the entire Pytorch Lightning Team for your interest !\n", + "\n", + "" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "include_colab_link": true, + "name": "03-basic-gan.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } diff --git a/notebooks/04-transformers-text-classification.ipynb b/notebooks/04-transformers-text-classification.ipynb index ae7424c7d4864..037b24e4ddd9d 100644 --- a/notebooks/04-transformers-text-classification.ipynb +++ b/notebooks/04-transformers-text-classification.ipynb @@ -1,543 +1,591 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "04-transformers-text-classification.ipynb", - "provenance": [], - "collapsed_sections": [], - "toc_visible": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "accelerator": "GPU" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "8ag5ANQPJ_j9" + }, + "source": [ + "# Finetune 🤗 Transformers Models with PyTorch Lightning ⚡\n", + "\n", + "This notebook will use HuggingFace's `datasets` library to get data, which will be wrapped in a `LightningDataModule`. Then, we write a class to perform text classification on any dataset from the[ GLUE Benchmark](https://gluebenchmark.com/). (We just show CoLA and MRPC due to constraint on compute/disk)\n", + "\n", + "[HuggingFace's NLP Viewer](https://huggingface.co/nlp/viewer/?dataset=glue&config=cola) can help you get a feel for the two datasets we will use and what tasks they are solving for.\n", + "\n", + "---\n", + " - Give us a ⭐ [on Github](https://www.github.com/PytorchLightning/pytorch-lightning/)\n", + " - Check out [the documentation](https://pytorch-lightning.readthedocs.io/en/latest/)\n", + " - Ask a question on [the forum](https://forums.pytorchlightning.ai/)\n", + " - Join us [on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)\n", + "\n", + " - [HuggingFace datasets](https://github.com/huggingface/datasets)\n", + " - [HuggingFace transformers](https://github.com/huggingface/transformers)" + ] }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "8ag5ANQPJ_j9", - "colab_type": "text" - }, - "source": [ - "# Finetune 🤗 Transformers Models with PyTorch Lightning ⚡\n", - "\n", - "This notebook will use HuggingFace's `datasets` library to get data, which will be wrapped in a `LightningDataModule`. Then, we write a class to perform text classification on any dataset from the[ GLUE Benchmark](https://gluebenchmark.com/). (We just show CoLA and MRPC due to constraint on compute/disk)\n", - "\n", - "[HuggingFace's NLP Viewer](https://huggingface.co/nlp/viewer/?dataset=glue&config=cola) can help you get a feel for the two datasets we will use and what tasks they are solving for.\n", - "\n", - "---\n", - " - Give us a ⭐ [on Github](https://www.github.com/PytorchLightning/pytorch-lightning/)\n", - " - Check out [the documentation](https://pytorch-lightning.readthedocs.io/en/latest/)\n", - " - Ask a question on [the forum](https://forums.pytorchlightning.ai/)\n", - " - Join us [on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)\n", - "\n", - " - [HuggingFace datasets](https://github.com/huggingface/datasets)\n", - " - [HuggingFace transformers](https://github.com/huggingface/transformers)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fqlsVTj7McZ3", - "colab_type": "text" - }, - "source": [ - "### Setup" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "OIhHrRL-MnKK", - "colab_type": "code", - "colab": {} - }, - "source": [ - "!pip install pytorch-lightning datasets transformers" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "6yuQT_ZQMpCg", - "colab_type": "code", - "colab": {} - }, - "source": [ - "from argparse import ArgumentParser\n", - "from datetime import datetime\n", - "from typing import Optional\n", - "\n", - "import datasets\n", - "import numpy as np\n", - "import pytorch_lightning as pl\n", - "import torch\n", - "from torch.utils.data import DataLoader\n", - "from transformers import (\n", - " AdamW,\n", - " AutoModelForSequenceClassification,\n", - " AutoConfig,\n", - " AutoTokenizer,\n", - " get_linear_schedule_with_warmup,\n", - " glue_compute_metrics\n", - ")" - ], - "execution_count": 2, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9ORJfiuiNZ_N", - "colab_type": "text" - }, - "source": [ - "## GLUE DataModule" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "jW9xQhZxMz1G", - "colab_type": "code", - "colab": {} - }, - "source": [ - "class GLUEDataModule(pl.LightningDataModule):\n", - "\n", - " task_text_field_map = {\n", - " 'cola': ['sentence'],\n", - " 'sst2': ['sentence'],\n", - " 'mrpc': ['sentence1', 'sentence2'],\n", - " 'qqp': ['question1', 'question2'],\n", - " 'stsb': ['sentence1', 'sentence2'],\n", - " 'mnli': ['premise', 'hypothesis'],\n", - " 'qnli': ['question', 'sentence'],\n", - " 'rte': ['sentence1', 'sentence2'],\n", - " 'wnli': ['sentence1', 'sentence2'],\n", - " 'ax': ['premise', 'hypothesis']\n", - " }\n", - "\n", - " glue_task_num_labels = {\n", - " 'cola': 2,\n", - " 'sst2': 2,\n", - " 'mrpc': 2,\n", - " 'qqp': 2,\n", - " 'stsb': 1,\n", - " 'mnli': 3,\n", - " 'qnli': 2,\n", - " 'rte': 2,\n", - " 'wnli': 2,\n", - " 'ax': 3\n", - " }\n", - "\n", - " loader_columns = [\n", - " 'datasets_idx',\n", - " 'input_ids',\n", - " 'token_type_ids',\n", - " 'attention_mask',\n", - " 'start_positions',\n", - " 'end_positions',\n", - " 'labels'\n", - " ]\n", - "\n", - " def __init__(\n", - " self,\n", - " model_name_or_path: str,\n", - " task_name: str ='mrpc',\n", - " max_seq_length: int = 128,\n", - " train_batch_size: int = 32,\n", - " eval_batch_size: int = 32,\n", - " **kwargs\n", - " ):\n", - " super().__init__()\n", - " self.model_name_or_path = model_name_or_path\n", - " self.task_name = task_name\n", - " self.max_seq_length = max_seq_length\n", - " self.train_batch_size = train_batch_size\n", - " self.eval_batch_size = eval_batch_size\n", - "\n", - " self.text_fields = self.task_text_field_map[task_name]\n", - " self.num_labels = self.glue_task_num_labels[task_name]\n", - " self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)\n", - "\n", - " def setup(self, stage):\n", - " self.dataset = datasets.load_dataset('glue', self.task_name)\n", - "\n", - " for split in self.dataset.keys():\n", - " self.dataset[split] = self.dataset[split].map(\n", - " self.convert_to_features,\n", - " batched=True,\n", - " remove_columns=['label'],\n", - " )\n", - " self.columns = [c for c in self.dataset[split].column_names if c in self.loader_columns]\n", - " self.dataset[split].set_format(type=\"torch\", columns=self.columns)\n", - "\n", - " self.eval_splits = [x for x in self.dataset.keys() if 'validation' in x]\n", - "\n", - " def prepare_data(self):\n", - " datasets.load_dataset('glue', self.task_name)\n", - " AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)\n", - " \n", - " def train_dataloader(self):\n", - " return DataLoader(self.dataset['train'], batch_size=self.train_batch_size)\n", - " \n", - " def val_dataloader(self):\n", - " if len(self.eval_splits) == 1:\n", - " return DataLoader(self.dataset['validation'], batch_size=self.eval_batch_size)\n", - " elif len(self.eval_splits) > 1:\n", - " return [DataLoader(self.dataset[x], batch_size=self.eval_batch_size) for x in self.eval_splits]\n", - "\n", - " def test_dataloader(self):\n", - " if len(self.eval_splits) == 1:\n", - " return DataLoader(self.dataset['test'], batch_size=self.eval_batch_size)\n", - " elif len(self.eval_splits) > 1:\n", - " return [DataLoader(self.dataset[x], batch_size=self.eval_batch_size) for x in self.eval_splits]\n", - "\n", - " def convert_to_features(self, example_batch, indices=None):\n", - "\n", - " # Either encode single sentence or sentence pairs\n", - " if len(self.text_fields) > 1:\n", - " texts_or_text_pairs = list(zip(example_batch[self.text_fields[0]], example_batch[self.text_fields[1]]))\n", - " else:\n", - " texts_or_text_pairs = example_batch[self.text_fields[0]]\n", - "\n", - " # Tokenize the text/text pairs\n", - " features = self.tokenizer.batch_encode_plus(\n", - " texts_or_text_pairs,\n", - " max_length=self.max_seq_length,\n", - " pad_to_max_length=True,\n", - " truncation=True\n", - " )\n", - "\n", - " # Rename label to labels to make it easier to pass to model forward\n", - " features['labels'] = example_batch['label']\n", - "\n", - " return features" - ], - "execution_count": 3, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jQC3a6KuOpX3", - "colab_type": "text" - }, - "source": [ - "#### You could use this datamodule with standalone PyTorch if you wanted..." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "JCMH3IAsNffF", - "colab_type": "code", - "colab": {} - }, - "source": [ - "dm = GLUEDataModule('distilbert-base-uncased')\n", - "dm.prepare_data()\n", - "dm.setup('fit')\n", - "next(iter(dm.train_dataloader()))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "l9fQ_67BO2Lj", - "colab_type": "text" - }, - "source": [ - "## GLUE Model" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "gtn5YGKYO65B", - "colab_type": "code", - "colab": {} - }, - "source": [ - "class GLUETransformer(pl.LightningModule):\n", - " def __init__(\n", - " self,\n", - " model_name_or_path: str,\n", - " num_labels: int,\n", - " learning_rate: float = 2e-5,\n", - " adam_epsilon: float = 1e-8,\n", - " warmup_steps: int = 0,\n", - " weight_decay: float = 0.0,\n", - " train_batch_size: int = 32,\n", - " eval_batch_size: int = 32,\n", - " eval_splits: Optional[list] = None,\n", - " **kwargs\n", - " ):\n", - " super().__init__()\n", - "\n", - " self.save_hyperparameters()\n", - "\n", - " self.config = AutoConfig.from_pretrained(model_name_or_path, num_labels=num_labels)\n", - " self.model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, config=self.config)\n", - " self.metric = datasets.load_metric(\n", - " 'glue',\n", - " self.hparams.task_name,\n", - " experiment_id=datetime.now().strftime(\"%d-%m-%Y_%H-%M-%S\")\n", - " )\n", - "\n", - " def forward(self, **inputs):\n", - " return self.model(**inputs)\n", - "\n", - " def training_step(self, batch, batch_idx):\n", - " outputs = self(**batch)\n", - " loss = outputs[0]\n", - " return loss\n", - "\n", - " def validation_step(self, batch, batch_idx, dataloader_idx=0):\n", - " outputs = self(**batch)\n", - " val_loss, logits = outputs[:2]\n", - "\n", - " if self.hparams.num_labels >= 1:\n", - " preds = torch.argmax(logits, axis=1)\n", - " elif self.hparams.num_labels == 1:\n", - " preds = logits.squeeze()\n", - "\n", - " labels = batch[\"labels\"]\n", - "\n", - " return {'loss': val_loss, \"preds\": preds, \"labels\": labels}\n", - "\n", - " def validation_epoch_end(self, outputs):\n", - " if self.hparams.task_name == 'mnli':\n", - " for i, output in enumerate(outputs):\n", - " # matched or mismatched\n", - " split = self.hparams.eval_splits[i].split('_')[-1]\n", - " preds = torch.cat([x['preds'] for x in output]).detach().cpu().numpy()\n", - " labels = torch.cat([x['labels'] for x in output]).detach().cpu().numpy()\n", - " loss = torch.stack([x['loss'] for x in output]).mean()\n", - " self.log(f'val_loss_{split}', loss, prog_bar=True)\n", - " split_metrics = {f\"{k}_{split}\": v for k, v in self.metric.compute(predictions=preds, references=labels).items()}\n", - " self.log_dict(split_metrics, prog_bar=True)\n", - " return loss\n", - "\n", - " preds = torch.cat([x['preds'] for x in outputs]).detach().cpu().numpy()\n", - " labels = torch.cat([x['labels'] for x in outputs]).detach().cpu().numpy()\n", - " loss = torch.stack([x['loss'] for x in outputs]).mean()\n", - " self.log('val_loss', loss, prog_bar=True)\n", - " self.log_dict(self.metric.compute(predictions=preds, references=labels), prog_bar=True)\n", - " return loss\n", - "\n", - " def setup(self, stage):\n", - " if stage == 'fit':\n", - " # Get dataloader by calling it - train_dataloader() is called after setup() by default\n", - " train_loader = self.train_dataloader()\n", - "\n", - " # Calculate total steps\n", - " self.total_steps = (\n", - " (len(train_loader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.gpus)))\n", - " // self.hparams.accumulate_grad_batches\n", - " * float(self.hparams.max_epochs)\n", - " )\n", - "\n", - " def configure_optimizers(self):\n", - " \"Prepare optimizer and schedule (linear warmup and decay)\"\n", - " model = self.model\n", - " no_decay = [\"bias\", \"LayerNorm.weight\"]\n", - " optimizer_grouped_parameters = [\n", - " {\n", - " \"params\": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],\n", - " \"weight_decay\": self.hparams.weight_decay,\n", - " },\n", - " {\n", - " \"params\": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],\n", - " \"weight_decay\": 0.0,\n", - " },\n", - " ]\n", - " optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)\n", - "\n", - " scheduler = get_linear_schedule_with_warmup(\n", - " optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps\n", - " )\n", - " scheduler = {\n", - " 'scheduler': scheduler,\n", - " 'interval': 'step',\n", - " 'frequency': 1\n", - " }\n", - " return [optimizer], [scheduler]\n", - "\n", - " @staticmethod\n", - " def add_model_specific_args(parent_parser):\n", - " parser = ArgumentParser(parents=[parent_parser], add_help=False)\n", - " parser.add_argument(\"--learning_rate\", default=2e-5, type=float)\n", - " parser.add_argument(\"--adam_epsilon\", default=1e-8, type=float)\n", - " parser.add_argument(\"--warmup_steps\", default=0, type=int)\n", - " parser.add_argument(\"--weight_decay\", default=0.0, type=float)\n", - " return parser" - ], - "execution_count": 5, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ha-NdIP_xbd3", - "colab_type": "text" - }, - "source": [ - "### ⚡ Quick Tip \n", - " - Combine arguments from your DataModule, Model, and Trainer into one for easy and robust configuration" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "3dEHnl3RPlAR", - "colab_type": "code", - "colab": {} - }, - "source": [ - "def parse_args(args=None):\n", - " parser = ArgumentParser()\n", - " parser = pl.Trainer.add_argparse_args(parser)\n", - " parser = GLUEDataModule.add_argparse_args(parser)\n", - " parser = GLUETransformer.add_model_specific_args(parser)\n", - " parser.add_argument('--seed', type=int, default=42)\n", - " return parser.parse_args(args)\n", - "\n", - "\n", - "def main(args):\n", - " pl.seed_everything(args.seed)\n", - " dm = GLUEDataModule.from_argparse_args(args)\n", - " dm.prepare_data()\n", - " dm.setup('fit')\n", - " model = GLUETransformer(num_labels=dm.num_labels, eval_splits=dm.eval_splits, **vars(args))\n", - " trainer = pl.Trainer.from_argparse_args(args)\n", - " return dm, model, trainer" - ], - "execution_count": 6, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PkuLaeec3sJ-", - "colab_type": "text" - }, - "source": [ - "# Training" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "QSpueK5UPsN7", - "colab_type": "text" - }, - "source": [ - "## CoLA\n", - "\n", - "See an interactive view of the CoLA dataset in [NLP Viewer](https://huggingface.co/nlp/viewer/?dataset=glue&config=cola)" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "NJnFmtpnPu0Y", - "colab_type": "code", - "colab": {} - }, - "source": [ - "mocked_args = \"\"\"\n", - " --model_name_or_path albert-base-v2\n", - " --task_name cola\n", - " --max_epochs 3\n", - " --gpus 1\"\"\".split()\n", - "\n", - "args = parse_args(mocked_args)\n", - "dm, model, trainer = main(args)\n", - "trainer.fit(model, dm)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_MrNsTnqdz4z", - "colab_type": "text" - }, - "source": [ - "## MRPC\n", - "\n", - "See an interactive view of the MRPC dataset in [NLP Viewer](https://huggingface.co/nlp/viewer/?dataset=glue&config=mrpc)" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "LBwRxg9Cb3d-", - "colab_type": "code", - "colab": {} - }, - "source": [ - "mocked_args = \"\"\"\n", - " --model_name_or_path distilbert-base-cased\n", - " --task_name mrpc\n", - " --max_epochs 3\n", - " --gpus 1\"\"\".split()\n", - "\n", - "args = parse_args(mocked_args)\n", - "dm, model, trainer = main(args)\n", - "trainer.fit(model, dm)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "iZhbn0HzfdCu", - "colab_type": "text" - }, - "source": [ - "## MNLI\n", - "\n", - " - The MNLI dataset is huge, so we aren't going to bother trying to train it here.\n", - "\n", - " - Let's just make sure our multi-dataloader logic is right by skipping over training and going straight to validation.\n", - "\n", - "See an interactive view of the MRPC dataset in [NLP Viewer](https://huggingface.co/nlp/viewer/?dataset=glue&config=mnli)" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "AvsZMOggfcWW", - "colab_type": "code", - "colab": {} - }, - "source": [ - "mocked_args = \"\"\"\n", - " --model_name_or_path distilbert-base-uncased\n", - " --task_name mnli\n", - " --max_epochs 1\n", - " --gpus 1\n", - " --limit_train_batches 10\n", - " --progress_bar_refresh_rate 20\"\"\".split()\n", - "\n", - "args = parse_args(mocked_args)\n", - "dm, model, trainer = main(args)\n", - "trainer.fit(model, dm)" - ], - "execution_count": null, - "outputs": [] - } - ] -} \ No newline at end of file + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "fqlsVTj7McZ3" + }, + "source": [ + "### Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "OIhHrRL-MnKK" + }, + "outputs": [], + "source": [ + "!pip install pytorch-lightning datasets transformers" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "6yuQT_ZQMpCg" + }, + "outputs": [], + "source": [ + "from argparse import ArgumentParser\n", + "from datetime import datetime\n", + "from typing import Optional\n", + "\n", + "import datasets\n", + "import numpy as np\n", + "import pytorch_lightning as pl\n", + "import torch\n", + "from torch.utils.data import DataLoader\n", + "from transformers import (\n", + " AdamW,\n", + " AutoModelForSequenceClassification,\n", + " AutoConfig,\n", + " AutoTokenizer,\n", + " get_linear_schedule_with_warmup,\n", + " glue_compute_metrics\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "9ORJfiuiNZ_N" + }, + "source": [ + "## GLUE DataModule" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "jW9xQhZxMz1G" + }, + "outputs": [], + "source": [ + "class GLUEDataModule(pl.LightningDataModule):\n", + "\n", + " task_text_field_map = {\n", + " 'cola': ['sentence'],\n", + " 'sst2': ['sentence'],\n", + " 'mrpc': ['sentence1', 'sentence2'],\n", + " 'qqp': ['question1', 'question2'],\n", + " 'stsb': ['sentence1', 'sentence2'],\n", + " 'mnli': ['premise', 'hypothesis'],\n", + " 'qnli': ['question', 'sentence'],\n", + " 'rte': ['sentence1', 'sentence2'],\n", + " 'wnli': ['sentence1', 'sentence2'],\n", + " 'ax': ['premise', 'hypothesis']\n", + " }\n", + "\n", + " glue_task_num_labels = {\n", + " 'cola': 2,\n", + " 'sst2': 2,\n", + " 'mrpc': 2,\n", + " 'qqp': 2,\n", + " 'stsb': 1,\n", + " 'mnli': 3,\n", + " 'qnli': 2,\n", + " 'rte': 2,\n", + " 'wnli': 2,\n", + " 'ax': 3\n", + " }\n", + "\n", + " loader_columns = [\n", + " 'datasets_idx',\n", + " 'input_ids',\n", + " 'token_type_ids',\n", + " 'attention_mask',\n", + " 'start_positions',\n", + " 'end_positions',\n", + " 'labels'\n", + " ]\n", + "\n", + " def __init__(\n", + " self,\n", + " model_name_or_path: str,\n", + " task_name: str ='mrpc',\n", + " max_seq_length: int = 128,\n", + " train_batch_size: int = 32,\n", + " eval_batch_size: int = 32,\n", + " **kwargs\n", + " ):\n", + " super().__init__()\n", + " self.model_name_or_path = model_name_or_path\n", + " self.task_name = task_name\n", + " self.max_seq_length = max_seq_length\n", + " self.train_batch_size = train_batch_size\n", + " self.eval_batch_size = eval_batch_size\n", + "\n", + " self.text_fields = self.task_text_field_map[task_name]\n", + " self.num_labels = self.glue_task_num_labels[task_name]\n", + " self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)\n", + "\n", + " def setup(self, stage):\n", + " self.dataset = datasets.load_dataset('glue', self.task_name)\n", + "\n", + " for split in self.dataset.keys():\n", + " self.dataset[split] = self.dataset[split].map(\n", + " self.convert_to_features,\n", + " batched=True,\n", + " remove_columns=['label'],\n", + " )\n", + " self.columns = [c for c in self.dataset[split].column_names if c in self.loader_columns]\n", + " self.dataset[split].set_format(type=\"torch\", columns=self.columns)\n", + "\n", + " self.eval_splits = [x for x in self.dataset.keys() if 'validation' in x]\n", + "\n", + " def prepare_data(self):\n", + " datasets.load_dataset('glue', self.task_name)\n", + " AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)\n", + " \n", + " def train_dataloader(self):\n", + " return DataLoader(self.dataset['train'], batch_size=self.train_batch_size)\n", + " \n", + " def val_dataloader(self):\n", + " if len(self.eval_splits) == 1:\n", + " return DataLoader(self.dataset['validation'], batch_size=self.eval_batch_size)\n", + " elif len(self.eval_splits) > 1:\n", + " return [DataLoader(self.dataset[x], batch_size=self.eval_batch_size) for x in self.eval_splits]\n", + "\n", + " def test_dataloader(self):\n", + " if len(self.eval_splits) == 1:\n", + " return DataLoader(self.dataset['test'], batch_size=self.eval_batch_size)\n", + " elif len(self.eval_splits) > 1:\n", + " return [DataLoader(self.dataset[x], batch_size=self.eval_batch_size) for x in self.eval_splits]\n", + "\n", + " def convert_to_features(self, example_batch, indices=None):\n", + "\n", + " # Either encode single sentence or sentence pairs\n", + " if len(self.text_fields) > 1:\n", + " texts_or_text_pairs = list(zip(example_batch[self.text_fields[0]], example_batch[self.text_fields[1]]))\n", + " else:\n", + " texts_or_text_pairs = example_batch[self.text_fields[0]]\n", + "\n", + " # Tokenize the text/text pairs\n", + " features = self.tokenizer.batch_encode_plus(\n", + " texts_or_text_pairs,\n", + " max_length=self.max_seq_length,\n", + " pad_to_max_length=True,\n", + " truncation=True\n", + " )\n", + "\n", + " # Rename label to labels to make it easier to pass to model forward\n", + " features['labels'] = example_batch['label']\n", + "\n", + " return features" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "jQC3a6KuOpX3" + }, + "source": [ + "#### You could use this datamodule with standalone PyTorch if you wanted..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "JCMH3IAsNffF" + }, + "outputs": [], + "source": [ + "dm = GLUEDataModule('distilbert-base-uncased')\n", + "dm.prepare_data()\n", + "dm.setup('fit')\n", + "next(iter(dm.train_dataloader()))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "l9fQ_67BO2Lj" + }, + "source": [ + "## GLUE Model" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "gtn5YGKYO65B" + }, + "outputs": [], + "source": [ + "class GLUETransformer(pl.LightningModule):\n", + " def __init__(\n", + " self,\n", + " model_name_or_path: str,\n", + " num_labels: int,\n", + " learning_rate: float = 2e-5,\n", + " adam_epsilon: float = 1e-8,\n", + " warmup_steps: int = 0,\n", + " weight_decay: float = 0.0,\n", + " train_batch_size: int = 32,\n", + " eval_batch_size: int = 32,\n", + " eval_splits: Optional[list] = None,\n", + " **kwargs\n", + " ):\n", + " super().__init__()\n", + "\n", + " self.save_hyperparameters()\n", + "\n", + " self.config = AutoConfig.from_pretrained(model_name_or_path, num_labels=num_labels)\n", + " self.model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, config=self.config)\n", + " self.metric = datasets.load_metric(\n", + " 'glue',\n", + " self.hparams.task_name,\n", + " experiment_id=datetime.now().strftime(\"%d-%m-%Y_%H-%M-%S\")\n", + " )\n", + "\n", + " def forward(self, **inputs):\n", + " return self.model(**inputs)\n", + "\n", + " def training_step(self, batch, batch_idx):\n", + " outputs = self(**batch)\n", + " loss = outputs[0]\n", + " return loss\n", + "\n", + " def validation_step(self, batch, batch_idx, dataloader_idx=0):\n", + " outputs = self(**batch)\n", + " val_loss, logits = outputs[:2]\n", + "\n", + " if self.hparams.num_labels >= 1:\n", + " preds = torch.argmax(logits, axis=1)\n", + " elif self.hparams.num_labels == 1:\n", + " preds = logits.squeeze()\n", + "\n", + " labels = batch[\"labels\"]\n", + "\n", + " return {'loss': val_loss, \"preds\": preds, \"labels\": labels}\n", + "\n", + " def validation_epoch_end(self, outputs):\n", + " if self.hparams.task_name == 'mnli':\n", + " for i, output in enumerate(outputs):\n", + " # matched or mismatched\n", + " split = self.hparams.eval_splits[i].split('_')[-1]\n", + " preds = torch.cat([x['preds'] for x in output]).detach().cpu().numpy()\n", + " labels = torch.cat([x['labels'] for x in output]).detach().cpu().numpy()\n", + " loss = torch.stack([x['loss'] for x in output]).mean()\n", + " self.log(f'val_loss_{split}', loss, prog_bar=True)\n", + " split_metrics = {f\"{k}_{split}\": v for k, v in self.metric.compute(predictions=preds, references=labels).items()}\n", + " self.log_dict(split_metrics, prog_bar=True)\n", + " return loss\n", + "\n", + " preds = torch.cat([x['preds'] for x in outputs]).detach().cpu().numpy()\n", + " labels = torch.cat([x['labels'] for x in outputs]).detach().cpu().numpy()\n", + " loss = torch.stack([x['loss'] for x in outputs]).mean()\n", + " self.log('val_loss', loss, prog_bar=True)\n", + " self.log_dict(self.metric.compute(predictions=preds, references=labels), prog_bar=True)\n", + " return loss\n", + "\n", + " def setup(self, stage):\n", + " if stage == 'fit':\n", + " # Get dataloader by calling it - train_dataloader() is called after setup() by default\n", + " train_loader = self.train_dataloader()\n", + "\n", + " # Calculate total steps\n", + " self.total_steps = (\n", + " (len(train_loader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.gpus)))\n", + " // self.hparams.accumulate_grad_batches\n", + " * float(self.hparams.max_epochs)\n", + " )\n", + "\n", + " def configure_optimizers(self):\n", + " \"Prepare optimizer and schedule (linear warmup and decay)\"\n", + " model = self.model\n", + " no_decay = [\"bias\", \"LayerNorm.weight\"]\n", + " optimizer_grouped_parameters = [\n", + " {\n", + " \"params\": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],\n", + " \"weight_decay\": self.hparams.weight_decay,\n", + " },\n", + " {\n", + " \"params\": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],\n", + " \"weight_decay\": 0.0,\n", + " },\n", + " ]\n", + " optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)\n", + "\n", + " scheduler = get_linear_schedule_with_warmup(\n", + " optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps\n", + " )\n", + " scheduler = {\n", + " 'scheduler': scheduler,\n", + " 'interval': 'step',\n", + " 'frequency': 1\n", + " }\n", + " return [optimizer], [scheduler]\n", + "\n", + " @staticmethod\n", + " def add_model_specific_args(parent_parser):\n", + " parser = ArgumentParser(parents=[parent_parser], add_help=False)\n", + " parser.add_argument(\"--learning_rate\", default=2e-5, type=float)\n", + " parser.add_argument(\"--adam_epsilon\", default=1e-8, type=float)\n", + " parser.add_argument(\"--warmup_steps\", default=0, type=int)\n", + " parser.add_argument(\"--weight_decay\", default=0.0, type=float)\n", + " return parser" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ha-NdIP_xbd3" + }, + "source": [ + "### ⚡ Quick Tip \n", + " - Combine arguments from your DataModule, Model, and Trainer into one for easy and robust configuration" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "3dEHnl3RPlAR" + }, + "outputs": [], + "source": [ + "def parse_args(args=None):\n", + " parser = ArgumentParser()\n", + " parser = pl.Trainer.add_argparse_args(parser)\n", + " parser = GLUEDataModule.add_argparse_args(parser)\n", + " parser = GLUETransformer.add_model_specific_args(parser)\n", + " parser.add_argument('--seed', type=int, default=42)\n", + " return parser.parse_args(args)\n", + "\n", + "\n", + "def main(args):\n", + " pl.seed_everything(args.seed)\n", + " dm = GLUEDataModule.from_argparse_args(args)\n", + " dm.prepare_data()\n", + " dm.setup('fit')\n", + " model = GLUETransformer(num_labels=dm.num_labels, eval_splits=dm.eval_splits, **vars(args))\n", + " trainer = pl.Trainer.from_argparse_args(args)\n", + " return dm, model, trainer" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "PkuLaeec3sJ-" + }, + "source": [ + "# Training" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "QSpueK5UPsN7" + }, + "source": [ + "## CoLA\n", + "\n", + "See an interactive view of the CoLA dataset in [NLP Viewer](https://huggingface.co/nlp/viewer/?dataset=glue&config=cola)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "NJnFmtpnPu0Y" + }, + "outputs": [], + "source": [ + "mocked_args = \"\"\"\n", + " --model_name_or_path albert-base-v2\n", + " --task_name cola\n", + " --max_epochs 3\n", + " --gpus 1\"\"\".split()\n", + "\n", + "args = parse_args(mocked_args)\n", + "dm, model, trainer = main(args)\n", + "trainer.fit(model, dm)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "_MrNsTnqdz4z" + }, + "source": [ + "## MRPC\n", + "\n", + "See an interactive view of the MRPC dataset in [NLP Viewer](https://huggingface.co/nlp/viewer/?dataset=glue&config=mrpc)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "LBwRxg9Cb3d-" + }, + "outputs": [], + "source": [ + "mocked_args = \"\"\"\n", + " --model_name_or_path distilbert-base-cased\n", + " --task_name mrpc\n", + " --max_epochs 3\n", + " --gpus 1\"\"\".split()\n", + "\n", + "args = parse_args(mocked_args)\n", + "dm, model, trainer = main(args)\n", + "trainer.fit(model, dm)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "iZhbn0HzfdCu" + }, + "source": [ + "## MNLI\n", + "\n", + " - The MNLI dataset is huge, so we aren't going to bother trying to train it here.\n", + "\n", + " - Let's just make sure our multi-dataloader logic is right by skipping over training and going straight to validation.\n", + "\n", + "See an interactive view of the MRPC dataset in [NLP Viewer](https://huggingface.co/nlp/viewer/?dataset=glue&config=mnli)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "AvsZMOggfcWW" + }, + "outputs": [], + "source": [ + "mocked_args = \"\"\"\n", + " --model_name_or_path distilbert-base-uncased\n", + " --task_name mnli\n", + " --max_epochs 1\n", + " --gpus 1\n", + " --limit_train_batches 10\n", + " --progress_bar_refresh_rate 20\"\"\".split()\n", + "\n", + "args = parse_args(mocked_args)\n", + "dm, model, trainer = main(args)\n", + "trainer.fit(model, dm)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "

Congratulations - Time to Join the Community!

\n", + "
\n", + "\n", + "Congratulations on completing this notebook tutorial! If you enjoyed this and would like to join the Lightning movement, you can do so in the following ways!\n", + "\n", + "### Star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) on GitHub\n", + "The easiest way to help our community is just by starring the GitHub repos! This helps raise awareness of the cool tools we're building.\n", + "\n", + "* Please, star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning)\n", + "\n", + "### Join our [Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)!\n", + "The best way to keep up to date on the latest advancements is to join our community! Make sure to introduce yourself and share your interests in `#general` channel\n", + "\n", + "### Interested by SOTA AI models ! Check out [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts)\n", + "Bolts has a collection of state-of-the-art models, all implemented in [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) and can be easily integrated within your own projects.\n", + "\n", + "* Please, star [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts)\n", + "\n", + "### Contributions !\n", + "The best way to contribute to our community is to become a code contributor! At any time you can go to [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) or [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts) GitHub Issues page and filter for \"good first issue\". \n", + "\n", + "* [Lightning good first issue](https://github.com/PyTorchLightning/pytorch-lightning/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", + "* [Bolt good first issue](https://github.com/PyTorchLightning/pytorch-lightning-bolts/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", + "* You can also contribute your own notebooks with useful examples !\n", + "\n", + "### Great thanks from the entire Pytorch Lightning Team for your interest !\n", + "\n", + "" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "04-transformers-text-classification.ipynb", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/05-trainer-flags-overview.ipynb b/notebooks/05-trainer-flags-overview.ipynb index a070ce03629ba..f1f93104f4552 100644 --- a/notebooks/05-trainer-flags-overview.ipynb +++ b/notebooks/05-trainer-flags-overview.ipynb @@ -1,2871 +1,2919 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "05-trainer-flags-overview.ipynb", - "provenance": [], - "collapsed_sections": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "accelerator": "GPU" - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "goRmGIRI5cfC" - }, - "source": [ - "# Introduction to Lightning Flags ⚡🚩\n", - "\n", - "In this notebook, we'll go over the flags available in the `Trainer` object. Note that not everything will work in the Colab environment (multi-gpu, etc). This notebook accompanies the Trainer videos we'll be putting out.\n", - "\n", - "---\n", - " - Give us a ⭐ [on Github](https://www.github.com/PytorchLightning/pytorch-lightning/)\n", - " - Check out [the documentation](https://pytorch-lightning.readthedocs.io/en/latest/)\n", - " - Join us [on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jKj5lgdr5j48" - }, - "source": [ - "--- \n", - "### Setup \n", - "First thing first, we need to install Lightning. Simply ```pip install pytorch-lightning```" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "UGjilEHk4vb7" - }, - "source": [ - "! pip install pytorch-lightning" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "zaVUShmQ5n8Y" - }, - "source": [ - "import os\n", - "\n", - "from argparse import ArgumentParser\n", - "import torch\n", - "from torch import nn\n", - "from torch.nn import functional as F\n", - "from torch.utils.data import DataLoader\n", - "from torch.utils.data import random_split\n", - "from torchvision.datasets import MNIST\n", - "from torchvision import transforms\n", - "import pytorch_lightning as pl\n", - "from pytorch_lightning.metrics.functional import accuracy\n", - "\n", - "from torchvision.datasets.mnist import MNIST\n", - "from torchvision import transforms" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "6tgkS8IYZwY_" - }, - "source": [ - "# ------------\n", - "# data\n", - "# ------------\n", - "pl.seed_everything(1234)\n", - "batch_size = 32\n", - "\n", - "# Init DataLoader from MNIST Dataset\n", - "\n", - "dataset = MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor())\n", - "mnist_test = MNIST(os.getcwd(), train=False, download=True, transform=transforms.ToTensor())\n", - "mnist_train, mnist_val = random_split(dataset, [55000, 5000])\n", - "\n", - "train_loader = DataLoader(mnist_train, batch_size=batch_size)\n", - "val_loader = DataLoader(mnist_val, batch_size=batch_size)\n", - "test_loader = DataLoader(mnist_test, batch_size=batch_size)\n" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gEulmrbxwaYL" - }, - "source": [ - "### Simple AutoEncoder Model\n", - "\n", - "Were gonna define a simple Lightning model so we can play with all the settings of the Lightning Trainer.\n", - "\n", - "LightningModule is simply pure Pytorch reorganized into hooks, that represents all the steps in the training process.\n", - "\n", - "You can use LightningModule hooks to control every part of your model, but for the purpose of this video we will use a very simple MNIST classifier, a model that takes 28*28 grayscale images of hand written images, and can predict the digit between 0-9.\n", - "\n", - "The LightningModule can encompass a single model, like an image classifier, or a deep learning system composed of multiple models, like this auto encoder that contains an encoder and a decoder.\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "x-34xKCI40yW" - }, - "source": [ - "class LitAutoEncoder(pl.LightningModule):\n", - "\n", - " def __init__(self, batch_size=32, lr=1e-3):\n", - " super().__init__()\n", - " self.encoder = nn.Sequential(\n", - " nn.Linear(28 * 28, 64),\n", - " nn.ReLU(),\n", - " nn.Linear(64, 3)\n", - " )\n", - " self.decoder = nn.Sequential(\n", - " nn.Linear(3, 64),\n", - " nn.ReLU(),\n", - " nn.Linear(64, 28 * 28)\n", - " )\n", - " self.batch_size=batch_size\n", - " self.learning_rate=lr\n", - "\n", - " def forward(self, x):\n", - " # in lightning, forward defines the prediction/inference actions\n", - " embedding = self.encoder(x)\n", - " return embedding\n", - "\n", - " def training_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " x = x.view(x.size(0), -1)\n", - " z = self.encoder(x)\n", - " x_hat = self.decoder(z)\n", - " loss = F.mse_loss(x_hat, x)\n", - " self.log('train_loss', loss)\n", - " return loss\n", - "\n", - " def validation_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " x = x.view(x.size(0), -1)\n", - " z = self.encoder(x)\n", - " x_hat = self.decoder(z)\n", - " loss = F.mse_loss(x_hat, x)\n", - " self.log('val_loss', loss)\n", - " \n", - " def test_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " x = x.view(x.size(0), -1)\n", - " z = self.encoder(x)\n", - " x_hat = self.decoder(z)\n", - " loss = F.mse_loss(x_hat, x)\n", - " self.log('test_loss', loss)\n", - "\n", - " def configure_optimizers(self):\n", - " optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)\n", - " return optimizer" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VbxcRCrxiYly" - }, - "source": [ - "You'll notice the LightningModule doesn't have epoch and batch loops, we're not calling model.train() and model.eval(), and no mentions of CUDA or hardware. That's because it is all automated by the Lightning Trainer. All the engineering boilerplate is automated by the trainer: \n", - "\n", - "* Training loops\n", - "* Evaluation and test loops\n", - "* Calling model.train(), model.eval(), no_grad at the right time\n", - "* CUDA or to_device calls\n", - "\n", - "It also allows you to train your models on different hardware like GPUs and TPUs without changing your code!\n", - "\n", - "\n", - "### To use the lightning trainer simply:\n", - "\n", - "1. init your LightningModule and datasets\n", - "\n", - "2. init lightning trainer\n", - "\n", - "3. call trainer.fit\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "HOk9c4_35FKg" - }, - "source": [ - "#####################\n", - "# 1. Init Model\n", - "#####################\n", - "\n", - "model = LitAutoEncoder()\n", - "\n", - "#####################\n", - "# 2. Init Trainer\n", - "#####################\n", - "\n", - "# these 2 flags are explained in the later sections...but for short explanation:\n", - "# - progress_bar_refresh_rate: limits refresh rate of tqdm progress bar so Colab doesn't freak out\n", - "# - max_epochs: only run 2 epochs instead of default of 1000\n", - "trainer = pl.Trainer(progress_bar_refresh_rate=20, max_epochs=2)\n", - "\n", - "#####################\n", - "# 3. Train\n", - "#####################\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3meDako-Qa_6" - }, - "source": [ - "Our model is training just like that, using the Lightning defaults. The beauty of Lightning is that everything is easily configurable.\n", - "In our next videos were going to show you all the ways you can control your Trainer to do things like controlling your training, validation and test loops, running on GPUs and TPUs, checkpointing, early stopping, and a lot more.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "z_Wry2MckQkI" - }, - "source": [ - "# Training loop and eval loop Flags" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0MkI1xB2vsLj" - }, - "source": [ - "\n", - "To really scale up your networks, you can use accelerators like GPUs. GPUs or Graphical Processing Units, parallelize matrix multiplications which enable speed ups of at least 100x over training on CPUs.\n", - "\n", - "Let's say you have a machine with 8 GPUs on it. You can set this flag to 1, 4, or 8 GPUs and lightning will automatically distribute your training for you.\n", - "\n", - "```\n", - "trainer = pl.Trainer(gpus=1)\n", - "```\n", - "\n", - "---------\n", - "\n", - "Lightning makes your code hardware agnostic... This means, you can switch between CPUs, GPUs without code changes.\n", - "\n", - "However, it requires forming good PyTorch habits:\n", - "\n", - "1. First, remove the .cuda() or .to() calls in your code.\n", - "2. Second, when you initialize a new tensor, set the device=self.device in the call since every lightningModule knows what gpu index or TPU core it is on.\n", - "\n", - "You can also use type_as and or you can register the tensor as a buffer in your module’s __init__ method with register_buffer().\n", - "\n", - "```\n", - "# before lightning\n", - "def forward(self, x):\n", - " z = torch.Tensor(2, 3)\n", - " z = z.cuda(0)\n", - "\n", - "# with lightning\n", - "def forward(self, x):\n", - " z = torch.Tensor(2, 3)\n", - " z = z.type_as(x, device=self.device)\n", - "```\n", - "\n", - "\n", - "```\n", - "class LitModel(LightningModule):\n", - "\n", - " def __init__(self):\n", - " ...\n", - " self.register_buffer(\"sigma\", torch.eye(3))\n", - " # you can now access self.sigma anywhere in your module\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hw6jJhhjvlSL" - }, - "source": [ - "Lightning Trainer automates all the engineering boilerplate like iterating over epochs and batches, training eval and test loops, CUDA and to(device) calls, calling model.train and model.eval.\n", - "\n", - "You still have full control over the loops, by using the following trainer flags:" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pT5-ETH9eUg6" - }, - "source": [ - "## Calling validation steps\n", - "Sometimes, training an epoch may be pretty fast, like minutes per epoch. In this case, you might not need to validate on every epoch. Instead, you can actually validate after a few epochs.\n", - "\n", - "Use `check_val_every_n_epoch` flag to control the frequency of validation step:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Z-EMVvKheu3D" - }, - "source": [ - "# run val loop every 10 training epochs\n", - "trainer = pl.Trainer(check_val_every_n_epoch=10)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "UOzZr9S2UcSO" - }, - "source": [ - "## val_check_interval\n", - "\n", - "In some cases where your epoch is very long, you might want to check validation within an epoch.\n", - "\n", - "You can also run validation step within your training epochs, by setting `val_check_interval` flag.\n", - "\n", - "Set `val_check_interval` to a float between [0.0 to 1.0] to check your validation set within a training epoch. For example, setting it to 0.25 will check your validation set 4 times during a training epoch.\n", - "\n", - "Default is set to 1.0" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "9kbUbvrUVLrT" - }, - "source": [ - "# check validation set 4 times during a training epoch\n", - "trainer = pl.Trainer(val_check_interval=0.25)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Onm1gBsKVaw4" - }, - "source": [ - "When you have iterable data sets, or when streaming data for production use cases, it is useful to check the validation set every number of steps. \n", - "Set val_check_interval to an int:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "psn6DVb5Vi85" - }, - "source": [ - "# check validation set every 1000 training batches\n", - "# use this when using iterableDataset and your dataset has no length\n", - "# (ie: production cases with streaming data)\n", - "trainer = pl.Trainer(val_check_interval=1000)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "QkoYonrWkb7-" - }, - "source": [ - "## num_sanity_val_steps \n", - "\n", - "You may have run into an issue, where you have a bug in your validation loop, but won't catch it until your training loop ends.\n", - "\n", - "and if your training loop takes hours or days, you will waste valuable compute.\n", - "\n", - "Instead, lightning automatically runs through 2 steps of validation in the beginning to catch these kinds of bugs up front.\n", - "\n", - "\n", - "The `num_sanity_val_steps` flag can help you run n batches of validation before starting the training routine.\n", - "\n", - "You can set it to 0 to turn it off" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "zOcT-ugSkiKW" - }, - "source": [ - "# turn it off\n", - "trainer = pl.Trainer(num_sanity_val_steps=0)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zS0ob1ZmTw56" - }, - "source": [ - "Set it to -1 to check all validation data before training" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "rzqvjA4UT263" - }, - "source": [ - "# check all validation data\n", - "trainer = pl.Trainer(num_sanity_val_steps=-1)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uMB41wq4T3Z2" - }, - "source": [ - "Or use any arbitrary number of validation steps" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "lGP78aQzT7VS" - }, - "source": [ - "trainer = pl.Trainer(num_sanity_val_steps=10)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "H-xaYRtd1rb-" - }, - "source": [ - "## Limit train, validation, and test batches\n", - "\n", - "You can set limits on how much of training, validation and test dataset you want your model to check. This is useful if you have really large validation or tests sets, for debugging or testing something that happens at the end of an epoch.\n", - "\n", - "Set the flag to int to specify the number of batches to run\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "XiK5cFKL1rcA" - }, - "source": [ - "# run for only 10 batches\n", - "trainer = pl.Trainer(limit_test_batches=10)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Y4LK0g65RrBm" - }, - "source": [ - "For example, some metrics need to be computed on the entire validation results, such as AUC ROC. " - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "8MmeRs2DR3dD" - }, - "source": [ - "trainer = pl.Trainer(limit_val_batches=10)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xmigcNa1A2Vy" - }, - "source": [ - "You can use a float to limit the batches be percentage of the set on every epoch" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "W7uGJt8nA4tv" - }, - "source": [ - "# run through only 25% of the test set each epoch\n", - "trainer = pl.Trainer(limit_test_batches=0.25)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YRI8THtUN7_e" - }, - "source": [ - "# Training on GPUs\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "R8FFkX_FwlfE" - }, - "source": [ - "To run on 1 GPU set the flag to 1" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Nnzkf3KaOE27" - }, - "source": [ - "trainer = pl.Trainer(gpus=1)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "cxBg47s5PB1P" - }, - "source": [ - "to run on 2 or 4 GPUs, set the flag to 2 or 4." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "cSEM4ihLrohT" - }, - "source": [ - "trainer = pl.Trainer(gpus=2)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZE6ZgwtNudro" - }, - "source": [ - "You can also select which GPU devices to run on, using a list of indices like [1, 4] \n", - "\n", - "or a string containing a comma separated list of GPU ids like '1,2'\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "gQkJtq0urrjq" - }, - "source": [ - "# list: train on GPUs 1, 4 (by bus ordering)\n", - "# trainer = Trainer(gpus='1, 4') # equivalent\n", - "trainer = pl.Trainer(gpus=[1, 4])\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "XghDPad4us74" - }, - "source": [ - "trainer = pl.Trainer(gpus=list(range(4)))\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6FVkKHpSPMTW" - }, - "source": [ - "You can use all the GPUs you have available by setting `gpus=-1`" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "r6cKQijYrtPe" - }, - "source": [ - "# trainer = Trainer(gpus='-1') - equivalent\n", - "trainer = pl.Trainer(gpus=-1)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2C-fNLm3UGCV" - }, - "source": [ - "Lightning uses the PCI bus_id as the index for ordering GPUs." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_V75s7EhOFhE" - }, - "source": [ - "### `auto_select_gpus`\n", - "\n", - "You can save on GPUs by running in “exclusive mode”, meaning only one process at a time can access them. If your not sure which GPUs you should use when running exclusive mode, Lightning can automatically find unoccupied GPUs for you. \n", - "\n", - "Simply specify the number of gpus as an integer `gpus=k`, and set the trainer flag `auto_select_gpus=True`. Lightning will automatically help you find k gpus that are not occupied by other processes." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "_Sd3XFsAOIwd" - }, - "source": [ - "# enable auto selection (will find two available gpus on system)\n", - "trainer = pl.Trainer(gpus=2, auto_select_gpus=True)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "a5JGSBMQhJNp" - }, - "source": [ - "## analyzing GPU usage\n", - "\n", - "### log_gpu_memory\n", - "\n", - "This is useful to analyze the memory usage of your GPUs.\n", - "\n", - "To get the GPU memory usage for every GPU on the master node, set the flag to log_gpu_memory=all.\n", - "\n", - "Under the hood, lightning uses the nvidia-smi command which may slow your training down.\n", - "\n", - "Your logs can become overwhelmed if you log the usage from many GPUs at once. In this case, you can also set the flag to min_max which will log only the min and max usage across all the GPUs of the master node.\n", - "\n", - "Note that lightning is not logging the usage across all nodes for performance reasons." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "idus3ZGahOki" - }, - "source": [ - "# log all the GPUs (on master node only)\n", - "trainer = Trainer(log_gpu_memory='all')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-mevgiy_hkip" - }, - "source": [ - "To avoid the performance decrease you can also set `log_gpu_memory=min_max` to only log the min and max memory on the master node.\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "SlvLJnWyhs7J" - }, - "source": [ - "# log only the min and max memory on the master node\n", - "trainer = Trainer(log_gpu_memory='min_max')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "K82FLLIJVQG3" - }, - "source": [ - "\n", - "But what if you want to train on multiple machines and not just one?" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YViQ6PXesAue" - }, - "source": [ - "# Training on multiple GPUs" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WacbBQUivxQq" - }, - "source": [ - "Lightning makes your models hardware agnostic, and you can run on GPUs with a flip of a flag. Lightning also supports training on multiple GPUs across many machines.\n", - "\n", - "You can do this by setting the num_nodes flag.\n", - "\n", - "The world size, or the total number of GPUs you are using, will be gpus*num_nodes.\n", - "\n", - "If i set gpus=8 and num_nodes=32 then I will be training on 256 GPUs." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "5iKckmDvr8zZ" - }, - "source": [ - "trainer = pl.Trainer(gpus=8, num_nodes=32)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GgcSbDjjlSTh" - }, - "source": [ - "## distributed backends\n", - "\n", - "Under the hood, Lightning uses distributed data parallel (or DDP) by default to distribute training across GPUs.\n", - "\n", - "This Lightning implementation of DDP calls your script under the hood multiple times with the correct environment variables.\n", - "\n", - "Under the hood it's as if you had called your script like this:\n", - "\n", - "1. Each GPU across each node gets its own process.\n", - "2. Each GPU gets visibility into a subset of the overall dataset. It will only ever see that subset.\n", - "3. Each process inits the model. (Make sure to set the random seed so that each model initializes with the same weights.)\n", - "4. Each process performs a full forward and backward pass in parallel.\n", - "5. The gradients are synced and averaged across all processes.\n", - "6. Each process updates its optimizer.\n", - "If you request multiple GPUs or nodes without setting a mode, DDP will be automatically used.\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "n_Brr7F5wdtj" - }, - "source": [ - "# ddp = DistributedDataParallel\n", - "# trainer = pl.Trainer(gpus=2, num_nodes=2) equivalent\n", - "trainer = pl.Trainer(gpus=2, num_nodes=2, distributed_backend='ddp')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "edxHyttC5J3e" - }, - "source": [ - "DDP is the fastest and recommended way to distribute your training, but you can pass in other backends to `distributed_backend` trainer flag, when DDP is not supported.\n", - "\n", - "DDP isn't available in\n", - "* Jupyter Notebook, Google COLAB, Kaggle, etc.\n", - "* If You have a nested script without a root package\n", - "* or if Your script needs to invoke .fit or .test multiple times" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZDh96mavxHxf" - }, - "source": [ - "### DDP_SPAWN\n", - "\n", - "In these cases, you can use `ddp_spawn` instead. `ddp_spawn` is exactly like DDP except that it uses `.spawn()` to start the training processes." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "JM5TKtgLxo37" - }, - "source": [ - "trainer = pl.Trainer(gpus=2, num_nodes=2, distributed_backend='ddp_spawn')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "sebhVE3qrhKK" - }, - "source": [ - "We STRONGLY discourage this use because it has limitations (due to Python and PyTorch):\n", - "\n", - "* Since .spawn() trains the model in subprocesses, the model on the main process does not get updated.\n", - "\n", - "* Dataloader(num_workers=N), where N is large, bottlenecks training with DDP… ie: it will be VERY slow or won’t work at all. This is a PyTorch limitation.\n", - "\n", - "* Forces everything to be picklable.\n", - "\n", - "DDP is MUCH faster than DDP_spawn. To be able to use DDP we recommend you: \n", - "\n", - "1. Install a top-level module for your project using setup.py\n", - "\n", - "```\n", - "# setup.py\n", - "#!/usr/bin/env python\n", - "\n", - "from setuptools import setup, find_packages\n", - "\n", - "setup(name='src',\n", - " version='0.0.1',\n", - " description='Describe Your Cool Project',\n", - " author='',\n", - " author_email='',\n", - " url='https://github.com/YourSeed', # REPLACE WITH YOUR OWN GITHUB PROJECT LINK\n", - " install_requires=[\n", - " 'pytorch-lightning'\n", - " ],\n", - " packages=find_packages()\n", - " )\n", - "\n", - "```\n", - "\n", - "2. Setup your project like so:\n", - "\n", - "```\n", - "/project\n", - " /src\n", - " some_file.py\n", - " /or_a_folder\n", - " setup.py\n", - "```\n", - "3. Install as a root-level package\n", - "```\n", - "cd /project\n", - "pip install -e .\n", - "```\n", - "4. You can then call your scripts anywhere\n", - "```\n", - "cd /project/src\n", - "\n", - "python some_file.py --distributed_backend 'ddp' --gpus 8\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "cmB3I_oyw7a8" - }, - "source": [ - "### DP\n", - "\n", - "If you're using windows, DDP is not supported. You can use `dp` for DataParallel instead: DataParallel uses multithreading, instead of multiprocessing. It splits a batch across k GPUs. That is, if you have a batch of 32 and use DP with 2 gpus, each GPU will process 16 samples, after which the root node will aggregate the results.\n", - "\n", - "DP use is discouraged by PyTorch and Lightning. Use DDP which is more stable and at least 3x faster.\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "OO-J0ISvlVCg" - }, - "source": [ - "# dp = DataParallel\n", - "trainer = pl.Trainer(gpus=2, distributed_backend='dp')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Y7E2eHZKwUn9" - }, - "source": [ - "### DDP2\n", - "\n", - "In certain cases, it’s advantageous to use ***all*** batches on the same machine, instead of a subset. For instance, in self-supervised learning, a common performance boost comes from increasing the number of negative samples.\n", - "\n", - "In this case, we can use DDP2 which behaves like DP in a machine and DDP across nodes. DDP2 does the following:\n", - "\n", - "* Copies a subset of the data to each node.\n", - "* Inits a model on each node.\n", - "* Runs a forward and backward pass using DP.\n", - "* Syncs gradients across nodes.\n", - "* Applies the optimizer updates.\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Y4xweqL3xHER" - }, - "source": [ - "# ddp2 = DistributedDataParallel + dp\n", - "trainer = pl.Trainer(gpus=2, num_nodes=2, distributed_backend='ddp2')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lhKNCnveeeq5" - }, - "source": [ - "- The second mode is ddp_spawn. This works like ddp, but instead of calling your script multiple times, lightning will use multiprocessing spawn to start a subprocess per GPU. \n", - "\n", - "However, you should be careful of mixing this mode with num_workers > 0 in your dataloaders because it will bottleneck your training. This is a current known limitation of PyTorch which is why we recommend using our ddp implementation instead.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HUf9ANyQkFFO" - }, - "source": [ - "\n", - "### mocking ddp\n", - "\n", - "Testing or debugging DDP can be hard, so we have a distributed backend that simulates ddp on cpus to make it easier. Set `num_processes` to a number greater than 1 when using distributed_backend=\"ddp_cpu\" to mimic distributed training on a machine without GPUs. Note that while this is useful for debugging, it will not provide any speedup, since single-process Torch already makes efficient use of multiple CPUs." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "ZSal5Da9kHOf" - }, - "source": [ - "# Simulate DDP for debugging on your GPU-less laptop\n", - "trainer = Trainer(distributed_backend=\"ddp_cpu\", num_processes=2)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Br_btCy5lgES" - }, - "source": [ - "# Training on TPUS\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DXkBNITdv44d" - }, - "source": [ - "Another option for accelerating your training is using TPUs.\n", - "A TPU is a Tensor processing unit, designed specifically for deep learning. Each TPU has 8 cores where each core is optimized for 128x128 matrix multiplies. Google estimates that 8 TPU cores are about as fast as 4 V100 GPUs!\n", - "\n", - "A TPU pod hosts many TPUs on it. Currently, TPU pod v2 has 2048 cores! You can request a full pod from Google cloud or a “slice” which gives you some subset of those 2048 cores.\n", - "\n", - "At this moment, TPUs are available on Google Cloud (GCP), Google Colab and Kaggle Environments.\n", - "\n", - "Lightning supports training on TPUs without any code adjustments to your model. Just like when using GPUs, Lightning automatically inserts the correct samplers - no need to do this yourself!\n", - "\n", - "Under the hood, lightning uses the XLA framework developed jointly by the facebook and google XLA teams. And we want to recognize their efforts in advancing TPU adoption of PyTorch.\n", - "\n", - "## tpu_cores\n", - "To train on TPUs, set the tpu_cores flag.\n", - "\n", - "When using colab or kaggle, the allowed values are 1 or 8 cores. When using google cloud, any value above 8 is allowed.\n", - "\n", - "Your effective batch size is the batch size passed into a dataloader times the total number of tpu cores." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "itP9y70gmD9M" - }, - "source": [ - "# int: train on a single core\n", - "trainer = pl.Trainer(tpu_cores=1)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "NJKnzPb3mKEg" - }, - "source": [ - "# int: train on all cores few cores\n", - "trainer = pl.Trainer(tpu_cores=8)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8a4exfWUmOHq" - }, - "source": [ - "You can also choose which TPU core to train on, by passing a list [1-8]. This is not an officially supported use case but we are working with the XLA team to improve this user experience.\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "S6OrjE_bmT-_" - }, - "source": [ - "# list: train on a single selected core\n", - "trainer = pl.Trainer(tpu_cores=[2])\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Afqx3sFUmfWD" - }, - "source": [ - "To train on more than 8 cores (ie: a POD), submit this script using the xla_dist script.\n", - "\n", - "\n", - "\n", - "```\n", - "python -m torch_xla.distributed.xla_dist\n", - "--tpu=$TPU_POD_NAME\n", - "--conda-env=torch-xla-nightly\n", - "--env=XLA_USE_BF16=1\n", - "-- python your_trainer_file.py\n", - "```\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ncPvbUVQqKOh" - }, - "source": [ - "# Advanced distributed training\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4MP7bEgnv7qK" - }, - "source": [ - "\n", - "Lightning supports distributed training across multiple GPUs and TPUs out of the box by setting trainer flags, but it also allows you to control the way sampling is done if you need to." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wdHiTfAMepKH" - }, - "source": [ - "## replace_sampler_ddp\n", - "In PyTorch, you must use torch.nn.DistributedSampler for multi-node or GPU training. The sampler makes sure each GPU sees the appropriate part of your data.\n", - "\n", - "```\n", - "# without lightning\n", - "def train_dataloader(self):\n", - " dataset = MNIST(...)\n", - " sampler = None\n", - "\n", - " if self.on_tpu:\n", - " sampler = DistributedSampler(dataset)\n", - "\n", - " return DataLoader(dataset, sampler=sampler)\n", - "```\n", - "Lightning adds the correct samplers when needed, so no need to explicitly add samplers. By default it will add `shuffle=True` for train sampler and `shuffle=False` for val/test sampler.\n", - "\n", - "If you want to customize this behaviour, you can set `replace_sampler_ddp=False` and add your own distributed sampler.\n", - "\n", - "(note: For iterable datasets, we don’t do this automatically.)\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "ZfmcB_e_7HbE" - }, - "source": [ - "sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=False)\n", - "dataloader = DataLoader(dataset, batch_size=32, sampler=sampler)\n", - "\n", - "trainer = pl.Trainer(gpus=2, num_nodes=2, replace_sampler_ddp=False)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-IOhk1n0lL3_" - }, - "source": [ - "## prepare_data_per_node\n", - "\n", - "When doing multi NODE training, if your nodes share the same file system, then you don't want to download data more than once to avoid possible collisions. \n", - "\n", - "Lightning automatically calls the prepare_data hook on the root GPU of the master node (ie: only a single GPU).\n", - "\n", - "In some cases where your nodes don't share the same file system, you need to download the data on each node. In this case you can set this flag to true and lightning will download the data on the root GPU of each node.\n", - "\n", - "This flag is defaulted to True." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "WFBMUR48lM04" - }, - "source": [ - "trainer = pl.Trainer(gpus=2, num_nodes=2, prepare_data_per_node=False)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FKBwXqo4q-Vp" - }, - "source": [ - "## sync_batchnorm\n", - "\n", - "Batch norm is computed per GPU/TPU. This flag enables synchronization between batchnorm layers across all GPUs.\n", - "It is recommended if you have small batch sizes.\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "GhaCLTEZrAQi" - }, - "source": [ - "trainer = Trainer(gpus=4, sync_batchnorm=True)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "XuFA7VTFMY9-" - }, - "source": [ - "# Debugging flags\n", - "\n", - "Lightning offers a couple of flags to make debugging your models easier:\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AKoS3fdml4Jx" - }, - "source": [ - "## Fast Dev Run\n", - "\n", - "To help you save time debugging, your first run should use the fast_dev_run flag.\n", - "\n", - "This won't generate logs or save checkpoints but will touch every line of your code to make sure that it is working as intended.\n", - "\n", - "Think about this flag like a compiler. You make changes to your code, and run Trainer with this flag to verify that your changes are bug free.\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "L5vuG7GSmhzK" - }, - "source": [ - "trainer = pl.Trainer(fast_dev_run=True)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HRP1qQR5nT4p" - }, - "source": [ - "## overfit_batches\n", - "\n", - "Uses this much data of the training set. If nonzero, will use the same training set for validation and testing. If the training dataloaders have shuffle=True, Lightning will automatically disable it.\n", - "\n", - "Useful for quickly debugging or trying to overfit on purpose." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "NTM-dqGMnXms" - }, - "source": [ - "# use only 1% of the train set (and use the train set for val and test)\n", - "trainer = pl.Trainer(overfit_batches=0.01)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "c0LV0gC3nl1X" - }, - "source": [ - "# overfit on 10 of the same batches\n", - "trainer = pl.Trainer(overfit_batches=10)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lt3UHU6WgtS_" - }, - "source": [ - "Or a float to represent percentage of data to run" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "K3yUqADhgnkf" - }, - "source": [ - "# run through only 25% of the test set each epoch\n", - "trainer = pl.Trainer(limit_test_batches=0.25)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ODN66NeVg_2o" - }, - "source": [ - "In the case of multiple test dataloaders, the limit applies to each dataloader individually.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8aQx5SLeMz1R" - }, - "source": [ - "# accumulate_grad_batches\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "g8GczZXFwKC7" - }, - "source": [ - "The batch size controls the accuracy of the estimate of the gradients. Small batch size use less memory, but decrease accuracy. When training large models, such as NLP transformers, it is useful to accumulate gradients before calling backwards(). It allows for bigger batch sizes than what can actually fit on a GPU/TPU in a single step.\n", - "\n", - "Use accumulate_grad_batches to accumulate gradients every k batches or as set up in the dict. Trainer also calls optimizer.step() for the last indivisible step number.\n", - "\n", - "For example, set accumulate_grad_batches to 4 to accumulate every 4 batches. In this case the effective batch size is batch_size*4, so if your batch size is 32, effectively it will be 128." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "2jB6-Z_yPhhf" - }, - "source": [ - "# accumulate every 4 batches (effective batch size is batch*4)\n", - "trainer = pl.Trainer(accumulate_grad_batches=4)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_Yi-bdTOgINC" - }, - "source": [ - "You can also pass a dictionary to specify different accumulation per epoch. We can set it to `{5: 3, 10: 20}` to have no accumulation for epochs 1 to 4, accumulate 3 batches for epoch 5 to 10, and 20 batches after that." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "X3xsoZ3YPgBv" - }, - "source": [ - "# no accumulation for epochs 1-4. accumulate 3 for epochs 5-10. accumulate 20 after that\n", - "trainer = pl.Trainer(accumulate_grad_batches={5: 3, 10: 20})\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "myzH8mV4M1_9" - }, - "source": [ - "# 16 bit precision\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "v9EaFAonwOk6" - }, - "source": [ - "Most deep learning frameworks like PyTorch, train with 32-bit floating point arithmetic. \n", - "\n", - "But many models can still achieve full accuracy using half the precision.\n", - "\n", - "In 2017, NVIDIA researchers successfully used a combination of 32 and 16 bit precision (also known as mixed precision) and achieved the same accuracy as 32 bit precision training.\n", - "\n", - "The main two advantages are:\n", - "\n", - "- a reduction in memory requirements which enables larger batch sizes and models.\n", - "- and a speed up in compute. On ampere, turing and volta architectures 16 bit precision models can train at least 3 times faster.\n", - "\n", - "As of PyTorch 1.6, NVIDIA and Facebook moved mixed precision functionality into PyTorch core as the AMP package, torch.cuda.amp. \n", - "\n", - "This package supersedes the apex package developed by NVIDIA." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TjNypZPHnxvJ" - }, - "source": [ - "## precision\n", - "\n", - "Use precision flag to switch between full precision (32) to half precision (16). Can be used on CPU, GPU or TPUs.\n", - "\n", - "When using PyTorch 1.6+ Lightning uses the native amp implementation to support 16-bit.\n", - "\n", - "If used on TPU will use torch.bfloat16 but tensor printing will still show torch.float32" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "kBZKMVx1nw-D" - }, - "source": [ - "# 16-bit precision\n", - "trainer = pl.Trainer(gpus=1, precision=16)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VJGj3Jh7oQXU" - }, - "source": [ - "In earlier version of Lightning, we use NVIDIA Apex for 16-bit precision. Apex was the first library to attempt 16-bit and the automatic mixed precision library (amp), has since been merged into core PyTorch as of 1.6.\n", - "\n", - "If you insist in using Apex, you can set the amp_backend flag to 'apex' and install Apex on your own." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "BDV1trAUPc9h" - }, - "source": [ - "trainer = pl.Trainer(gpus=1, precision=16, amp_backend='apex')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HK5c_aVfNV4e" - }, - "source": [ - "## amp_level\n", - "Apex includes 4 optimization levels:\n", - "O0 (FP32 training)\n", - "O1 (Conservative Mixed Precision): only some whitelist ops are done in FP16.\n", - "O2 (Fast Mixed Precision): this is the standard mixed precision training. It maintains FP32 master weights and optimizer.step acts directly on the FP32 master weights.\n", - "O3 (FP16 training): full FP16. Passing keep_batchnorm_fp32=True can speed things up as cudnn batchnorm is faster anyway.\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "FshMFPowNbWt" - }, - "source": [ - "# default used by the Trainer\n", - "trainer = pl.Trainer(gpus=1, precision=16, amp_backend='apex', amp_level='O2')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "y8KEr1YvNgkC" - }, - "source": [ - "# `auto_scale_batch_size`\n", - "\n", - " \n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7F1pKFIuwSFl" - }, - "source": [ - "Lightning can help you improve your model by using auto_scale_batch_size flag, which tries to find the largest batch size that fits into memory, before you start your training.\n", - "Larger batch size often yields better estimates of gradients, but may also result in longer training time. \n", - "\n", - "Set it to True to initially run a batch size finder trying to find the largest batch size that fits into memory. The result will be stored in self.batch_size in the LightningModule.\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "9_jE-iyyheIv" - }, - "source": [ - "trainer = pl.Trainer(auto_scale_batch_size=True)\n", - "\n", - "trainer.tune(model, train_dataloader=train_loader, val_dataloaders=val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yaHsJvwFhNJt" - }, - "source": [ - "You can set the value to `power`. `power` scaling starts from a batch size of 1 and keeps doubling the batch size until an out-of-memory (OOM) error is encountered.\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Qx0FbQrphgw1" - }, - "source": [ - "trainer = pl.Trainer(auto_scale_batch_size='power')\n", - "\n", - "trainer.tune(model, train_dataloader=train_loader, val_dataloaders=val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8bwgVF9zhZ75" - }, - "source": [ - "You can also set it to `binsearch`, that continues to finetune the batch size by performing a binary search.\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "QObXNs3yNrg9" - }, - "source": [ - "# run batch size scaling, result overrides hparams.batch_size\n", - "trainer = pl.Trainer(auto_scale_batch_size='binsearch')\n", - "\n", - "trainer.tune(model, train_dataloader=train_loader, val_dataloaders=val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5OWdhSsZjqW7" - }, - "source": [ - "This feature expects that a batch_size field in the hparams of your model, i.e., model.hparams.batch_size should exist and will be overridden by the results of this algorithm. \n", - "\n", - "Additionally, your train_dataloader() method should depend on this field for this feature to work.\n", - "\n", - "The algorithm in short works by:\n", - "1. Dumping the current state of the model and trainer\n", - "\n", - "2. Iteratively until convergence or maximum number of tries max_trials (default 25) has been reached:\n", - "* Call fit() method of trainer. This evaluates steps_per_trial (default 3) number of training steps. Each training step can trigger an OOM error if the tensors (training batch, weights, gradients etc.) allocated during the steps have a too large memory footprint.\n", - " * If an OOM error is encountered, decrease the batch size\n", - " * Else increase it.\n", - "* How much the batch size is increased/decreased is determined by the chosen strategy.\n", - "\n", - "3. The found batch size is saved to model.hparams.batch_size\n", - "\n", - "4. Restore the initial state of model and trainer\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "q4CvxfZmOWBd" - }, - "source": [ - "# `auto_lr_find`\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "j85e8usNwdBV" - }, - "source": [ - "Selecting a good learning rate for your deep learning training is essential for both better performance and faster convergence.\n", - "\n", - "Even optimizers such as Adam that are self-adjusting the learning rate can benefit from more optimal choices.\n", - "\n", - "To reduce the amount of guesswork concerning choosing a good initial learning rate, you can use Lightning auto learning rate finder.\n", - "\n", - "The learning rate finder does a small run where the learning rate is increased after each processed batch and the corresponding loss is logged. The result of this is a lr vs. loss plot that can be used as guidance for choosing an optimal initial lr.\n", - "\n", - "\n", - "warning: For the moment, this feature only works with models having a single optimizer. LR support for DDP is not implemented yet, it is coming soon.\n", - "\n", - "\n", - "***auto_lr_find=***\n", - "\n", - "In the most basic use case, this feature can be enabled during trainer construction with Trainer(auto_lr_find=True).\n", - "When .fit(model) is called, the LR finder will automatically run before any training is done. The lr that is found and used will be written to the console and logged together with all other hyperparameters of the model." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "iuhve9RBOfFh" - }, - "source": [ - "# default used by the Trainer (no learning rate finder)\n", - "trainer = pl.Trainer(mnist_model, auto_lr_find=False)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BL-gjXNCPDXk" - }, - "source": [ - "This flag sets your learning rate which can be accessed via self.lr or self.learning_rate.\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "wEb-vIMmPJQf" - }, - "source": [ - "class LitModel(LightningModule):\n", - "\n", - " def __init__(self, learning_rate):\n", - " self.learning_rate = learning_rate\n", - "\n", - " def configure_optimizers(self):\n", - " return Adam(self.parameters(), lr=(self.lr or self.learning_rate))\n", - "\n", - "# finds learning rate automatically\n", - "# sets hparams.lr or hparams.learning_rate to that learning rate\n", - "trainer = pl.Trainer(mnist_model, auto_lr_find=True)\n", - "\n", - "trainer.tune(model, train_dataloader=train_loader, val_dataloaders=val_loader)\n" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RweqvpnVPPSh" - }, - "source": [ - "To use an arbitrary value set it as auto_lr_find\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "4LKI39IfPLJv" - }, - "source": [ - "trainer = pl.Trainer(mnist_model, auto_lr_find='my_value')\n", - "\n", - "trainer.tune(model, train_dataloader=train_loader, val_dataloaders=val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9VAhPRKbPX-m" - }, - "source": [ - "Under the hood, when you call tune it runs the learning rate finder.\n", - "\n", - "If you want to inspect the results of the learning rate finder before doing any actual training or just play around with the parameters of the algorithm, this can be done by invoking the lr_find method of the trainer. A typical example of this would look like\n", - "\n", - "\n", - "```\n", - "trainer = pl.Trainer(auto_lr_find=True)\n", - "\n", - "# Run learning rate finder\n", - "lr_finder = trainer.lr_find(model)\n", - "\n", - "# Results can be found in\n", - "lr_finder.results\n", - "\n", - "# Plot with\n", - "fig = lr_finder.plot(suggest=True)\n", - "fig.show()\n", - "\n", - "# Pick point based on plot, or get suggestion\n", - "new_lr = lr_finder.suggestion()\n", - "\n", - "# update hparams of the model\n", - "model.hparams.lr = new_lr\n", - "\n", - "# Fit model\n", - "trainer.fit(model)\n", - "```\n", - "\n", - "The figure produced by lr_finder.plot() should look something like the figure below. It is recommended to not pick the learning rate that achieves the lowest loss, but instead something in the middle of the sharpest downward slope (red point). This is the point returned py lr_finder.suggestion().\n", - "\n", - "![image.png]()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tn1RV-jfOjt1" - }, - "source": [ - "# `benchmark`\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rsmTl5zfwjM3" - }, - "source": [ - "You can try to speed your system by setting `benchmark=True`, which enables cudnn.benchmark. This flag is likely to increase the speed of your system if your input sizes don’t change. This flag makes cudnn auto-tuner look for the optimal set of algorithms for the given hardware configuration. This usually leads to faster runtime.\n", - "But if your input sizes changes at each iteration, then cudnn will benchmark every time a new size appears, possibly leading to worse runtime performances." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "dWr-OCBgQCeb" - }, - "source": [ - "trainer = pl.Trainer(gpus=1, benchmark=True)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qwAvSKYGa24K" - }, - "source": [ - "# `deterministic`\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tl5mfmafwmat" - }, - "source": [ - "PyTorch does not guarantee reproducible results, even when using identical seeds. To guarentee reproducible results, you can remove most of the randomness from your process by setting the `deterministic` flag to True.\n", - "\n", - "Note that it might make your system slower." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Mhv5LZ3HbNCK" - }, - "source": [ - "trainer = pl.Trainer(gpus=1, deterministic=True)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "u_5eJSvTf60f" - }, - "source": [ - "# Exploding and vanishing gradients" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "B6drjh4pq6Jv" - }, - "source": [ - "## track_grad_norm\n", - "\n", - "You can debug your grad norm to identify exploding or vanishing gradients using the `track_grad_norm` flag.\n", - "\n", - "Set value to 2 to track the 2-norm. or p to any p-norm." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "2taHUir8rflR" - }, - "source": [ - "# track the 2-norm\n", - "trainer = pl.Trainer(track_grad_norm=2)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3vHKxmruk62f" - }, - "source": [ - "May be set to ‘inf’ infinity-norm." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "g7TbD6SxlAjP" - }, - "source": [ - "trainer = pl.Trainer(track_grad_norm='inf')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TcMlRe7ywpe6" - }, - "source": [ - "## Gradient clipping\n", - "\n", - "\n", - "Exploding gradients refer to the problem that the gradients get too large and overflow in training, making the model unstable. Gradient clipping will ‘clip’ the gradients or cap them to a Threshold value to prevent the gradients from getting too large. To avoid this, we can set `gradient_clip_val` (default is set to 0.0).\n", - "\n", - "[when to use it, what are relevant values]" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "jF9JwmbOgOWF" - }, - "source": [ - "trainer = pl.Trainer(gradient_clip_val=0.1)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ggb4MkkQrr1h" - }, - "source": [ - "# truncated_bptt_steps\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "s1Iu6PyAw9_r" - }, - "source": [ - "If you have a large recurrent model, you can use truncated_bptt_steps flag to split up the backprop over portions of the sequence. This flag will automatically truncate your batches and the trainer will apply Truncated Backprop to it.\n", - "\n", - "Make sure your batches have a sequence dimension.\n", - "\n", - "Lightning takes care of splitting your batch along the time-dimension.\n", - "```\n", - "# we use the second as the time dimension\n", - "# (batch, time, ...)\n", - "sub_batch = batch[0, 0:t, ...]\n", - "Using this feature requires updating your LightningModule’s pytorch_lightning.core.LightningModule.training_step() to include a hiddens arg with the hidden\n", - "\n", - "# Truncated back-propagation through time\n", - "def training_step(self, batch, batch_idx, hiddens):\n", - " # hiddens are the hiddens from the previous truncated backprop step\n", - " out, hiddens = self.lstm(data, hiddens)\n", - "\n", - " return {\n", - " \"loss\": ...,\n", - " \"hiddens\": hiddens # remember to detach() this\n", - " }\n", - "```" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "WiTF1VMtruMU" - }, - "source": [ - "# backprop every 5 steps in a batch\n", - "trainer = pl.Trainer(truncated_bptt_steps=5)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8XI_kEWkS-nT" - }, - "source": [ - "To modify how the batch is split, override pytorch_lightning.core.LightningModule.tbptt_split_batch():\n", - "\n", - "```\n", - "class LitMNIST(LightningModule):\n", - " def tbptt_split_batch(self, batch, split_size):\n", - " # do your own splitting on the batch\n", - " return splits\n", - "```\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oLbEmbmupwQ8" - }, - "source": [ - "# reload_dataloaders_every_epoch\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CLdNGVv9xD_L" - }, - "source": [ - "Set to True to reload dataloaders every epoch (instead of loading just once in the beginning of training).\n", - "\n", - "```\n", - "# if False (default)\n", - "train_loader = model.train_dataloader()\n", - "for epoch in epochs:\n", - " for batch in train_loader:\n", - " ...\n", - "\n", - "# if True\n", - "for epoch in epochs:\n", - " train_loader = model.train_dataloader()\n", - " for batch in train_loader:\n", - "\n", - "```" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "10AXthXxp311" - }, - "source": [ - "trainer = pl.Trainer(reload_dataloaders_every_epoch=True)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "f513EYl0bmmL" - }, - "source": [ - "# Callbacks\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2pt7iGh4xNs5" - }, - "source": [ - "\n", - "Lightning Callbacks are self-contained programs that can be reused across projects.\n", - "Callbacks should capture NON-ESSENTIAL logic that is NOT required for your LightningModule to run. Lightning includes some a few built-in callbacks that can be used with flags like early stopping and Model Checkpointing, but you can also create your own callbacks to add any functionality to your models.\n", - "\n", - "The callback API includes hooks that allow you to add logic at every point of your training:\n", - "setup, teardown, on_epoch_start, on_epoch_end, on_batch_start, on_batch_end, on_init_start, on_keyboard_interrupt etc. \n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1t84gvDNsUuh" - }, - "source": [ - "## callbacks\n", - "\n", - "Use **callbacks=** to pass a list of user defined callbacks. These callbacks DO NOT replace the built-in callbacks (loggers or EarlyStopping). \n", - "\n", - "In this example, we create a dummy callback that prints a message when training starts and ends, using on_train_start and on_train_end hooks." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "oIXZYabub3f0" - }, - "source": [ - "from pytorch_lightning.callbacks import Callback\n", - "\n", - "class PrintCallback(Callback):\n", - " def on_train_start(self, trainer, pl_module):\n", - " print(\"Training is started!\")\n", - " def on_train_end(self, trainer, pl_module):\n", - " print(\"Training is done.\")\n", - "\n", - "# a list of callbacks\n", - "callbacks = [PrintCallback()]\n", - "trainer = pl.Trainer(callbacks=callbacks)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "cNF74CLYfJJu" - }, - "source": [ - "# Model checkpointing\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2blgquBrxLtS" - }, - "source": [ - "Checkpoints capture the exact value of all parameters used by a model.\n", - "\n", - "Checkpointing your training allows you to resume a training process in case it was interrupted, fine-tune a model or use a pre-trained model for inference without having to retrain the model.\n", - "\n", - "Lightning automates saving and loading checkpoints so you restore a training session, saving all the required parameters including: \n", - "* 16-bit scaling factor (apex)\n", - "* Current epoch\n", - "* Global step\n", - "* Model state_dict\n", - "* State of all optimizers\n", - "* State of all learningRate schedulers\n", - "* State of all callbacks\n", - "* The hyperparameters used for that model if passed in as hparams (Argparse.Namespace)\n", - "\n", - "By default Lightning will save a checkpoint in the working directory, which will be updated every epoch.\n", - "\n", - "### Automatic saving\n", - "By default Lightning will save a checkpoint in the end of the first epoch in the working directory, which will be updated every epoch." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "XGu0JULrg9l7" - }, - "source": [ - "# default used by the Trainer\n", - "trainer = pl.Trainer(default_root_path=os.getcwd())\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3s9OjkGuhq1W" - }, - "source": [ - "To change the checkpoint path pass in **default_root_dir=**" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "DgdxkrIQhvfw" - }, - "source": [ - "trainer = pl.Trainer(default_root_dir='/your/path/to/save/checkpoints')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Qyvj_bkWrJiE" - }, - "source": [ - "\n", - "You can also have Lightning update your checkpoint based on a specific metric that you are logging (using self.log), by passing the key to `monitor=`. For example, if we want to save checkpoint based on the validation loss, logged as `val_loss`, you can pass:\n", - "\n", - "\n", - "```\n", - "checkpoint_callback = ModelCheckpoint(\n", - " filepath=os.getcwd(),\n", - " save_top_k=1,\n", - " verbose=True,\n", - " monitor='val_loss',\n", - " mode='min',\n", - " prefix=''\n", - ")\n", - "```\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "YzYMivw1rO1O" - }, - "source": [ - "from pytorch_lightning.callbacks import ModelCheckpoint\n", - "\n", - "trainer = pl.Trainer(checkpoint_callback=ModelCheckpoint(monitor='val_loss'))\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5hYs_FV8iDMn" - }, - "source": [ - "You can modify the behavior of checkpointing by creating your own callback, and passing it to the trainer. \n", - "You can control\n", - "* filepath- where logs are saved\n", - "* save_top_k- save k top models\n", - "* verbose\n", - "* monitor- the metric to monitor\n", - "* mode\n", - "* prefix\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Tb1K2VYDiNTu" - }, - "source": [ - "from pytorch_lightning.callbacks import ModelCheckpoint\n", - "\n", - "# DEFAULTS used by the Trainer\n", - "checkpoint_callback = ModelCheckpoint(\n", - " filepath=os.getcwd(),\n", - " save_top_k=3,\n", - " verbose=True,\n", - " monitor='val_loss',\n", - " mode='min',\n", - " prefix='',\n", - ")\n", - "\n", - "trainer = Trainer(checkpoint_callback=checkpoint_callback)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YKhZ6xRojJcl" - }, - "source": [ - "You can disable checkpointing it by passing\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Yt8zd2ZFjOXX" - }, - "source": [ - "trainer = Trainer(checkpoint_callback=False)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HcLy8asCjrj9" - }, - "source": [ - "### Manual saving\n", - "\n", - "You can manually save checkpoints and restore your model from the checkpointed state.\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "kZSkMJf0jR4x" - }, - "source": [ - "trainer.fit(model)\n", - "trainer.save_checkpoint(\"example.ckpt\")\n", - "new_model = LitAutoEncoder.load_from_checkpoint(checkpoint_path=\"example.ckpt\")" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "X2d9cjVPj7CP" - }, - "source": [ - "### Checkpoint Loading\n", - "To load a model along with its weights, biases and module_arguments use following method:\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "BpAFfg5zkFmH" - }, - "source": [ - "model = LitAutoEncoder.load_from_checkpoint(PATH)\n", - "\n", - "print(model.learning_rate)\n", - "# prints the learning_rate you used in this checkpoint\n", - "\n", - "model.eval()\n", - "y_hat = model(x)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jTQ3mxSJkhFN" - }, - "source": [ - "But if you don’t want to use the values saved in the checkpoint, pass in your own here" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "IoMcOh9-kfUP" - }, - "source": [ - "class LitAutoEncoder(LightningModule):\n", - "\n", - " def __init__(self, in_dim, out_dim):\n", - " super().__init__()\n", - " self.save_hyperparameters()\n", - " self.l1 = nn.Linear(self.hparams.in_dim, self.hparams.out_dim)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ITPVY8mNknut" - }, - "source": [ - "you can restore the model like this\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "H7XeRJzVkuY8" - }, - "source": [ - "# if you train and save the model like this it will use these values when loading\n", - "# the weights. But you can overwrite this\n", - "LitAutoEncoder(in_dim=32, out_dim=10)\n", - "\n", - "# uses in_dim=32, out_dim=10\n", - "model = LitAutoEncoder.load_from_checkpoint(PATH)\n" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "14WwGpnVk0a4" - }, - "source": [ - "# uses in_dim=128, out_dim=10\n", - "model = LitAutoEncoder.load_from_checkpoint(PATH, in_dim=128, out_dim=10)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bY5s6wP_k1CU" - }, - "source": [ - "\n", - "\n", - "## Restoring Training State (resume_from_checkpoint)\n", - "If your training was cut short for some reason, you can resume exactly from where you left off using the `resume_from_checkpoint` flag, which will automatically restore model, epoch, step, LR schedulers, apex, etc..." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "9zfhHtyrk3rO" - }, - "source": [ - "model = LitAutoEncoder()\n", - "trainer = pl.Trainer(resume_from_checkpoint='some/path/to/my_checkpoint.ckpt')\n", - "\n", - "# automatically restores model, epoch, step, LR schedulers, apex, etc...\n", - "trainer.fit(model)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xkKdvALFsmT2" - }, - "source": [ - "## weights_save_path\n", - "You can specify a directory for saving weights file using `weights_save_path`.\n", - "\n", - "(If you are using a custom checkpoint callback, the checkpoint callback will override this flag)." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "9OwHHFcCsrgT" - }, - "source": [ - "# save to your custom path\n", - "trainer = pl.Trainer(weights_save_path='my/path')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "PbNtlJ9Wsscf" - }, - "source": [ - "# if checkpoint callback used, then overrides the weights path\n", - "# **NOTE: this saves weights to some/path NOT my/path\n", - "checkpoint = ModelCheckpoint(filepath='some/path')\n", - "trainer = pl.Trainer(\n", - " checkpoint_callback=checkpoint,\n", - " weights_save_path='my/path'\n", - ")\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uDdxCuyHdWQt" - }, - "source": [ - "# Early stopping\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fqAy3ihRxTfR" - }, - "source": [ - "The EarlyStopping callback can be used to monitor a validation metric and stop the training when no improvement is observed, to help you avoid overfitting.\n", - "\n", - "To enable Early Stopping you can init the EarlyStopping callback, and pass it to `callbacks=` trainer flag. The callback will look for a logged metric to early stop on.\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "lFx976CheH93" - }, - "source": [ - "from pytorch_lightning.callbacks.early_stopping import EarlyStopping\n", - "\n", - "trainer = pl.Trainer(callbacks=[EarlyStopping('val_loss')])\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MwpJfTvjeOwF" - }, - "source": [ - "You can customize the callback using the following params:\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "V6I9h6HteK2U" - }, - "source": [ - "from pytorch_lightning.callbacks.early_stopping import EarlyStopping\n", - "\n", - "early_stop_callback = EarlyStopping(\n", - " monitor='val_accuracy',\n", - " min_delta=0.00,\n", - " patience=3,\n", - " verbose=False,\n", - " mode='max'\n", - ")\n", - "trainer = pl.Trainer(callbacks=[early_stop_callback])\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7TAIerPYe_Q1" - }, - "source": [ - "The EarlyStopping callback runs at the end of every validation epoch, which, under the default configuration, happens after every training epoch. However, the frequency of validation can be modified by setting various parameters on the Trainer, for example check_val_every_n_epoch and val_check_interval. It must be noted that the patience parameter counts the number of validation epochs with no improvement, and not the number of training epochs. Therefore, with parameters check_val_every_n_epoch=10 and patience=3, the trainer will perform at least 40 training epochs before being stopped." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VoKrX2ENh9Fg" - }, - "source": [ - "# Logging" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-CQTPKd7iKLm" - }, - "source": [ - "Lightning has built in integration with various loggers such as TensorBoard, wandb, commet, etc.\n", - "\n", - "\n", - "You can pass any metrics you want to log during training to `self.log`, such as loss or accuracy. Similarly, pass in to self.log any metric you want to log during validation step.\n", - "\n", - "These values will be passed in to the logger of your choise. simply pass in any supported logger to logger trainer flag.\n", - "\n", - "\n", - "\n", - "Use the as`logger=` trainer flag to pass in a Logger, or iterable collection of Loggers, for experiment tracking.\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "ty5VPS3AiS8L" - }, - "source": [ - "from pytorch_lightning.loggers import TensorBoardLogger\n", - "\n", - "# default logger used by trainer\n", - "logger = TensorBoardLogger(\n", - " save_dir=os.getcwd(),\n", - " version=1,\n", - " name='lightning_logs'\n", - ")\n", - "trainer = pl.Trainer(logger=logger)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jc5oWNpoiuuc" - }, - "source": [ - "Lightning supports the use of multiple loggers, just pass a list to the Trainer.\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "BlYwMRRyivp_" - }, - "source": [ - "from pytorch_lightning.loggers import TensorBoardLogger, TestTubeLogger\n", - "logger1 = TensorBoardLogger('tb_logs', name='my_model')\n", - "logger2 = TestTubeLogger('tb_logs', name='my_model')\n", - "trainer = pl.Trainer(logger=[logger1, logger2])" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "a7EyspQPh7iQ" - }, - "source": [ - "## flush_logs_every_n_steps\n", - "\n", - "Use this flag to determine when logging to disc should happen." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Em_XvsmyiBbk" - }, - "source": [ - "trainer = pl.Trainer(flush_logs_every_n_steps=100)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)\n" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_vDeKE98qsl1" - }, - "source": [ - "## log_every_n_steps\n", - "How often to add logging rows (does not write to disk)\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "HkqD7D_0w1Tt" - }, - "source": [ - "trainer = pl.Trainer(log_every_n_steps=1000)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9uw0gfe422CT" - }, - "source": [ - "# info logging" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dQXpt0aatDGo" - }, - "source": [ - "### default_root_dir\n", - "\n", - "---\n", - "\n", - "\n", - "\n", - "Default path for logs and weights when no logger or pytorch_lightning.callbacks.ModelCheckpoint callback passed. On certain clusters you might want to separate where logs and checkpoints are stored. If you don’t then use this argument for convenience. Paths can be local paths or remote paths such as s3://bucket/path or ‘hdfs://path/’. Credentials will need to be set up to use remote filepaths." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CMmID2Bts5W3" - }, - "source": [ - "## weights_summary\n", - "Prints a summary of the weights when training begins. Default is set to `top`- print summary of top level modules.\n", - "\n", - "Options: ‘full’, ‘top’, None." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "KTl6EdwDs6j2" - }, - "source": [ - "\n", - "# print full summary of all modules and submodules\n", - "trainer = pl.Trainer(weights_summary='full')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "R57cSLl9w9ma" - }, - "source": [ - "# don't print a summary\n", - "trainer = Trainer(weights_summary=None)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bSc2hU5AotAP" - }, - "source": [ - "# progress bar" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GgvbyDsBxcH6" - }, - "source": [ - "## process_position\n", - "\n", - "Orders the progress bar. Useful when running multiple trainers on the same node.\n", - "\n", - "(This argument is ignored if a custom callback is passed to callbacks)\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "6ekz8Es8owDn" - }, - "source": [ - "# default used by the Trainer\n", - "trainer = pl.Trainer(process_position=0)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "itivQFgEphBU" - }, - "source": [ - "## progress_bar_refresh_rate\n", - "\n", - "How often to refresh the progress bar (in steps). In notebooks, faster refresh rates (lower number) is known to crash them because of their screen refresh rates, so raise it to 50 or more." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "GKe6eVxmplL5" - }, - "source": [ - "# default used by the Trainer\n", - "trainer = pl.Trainer(progress_bar_refresh_rate=1)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "8rDHJOJbxNtf" - }, - "source": [ - "# disable progress bar\n", - "trainer = Trainer(progress_bar_refresh_rate=0)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NCNvYLwjpWne" - }, - "source": [ - "# profiler" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "pRknrG_zpY6M" - }, - "source": [ - "# to profile standard training events\n", - "trainer = pl.Trainer(profiler=True)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Ji6aWpU73kMM" - }, - "source": [ - "You can also use Lightning AdvancedProfiler if you want more detailed information about time spent in each function call recorded during a given action. The output is quite verbose and you should only use this if you want very detailed reports.\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "layG55pt316C" - }, - "source": [ - "from pytorch_lightning.profiler import AdvancedProfiler\n", - "\n", - "trainer = Trainer(profiler=AdvancedProfiler())\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - } - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "goRmGIRI5cfC" + }, + "source": [ + "# Introduction to Lightning Flags ⚡🚩\n", + "\n", + "In this notebook, we'll go over the flags available in the `Trainer` object. Note that not everything will work in the Colab environment (multi-gpu, etc). This notebook accompanies the Trainer videos we'll be putting out.\n", + "\n", + "---\n", + " - Give us a ⭐ [on Github](https://www.github.com/PytorchLightning/pytorch-lightning/)\n", + " - Check out [the documentation](https://pytorch-lightning.readthedocs.io/en/latest/)\n", + " - Join us [on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jKj5lgdr5j48" + }, + "source": [ + "--- \n", + "### Setup \n", + "First thing first, we need to install Lightning. Simply ```pip install pytorch-lightning```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UGjilEHk4vb7" + }, + "outputs": [], + "source": [ + "! pip install pytorch-lightning" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zaVUShmQ5n8Y" + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "from argparse import ArgumentParser\n", + "import torch\n", + "from torch import nn\n", + "from torch.nn import functional as F\n", + "from torch.utils.data import DataLoader\n", + "from torch.utils.data import random_split\n", + "from torchvision.datasets import MNIST\n", + "from torchvision import transforms\n", + "import pytorch_lightning as pl\n", + "from pytorch_lightning.metrics.functional import accuracy\n", + "\n", + "from torchvision.datasets.mnist import MNIST\n", + "from torchvision import transforms" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6tgkS8IYZwY_" + }, + "outputs": [], + "source": [ + "# ------------\n", + "# data\n", + "# ------------\n", + "pl.seed_everything(1234)\n", + "batch_size = 32\n", + "\n", + "# Init DataLoader from MNIST Dataset\n", + "\n", + "dataset = MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor())\n", + "mnist_test = MNIST(os.getcwd(), train=False, download=True, transform=transforms.ToTensor())\n", + "mnist_train, mnist_val = random_split(dataset, [55000, 5000])\n", + "\n", + "train_loader = DataLoader(mnist_train, batch_size=batch_size)\n", + "val_loader = DataLoader(mnist_val, batch_size=batch_size)\n", + "test_loader = DataLoader(mnist_test, batch_size=batch_size)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gEulmrbxwaYL" + }, + "source": [ + "### Simple AutoEncoder Model\n", + "\n", + "Were gonna define a simple Lightning model so we can play with all the settings of the Lightning Trainer.\n", + "\n", + "LightningModule is simply pure Pytorch reorganized into hooks, that represents all the steps in the training process.\n", + "\n", + "You can use LightningModule hooks to control every part of your model, but for the purpose of this video we will use a very simple MNIST classifier, a model that takes 28*28 grayscale images of hand written images, and can predict the digit between 0-9.\n", + "\n", + "The LightningModule can encompass a single model, like an image classifier, or a deep learning system composed of multiple models, like this auto encoder that contains an encoder and a decoder.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "x-34xKCI40yW" + }, + "outputs": [], + "source": [ + "class LitAutoEncoder(pl.LightningModule):\n", + "\n", + " def __init__(self, batch_size=32, lr=1e-3):\n", + " super().__init__()\n", + " self.encoder = nn.Sequential(\n", + " nn.Linear(28 * 28, 64),\n", + " nn.ReLU(),\n", + " nn.Linear(64, 3)\n", + " )\n", + " self.decoder = nn.Sequential(\n", + " nn.Linear(3, 64),\n", + " nn.ReLU(),\n", + " nn.Linear(64, 28 * 28)\n", + " )\n", + " self.batch_size=batch_size\n", + " self.learning_rate=lr\n", + "\n", + " def forward(self, x):\n", + " # in lightning, forward defines the prediction/inference actions\n", + " embedding = self.encoder(x)\n", + " return embedding\n", + "\n", + " def training_step(self, batch, batch_idx):\n", + " x, y = batch\n", + " x = x.view(x.size(0), -1)\n", + " z = self.encoder(x)\n", + " x_hat = self.decoder(z)\n", + " loss = F.mse_loss(x_hat, x)\n", + " self.log('train_loss', loss)\n", + " return loss\n", + "\n", + " def validation_step(self, batch, batch_idx):\n", + " x, y = batch\n", + " x = x.view(x.size(0), -1)\n", + " z = self.encoder(x)\n", + " x_hat = self.decoder(z)\n", + " loss = F.mse_loss(x_hat, x)\n", + " self.log('val_loss', loss)\n", + " \n", + " def test_step(self, batch, batch_idx):\n", + " x, y = batch\n", + " x = x.view(x.size(0), -1)\n", + " z = self.encoder(x)\n", + " x_hat = self.decoder(z)\n", + " loss = F.mse_loss(x_hat, x)\n", + " self.log('test_loss', loss)\n", + "\n", + " def configure_optimizers(self):\n", + " optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)\n", + " return optimizer" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VbxcRCrxiYly" + }, + "source": [ + "You'll notice the LightningModule doesn't have epoch and batch loops, we're not calling model.train() and model.eval(), and no mentions of CUDA or hardware. That's because it is all automated by the Lightning Trainer. All the engineering boilerplate is automated by the trainer: \n", + "\n", + "* Training loops\n", + "* Evaluation and test loops\n", + "* Calling model.train(), model.eval(), no_grad at the right time\n", + "* CUDA or to_device calls\n", + "\n", + "It also allows you to train your models on different hardware like GPUs and TPUs without changing your code!\n", + "\n", + "\n", + "### To use the lightning trainer simply:\n", + "\n", + "1. init your LightningModule and datasets\n", + "\n", + "2. init lightning trainer\n", + "\n", + "3. call trainer.fit\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "HOk9c4_35FKg" + }, + "outputs": [], + "source": [ + "#####################\n", + "# 1. Init Model\n", + "#####################\n", + "\n", + "model = LitAutoEncoder()\n", + "\n", + "#####################\n", + "# 2. Init Trainer\n", + "#####################\n", + "\n", + "# these 2 flags are explained in the later sections...but for short explanation:\n", + "# - progress_bar_refresh_rate: limits refresh rate of tqdm progress bar so Colab doesn't freak out\n", + "# - max_epochs: only run 2 epochs instead of default of 1000\n", + "trainer = pl.Trainer(progress_bar_refresh_rate=20, max_epochs=2)\n", + "\n", + "#####################\n", + "# 3. Train\n", + "#####################\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3meDako-Qa_6" + }, + "source": [ + "Our model is training just like that, using the Lightning defaults. The beauty of Lightning is that everything is easily configurable.\n", + "In our next videos were going to show you all the ways you can control your Trainer to do things like controlling your training, validation and test loops, running on GPUs and TPUs, checkpointing, early stopping, and a lot more.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "z_Wry2MckQkI" + }, + "source": [ + "# Training loop and eval loop Flags" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0MkI1xB2vsLj" + }, + "source": [ + "\n", + "To really scale up your networks, you can use accelerators like GPUs. GPUs or Graphical Processing Units, parallelize matrix multiplications which enable speed ups of at least 100x over training on CPUs.\n", + "\n", + "Let's say you have a machine with 8 GPUs on it. You can set this flag to 1, 4, or 8 GPUs and lightning will automatically distribute your training for you.\n", + "\n", + "```\n", + "trainer = pl.Trainer(gpus=1)\n", + "```\n", + "\n", + "---------\n", + "\n", + "Lightning makes your code hardware agnostic... This means, you can switch between CPUs, GPUs without code changes.\n", + "\n", + "However, it requires forming good PyTorch habits:\n", + "\n", + "1. First, remove the .cuda() or .to() calls in your code.\n", + "2. Second, when you initialize a new tensor, set the device=self.device in the call since every lightningModule knows what gpu index or TPU core it is on.\n", + "\n", + "You can also use type_as and or you can register the tensor as a buffer in your module’s __init__ method with register_buffer().\n", + "\n", + "```\n", + "# before lightning\n", + "def forward(self, x):\n", + " z = torch.Tensor(2, 3)\n", + " z = z.cuda(0)\n", + "\n", + "# with lightning\n", + "def forward(self, x):\n", + " z = torch.Tensor(2, 3)\n", + " z = z.type_as(x, device=self.device)\n", + "```\n", + "\n", + "\n", + "```\n", + "class LitModel(LightningModule):\n", + "\n", + " def __init__(self):\n", + " ...\n", + " self.register_buffer(\"sigma\", torch.eye(3))\n", + " # you can now access self.sigma anywhere in your module\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hw6jJhhjvlSL" + }, + "source": [ + "Lightning Trainer automates all the engineering boilerplate like iterating over epochs and batches, training eval and test loops, CUDA and to(device) calls, calling model.train and model.eval.\n", + "\n", + "You still have full control over the loops, by using the following trainer flags:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pT5-ETH9eUg6" + }, + "source": [ + "## Calling validation steps\n", + "Sometimes, training an epoch may be pretty fast, like minutes per epoch. In this case, you might not need to validate on every epoch. Instead, you can actually validate after a few epochs.\n", + "\n", + "Use `check_val_every_n_epoch` flag to control the frequency of validation step:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Z-EMVvKheu3D" + }, + "outputs": [], + "source": [ + "# run val loop every 10 training epochs\n", + "trainer = pl.Trainer(check_val_every_n_epoch=10)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UOzZr9S2UcSO" + }, + "source": [ + "## val_check_interval\n", + "\n", + "In some cases where your epoch is very long, you might want to check validation within an epoch.\n", + "\n", + "You can also run validation step within your training epochs, by setting `val_check_interval` flag.\n", + "\n", + "Set `val_check_interval` to a float between [0.0 to 1.0] to check your validation set within a training epoch. For example, setting it to 0.25 will check your validation set 4 times during a training epoch.\n", + "\n", + "Default is set to 1.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9kbUbvrUVLrT" + }, + "outputs": [], + "source": [ + "# check validation set 4 times during a training epoch\n", + "trainer = pl.Trainer(val_check_interval=0.25)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Onm1gBsKVaw4" + }, + "source": [ + "When you have iterable data sets, or when streaming data for production use cases, it is useful to check the validation set every number of steps. \n", + "Set val_check_interval to an int:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "psn6DVb5Vi85" + }, + "outputs": [], + "source": [ + "# check validation set every 1000 training batches\n", + "# use this when using iterableDataset and your dataset has no length\n", + "# (ie: production cases with streaming data)\n", + "trainer = pl.Trainer(val_check_interval=1000)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QkoYonrWkb7-" + }, + "source": [ + "## num_sanity_val_steps \n", + "\n", + "You may have run into an issue, where you have a bug in your validation loop, but won't catch it until your training loop ends.\n", + "\n", + "and if your training loop takes hours or days, you will waste valuable compute.\n", + "\n", + "Instead, lightning automatically runs through 2 steps of validation in the beginning to catch these kinds of bugs up front.\n", + "\n", + "\n", + "The `num_sanity_val_steps` flag can help you run n batches of validation before starting the training routine.\n", + "\n", + "You can set it to 0 to turn it off" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zOcT-ugSkiKW" + }, + "outputs": [], + "source": [ + "# turn it off\n", + "trainer = pl.Trainer(num_sanity_val_steps=0)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zS0ob1ZmTw56" + }, + "source": [ + "Set it to -1 to check all validation data before training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rzqvjA4UT263" + }, + "outputs": [], + "source": [ + "# check all validation data\n", + "trainer = pl.Trainer(num_sanity_val_steps=-1)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uMB41wq4T3Z2" + }, + "source": [ + "Or use any arbitrary number of validation steps" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lGP78aQzT7VS" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(num_sanity_val_steps=10)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "H-xaYRtd1rb-" + }, + "source": [ + "## Limit train, validation, and test batches\n", + "\n", + "You can set limits on how much of training, validation and test dataset you want your model to check. This is useful if you have really large validation or tests sets, for debugging or testing something that happens at the end of an epoch.\n", + "\n", + "Set the flag to int to specify the number of batches to run\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XiK5cFKL1rcA" + }, + "outputs": [], + "source": [ + "# run for only 10 batches\n", + "trainer = pl.Trainer(limit_test_batches=10)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Y4LK0g65RrBm" + }, + "source": [ + "For example, some metrics need to be computed on the entire validation results, such as AUC ROC. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8MmeRs2DR3dD" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(limit_val_batches=10)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xmigcNa1A2Vy" + }, + "source": [ + "You can use a float to limit the batches be percentage of the set on every epoch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "W7uGJt8nA4tv" + }, + "outputs": [], + "source": [ + "# run through only 25% of the test set each epoch\n", + "trainer = pl.Trainer(limit_test_batches=0.25)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YRI8THtUN7_e" + }, + "source": [ + "# Training on GPUs\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R8FFkX_FwlfE" + }, + "source": [ + "To run on 1 GPU set the flag to 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nnzkf3KaOE27" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(gpus=1)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cxBg47s5PB1P" + }, + "source": [ + "to run on 2 or 4 GPUs, set the flag to 2 or 4." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cSEM4ihLrohT" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(gpus=2)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZE6ZgwtNudro" + }, + "source": [ + "You can also select which GPU devices to run on, using a list of indices like [1, 4] \n", + "\n", + "or a string containing a comma separated list of GPU ids like '1,2'\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gQkJtq0urrjq" + }, + "outputs": [], + "source": [ + "# list: train on GPUs 1, 4 (by bus ordering)\n", + "# trainer = Trainer(gpus='1, 4') # equivalent\n", + "trainer = pl.Trainer(gpus=[1, 4])\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XghDPad4us74" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(gpus=list(range(4)))\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6FVkKHpSPMTW" + }, + "source": [ + "You can use all the GPUs you have available by setting `gpus=-1`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "r6cKQijYrtPe" + }, + "outputs": [], + "source": [ + "# trainer = Trainer(gpus='-1') - equivalent\n", + "trainer = pl.Trainer(gpus=-1)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2C-fNLm3UGCV" + }, + "source": [ + "Lightning uses the PCI bus_id as the index for ordering GPUs." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_V75s7EhOFhE" + }, + "source": [ + "### `auto_select_gpus`\n", + "\n", + "You can save on GPUs by running in “exclusive mode”, meaning only one process at a time can access them. If your not sure which GPUs you should use when running exclusive mode, Lightning can automatically find unoccupied GPUs for you. \n", + "\n", + "Simply specify the number of gpus as an integer `gpus=k`, and set the trainer flag `auto_select_gpus=True`. Lightning will automatically help you find k gpus that are not occupied by other processes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_Sd3XFsAOIwd" + }, + "outputs": [], + "source": [ + "# enable auto selection (will find two available gpus on system)\n", + "trainer = pl.Trainer(gpus=2, auto_select_gpus=True)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "a5JGSBMQhJNp" + }, + "source": [ + "## analyzing GPU usage\n", + "\n", + "### log_gpu_memory\n", + "\n", + "This is useful to analyze the memory usage of your GPUs.\n", + "\n", + "To get the GPU memory usage for every GPU on the master node, set the flag to log_gpu_memory=all.\n", + "\n", + "Under the hood, lightning uses the nvidia-smi command which may slow your training down.\n", + "\n", + "Your logs can become overwhelmed if you log the usage from many GPUs at once. In this case, you can also set the flag to min_max which will log only the min and max usage across all the GPUs of the master node.\n", + "\n", + "Note that lightning is not logging the usage across all nodes for performance reasons." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "idus3ZGahOki" + }, + "outputs": [], + "source": [ + "# log all the GPUs (on master node only)\n", + "trainer = Trainer(log_gpu_memory='all')\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-mevgiy_hkip" + }, + "source": [ + "To avoid the performance decrease you can also set `log_gpu_memory=min_max` to only log the min and max memory on the master node.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SlvLJnWyhs7J" + }, + "outputs": [], + "source": [ + "# log only the min and max memory on the master node\n", + "trainer = Trainer(log_gpu_memory='min_max')\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "K82FLLIJVQG3" + }, + "source": [ + "\n", + "But what if you want to train on multiple machines and not just one?" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YViQ6PXesAue" + }, + "source": [ + "# Training on multiple GPUs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WacbBQUivxQq" + }, + "source": [ + "Lightning makes your models hardware agnostic, and you can run on GPUs with a flip of a flag. Lightning also supports training on multiple GPUs across many machines.\n", + "\n", + "You can do this by setting the num_nodes flag.\n", + "\n", + "The world size, or the total number of GPUs you are using, will be gpus*num_nodes.\n", + "\n", + "If i set gpus=8 and num_nodes=32 then I will be training on 256 GPUs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5iKckmDvr8zZ" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(gpus=8, num_nodes=32)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GgcSbDjjlSTh" + }, + "source": [ + "## distributed backends\n", + "\n", + "Under the hood, Lightning uses distributed data parallel (or DDP) by default to distribute training across GPUs.\n", + "\n", + "This Lightning implementation of DDP calls your script under the hood multiple times with the correct environment variables.\n", + "\n", + "Under the hood it's as if you had called your script like this:\n", + "\n", + "1. Each GPU across each node gets its own process.\n", + "2. Each GPU gets visibility into a subset of the overall dataset. It will only ever see that subset.\n", + "3. Each process inits the model. (Make sure to set the random seed so that each model initializes with the same weights.)\n", + "4. Each process performs a full forward and backward pass in parallel.\n", + "5. The gradients are synced and averaged across all processes.\n", + "6. Each process updates its optimizer.\n", + "If you request multiple GPUs or nodes without setting a mode, DDP will be automatically used.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "n_Brr7F5wdtj" + }, + "outputs": [], + "source": [ + "# ddp = DistributedDataParallel\n", + "# trainer = pl.Trainer(gpus=2, num_nodes=2) equivalent\n", + "trainer = pl.Trainer(gpus=2, num_nodes=2, distributed_backend='ddp')\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "edxHyttC5J3e" + }, + "source": [ + "DDP is the fastest and recommended way to distribute your training, but you can pass in other backends to `distributed_backend` trainer flag, when DDP is not supported.\n", + "\n", + "DDP isn't available in\n", + "* Jupyter Notebook, Google COLAB, Kaggle, etc.\n", + "* If You have a nested script without a root package\n", + "* or if Your script needs to invoke .fit or .test multiple times" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZDh96mavxHxf" + }, + "source": [ + "### DDP_SPAWN\n", + "\n", + "In these cases, you can use `ddp_spawn` instead. `ddp_spawn` is exactly like DDP except that it uses `.spawn()` to start the training processes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JM5TKtgLxo37" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(gpus=2, num_nodes=2, distributed_backend='ddp_spawn')\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sebhVE3qrhKK" + }, + "source": [ + "We STRONGLY discourage this use because it has limitations (due to Python and PyTorch):\n", + "\n", + "* Since .spawn() trains the model in subprocesses, the model on the main process does not get updated.\n", + "\n", + "* Dataloader(num_workers=N), where N is large, bottlenecks training with DDP… ie: it will be VERY slow or won’t work at all. This is a PyTorch limitation.\n", + "\n", + "* Forces everything to be picklable.\n", + "\n", + "DDP is MUCH faster than DDP_spawn. To be able to use DDP we recommend you: \n", + "\n", + "1. Install a top-level module for your project using setup.py\n", + "\n", + "```\n", + "# setup.py\n", + "#!/usr/bin/env python\n", + "\n", + "from setuptools import setup, find_packages\n", + "\n", + "setup(name='src',\n", + " version='0.0.1',\n", + " description='Describe Your Cool Project',\n", + " author='',\n", + " author_email='',\n", + " url='https://github.com/YourSeed', # REPLACE WITH YOUR OWN GITHUB PROJECT LINK\n", + " install_requires=[\n", + " 'pytorch-lightning'\n", + " ],\n", + " packages=find_packages()\n", + " )\n", + "\n", + "```\n", + "\n", + "2. Setup your project like so:\n", + "\n", + "```\n", + "/project\n", + " /src\n", + " some_file.py\n", + " /or_a_folder\n", + " setup.py\n", + "```\n", + "3. Install as a root-level package\n", + "```\n", + "cd /project\n", + "pip install -e .\n", + "```\n", + "4. You can then call your scripts anywhere\n", + "```\n", + "cd /project/src\n", + "\n", + "python some_file.py --distributed_backend 'ddp' --gpus 8\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cmB3I_oyw7a8" + }, + "source": [ + "### DP\n", + "\n", + "If you're using windows, DDP is not supported. You can use `dp` for DataParallel instead: DataParallel uses multithreading, instead of multiprocessing. It splits a batch across k GPUs. That is, if you have a batch of 32 and use DP with 2 gpus, each GPU will process 16 samples, after which the root node will aggregate the results.\n", + "\n", + "DP use is discouraged by PyTorch and Lightning. Use DDP which is more stable and at least 3x faster.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "OO-J0ISvlVCg" + }, + "outputs": [], + "source": [ + "# dp = DataParallel\n", + "trainer = pl.Trainer(gpus=2, distributed_backend='dp')\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Y7E2eHZKwUn9" + }, + "source": [ + "### DDP2\n", + "\n", + "In certain cases, it’s advantageous to use ***all*** batches on the same machine, instead of a subset. For instance, in self-supervised learning, a common performance boost comes from increasing the number of negative samples.\n", + "\n", + "In this case, we can use DDP2 which behaves like DP in a machine and DDP across nodes. DDP2 does the following:\n", + "\n", + "* Copies a subset of the data to each node.\n", + "* Inits a model on each node.\n", + "* Runs a forward and backward pass using DP.\n", + "* Syncs gradients across nodes.\n", + "* Applies the optimizer updates.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Y4xweqL3xHER" + }, + "outputs": [], + "source": [ + "# ddp2 = DistributedDataParallel + dp\n", + "trainer = pl.Trainer(gpus=2, num_nodes=2, distributed_backend='ddp2')\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lhKNCnveeeq5" + }, + "source": [ + "- The second mode is ddp_spawn. This works like ddp, but instead of calling your script multiple times, lightning will use multiprocessing spawn to start a subprocess per GPU. \n", + "\n", + "However, you should be careful of mixing this mode with num_workers > 0 in your dataloaders because it will bottleneck your training. This is a current known limitation of PyTorch which is why we recommend using our ddp implementation instead.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HUf9ANyQkFFO" + }, + "source": [ + "\n", + "### mocking ddp\n", + "\n", + "Testing or debugging DDP can be hard, so we have a distributed backend that simulates ddp on cpus to make it easier. Set `num_processes` to a number greater than 1 when using distributed_backend=\"ddp_cpu\" to mimic distributed training on a machine without GPUs. Note that while this is useful for debugging, it will not provide any speedup, since single-process Torch already makes efficient use of multiple CPUs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZSal5Da9kHOf" + }, + "outputs": [], + "source": [ + "# Simulate DDP for debugging on your GPU-less laptop\n", + "trainer = Trainer(distributed_backend=\"ddp_cpu\", num_processes=2)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Br_btCy5lgES" + }, + "source": [ + "# Training on TPUS\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DXkBNITdv44d" + }, + "source": [ + "Another option for accelerating your training is using TPUs.\n", + "A TPU is a Tensor processing unit, designed specifically for deep learning. Each TPU has 8 cores where each core is optimized for 128x128 matrix multiplies. Google estimates that 8 TPU cores are about as fast as 4 V100 GPUs!\n", + "\n", + "A TPU pod hosts many TPUs on it. Currently, TPU pod v2 has 2048 cores! You can request a full pod from Google cloud or a “slice” which gives you some subset of those 2048 cores.\n", + "\n", + "At this moment, TPUs are available on Google Cloud (GCP), Google Colab and Kaggle Environments.\n", + "\n", + "Lightning supports training on TPUs without any code adjustments to your model. Just like when using GPUs, Lightning automatically inserts the correct samplers - no need to do this yourself!\n", + "\n", + "Under the hood, lightning uses the XLA framework developed jointly by the facebook and google XLA teams. And we want to recognize their efforts in advancing TPU adoption of PyTorch.\n", + "\n", + "## tpu_cores\n", + "To train on TPUs, set the tpu_cores flag.\n", + "\n", + "When using colab or kaggle, the allowed values are 1 or 8 cores. When using google cloud, any value above 8 is allowed.\n", + "\n", + "Your effective batch size is the batch size passed into a dataloader times the total number of tpu cores." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "itP9y70gmD9M" + }, + "outputs": [], + "source": [ + "# int: train on a single core\n", + "trainer = pl.Trainer(tpu_cores=1)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NJKnzPb3mKEg" + }, + "outputs": [], + "source": [ + "# int: train on all cores few cores\n", + "trainer = pl.Trainer(tpu_cores=8)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8a4exfWUmOHq" + }, + "source": [ + "You can also choose which TPU core to train on, by passing a list [1-8]. This is not an officially supported use case but we are working with the XLA team to improve this user experience.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "S6OrjE_bmT-_" + }, + "outputs": [], + "source": [ + "# list: train on a single selected core\n", + "trainer = pl.Trainer(tpu_cores=[2])\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Afqx3sFUmfWD" + }, + "source": [ + "To train on more than 8 cores (ie: a POD), submit this script using the xla_dist script.\n", + "\n", + "\n", + "\n", + "```\n", + "python -m torch_xla.distributed.xla_dist\n", + "--tpu=$TPU_POD_NAME\n", + "--conda-env=torch-xla-nightly\n", + "--env=XLA_USE_BF16=1\n", + "-- python your_trainer_file.py\n", + "```\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ncPvbUVQqKOh" + }, + "source": [ + "# Advanced distributed training\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4MP7bEgnv7qK" + }, + "source": [ + "\n", + "Lightning supports distributed training across multiple GPUs and TPUs out of the box by setting trainer flags, but it also allows you to control the way sampling is done if you need to." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wdHiTfAMepKH" + }, + "source": [ + "## replace_sampler_ddp\n", + "In PyTorch, you must use torch.nn.DistributedSampler for multi-node or GPU training. The sampler makes sure each GPU sees the appropriate part of your data.\n", + "\n", + "```\n", + "# without lightning\n", + "def train_dataloader(self):\n", + " dataset = MNIST(...)\n", + " sampler = None\n", + "\n", + " if self.on_tpu:\n", + " sampler = DistributedSampler(dataset)\n", + "\n", + " return DataLoader(dataset, sampler=sampler)\n", + "```\n", + "Lightning adds the correct samplers when needed, so no need to explicitly add samplers. By default it will add `shuffle=True` for train sampler and `shuffle=False` for val/test sampler.\n", + "\n", + "If you want to customize this behaviour, you can set `replace_sampler_ddp=False` and add your own distributed sampler.\n", + "\n", + "(note: For iterable datasets, we don’t do this automatically.)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZfmcB_e_7HbE" + }, + "outputs": [], + "source": [ + "sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=False)\n", + "dataloader = DataLoader(dataset, batch_size=32, sampler=sampler)\n", + "\n", + "trainer = pl.Trainer(gpus=2, num_nodes=2, replace_sampler_ddp=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-IOhk1n0lL3_" + }, + "source": [ + "## prepare_data_per_node\n", + "\n", + "When doing multi NODE training, if your nodes share the same file system, then you don't want to download data more than once to avoid possible collisions. \n", + "\n", + "Lightning automatically calls the prepare_data hook on the root GPU of the master node (ie: only a single GPU).\n", + "\n", + "In some cases where your nodes don't share the same file system, you need to download the data on each node. In this case you can set this flag to true and lightning will download the data on the root GPU of each node.\n", + "\n", + "This flag is defaulted to True." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WFBMUR48lM04" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(gpus=2, num_nodes=2, prepare_data_per_node=False)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FKBwXqo4q-Vp" + }, + "source": [ + "## sync_batchnorm\n", + "\n", + "Batch norm is computed per GPU/TPU. This flag enables synchronization between batchnorm layers across all GPUs.\n", + "It is recommended if you have small batch sizes.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GhaCLTEZrAQi" + }, + "outputs": [], + "source": [ + "trainer = Trainer(gpus=4, sync_batchnorm=True)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XuFA7VTFMY9-" + }, + "source": [ + "# Debugging flags\n", + "\n", + "Lightning offers a couple of flags to make debugging your models easier:\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AKoS3fdml4Jx" + }, + "source": [ + "## Fast Dev Run\n", + "\n", + "To help you save time debugging, your first run should use the fast_dev_run flag.\n", + "\n", + "This won't generate logs or save checkpoints but will touch every line of your code to make sure that it is working as intended.\n", + "\n", + "Think about this flag like a compiler. You make changes to your code, and run Trainer with this flag to verify that your changes are bug free.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "L5vuG7GSmhzK" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(fast_dev_run=True)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HRP1qQR5nT4p" + }, + "source": [ + "## overfit_batches\n", + "\n", + "Uses this much data of the training set. If nonzero, will use the same training set for validation and testing. If the training dataloaders have shuffle=True, Lightning will automatically disable it.\n", + "\n", + "Useful for quickly debugging or trying to overfit on purpose." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NTM-dqGMnXms" + }, + "outputs": [], + "source": [ + "# use only 1% of the train set (and use the train set for val and test)\n", + "trainer = pl.Trainer(overfit_batches=0.01)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c0LV0gC3nl1X" + }, + "outputs": [], + "source": [ + "# overfit on 10 of the same batches\n", + "trainer = pl.Trainer(overfit_batches=10)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lt3UHU6WgtS_" + }, + "source": [ + "Or a float to represent percentage of data to run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "K3yUqADhgnkf" + }, + "outputs": [], + "source": [ + "# run through only 25% of the test set each epoch\n", + "trainer = pl.Trainer(limit_test_batches=0.25)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ODN66NeVg_2o" + }, + "source": [ + "In the case of multiple test dataloaders, the limit applies to each dataloader individually.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8aQx5SLeMz1R" + }, + "source": [ + "# accumulate_grad_batches\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "g8GczZXFwKC7" + }, + "source": [ + "The batch size controls the accuracy of the estimate of the gradients. Small batch size use less memory, but decrease accuracy. When training large models, such as NLP transformers, it is useful to accumulate gradients before calling backwards(). It allows for bigger batch sizes than what can actually fit on a GPU/TPU in a single step.\n", + "\n", + "Use accumulate_grad_batches to accumulate gradients every k batches or as set up in the dict. Trainer also calls optimizer.step() for the last indivisible step number.\n", + "\n", + "For example, set accumulate_grad_batches to 4 to accumulate every 4 batches. In this case the effective batch size is batch_size*4, so if your batch size is 32, effectively it will be 128." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2jB6-Z_yPhhf" + }, + "outputs": [], + "source": [ + "# accumulate every 4 batches (effective batch size is batch*4)\n", + "trainer = pl.Trainer(accumulate_grad_batches=4)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_Yi-bdTOgINC" + }, + "source": [ + "You can also pass a dictionary to specify different accumulation per epoch. We can set it to `{5: 3, 10: 20}` to have no accumulation for epochs 1 to 4, accumulate 3 batches for epoch 5 to 10, and 20 batches after that." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "X3xsoZ3YPgBv" + }, + "outputs": [], + "source": [ + "# no accumulation for epochs 1-4. accumulate 3 for epochs 5-10. accumulate 20 after that\n", + "trainer = pl.Trainer(accumulate_grad_batches={5: 3, 10: 20})\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "myzH8mV4M1_9" + }, + "source": [ + "# 16 bit precision\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "v9EaFAonwOk6" + }, + "source": [ + "Most deep learning frameworks like PyTorch, train with 32-bit floating point arithmetic. \n", + "\n", + "But many models can still achieve full accuracy using half the precision.\n", + "\n", + "In 2017, NVIDIA researchers successfully used a combination of 32 and 16 bit precision (also known as mixed precision) and achieved the same accuracy as 32 bit precision training.\n", + "\n", + "The main two advantages are:\n", + "\n", + "- a reduction in memory requirements which enables larger batch sizes and models.\n", + "- and a speed up in compute. On ampere, turing and volta architectures 16 bit precision models can train at least 3 times faster.\n", + "\n", + "As of PyTorch 1.6, NVIDIA and Facebook moved mixed precision functionality into PyTorch core as the AMP package, torch.cuda.amp. \n", + "\n", + "This package supersedes the apex package developed by NVIDIA." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TjNypZPHnxvJ" + }, + "source": [ + "## precision\n", + "\n", + "Use precision flag to switch between full precision (32) to half precision (16). Can be used on CPU, GPU or TPUs.\n", + "\n", + "When using PyTorch 1.6+ Lightning uses the native amp implementation to support 16-bit.\n", + "\n", + "If used on TPU will use torch.bfloat16 but tensor printing will still show torch.float32" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kBZKMVx1nw-D" + }, + "outputs": [], + "source": [ + "# 16-bit precision\n", + "trainer = pl.Trainer(gpus=1, precision=16)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VJGj3Jh7oQXU" + }, + "source": [ + "In earlier version of Lightning, we use NVIDIA Apex for 16-bit precision. Apex was the first library to attempt 16-bit and the automatic mixed precision library (amp), has since been merged into core PyTorch as of 1.6.\n", + "\n", + "If you insist in using Apex, you can set the amp_backend flag to 'apex' and install Apex on your own." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BDV1trAUPc9h" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(gpus=1, precision=16, amp_backend='apex')\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HK5c_aVfNV4e" + }, + "source": [ + "## amp_level\n", + "Apex includes 4 optimization levels:\n", + "O0 (FP32 training)\n", + "O1 (Conservative Mixed Precision): only some whitelist ops are done in FP16.\n", + "O2 (Fast Mixed Precision): this is the standard mixed precision training. It maintains FP32 master weights and optimizer.step acts directly on the FP32 master weights.\n", + "O3 (FP16 training): full FP16. Passing keep_batchnorm_fp32=True can speed things up as cudnn batchnorm is faster anyway.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FshMFPowNbWt" + }, + "outputs": [], + "source": [ + "# default used by the Trainer\n", + "trainer = pl.Trainer(gpus=1, precision=16, amp_backend='apex', amp_level='O2')\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "y8KEr1YvNgkC" + }, + "source": [ + "# `auto_scale_batch_size`\n", + "\n", + " \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7F1pKFIuwSFl" + }, + "source": [ + "Lightning can help you improve your model by using auto_scale_batch_size flag, which tries to find the largest batch size that fits into memory, before you start your training.\n", + "Larger batch size often yields better estimates of gradients, but may also result in longer training time. \n", + "\n", + "Set it to True to initially run a batch size finder trying to find the largest batch size that fits into memory. The result will be stored in self.batch_size in the LightningModule.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9_jE-iyyheIv" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(auto_scale_batch_size=True)\n", + "\n", + "trainer.tune(model, train_dataloader=train_loader, val_dataloaders=val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yaHsJvwFhNJt" + }, + "source": [ + "You can set the value to `power`. `power` scaling starts from a batch size of 1 and keeps doubling the batch size until an out-of-memory (OOM) error is encountered.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Qx0FbQrphgw1" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(auto_scale_batch_size='power')\n", + "\n", + "trainer.tune(model, train_dataloader=train_loader, val_dataloaders=val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8bwgVF9zhZ75" + }, + "source": [ + "You can also set it to `binsearch`, that continues to finetune the batch size by performing a binary search.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QObXNs3yNrg9" + }, + "outputs": [], + "source": [ + "# run batch size scaling, result overrides hparams.batch_size\n", + "trainer = pl.Trainer(auto_scale_batch_size='binsearch')\n", + "\n", + "trainer.tune(model, train_dataloader=train_loader, val_dataloaders=val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5OWdhSsZjqW7" + }, + "source": [ + "This feature expects that a batch_size field in the hparams of your model, i.e., model.hparams.batch_size should exist and will be overridden by the results of this algorithm. \n", + "\n", + "Additionally, your train_dataloader() method should depend on this field for this feature to work.\n", + "\n", + "The algorithm in short works by:\n", + "1. Dumping the current state of the model and trainer\n", + "\n", + "2. Iteratively until convergence or maximum number of tries max_trials (default 25) has been reached:\n", + "* Call fit() method of trainer. This evaluates steps_per_trial (default 3) number of training steps. Each training step can trigger an OOM error if the tensors (training batch, weights, gradients etc.) allocated during the steps have a too large memory footprint.\n", + " * If an OOM error is encountered, decrease the batch size\n", + " * Else increase it.\n", + "* How much the batch size is increased/decreased is determined by the chosen strategy.\n", + "\n", + "3. The found batch size is saved to model.hparams.batch_size\n", + "\n", + "4. Restore the initial state of model and trainer\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "q4CvxfZmOWBd" + }, + "source": [ + "# `auto_lr_find`\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "j85e8usNwdBV" + }, + "source": [ + "Selecting a good learning rate for your deep learning training is essential for both better performance and faster convergence.\n", + "\n", + "Even optimizers such as Adam that are self-adjusting the learning rate can benefit from more optimal choices.\n", + "\n", + "To reduce the amount of guesswork concerning choosing a good initial learning rate, you can use Lightning auto learning rate finder.\n", + "\n", + "The learning rate finder does a small run where the learning rate is increased after each processed batch and the corresponding loss is logged. The result of this is a lr vs. loss plot that can be used as guidance for choosing an optimal initial lr.\n", + "\n", + "\n", + "warning: For the moment, this feature only works with models having a single optimizer. LR support for DDP is not implemented yet, it is coming soon.\n", + "\n", + "\n", + "***auto_lr_find=***\n", + "\n", + "In the most basic use case, this feature can be enabled during trainer construction with Trainer(auto_lr_find=True).\n", + "When .fit(model) is called, the LR finder will automatically run before any training is done. The lr that is found and used will be written to the console and logged together with all other hyperparameters of the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iuhve9RBOfFh" + }, + "outputs": [], + "source": [ + "# default used by the Trainer (no learning rate finder)\n", + "trainer = pl.Trainer(mnist_model, auto_lr_find=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BL-gjXNCPDXk" + }, + "source": [ + "This flag sets your learning rate which can be accessed via self.lr or self.learning_rate.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wEb-vIMmPJQf" + }, + "outputs": [], + "source": [ + "class LitModel(LightningModule):\n", + "\n", + " def __init__(self, learning_rate):\n", + " self.learning_rate = learning_rate\n", + "\n", + " def configure_optimizers(self):\n", + " return Adam(self.parameters(), lr=(self.lr or self.learning_rate))\n", + "\n", + "# finds learning rate automatically\n", + "# sets hparams.lr or hparams.learning_rate to that learning rate\n", + "trainer = pl.Trainer(mnist_model, auto_lr_find=True)\n", + "\n", + "trainer.tune(model, train_dataloader=train_loader, val_dataloaders=val_loader)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RweqvpnVPPSh" + }, + "source": [ + "To use an arbitrary value set it as auto_lr_find\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4LKI39IfPLJv" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(mnist_model, auto_lr_find='my_value')\n", + "\n", + "trainer.tune(model, train_dataloader=train_loader, val_dataloaders=val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9VAhPRKbPX-m" + }, + "source": [ + "Under the hood, when you call tune it runs the learning rate finder.\n", + "\n", + "If you want to inspect the results of the learning rate finder before doing any actual training or just play around with the parameters of the algorithm, this can be done by invoking the lr_find method of the trainer. A typical example of this would look like\n", + "\n", + "\n", + "```\n", + "trainer = pl.Trainer(auto_lr_find=True)\n", + "\n", + "# Run learning rate finder\n", + "lr_finder = trainer.lr_find(model)\n", + "\n", + "# Results can be found in\n", + "lr_finder.results\n", + "\n", + "# Plot with\n", + "fig = lr_finder.plot(suggest=True)\n", + "fig.show()\n", + "\n", + "# Pick point based on plot, or get suggestion\n", + "new_lr = lr_finder.suggestion()\n", + "\n", + "# update hparams of the model\n", + "model.hparams.lr = new_lr\n", + "\n", + "# Fit model\n", + "trainer.fit(model)\n", + "```\n", + "\n", + "The figure produced by lr_finder.plot() should look something like the figure below. It is recommended to not pick the learning rate that achieves the lowest loss, but instead something in the middle of the sharpest downward slope (red point). This is the point returned py lr_finder.suggestion().\n", + "\n", + "![image.png]()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tn1RV-jfOjt1" + }, + "source": [ + "# `benchmark`\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rsmTl5zfwjM3" + }, + "source": [ + "You can try to speed your system by setting `benchmark=True`, which enables cudnn.benchmark. This flag is likely to increase the speed of your system if your input sizes don’t change. This flag makes cudnn auto-tuner look for the optimal set of algorithms for the given hardware configuration. This usually leads to faster runtime.\n", + "But if your input sizes changes at each iteration, then cudnn will benchmark every time a new size appears, possibly leading to worse runtime performances." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dWr-OCBgQCeb" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(gpus=1, benchmark=True)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qwAvSKYGa24K" + }, + "source": [ + "# `deterministic`\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tl5mfmafwmat" + }, + "source": [ + "PyTorch does not guarantee reproducible results, even when using identical seeds. To guarentee reproducible results, you can remove most of the randomness from your process by setting the `deterministic` flag to True.\n", + "\n", + "Note that it might make your system slower." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Mhv5LZ3HbNCK" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(gpus=1, deterministic=True)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "u_5eJSvTf60f" + }, + "source": [ + "# Exploding and vanishing gradients" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "B6drjh4pq6Jv" + }, + "source": [ + "## track_grad_norm\n", + "\n", + "You can debug your grad norm to identify exploding or vanishing gradients using the `track_grad_norm` flag.\n", + "\n", + "Set value to 2 to track the 2-norm. or p to any p-norm." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2taHUir8rflR" + }, + "outputs": [], + "source": [ + "# track the 2-norm\n", + "trainer = pl.Trainer(track_grad_norm=2)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3vHKxmruk62f" + }, + "source": [ + "May be set to ‘inf’ infinity-norm." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "g7TbD6SxlAjP" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(track_grad_norm='inf')\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TcMlRe7ywpe6" + }, + "source": [ + "## Gradient clipping\n", + "\n", + "\n", + "Exploding gradients refer to the problem that the gradients get too large and overflow in training, making the model unstable. Gradient clipping will ‘clip’ the gradients or cap them to a Threshold value to prevent the gradients from getting too large. To avoid this, we can set `gradient_clip_val` (default is set to 0.0).\n", + "\n", + "[when to use it, what are relevant values]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jF9JwmbOgOWF" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(gradient_clip_val=0.1)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ggb4MkkQrr1h" + }, + "source": [ + "# truncated_bptt_steps\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "s1Iu6PyAw9_r" + }, + "source": [ + "If you have a large recurrent model, you can use truncated_bptt_steps flag to split up the backprop over portions of the sequence. This flag will automatically truncate your batches and the trainer will apply Truncated Backprop to it.\n", + "\n", + "Make sure your batches have a sequence dimension.\n", + "\n", + "Lightning takes care of splitting your batch along the time-dimension.\n", + "```\n", + "# we use the second as the time dimension\n", + "# (batch, time, ...)\n", + "sub_batch = batch[0, 0:t, ...]\n", + "Using this feature requires updating your LightningModule’s pytorch_lightning.core.LightningModule.training_step() to include a hiddens arg with the hidden\n", + "\n", + "# Truncated back-propagation through time\n", + "def training_step(self, batch, batch_idx, hiddens):\n", + " # hiddens are the hiddens from the previous truncated backprop step\n", + " out, hiddens = self.lstm(data, hiddens)\n", + "\n", + " return {\n", + " \"loss\": ...,\n", + " \"hiddens\": hiddens # remember to detach() this\n", + " }\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WiTF1VMtruMU" + }, + "outputs": [], + "source": [ + "# backprop every 5 steps in a batch\n", + "trainer = pl.Trainer(truncated_bptt_steps=5)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8XI_kEWkS-nT" + }, + "source": [ + "To modify how the batch is split, override pytorch_lightning.core.LightningModule.tbptt_split_batch():\n", + "\n", + "```\n", + "class LitMNIST(LightningModule):\n", + " def tbptt_split_batch(self, batch, split_size):\n", + " # do your own splitting on the batch\n", + " return splits\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oLbEmbmupwQ8" + }, + "source": [ + "# reload_dataloaders_every_epoch\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CLdNGVv9xD_L" + }, + "source": [ + "Set to True to reload dataloaders every epoch (instead of loading just once in the beginning of training).\n", + "\n", + "```\n", + "# if False (default)\n", + "train_loader = model.train_dataloader()\n", + "for epoch in epochs:\n", + " for batch in train_loader:\n", + " ...\n", + "\n", + "# if True\n", + "for epoch in epochs:\n", + " train_loader = model.train_dataloader()\n", + " for batch in train_loader:\n", + "\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "10AXthXxp311" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(reload_dataloaders_every_epoch=True)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "f513EYl0bmmL" + }, + "source": [ + "# Callbacks\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2pt7iGh4xNs5" + }, + "source": [ + "\n", + "Lightning Callbacks are self-contained programs that can be reused across projects.\n", + "Callbacks should capture NON-ESSENTIAL logic that is NOT required for your LightningModule to run. Lightning includes some a few built-in callbacks that can be used with flags like early stopping and Model Checkpointing, but you can also create your own callbacks to add any functionality to your models.\n", + "\n", + "The callback API includes hooks that allow you to add logic at every point of your training:\n", + "setup, teardown, on_epoch_start, on_epoch_end, on_batch_start, on_batch_end, on_init_start, on_keyboard_interrupt etc. \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1t84gvDNsUuh" + }, + "source": [ + "## callbacks\n", + "\n", + "Use **callbacks=** to pass a list of user defined callbacks. These callbacks DO NOT replace the built-in callbacks (loggers or EarlyStopping). \n", + "\n", + "In this example, we create a dummy callback that prints a message when training starts and ends, using on_train_start and on_train_end hooks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oIXZYabub3f0" + }, + "outputs": [], + "source": [ + "from pytorch_lightning.callbacks import Callback\n", + "\n", + "class PrintCallback(Callback):\n", + " def on_train_start(self, trainer, pl_module):\n", + " print(\"Training is started!\")\n", + " def on_train_end(self, trainer, pl_module):\n", + " print(\"Training is done.\")\n", + "\n", + "# a list of callbacks\n", + "callbacks = [PrintCallback()]\n", + "trainer = pl.Trainer(callbacks=callbacks)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cNF74CLYfJJu" + }, + "source": [ + "# Model checkpointing\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2blgquBrxLtS" + }, + "source": [ + "Checkpoints capture the exact value of all parameters used by a model.\n", + "\n", + "Checkpointing your training allows you to resume a training process in case it was interrupted, fine-tune a model or use a pre-trained model for inference without having to retrain the model.\n", + "\n", + "Lightning automates saving and loading checkpoints so you restore a training session, saving all the required parameters including: \n", + "* 16-bit scaling factor (apex)\n", + "* Current epoch\n", + "* Global step\n", + "* Model state_dict\n", + "* State of all optimizers\n", + "* State of all learningRate schedulers\n", + "* State of all callbacks\n", + "* The hyperparameters used for that model if passed in as hparams (Argparse.Namespace)\n", + "\n", + "By default Lightning will save a checkpoint in the working directory, which will be updated every epoch.\n", + "\n", + "### Automatic saving\n", + "By default Lightning will save a checkpoint in the end of the first epoch in the working directory, which will be updated every epoch." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XGu0JULrg9l7" + }, + "outputs": [], + "source": [ + "# default used by the Trainer\n", + "trainer = pl.Trainer(default_root_path=os.getcwd())\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3s9OjkGuhq1W" + }, + "source": [ + "To change the checkpoint path pass in **default_root_dir=**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DgdxkrIQhvfw" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(default_root_dir='/your/path/to/save/checkpoints')\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Qyvj_bkWrJiE" + }, + "source": [ + "\n", + "You can also have Lightning update your checkpoint based on a specific metric that you are logging (using self.log), by passing the key to `monitor=`. For example, if we want to save checkpoint based on the validation loss, logged as `val_loss`, you can pass:\n", + "\n", + "\n", + "```\n", + "checkpoint_callback = ModelCheckpoint(\n", + " filepath=os.getcwd(),\n", + " save_top_k=1,\n", + " verbose=True,\n", + " monitor='val_loss',\n", + " mode='min',\n", + " prefix=''\n", + ")\n", + "```\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YzYMivw1rO1O" + }, + "outputs": [], + "source": [ + "from pytorch_lightning.callbacks import ModelCheckpoint\n", + "\n", + "trainer = pl.Trainer(callbacks=[ModelCheckpoint(monitor='val_loss')])\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5hYs_FV8iDMn" + }, + "source": [ + "You can modify the behavior of checkpointing by creating your own callback, and passing it to the trainer. \n", + "You can control\n", + "* filepath- where logs are saved\n", + "* save_top_k- save k top models\n", + "* verbose\n", + "* monitor- the metric to monitor\n", + "* mode\n", + "* prefix\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Tb1K2VYDiNTu" + }, + "outputs": [], + "source": [ + "from pytorch_lightning.callbacks import ModelCheckpoint\n", + "\n", + "# DEFAULTS used by the Trainer\n", + "checkpoint_callback = ModelCheckpoint(\n", + " filepath=os.getcwd(),\n", + " save_top_k=3,\n", + " verbose=True,\n", + " monitor='val_loss',\n", + " mode='min',\n", + " prefix='',\n", + ")\n", + "\n", + "trainer = Trainer(callbacks=[checkpoint_callback])\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YKhZ6xRojJcl" + }, + "source": [ + "You can disable checkpointing it by passing\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Yt8zd2ZFjOXX" + }, + "outputs": [], + "source": [ + "trainer = Trainer(checkpoint_callback=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HcLy8asCjrj9" + }, + "source": [ + "### Manual saving\n", + "\n", + "You can manually save checkpoints and restore your model from the checkpointed state.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kZSkMJf0jR4x" + }, + "outputs": [], + "source": [ + "trainer.fit(model)\n", + "trainer.save_checkpoint(\"example.ckpt\")\n", + "new_model = LitAutoEncoder.load_from_checkpoint(checkpoint_path=\"example.ckpt\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "X2d9cjVPj7CP" + }, + "source": [ + "### Checkpoint Loading\n", + "To load a model along with its weights, biases and module_arguments use following method:\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BpAFfg5zkFmH" + }, + "outputs": [], + "source": [ + "model = LitAutoEncoder.load_from_checkpoint(PATH)\n", + "\n", + "print(model.learning_rate)\n", + "# prints the learning_rate you used in this checkpoint\n", + "\n", + "model.eval()\n", + "y_hat = model(x)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jTQ3mxSJkhFN" + }, + "source": [ + "But if you don’t want to use the values saved in the checkpoint, pass in your own here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "IoMcOh9-kfUP" + }, + "outputs": [], + "source": [ + "class LitAutoEncoder(LightningModule):\n", + "\n", + " def __init__(self, in_dim, out_dim):\n", + " super().__init__()\n", + " self.save_hyperparameters()\n", + " self.l1 = nn.Linear(self.hparams.in_dim, self.hparams.out_dim)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ITPVY8mNknut" + }, + "source": [ + "you can restore the model like this\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "H7XeRJzVkuY8" + }, + "outputs": [], + "source": [ + "# if you train and save the model like this it will use these values when loading\n", + "# the weights. But you can overwrite this\n", + "LitAutoEncoder(in_dim=32, out_dim=10)\n", + "\n", + "# uses in_dim=32, out_dim=10\n", + "model = LitAutoEncoder.load_from_checkpoint(PATH)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "14WwGpnVk0a4" + }, + "outputs": [], + "source": [ + "# uses in_dim=128, out_dim=10\n", + "model = LitAutoEncoder.load_from_checkpoint(PATH, in_dim=128, out_dim=10)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bY5s6wP_k1CU" + }, + "source": [ + "\n", + "\n", + "## Restoring Training State (resume_from_checkpoint)\n", + "If your training was cut short for some reason, you can resume exactly from where you left off using the `resume_from_checkpoint` flag, which will automatically restore model, epoch, step, LR schedulers, apex, etc..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9zfhHtyrk3rO" + }, + "outputs": [], + "source": [ + "model = LitAutoEncoder()\n", + "trainer = pl.Trainer(resume_from_checkpoint='some/path/to/my_checkpoint.ckpt')\n", + "\n", + "# automatically restores model, epoch, step, LR schedulers, apex, etc...\n", + "trainer.fit(model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xkKdvALFsmT2" + }, + "source": [ + "## weights_save_path\n", + "You can specify a directory for saving weights file using `weights_save_path`.\n", + "\n", + "(If you are using a custom checkpoint callback, the checkpoint callback will override this flag)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9OwHHFcCsrgT" + }, + "outputs": [], + "source": [ + "# save to your custom path\n", + "trainer = pl.Trainer(weights_save_path='my/path')\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PbNtlJ9Wsscf" + }, + "outputs": [], + "source": [ + "# if checkpoint callback used, then overrides the weights path\n", + "# **NOTE: this saves weights to some/path NOT my/path\n", + "checkpoint = ModelCheckpoint(filepath='some/path')\n", + "trainer = pl.Trainer(\n", + " callbacks=[checkpoint],\n", + " weights_save_path='my/path'\n", + ")\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uDdxCuyHdWQt" + }, + "source": [ + "# Early stopping\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fqAy3ihRxTfR" + }, + "source": [ + "The EarlyStopping callback can be used to monitor a validation metric and stop the training when no improvement is observed, to help you avoid overfitting.\n", + "\n", + "To enable Early Stopping you can init the EarlyStopping callback, and pass it to `callbacks=` trainer flag. The callback will look for a logged metric to early stop on.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lFx976CheH93" + }, + "outputs": [], + "source": [ + "from pytorch_lightning.callbacks.early_stopping import EarlyStopping\n", + "\n", + "trainer = pl.Trainer(callbacks=[EarlyStopping('val_loss')])\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MwpJfTvjeOwF" + }, + "source": [ + "You can customize the callback using the following params:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "V6I9h6HteK2U" + }, + "outputs": [], + "source": [ + "from pytorch_lightning.callbacks.early_stopping import EarlyStopping\n", + "\n", + "early_stop_callback = EarlyStopping(\n", + " monitor='val_accuracy',\n", + " min_delta=0.00,\n", + " patience=3,\n", + " verbose=False,\n", + " mode='max'\n", + ")\n", + "trainer = pl.Trainer(callbacks=[early_stop_callback])\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7TAIerPYe_Q1" + }, + "source": [ + "The EarlyStopping callback runs at the end of every validation epoch, which, under the default configuration, happens after every training epoch. However, the frequency of validation can be modified by setting various parameters on the Trainer, for example check_val_every_n_epoch and val_check_interval. It must be noted that the patience parameter counts the number of validation epochs with no improvement, and not the number of training epochs. Therefore, with parameters check_val_every_n_epoch=10 and patience=3, the trainer will perform at least 40 training epochs before being stopped." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VoKrX2ENh9Fg" + }, + "source": [ + "# Logging" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-CQTPKd7iKLm" + }, + "source": [ + "Lightning has built in integration with various loggers such as TensorBoard, wandb, commet, etc.\n", + "\n", + "\n", + "You can pass any metrics you want to log during training to `self.log`, such as loss or accuracy. Similarly, pass in to self.log any metric you want to log during validation step.\n", + "\n", + "These values will be passed in to the logger of your choise. simply pass in any supported logger to logger trainer flag.\n", + "\n", + "\n", + "\n", + "Use the as`logger=` trainer flag to pass in a Logger, or iterable collection of Loggers, for experiment tracking.\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ty5VPS3AiS8L" + }, + "outputs": [], + "source": [ + "from pytorch_lightning.loggers import TensorBoardLogger\n", + "\n", + "# default logger used by trainer\n", + "logger = TensorBoardLogger(\n", + " save_dir=os.getcwd(),\n", + " version=1,\n", + " name='lightning_logs'\n", + ")\n", + "trainer = pl.Trainer(logger=logger)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jc5oWNpoiuuc" + }, + "source": [ + "Lightning supports the use of multiple loggers, just pass a list to the Trainer.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BlYwMRRyivp_" + }, + "outputs": [], + "source": [ + "from pytorch_lightning.loggers import TensorBoardLogger, TestTubeLogger\n", + "logger1 = TensorBoardLogger('tb_logs', name='my_model')\n", + "logger2 = TestTubeLogger('tb_logs', name='my_model')\n", + "trainer = pl.Trainer(logger=[logger1, logger2])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "a7EyspQPh7iQ" + }, + "source": [ + "## flush_logs_every_n_steps\n", + "\n", + "Use this flag to determine when logging to disc should happen." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Em_XvsmyiBbk" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(flush_logs_every_n_steps=100)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_vDeKE98qsl1" + }, + "source": [ + "## log_every_n_steps\n", + "How often to add logging rows (does not write to disk)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "HkqD7D_0w1Tt" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(log_every_n_steps=1000)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9uw0gfe422CT" + }, + "source": [ + "# info logging" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dQXpt0aatDGo" + }, + "source": [ + "### default_root_dir\n", + "\n", + "---\n", + "\n", + "\n", + "\n", + "Default path for logs and weights when no logger or pytorch_lightning.callbacks.ModelCheckpoint callback passed. On certain clusters you might want to separate where logs and checkpoints are stored. If you don’t then use this argument for convenience. Paths can be local paths or remote paths such as s3://bucket/path or ‘hdfs://path/’. Credentials will need to be set up to use remote filepaths." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CMmID2Bts5W3" + }, + "source": [ + "## weights_summary\n", + "Prints a summary of the weights when training begins. Default is set to `top`- print summary of top level modules.\n", + "\n", + "Options: ‘full’, ‘top’, None." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KTl6EdwDs6j2" + }, + "outputs": [], + "source": [ + "\n", + "# print full summary of all modules and submodules\n", + "trainer = pl.Trainer(weights_summary='full')\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "R57cSLl9w9ma" + }, + "outputs": [], + "source": [ + "# don't print a summary\n", + "trainer = Trainer(weights_summary=None)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bSc2hU5AotAP" + }, + "source": [ + "# progress bar" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GgvbyDsBxcH6" + }, + "source": [ + "## process_position\n", + "\n", + "Orders the progress bar. Useful when running multiple trainers on the same node.\n", + "\n", + "(This argument is ignored if a custom callback is passed to callbacks)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6ekz8Es8owDn" + }, + "outputs": [], + "source": [ + "# default used by the Trainer\n", + "trainer = pl.Trainer(process_position=0)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "itivQFgEphBU" + }, + "source": [ + "## progress_bar_refresh_rate\n", + "\n", + "How often to refresh the progress bar (in steps). In notebooks, faster refresh rates (lower number) is known to crash them because of their screen refresh rates, so raise it to 50 or more." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GKe6eVxmplL5" + }, + "outputs": [], + "source": [ + "# default used by the Trainer\n", + "trainer = pl.Trainer(progress_bar_refresh_rate=1)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8rDHJOJbxNtf" + }, + "outputs": [], + "source": [ + "# disable progress bar\n", + "trainer = Trainer(progress_bar_refresh_rate=0)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NCNvYLwjpWne" + }, + "source": [ + "# profiler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pRknrG_zpY6M" + }, + "outputs": [], + "source": [ + "# to profile standard training events\n", + "trainer = pl.Trainer(profiler=True)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ji6aWpU73kMM" + }, + "source": [ + "You can also use Lightning AdvancedProfiler if you want more detailed information about time spent in each function call recorded during a given action. The output is quite verbose and you should only use this if you want very detailed reports.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "layG55pt316C" + }, + "outputs": [], + "source": [ + "from pytorch_lightning.profiler import AdvancedProfiler\n", + "\n", + "trainer = Trainer(profiler=AdvancedProfiler())\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "

Congratulations - Time to Join the Community!

\n", + "
\n", + "\n", + "Congratulations on completing this notebook tutorial! If you enjoyed this and would like to join the Lightning movement, you can do so in the following ways!\n", + "\n", + "### Star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) on GitHub\n", + "The easiest way to help our community is just by starring the GitHub repos! This helps raise awareness of the cool tools we're building.\n", + "\n", + "* Please, star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning)\n", + "\n", + "### Join our [Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)!\n", + "The best way to keep up to date on the latest advancements is to join our community! Make sure to introduce yourself and share your interests in `#general` channel\n", + "\n", + "### Interested by SOTA AI models ! Check out [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts)\n", + "Bolts has a collection of state-of-the-art models, all implemented in [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) and can be easily integrated within your own projects.\n", + "\n", + "* Please, star [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts)\n", + "\n", + "### Contributions !\n", + "The best way to contribute to our community is to become a code contributor! At any time you can go to [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) or [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts) GitHub Issues page and filter for \"good first issue\". \n", + "\n", + "* [Lightning good first issue](https://github.com/PyTorchLightning/pytorch-lightning/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", + "* [Bolt good first issue](https://github.com/PyTorchLightning/pytorch-lightning-bolts/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", + "* You can also contribute your own notebooks with useful examples !\n", + "\n", + "### Great thanks from the entire Pytorch Lightning Team for your interest !\n", + "\n", + "" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "05-trainer-flags-overview.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } diff --git a/pl_examples/basic_examples/README.md b/pl_examples/basic_examples/README.md index 4dcf06a74bf92..18ae204396290 100644 --- a/pl_examples/basic_examples/README.md +++ b/pl_examples/basic_examples/README.md @@ -14,7 +14,15 @@ python mnist.py python mnist.py --gpus 2 --distributed_backend 'dp' ``` ---- +--- +#### MNIST with DALI +The MNIST example above using [NVIDIA DALI](https://developer.nvidia.com/DALI). +Requires NVIDIA DALI to be installed based on your CUDA version, see [here](https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html). +```bash +python mnist_dali.py +``` + +--- #### Image classifier Generic image classifier with an arbitrary backbone (ie: a simple system) ```bash diff --git a/pl_examples/basic_examples/mnist_dali.py b/pl_examples/basic_examples/mnist_dali.py new file mode 100644 index 0000000000000..649198053a01b --- /dev/null +++ b/pl_examples/basic_examples/mnist_dali.py @@ -0,0 +1,204 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from abc import ABC +from argparse import ArgumentParser +from random import shuffle +from warnings import warn + +import numpy as np +import torch +from torch.nn import functional as F +from torch.utils.data import random_split + +import pytorch_lightning as pl + +try: + from torchvision.datasets.mnist import MNIST + from torchvision import transforms +except Exception: + from tests.base.datasets import MNIST + +try: + import nvidia.dali.ops as ops + import nvidia.dali.types as types + from nvidia.dali.pipeline import Pipeline + from nvidia.dali.plugin.pytorch import DALIClassificationIterator +except (ImportError, ModuleNotFoundError): + warn('NVIDIA DALI is not available') + ops, types, Pipeline, DALIClassificationIterator = ..., ..., ABC, ABC + + +class ExternalMNISTInputIterator(object): + """ + This iterator class wraps torchvision's MNIST dataset and returns the images and labels in batches + """ + + def __init__(self, mnist_ds, batch_size): + self.batch_size = batch_size + self.mnist_ds = mnist_ds + self.indices = list(range(len(self.mnist_ds))) + shuffle(self.indices) + + def __iter__(self): + self.i = 0 + self.n = len(self.mnist_ds) + return self + + def __next__(self): + batch = [] + labels = [] + for _ in range(self.batch_size): + index = self.indices[self.i] + img, label = self.mnist_ds[index] + batch.append(img.numpy()) + labels.append(np.array([label], dtype=np.uint8)) + self.i = (self.i + 1) % self.n + return (batch, labels) + + +class ExternalSourcePipeline(Pipeline): + """ + This DALI pipeline class just contains the MNIST iterator + """ + + def __init__(self, batch_size, eii, num_threads, device_id): + super(ExternalSourcePipeline, self).__init__(batch_size, num_threads, device_id, seed=12) + self.source = ops.ExternalSource(source=eii, num_outputs=2) + self.build() + + def define_graph(self): + images, labels = self.source() + return images, labels + + +class DALIClassificationLoader(DALIClassificationIterator): + """ + This class extends DALI's original DALIClassificationIterator with the __len__() function so that we can call len() on it + """ + + def __init__( + self, + pipelines, + size=-1, + reader_name=None, + auto_reset=False, + fill_last_batch=True, + dynamic_shape=False, + last_batch_padded=False, + ): + super().__init__(pipelines, size, reader_name, auto_reset, fill_last_batch, dynamic_shape, last_batch_padded) + + def __len__(self): + batch_count = self._size // (self._num_gpus * self.batch_size) + last_batch = 1 if self._fill_last_batch else 0 + return batch_count + last_batch + + +class LitClassifier(pl.LightningModule): + def __init__(self, hidden_dim=128, learning_rate=1e-3): + super().__init__() + self.save_hyperparameters() + + self.l1 = torch.nn.Linear(28 * 28, self.hparams.hidden_dim) + self.l2 = torch.nn.Linear(self.hparams.hidden_dim, 10) + + def forward(self, x): + x = x.view(x.size(0), -1) + x = torch.relu(self.l1(x)) + x = torch.relu(self.l2(x)) + return x + + def split_batch(self, batch): + return batch[0]["data"], batch[0]["label"].squeeze().long() + + def training_step(self, batch, batch_idx): + x, y = self.split_batch(batch) + y_hat = self(x) + loss = F.cross_entropy(y_hat, y) + return loss + + def validation_step(self, batch, batch_idx): + x, y = self.split_batch(batch) + y_hat = self(x) + loss = F.cross_entropy(y_hat, y) + self.log('valid_loss', loss) + + def test_step(self, batch, batch_idx): + x, y = self.split_batch(batch) + y_hat = self(x) + loss = F.cross_entropy(y_hat, y) + self.log('test_loss', loss) + + def configure_optimizers(self): + return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate) + + @staticmethod + def add_model_specific_args(parent_parser): + parser = ArgumentParser(parents=[parent_parser], add_help=False) + parser.add_argument('--hidden_dim', type=int, default=128) + parser.add_argument('--learning_rate', type=float, default=0.0001) + return parser + + +def cli_main(): + pl.seed_everything(1234) + + # ------------ + # args + # ------------ + parser = ArgumentParser() + parser.add_argument('--batch_size', default=32, type=int) + parser = pl.Trainer.add_argparse_args(parser) + parser = LitClassifier.add_model_specific_args(parser) + args = parser.parse_args() + + # ------------ + # data + # ------------ + dataset = MNIST('', train=True, download=True, transform=transforms.ToTensor()) + mnist_test = MNIST('', train=False, download=True, transform=transforms.ToTensor()) + mnist_train, mnist_val = random_split(dataset, [55000, 5000]) + + eii_train = ExternalMNISTInputIterator(mnist_train, args.batch_size) + eii_val = ExternalMNISTInputIterator(mnist_val, args.batch_size) + eii_test = ExternalMNISTInputIterator(mnist_test, args.batch_size) + + pipe_train = ExternalSourcePipeline(batch_size=args.batch_size, eii=eii_train, num_threads=2, device_id=0) + train_loader = DALIClassificationLoader(pipe_train, size=len(mnist_train), auto_reset=True, fill_last_batch=False) + + pipe_val = ExternalSourcePipeline(batch_size=args.batch_size, eii=eii_val, num_threads=2, device_id=0) + val_loader = DALIClassificationLoader(pipe_val, size=len(mnist_val), auto_reset=True, fill_last_batch=False) + + pipe_test = ExternalSourcePipeline(batch_size=args.batch_size, eii=eii_test, num_threads=2, device_id=0) + test_loader = DALIClassificationLoader(pipe_test, size=len(mnist_test), auto_reset=True, fill_last_batch=False) + + # ------------ + # model + # ------------ + model = LitClassifier(args.hidden_dim, args.learning_rate) + + # ------------ + # training + # ------------ + trainer = pl.Trainer.from_argparse_args(args) + trainer.fit(model, train_loader, val_loader) + + # ------------ + # testing + # ------------ + trainer.test(test_dataloaders=test_loader) + + +if __name__ == "__main__": + cli_main() diff --git a/pl_examples/bug_report_model.py b/pl_examples/bug_report_model.py index 27ecf774623af..dbea2013d1110 100644 --- a/pl_examples/bug_report_model.py +++ b/pl_examples/bug_report_model.py @@ -105,7 +105,14 @@ def configure_optimizers(self): return [optimizer], [lr_scheduler] +# NOTE: If you are using a cmd line to run your script, +# provide the cmd line as below. +# opt = "--max_epochs 1 --limit_train_batches 1".split(" ") +# parser = ArgumentParser() +# args = parser.parse_args(opt) + def run_test(): + class TestModel(BoringModel): def on_train_epoch_start(self) -> None: diff --git a/pl_examples/test_examples.py b/pl_examples/test_examples.py index 7fe5d4ed604dc..60f10a637e583 100644 --- a/pl_examples/test_examples.py +++ b/pl_examples/test_examples.py @@ -1,6 +1,15 @@ +import platform from unittest import mock -import torch + import pytest +import torch + +try: + from nvidia.dali import ops, types, pipeline, plugin +except (ImportError, ModuleNotFoundError): + DALI_AVAILABLE = False +else: + DALI_AVAILABLE = True dp_16_args = """ --max_epochs 1 \ @@ -28,7 +37,7 @@ --precision 16 \ """ - +# TODO # @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") # @pytest.mark.parametrize('cli_args', [dp_16_args]) # def test_examples_dp_mnist(cli_args): @@ -38,6 +47,7 @@ # cli_main() +# TODO # @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") # @pytest.mark.parametrize('cli_args', [dp_16_args]) # def test_examples_dp_image_classifier(cli_args): @@ -45,8 +55,9 @@ # # with mock.patch("argparse._sys.argv", ["any.py"] + cli_args.strip().split()): # cli_main() -# -# + + +# TODO # @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") # @pytest.mark.parametrize('cli_args', [dp_16_args]) # def test_examples_dp_autoencoder(cli_args): @@ -56,6 +67,7 @@ # cli_main() +# TODO # @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") # @pytest.mark.parametrize('cli_args', [ddp_args]) # def test_examples_ddp_mnist(cli_args): @@ -63,8 +75,9 @@ # # with mock.patch("argparse._sys.argv", ["any.py"] + cli_args.strip().split()): # cli_main() -# -# + + +# TODO # @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") # @pytest.mark.parametrize('cli_args', [ddp_args]) # def test_examples_ddp_image_classifier(cli_args): @@ -72,8 +85,9 @@ # # with mock.patch("argparse._sys.argv", ["any.py"] + cli_args.strip().split()): # cli_main() -# -# + + +# TODO # @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") # @pytest.mark.parametrize('cli_args', [ddp_args]) # def test_examples_ddp_autoencoder(cli_args): @@ -92,3 +106,14 @@ def test_examples_cpu(cli_args): for cli_cmd in [mnist_cli, ic_cli, ae_cli]: with mock.patch("argparse._sys.argv", ["any.py"] + cli_args.strip().split()): cli_cmd() + + +@pytest.mark.skipif(not DALI_AVAILABLE, reason="Nvidia DALI required") +@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") +@pytest.mark.skipif(platform.system() != 'Linux', reason='Only applies to Linux platform.') +@pytest.mark.parametrize('cli_args', [cpu_args]) +def test_examples_mnist_dali(cli_args): + from pl_examples.basic_examples.mnist_dali import cli_main + + with mock.patch("argparse._sys.argv", ["any.py"] + cli_args.strip().split()): + cli_main() diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py index 9c476680ceba9..d28c67030ad30 100644 --- a/pytorch_lightning/__init__.py +++ b/pytorch_lightning/__init__.py @@ -1,6 +1,6 @@ """Root package info.""" -__version__ = '1.0.4rc1' +__version__ = '1.0.4' __author__ = 'William Falcon et al.' __author_email__ = 'waf2107@columbia.edu' __license__ = 'Apache-2.0' diff --git a/pytorch_lightning/accelerators/__init__.py b/pytorch_lightning/accelerators/__init__.py index 65f3161675e4f..ec6e90caa6452 100644 --- a/pytorch_lightning/accelerators/__init__.py +++ b/pytorch_lightning/accelerators/__init__.py @@ -20,8 +20,6 @@ from pytorch_lightning.accelerators.gpu_accelerator import GPUAccelerator from pytorch_lightning.accelerators.tpu_accelerator import TPUAccelerator from pytorch_lightning.accelerators.horovod_accelerator import HorovodAccelerator -from pytorch_lightning.accelerators.ddp_slurm_accelerator import DDPSLURMAccelerator -from pytorch_lightning.accelerators.ddp_torchelastic_accelerator import DDPTorchElasticAccelerator -from pytorch_lightning.accelerators.ddp_cpu_torchelastic_accelerator import DDPCPUTorchElasticAccelerator -from pytorch_lightning.accelerators.ddp_cpu_slurm_accelerator import DDPCPUSLURMAccelerator +from pytorch_lightning.accelerators.ddp_hpc_accelerator import DDPHPCAccelerator +from pytorch_lightning.accelerators.ddp_cpu_hpc_accelerator import DDPCPUHPCAccelerator from pytorch_lightning.accelerators.accelerator import Accelerator diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 8e1969cc9368e..3b762e08ed5e6 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -14,7 +14,7 @@ import os import math from enum import Enum -from typing import Any, Optional +from typing import Any, Optional, Union import torch @@ -30,6 +30,12 @@ except ImportError: amp = None +if torch.distributed.is_available(): + from torch.distributed import ReduceOp +else: + class ReduceOp: + SUM = None + EPSILON = 1e-6 EPSILON_FP16 = 1e-5 @@ -103,27 +109,29 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): def optimizer_step(self, optimizer, batch_idx, opt_idx, lambda_closure): model_ref = self.trainer.get_model() is_lbfgs = isinstance(optimizer, torch.optim.LBFGS) - native_amp = self.trainer.amp_backend == AMPType.NATIVE + using_native_amp = self.trainer.amp_backend == AMPType.NATIVE + automatic_optimization = self.trainer.train_loop.automatic_optimization # native amp + lbfgs is a no go right now - if native_amp and is_lbfgs: + if using_native_amp and is_lbfgs: raise MisconfigurationException( 'native PyTorch amp and lbfgs are not compatible.' ' To request, please file a Github issue in PyTorch and tag @mcarilli') # model hook model_ref.optimizer_step( - self.trainer.current_epoch, - batch_idx, - optimizer, - opt_idx, - lambda_closure, - using_native_amp=native_amp, + epoch=self.trainer.current_epoch, + batch_idx=batch_idx, + optimizer=optimizer, + optimizer_idx=opt_idx, + optimizer_closure=lambda_closure, + on_tpu=False, # TPUAccelerator class sets this as True + using_native_amp=using_native_amp, using_lbfgs=is_lbfgs ) # scale when native amp - if native_amp: + if automatic_optimization and using_native_amp: self.trainer.scaler.update() def optimizer_zero_grad(self, batch_idx, optimizer, opt_idx): @@ -131,11 +139,6 @@ def optimizer_zero_grad(self, batch_idx, optimizer, opt_idx): model_ref.optimizer_zero_grad(self.trainer.current_epoch, batch_idx, optimizer, opt_idx) def clip_gradients(self, optimizer, clip_val=None): - - if self.trainer.amp_backend == AMPType.NATIVE: - self.trainer.scaler.unscale_(optimizer) - - # apply clip gradients # TODO: separate TPU case from here self._clip_gradients(optimizer, clip_val) @@ -213,6 +216,24 @@ def init_ddp_connection( torch_backend, rank=global_rank, world_size=world_size ) + def sync_tensor(self, + tensor: Union[torch.Tensor], + group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: + """ + Function to reduce a tensor from several distributed processes to one aggregated tensor. + + Args: + tensor: the tensor to sync and reduce + group: the process group to gather results from. Defaults to all processes (world) + reduce_op: the reduction operation. Defaults to sum. + Can also be a string of 'avg', 'mean' to calculate the mean during reduction. + + Return: + reduced value + """ + raise NotImplementedError() + def __getstate__(self): return { 'trainer': self.trainer, diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index df70d90755632..f8d90945e9e77 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -220,28 +220,28 @@ def select_accelerator(self): ) elif use_ddp_cpu_slurm: - accelerator_backend = accelerators.DDPCPUSLURMAccelerator( + accelerator_backend = accelerators.DDPCPUHPCAccelerator( self.trainer, cluster_env, self.trainer.plugin_connector.ddp_plugin ) elif use_slurm_ddp: - accelerator_backend = accelerators.DDPSLURMAccelerator( + accelerator_backend = accelerators.DDPHPCAccelerator( self.trainer, cluster_env, self.trainer.plugin_connector.ddp_plugin ) elif use_ddp_cpu_torch_elastic: - accelerator_backend = accelerators.DDPCPUTorchElasticAccelerator( + accelerator_backend = accelerators.DDPCPUHPCAccelerator( self.trainer, cluster_env, self.trainer.plugin_connector.ddp_plugin ) elif use_torchelastic_ddp: - accelerator_backend = accelerators.DDPTorchElasticAccelerator( + accelerator_backend = accelerators.DDPHPCAccelerator( self.trainer, cluster_env, self.trainer.plugin_connector.ddp_plugin diff --git a/pytorch_lightning/accelerators/cpu_accelerator.py b/pytorch_lightning/accelerators/cpu_accelerator.py index 3ce6f7315feb8..083b5193ff8f3 100644 --- a/pytorch_lightning/accelerators/cpu_accelerator.py +++ b/pytorch_lightning/accelerators/cpu_accelerator.py @@ -21,6 +21,15 @@ class CPUAccelerator(Accelerator): def __init__(self, trainer, cluster_environment=None): + """ + Runs training on CPU + + Example:: + + # default + trainer = Trainer(accelerator=CPUAccelerator()) + + """ super().__init__(trainer, cluster_environment) self.nickname = None diff --git a/pytorch_lightning/accelerators/ddp2_accelerator.py b/pytorch_lightning/accelerators/ddp2_accelerator.py index 452c15ba3bc38..2da9747a9be92 100644 --- a/pytorch_lightning/accelerators/ddp2_accelerator.py +++ b/pytorch_lightning/accelerators/ddp2_accelerator.py @@ -11,22 +11,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License - import os import torch import torch.distributed as torch_distrib -from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.core.step_result import Result from pytorch_lightning.distributed.dist import LightningDistributed from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp from pytorch_lightning.utilities import AMPType -from pytorch_lightning.utilities.distributed import rank_zero_only +from pytorch_lightning.utilities.distributed import rank_zero_only, sync_ddp_if_available from torch.nn.parallel import DistributedDataParallel -from typing import List, Optional +from typing import List, Optional, Union, Any try: from hydra.utils import to_absolute_path, get_original_cwd @@ -40,25 +38,23 @@ class DDP2Accelerator(Accelerator): def __init__(self, trainer, cluster_environment=None, ddp_plugin=None): + """ + Runs training using DDP2 strategy on a cluster + + Example:: + + # default + trainer = Trainer(accelerator=DDP2Accelerator()) + + """ super().__init__(trainer, cluster_environment, ddp_plugin) self.task_idx = None self.dist = LightningDistributed() self.nickname = 'ddp2' def setup(self, model): - self._resolve_task_idx() self.trainer.model = model - - def _resolve_task_idx(self): - if self.trainer.is_slurm_managing_tasks: - self.task_idx = int(os.environ['SLURM_LOCALID']) - else: - # torchelastic or general non_slurm ddp2 - try: - self.task_idx = int(os.environ['LOCAL_RANK']) - except Exception as exp: - m = 'ddp2 only works in SLURM or via torchelastic with the WORLD_SIZE, LOCAL_RANK, GROUP_RANK flags' - raise MisconfigurationException(m) from exp + self.task_idx = self.cluster_environment.local_rank() def train(self): model = self.trainer.model @@ -214,3 +210,9 @@ def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None) return model + + def sync_tensor(self, + tensor: Union[torch.Tensor], + group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: + return sync_ddp_if_available(tensor, group, reduce_op) diff --git a/pytorch_lightning/accelerators/ddp_accelerator.py b/pytorch_lightning/accelerators/ddp_accelerator.py index b9f01b5ddc167..f99cd1149e5ae 100644 --- a/pytorch_lightning/accelerators/ddp_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_accelerator.py @@ -18,17 +18,18 @@ import sys from os.path import abspath from time import sleep -from typing import Optional, List +from typing import Any, Optional, List, Union import numpy as np from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.distributed.dist import LightningDistributed from pytorch_lightning.utilities import AMPType from pytorch_lightning.utilities.distributed import find_free_network_port from pytorch_lightning.utilities.distributed import rank_zero_only +from pytorch_lightning.utilities.distributed import sync_ddp_if_available from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.seed import seed_everything from torch.nn.parallel import DistributedDataParallel @@ -46,6 +47,15 @@ class DDPAccelerator(Accelerator): def __init__(self, trainer, cluster_environment=None, ddp_plugin=None): + """ + Runs training using DDP strategy on a single machine (manually, not via cluster start) + + Example:: + + # default + trainer = Trainer(accelerator=DDPAccelerator()) + + """ super().__init__(trainer, cluster_environment, ddp_plugin) self.task_idx = None self._has_spawned_children = False @@ -298,3 +308,12 @@ def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None) return model + + def sync_tensor(self, + tensor: Union[torch.Tensor], + group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: + """ + + """ + return sync_ddp_if_available(tensor, group, reduce_op) diff --git a/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py new file mode 100644 index 0000000000000..7b43dc9f6b68a --- /dev/null +++ b/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py @@ -0,0 +1,46 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +from pytorch_lightning.accelerators.ddp_hpc_accelerator import DDPHPCAccelerator + + +try: + from hydra.utils import to_absolute_path, get_original_cwd + from hydra.core.hydra_config import HydraConfig +except ImportError: + HYDRA_AVAILABLE = False +else: + HYDRA_AVAILABLE = True + + +class DDPCPUHPCAccelerator(DDPHPCAccelerator): + + def __init__(self, trainer, cluster_environment=None, ddp_plugin=None): + """ + Runs training using DDP (with CPUs) strategy on a cluster + + Example:: + + # default + trainer = Trainer(accelerator=DDPCPUHPCAccelerator()) + + """ + super().__init__(trainer, cluster_environment, ddp_plugin) + self.nickname = 'ddp_cpu' + + def model_to_device(self, model, process_idx): + model.cpu() + + def get_device_ids(self): + device_ids = None + return device_ids diff --git a/pytorch_lightning/accelerators/ddp_cpu_slurm_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_slurm_accelerator.py deleted file mode 100644 index 2aad005a07847..0000000000000 --- a/pytorch_lightning/accelerators/ddp_cpu_slurm_accelerator.py +++ /dev/null @@ -1,201 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License -import os -from typing import List, Optional - -import torch -import torch.distributed as torch_distrib -import torch.distributed as dist -from torch.nn.parallel import DistributedDataParallel - -from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator -from pytorch_lightning.core.lightning import LightningModule -from pytorch_lightning.utilities import AMPType -from pytorch_lightning.utilities.distributed import rank_zero_only -from pytorch_lightning.distributed.dist import LightningDistributed - - -try: - from hydra.utils import to_absolute_path, get_original_cwd - from hydra.core.hydra_config import HydraConfig -except ImportError: - HYDRA_AVAILABLE = False -else: - HYDRA_AVAILABLE = True - - -# ------------------------------------------- -# !!!!!!!!!!!!!! NOTE !!!!!!!!!!!!!!!!!!!!!! -# TEMP CLASS WHILE WE DECOUPLE TE FROM DDP -# !!!!!!!!!!!!!! NOTE !!!!!!!!!!!!!!!!!!!!!! -# ------------------------------------------- -class DDPCPUSLURMAccelerator(Accelerator): - - def __init__(self, trainer, cluster_environment=None, ddp_plugin=None): - super().__init__(trainer, cluster_environment, ddp_plugin) - self.task_idx = None - self._has_spawned_children = False - self.dist = LightningDistributed() - self.nickname = 'ddp_cpu' - - def setup(self, model): - self.trainer.model = model - self.task_idx = int(os.environ['SLURM_LOCALID']) - - def train(self): - model = self.trainer.model - self.ddp_train(process_idx=self.task_idx, model=model) - - def set_world_ranks(self, process_idx): - self.trainer.local_rank = process_idx - self.trainer.global_rank = self.trainer.node_rank * self.trainer.num_processes + process_idx - self.trainer.world_size = self.trainer.num_nodes * self.trainer.num_processes - - def model_to_device(self, model, process_idx): - model.cpu() - - def get_device_ids(self): - device_ids = None - return device_ids - - def training_step(self, args): - if self.trainer.amp_backend == AMPType.NATIVE: - with torch.cuda.amp.autocast(): - output = self.trainer.model(*args) - else: - output = self.trainer.model(*args) - return output - - def validation_step(self, args): - output = self.training_step(args) - return output - - def test_step(self, args): - output = self.training_step(args) - return output - - def barrier(self, name: Optional[str] = None): - if torch_distrib.is_initialized(): - torch_distrib.barrier() - - def early_stopping_should_stop(self, pl_module): - stop = torch.tensor(int(self.trainer.should_stop), device=pl_module.device) - dist.all_reduce(stop, op=dist.reduce_op.SUM) - dist.barrier() - should_stop = stop == self.trainer.world_size - return should_stop - - def broadcast(self, obj, src=0): - return self.dist.broadcast(obj) - - def ddp_train(self, process_idx, model): - """ - Entry point for ddp - - Args: - process_idx: - mp_queue: multiprocessing queue - model: - - Returns: - Dict with evaluation results - - """ - # determine which process we are and world size - self.set_world_ranks(process_idx) - - # toggle prog bar - if self.trainer.global_rank == 0 and self.trainer.progress_bar_callback is not None: - self.trainer.progress_bar_callback.disable() - - # set warning rank - rank_zero_only.rank = self.trainer.global_rank - - # set up server using proc 0's ip address - # try to init for 20 times at max in case ports are taken - # where to store ip_table - model.trainer = self.trainer - self.init_ddp_connection( - self.trainer.global_rank, - self.trainer.world_size, - self.trainer.is_slurm_managing_tasks - ) - - # call setup after the ddp process has connected - self.trainer.call_setup_hook(model) - - # on world_size=0 let everyone know training is starting - if self.trainer.is_global_zero and not torch.distributed.is_initialized(): - log.info('-' * 100) - log.info(f'distributed_backend={self.trainer.distributed_backend} (TORCH_ELASTIC)') - log.info(f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes') - log.info('-' * 100) - - # call sync_bn before .cuda(), configure_apex and configure_ddp - if self.trainer.sync_batchnorm: - model = self.configure_sync_batchnorm(model) - - # move the model to the correct device - self.model_to_device(model, process_idx) - - # CHOOSE OPTIMIZER - # allow for lr schedulers as well - self.setup_optimizers(model) - - # set model properties before going into wrapper - self.trainer.model_connector.copy_trainer_model_properties(model) - - # 16-bit - model = self.trainer.precision_connector.connect(model) - - # device ids change depending on the DDP setup - device_ids = self.get_device_ids() - - # allow user to configure ddp - model = self.configure_ddp(model, device_ids) - - # set up training routine - self.trainer.train_loop.setup_training(model) - - # train or test - results = self.train_or_test() - - # clean up memory - torch.cuda.empty_cache() - - return results - - def configure_ddp( - self, model: LightningModule, device_ids: List[int] - ) -> DistributedDataParallel: - model = self.ddp_plugin.configure_ddp(model, device_ids) - return model - - def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule: - """ - Add global batchnorm for a model spread across multiple GPUs and nodes. - - Override to synchronize batchnorm between specific process groups instead - of the whole world or use a different sync_bn like `apex`'s version. - - Args: - model: pointer to current :class:`LightningModule`. - - Return: - LightningModule with batchnorm layers synchronized between process groups - """ - model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None) - - return model diff --git a/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py index f1813361c5eec..221ed5769c35e 100644 --- a/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License import os -from typing import List, Optional +from typing import Any, List, Optional, Union import torch import torch.distributed as torch_distrib @@ -21,11 +21,11 @@ from torch.nn.parallel import DistributedDataParallel from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.utilities import AMPType from pytorch_lightning.utilities.distributed import rank_zero_only, rank_zero_warn -from pytorch_lightning.utilities.distributed import find_free_network_port +from pytorch_lightning.utilities.distributed import find_free_network_port, sync_ddp_if_available from pytorch_lightning.distributed.dist import LightningDistributed try: @@ -40,6 +40,15 @@ class DDPCPUSpawnAccelerator(Accelerator): def __init__(self, trainer, nprocs, cluster_environment=None, ddp_plugin=None): + """ + Runs training using DDP (on a single machine or manually on multiple machines), using mp.spawn + + Example:: + + # default + trainer = Trainer(accelerator=DDPCPUSpawnAccelerator()) + + """ super().__init__(trainer, cluster_environment, ddp_plugin) self.mp_queue = None self.nprocs = nprocs @@ -229,3 +238,9 @@ def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None) return model + + def sync_tensor(self, + tensor: Union[torch.Tensor], + group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: + return sync_ddp_if_available(tensor, group, reduce_op) diff --git a/pytorch_lightning/accelerators/ddp_cpu_torchelastic_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_torchelastic_accelerator.py deleted file mode 100644 index 6b27e7da330ea..0000000000000 --- a/pytorch_lightning/accelerators/ddp_cpu_torchelastic_accelerator.py +++ /dev/null @@ -1,200 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License -import os -from typing import List, Optional - -import torch -import torch.distributed as torch_distrib -import torch.distributed as dist -from torch.nn.parallel import DistributedDataParallel - -from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator -from pytorch_lightning.core.lightning import LightningModule -from pytorch_lightning.distributed.dist import LightningDistributed -from pytorch_lightning.utilities import AMPType -from pytorch_lightning.utilities.distributed import rank_zero_only - -try: - from hydra.utils import to_absolute_path, get_original_cwd - from hydra.core.hydra_config import HydraConfig -except ImportError: - HYDRA_AVAILABLE = False -else: - HYDRA_AVAILABLE = True - - -# ------------------------------------------- -# !!!!!!!!!!!!!! NOTE !!!!!!!!!!!!!!!!!!!!!! -# TEMP CLASS WHILE WE DECOUPLE TE FROM DDP -# !!!!!!!!!!!!!! NOTE !!!!!!!!!!!!!!!!!!!!!! -# ------------------------------------------- -class DDPCPUTorchElasticAccelerator(Accelerator): - - def __init__(self, trainer, cluster_environment=None, ddp_plugin=None): - super().__init__(trainer, cluster_environment, ddp_plugin) - self.task_idx = None - self._has_spawned_children = False - self.dist = LightningDistributed() - self.nickname = 'ddp_cpu' - - def setup(self, model): - self.trainer.model = model - self.task_idx = int(os.environ['LOCAL_RANK']) - - def train(self): - model = self.trainer.model - self.ddp_train(process_idx=self.task_idx, model=model) - - def set_world_ranks(self, process_idx): - self.trainer.local_rank = process_idx - self.trainer.global_rank = self.trainer.node_rank * self.trainer.num_processes + process_idx - self.trainer.world_size = self.trainer.num_nodes * self.trainer.num_processes - - def model_to_device(self, model, process_idx): - model.cpu() - - def get_device_ids(self): - device_ids = None - return device_ids - - def training_step(self, args): - if self.trainer.amp_backend == AMPType.NATIVE: - with torch.cuda.amp.autocast(): - output = self.trainer.model(*args) - else: - output = self.trainer.model(*args) - return output - - def validation_step(self, args): - output = self.training_step(args) - return output - - def test_step(self, args): - output = self.training_step(args) - return output - - def barrier(self, name: Optional[str] = None): - if torch_distrib.is_initialized(): - torch_distrib.barrier() - - def early_stopping_should_stop(self, pl_module): - stop = torch.tensor(int(self.trainer.should_stop), device=pl_module.device) - dist.all_reduce(stop, op=dist.reduce_op.SUM) - dist.barrier() - should_stop = stop == self.trainer.world_size - return should_stop - - def broadcast(self, obj, src=0): - return self.dist.broadcast(obj) - - def ddp_train(self, process_idx, model): - """ - Entry point for ddp - - Args: - process_idx: - mp_queue: multiprocessing queue - model: - - Returns: - Dict with evaluation results - - """ - # determine which process we are and world size - self.set_world_ranks(process_idx) - - # toggle prog bar - if self.trainer.global_rank == 0 and self.trainer.progress_bar_callback is not None: - self.trainer.progress_bar_callback.disable() - - # set warning rank - rank_zero_only.rank = self.trainer.global_rank - - # set up server using proc 0's ip address - # try to init for 20 times at max in case ports are taken - # where to store ip_table - model.trainer = self.trainer - self.init_ddp_connection( - self.trainer.global_rank, - self.trainer.world_size, - self.trainer.is_slurm_managing_tasks - ) - - # call setup after the ddp process has connected - self.trainer.call_setup_hook(model) - - # on world_size=0 let everyone know training is starting - if self.trainer.is_global_zero and not torch.distributed.is_initialized(): - log.info('-' * 100) - log.info(f'distributed_backend={self.trainer.distributed_backend} (TORCH_ELASTIC)') - log.info(f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes') - log.info('-' * 100) - - # call sync_bn before .cuda(), configure_apex and configure_ddp - if self.trainer.sync_batchnorm: - model = self.configure_sync_batchnorm(model) - - # move the model to the correct device - self.model_to_device(model, process_idx) - - # CHOOSE OPTIMIZER - # allow for lr schedulers as well - self.setup_optimizers(model) - - # set model properties before going into wrapper - self.trainer.model_connector.copy_trainer_model_properties(model) - - # 16-bit - model = self.trainer.precision_connector.connect(model) - - # device ids change depending on the DDP setup - device_ids = self.get_device_ids() - - # allow user to configure ddp - model = self.configure_ddp(model, device_ids) - - # set up training routine - self.trainer.train_loop.setup_training(model) - - # train or test - results = self.train_or_test() - - # clean up memory - torch.cuda.empty_cache() - - return results - - def configure_ddp( - self, model: LightningModule, device_ids: List[int] - ) -> DistributedDataParallel: - model = self.ddp_plugin.configure_ddp(model, device_ids) - return model - - def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule: - """ - Add global batchnorm for a model spread across multiple GPUs and nodes. - - Override to synchronize batchnorm between specific process groups instead - of the whole world or use a different sync_bn like `apex`'s version. - - Args: - model: pointer to current :class:`LightningModule`. - - Return: - LightningModule with batchnorm layers synchronized between process groups - """ - model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None) - - return model diff --git a/pytorch_lightning/accelerators/ddp_torchelastic_accelerator.py b/pytorch_lightning/accelerators/ddp_hpc_accelerator.py similarity index 89% rename from pytorch_lightning/accelerators/ddp_torchelastic_accelerator.py rename to pytorch_lightning/accelerators/ddp_hpc_accelerator.py index 8a9e6ac77e574..b6d813f978943 100644 --- a/pytorch_lightning/accelerators/ddp_torchelastic_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_hpc_accelerator.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License import os -from typing import List, Optional +from typing import Any, List, Optional, Union import torch import torch.distributed as torch_distrib @@ -20,11 +20,11 @@ from torch.nn.parallel import DistributedDataParallel from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.distributed.dist import LightningDistributed from pytorch_lightning.utilities import AMPType -from pytorch_lightning.utilities.distributed import rank_zero_only +from pytorch_lightning.utilities.distributed import rank_zero_only, sync_ddp_if_available try: @@ -36,14 +36,18 @@ HYDRA_AVAILABLE = True -# ------------------------------------------- -# !!!!!!!!!!!!!! NOTE !!!!!!!!!!!!!!!!!!!!!! -# TEMP CLASS WHILE WE DECOUPLE SLURM FROM DDP -# !!!!!!!!!!!!!! NOTE !!!!!!!!!!!!!!!!!!!!!! -# ------------------------------------------- -class DDPTorchElasticAccelerator(Accelerator): +class DDPHPCAccelerator(Accelerator): def __init__(self, trainer, cluster_environment=None, ddp_plugin=None): + """ + Runs training using DDP on an HPC cluster + + Example:: + + # default + trainer = Trainer(accelerator=DDPHPCAccelerator()) + + """ super().__init__(trainer, cluster_environment, ddp_plugin) self.task_idx = None self._has_spawned_children = False @@ -52,7 +56,7 @@ def __init__(self, trainer, cluster_environment=None, ddp_plugin=None): def setup(self, model): self.trainer.model = model - self.task_idx = int(os.environ['LOCAL_RANK']) + self.task_idx = self.cluster_environment.local_rank() def train(self): model = self.trainer.model @@ -119,7 +123,7 @@ def ddp_train(self, process_idx, model): self.set_world_ranks(process_idx) # toggle prog bar - if self.trainer.global_rank == 0 and self.trainer.progress_bar_callback is not None: + if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None: self.trainer.progress_bar_callback.disable() # set warning rank @@ -141,7 +145,7 @@ def ddp_train(self, process_idx, model): # on world_size=0 let everyone know training is starting if self.trainer.is_global_zero and not torch.distributed.is_initialized(): log.info('-' * 100) - log.info(f'distributed_backend={self.trainer.distributed_backend} (on SLURM)') + log.info(f'distributed_backend={self.trainer.distributed_backend}') log.info(f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes') log.info('-' * 100) @@ -201,3 +205,9 @@ def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None) return model + + def sync_tensor(self, + tensor: Union[torch.Tensor], + group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: + return sync_ddp_if_available(tensor, group, reduce_op) diff --git a/pytorch_lightning/accelerators/ddp_slurm_accelerator.py b/pytorch_lightning/accelerators/ddp_slurm_accelerator.py deleted file mode 100644 index 8a6326d3d5cb8..0000000000000 --- a/pytorch_lightning/accelerators/ddp_slurm_accelerator.py +++ /dev/null @@ -1,207 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License -import os -from typing import List - -import torch -import torch.distributed as torch_distrib -import torch.distributed as dist -from torch.nn.parallel import DistributedDataParallel - -from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator -from pytorch_lightning.core.lightning import LightningModule -from pytorch_lightning.distributed.dist import LightningDistributed -from pytorch_lightning.utilities import AMPType -from pytorch_lightning.utilities.distributed import rank_zero_only -from pytorch_lightning.utilities.seed import seed_everything - -try: - from hydra.utils import to_absolute_path, get_original_cwd - from hydra.core.hydra_config import HydraConfig -except ImportError: - HYDRA_AVAILABLE = False -else: - HYDRA_AVAILABLE = True - - -# ------------------------------------------- -# !!!!!!!!!!!!!! NOTE !!!!!!!!!!!!!!!!!!!!!! -# TEMP CLASS WHILE WE DECOUPLE SLURM FROM DDP -# !!!!!!!!!!!!!! NOTE !!!!!!!!!!!!!!!!!!!!!! -# ------------------------------------------- -class DDPSLURMAccelerator(Accelerator): - - def __init__(self, trainer, cluster_environment=None, ddp_plugin=None): - super().__init__(trainer, cluster_environment, ddp_plugin) - self.task_idx = None - self._has_spawned_children = False - self.dist = LightningDistributed() - self.nickname = 'ddp' - - def setup(self, model): - self.trainer.model = model - self.task_idx = int(os.environ['SLURM_LOCALID']) - - def train(self): - model = self.trainer.model - self.ddp_train(process_idx=self.task_idx, model=model) - - def set_world_ranks(self, process_idx): - self.trainer.local_rank = process_idx - self.trainer.global_rank = self.trainer.node_rank * self.trainer.num_processes + process_idx - self.trainer.world_size = self.trainer.num_nodes * self.trainer.num_processes - - def model_to_device(self, model, process_idx): - self.trainer.root_gpu = process_idx - torch.cuda.set_device(self.trainer.root_gpu) - model.cuda(self.trainer.root_gpu) - - def get_device_ids(self): - device_ids = [self.trainer.root_gpu] - return device_ids - - def training_step(self, args): - if self.trainer.amp_backend == AMPType.NATIVE: - with torch.cuda.amp.autocast(): - output = self.trainer.model(*args) - else: - output = self.trainer.model(*args) - return output - - def validation_step(self, args): - output = self.training_step(args) - return output - - def test_step(self, args): - output = self.training_step(args) - return output - - def barrier(self, name: str = None): - if torch_distrib.is_initialized(): - torch_distrib.barrier() - - def early_stopping_should_stop(self, pl_module): - stop = torch.tensor(int(self.trainer.should_stop), device=pl_module.device) - dist.all_reduce(stop, op=dist.reduce_op.SUM) - dist.barrier() - should_stop = stop == self.trainer.world_size - return should_stop - - def broadcast(self, obj, src=0): - return self.dist.broadcast(obj) - - def ddp_train(self, process_idx, model): - """ - Entry point for ddp - - Args: - process_idx: - mp_queue: multiprocessing queue - model: - - Returns: - Dict with evaluation results - - """ - seed = os.environ.get("PL_GLOBAL_SEED") - if seed is not None: - seed_everything(int(seed)) - - # determine which process we are and world size - self.set_world_ranks(process_idx) - - # toggle prog bar - if self.trainer.global_rank == 0 and self.trainer.progress_bar_callback is not None: - self.trainer.progress_bar_callback.disable() - - # set warning rank - rank_zero_only.rank = self.trainer.global_rank - - # set up server using proc 0's ip address - # try to init for 20 times at max in case ports are taken - # where to store ip_table - model.trainer = self.trainer - self.init_ddp_connection( - self.trainer.global_rank, - self.trainer.world_size, - self.trainer.is_slurm_managing_tasks - ) - - # call setup after the ddp process has connected - self.trainer.call_setup_hook(model) - - # on world_size=0 let everyone know training is starting - if self.trainer.is_global_zero and not torch.distributed.is_initialized(): - log.info('-' * 100) - log.info(f'distributed_backend={self.trainer.distributed_backend} (on SLURM)') - log.info(f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes') - log.info('-' * 100) - - # call sync_bn before .cuda(), configure_apex and configure_ddp - if self.trainer.sync_batchnorm: - model = self.configure_sync_batchnorm(model) - - # move the model to the correct device - self.model_to_device(model, process_idx) - - # CHOOSE OPTIMIZER - # allow for lr schedulers as well - self.setup_optimizers(model) - - # set model properties before going into wrapper - self.trainer.model_connector.copy_trainer_model_properties(model) - - # 16-bit - model = self.trainer.precision_connector.connect(model) - - # device ids change depending on the DDP setup - device_ids = self.get_device_ids() - - # allow user to configure ddp - model = self.configure_ddp(model, device_ids) - - # set up training routine - self.trainer.train_loop.setup_training(model) - - # train or test - results = self.train_or_test() - - # clean up memory - torch.cuda.empty_cache() - - return results - - def configure_ddp( - self, model: LightningModule, device_ids: List[int] - ) -> DistributedDataParallel: - model = self.ddp_plugin.configure_ddp(model, device_ids) - return model - - def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule: - """ - Add global batchnorm for a model spread across multiple GPUs and nodes. - - Override to synchronize batchnorm between specific process groups instead - of the whole world or use a different sync_bn like `apex`'s version. - - Args: - model: pointer to current :class:`LightningModule`. - - Return: - LightningModule with batchnorm layers synchronized between process groups - """ - model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None) - - return model diff --git a/pytorch_lightning/accelerators/ddp_spawn_accelerator.py b/pytorch_lightning/accelerators/ddp_spawn_accelerator.py index b204494773362..a30d266ec1b2f 100644 --- a/pytorch_lightning/accelerators/ddp_spawn_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_spawn_accelerator.py @@ -13,7 +13,7 @@ # limitations under the License import os import re -from typing import List, Optional +from typing import Any, List, Optional, Union import torch import torch.multiprocessing as mp @@ -22,11 +22,12 @@ from torch.nn.parallel import DistributedDataParallel from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.utilities import AMPType from pytorch_lightning.utilities.cloud_io import atomic_save, load as pl_load from pytorch_lightning.utilities.distributed import rank_zero_only, rank_zero_warn, find_free_network_port +from pytorch_lightning.utilities.distributed import sync_ddp_if_available from pytorch_lightning.utilities.seed import seed_everything from pytorch_lightning.distributed.dist import LightningDistributed @@ -42,6 +43,15 @@ class DDPSpawnAccelerator(Accelerator): def __init__(self, trainer, nprocs, cluster_environment=None, ddp_plugin=None): + """ + Runs training using DDP using mp.spawn via manual launch (not cluster launch) + + Example:: + + # default + trainer = Trainer(accelerator=DDPSpawnAccelerator()) + + """ super().__init__(trainer, cluster_environment, ddp_plugin) self.mp_queue = None self.nprocs = nprocs @@ -254,3 +264,9 @@ def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None) return model + + def sync_tensor(self, + tensor: Union[torch.Tensor], + group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: + return sync_ddp_if_available(tensor, group, reduce_op) diff --git a/pytorch_lightning/accelerators/dp_accelerator.py b/pytorch_lightning/accelerators/dp_accelerator.py index 0a6eac607d79c..2f6c5dce97c46 100644 --- a/pytorch_lightning/accelerators/dp_accelerator.py +++ b/pytorch_lightning/accelerators/dp_accelerator.py @@ -26,6 +26,15 @@ class DataParallelAccelerator(Accelerator): def __init__(self, trainer, cluster_environment=None): + """ + Runs training using DP via manual start (not HPC cluster) + + Example:: + + # default + trainer = Trainer(accelerator=DataParallelAccelerator()) + + """ super().__init__(trainer, cluster_environment) self.model_autocast_original_forward = None self.dist = LightningDistributed() diff --git a/pytorch_lightning/accelerators/gpu_accelerator.py b/pytorch_lightning/accelerators/gpu_accelerator.py index e5611767547a1..e66f5bcb8b48c 100644 --- a/pytorch_lightning/accelerators/gpu_accelerator.py +++ b/pytorch_lightning/accelerators/gpu_accelerator.py @@ -23,6 +23,15 @@ class GPUAccelerator(Accelerator): amp_backend: AMPType def __init__(self, trainer, cluster_environment=None): + """ + Runs training using a single GPU + + Example:: + + # default + trainer = Trainer(accelerator=GPUAccelerator()) + + """ super().__init__(trainer, cluster_environment) self.dist = LightningDistributed() self.nickname = None diff --git a/pytorch_lightning/accelerators/horovod_accelerator.py b/pytorch_lightning/accelerators/horovod_accelerator.py index 91a5400999f6e..3d9191914566d 100644 --- a/pytorch_lightning/accelerators/horovod_accelerator.py +++ b/pytorch_lightning/accelerators/horovod_accelerator.py @@ -12,12 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. from contextlib import ExitStack -from typing import Optional +from typing import Any, Optional, Union import torch from torch.optim.lr_scheduler import _LRScheduler -from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp from pytorch_lightning.utilities import AMPType from pytorch_lightning.utilities.distributed import rank_zero_only @@ -33,6 +33,15 @@ class HorovodAccelerator(Accelerator): amp_backend: AMPType def __init__(self, trainer, cluster_environment=None): + """ + Runs training using horovod + + Example:: + + # default + trainer = Trainer(accelerator=HorovodAccelerator()) + + """ super().__init__(trainer, cluster_environment) self.nickname = 'horovod' @@ -161,3 +170,41 @@ def barrier(self, name: Optional[str] = None): def broadcast(self, obj, src=0): obj = hvd.broadcast_object(obj, src) return obj + + def gather_all_tensors(self, result: Union[torch.Tensor], group: Optional[Any] = None): + if group is not None: + raise ValueError( + "Horovod does not support allgather using a subcommunicator at this time. " + "Unset `group`." + ) + + if len(result.shape) == 0: + # Convert scalars to single dimension tensors + result = result.reshape(1) + + # sync and gather all + hvd.join() + gathered = hvd.allgather(result) + gathered_result = list(gathered.split(1, dim=0)) + return gathered_result + + def sync_tensor(self, + tensor: Union[torch.Tensor], + group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: + if group is not None: + raise ValueError( + "Horovod does not support allreduce using a subcommunicator at this time. " + "Unset `group`." + ) + + if reduce_op is None or reduce_op == "sum": + reduce_op = hvd.Sum + elif isinstance(reduce_op, str) and reduce_op in ("avg", "mean"): + reduce_op = hvd.Average + else: + raise ValueError(f"unrecognized `reduce_op`: {reduce_op}") + + # sync all processes before reduction + hvd.join() + return hvd.allreduce(tensor, op=reduce_op) diff --git a/pytorch_lightning/accelerators/tpu_accelerator.py b/pytorch_lightning/accelerators/tpu_accelerator.py index 1988f83601b8c..5f4e6cc22cacd 100644 --- a/pytorch_lightning/accelerators/tpu_accelerator.py +++ b/pytorch_lightning/accelerators/tpu_accelerator.py @@ -39,6 +39,15 @@ class TPUAccelerator(Accelerator): def __init__(self, trainer, cluster_environment=None): + """ + Runs training using TPUs (colab, single machine or pod) + + Example:: + + # default + trainer = Trainer(accelerator=TPUAccelerator()) + + """ super().__init__(trainer, cluster_environment) self.start_method = None self.mp_queue = None @@ -242,11 +251,13 @@ def optimizer_step(self, optimizer, batch_idx, opt_idx, lambda_closure): # model hook model_ref.optimizer_step( - self.trainer.current_epoch, - batch_idx, optimizer, - opt_idx, - lambda_closure, + epoch=self.trainer.current_epoch, + batch_idx=batch_idx, + optimizer=optimizer, + optimizer_idx=opt_idx, + optimizer_closure=lambda_closure, on_tpu=True, + using_native_amp=False, using_lbfgs=is_lbfgs ) @@ -268,8 +279,6 @@ def early_stopping_should_stop(self, pl_module): def save_spawn_weights(self, model): """ Dump a temporary checkpoint after ddp ends to get weights out of the process - :param model: - :return: """ if self.trainer.is_global_zero: path = os.path.join(self.trainer.default_root_dir, '__temp_weight_distributed_end.ckpt') @@ -280,8 +289,6 @@ def load_spawn_weights(self, original_model): """ Load the temp weights saved in the process To recover the trained model from the ddp process we load the saved weights - :param model: - :return: """ loaded_model = original_model diff --git a/pytorch_lightning/callbacks/base.py b/pytorch_lightning/callbacks/base.py index 591703c245543..004aa6d737b4a 100644 --- a/pytorch_lightning/callbacks/base.py +++ b/pytorch_lightning/callbacks/base.py @@ -166,3 +166,15 @@ def on_save_checkpoint(self, trainer, pl_module): def on_load_checkpoint(self, checkpointed_state): """Called when loading a model checkpoint, use to reload state.""" pass + + def on_after_backward(self, trainer, pl_module): + """ + Called after loss.backward() and before optimizers do anything. + """ + pass + + def on_before_zero_grad(self, trainer, pl_module, optimizer): + """ + Called after optimizer.step() and before optimizer.zero_grad(). + """ + pass diff --git a/pytorch_lightning/callbacks/lr_monitor.py b/pytorch_lightning/callbacks/lr_monitor.py index b6de6107b0bc9..7502829044200 100755 --- a/pytorch_lightning/callbacks/lr_monitor.py +++ b/pytorch_lightning/callbacks/lr_monitor.py @@ -36,6 +36,8 @@ class LearningRateMonitor(Callback): logging_interval: set to `epoch` or `step` to log `lr` of all optimizers at the same interval, set to `None` to log at individual interval according to the `interval` key of each scheduler. Defaults to ``None``. + log_momentum: option to also log the momentum values of the optimizer, if the optimizer + has the `momentum` attribute. Defaults to ``False``. Example:: @@ -59,13 +61,14 @@ def configure_optimizer(self): return [optimizer], [lr_scheduler] """ - def __init__(self, logging_interval: Optional[str] = None): + def __init__(self, logging_interval: Optional[str] = None, log_momentum: bool = False): if logging_interval not in (None, 'step', 'epoch'): raise MisconfigurationException( 'logging_interval should be `step` or `epoch` or `None`.' ) self.logging_interval = logging_interval + self.log_momentum = log_momentum self.lrs = None self.lr_sch_names = [] @@ -92,6 +95,7 @@ def on_train_start(self, trainer, *args, **kwargs): # Initialize for storing values self.lrs = {name: [] for name in names} + self.last_momentum_values = {name + "-momentum": None for name in names} def on_train_batch_start(self, trainer, *args, **kwargs): if not self._should_log(trainer): @@ -99,7 +103,7 @@ def on_train_batch_start(self, trainer, *args, **kwargs): if self.logging_interval != 'epoch': interval = 'step' if self.logging_interval is None else 'any' - latest_stat = self._extract_lr(trainer, interval) + latest_stat = self._extract_stats(trainer, interval) if trainer.logger is not None and latest_stat: trainer.logger.log_metrics(latest_stat, step=trainer.global_step) @@ -107,12 +111,12 @@ def on_train_batch_start(self, trainer, *args, **kwargs): def on_train_epoch_start(self, trainer, *args, **kwargs): if self.logging_interval != 'step': interval = 'epoch' if self.logging_interval is None else 'any' - latest_stat = self._extract_lr(trainer, interval) + latest_stat = self._extract_stats(trainer, interval) if trainer.logger is not None and latest_stat: trainer.logger.log_metrics(latest_stat, step=trainer.current_epoch) - def _extract_lr(self, trainer, interval: str) -> Dict[str, float]: + def _extract_stats(self, trainer, interval: str) -> Dict[str, float]: latest_stat = {} for name, scheduler in zip(self.lr_sch_names, trainer.lr_schedulers): @@ -120,15 +124,33 @@ def _extract_lr(self, trainer, interval: str) -> Dict[str, float]: param_groups = scheduler['scheduler'].optimizer.param_groups if len(param_groups) != 1: for i, pg in enumerate(param_groups): - lr, key = pg['lr'], f'{name}/pg{i + 1}' - self.lrs[key].append(lr) - latest_stat[key] = lr + lr = self._extract_lr(param_group=pg, name=f'{name}/pg{i + 1}') + latest_stat.update(lr) + momentum = self._extract_momentum(param_group=pg, name=f'{name}-momentum/pg{i + 1}') + latest_stat.update(momentum) + else: - self.lrs[name].append(param_groups[0]['lr']) - latest_stat[name] = param_groups[0]['lr'] + pg = param_groups[0] + lr = self._extract_lr(param_group=pg, name=name) + latest_stat.update(lr) + momentum = self._extract_momentum(param_group=pg, name=f'{name}-momentum') + latest_stat.update(momentum) return latest_stat + def _extract_lr(self, param_group, name: str) -> Dict[str, float]: + lr = param_group.get('lr') + self.lrs[name].append(lr) + return {name: lr} + + def _extract_momentum(self, param_group, name: str) -> Dict[str, float]: + if not self.log_momentum: + return {} + + momentum = param_group.get('momentum') + self.last_momentum_values[name] = momentum + return {name: momentum} + def _find_names(self, lr_schedulers) -> List[str]: # Create uniqe names in the case we have multiple of the same learning # rate schduler + multiple parameter groups diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 6c6a1741c31c5..d257e1ea7cc0d 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -78,7 +78,7 @@ class ModelCheckpoint(Callback): dirpath: directory to save the model file. - Example: + Example:: # custom path # saves a file like: my/path/epoch=0.ckpt @@ -92,7 +92,7 @@ class ModelCheckpoint(Callback): filename: checkpoint filename. Can contain named formatting options to be auto-filled. - Example: + Example:: # save any arbitrary metrics like `val_loss`, etc. in name # saves a file like: my/path/epoch=2-val_loss=0.02-other_metric=0.03.ckpt @@ -101,17 +101,17 @@ class ModelCheckpoint(Callback): ... filename='{epoch}-{val_loss:.2f}-{other_metric:.2f}' ... ) - By default, filename is ``None`` and will be set to ``'{epoch}'``. + By default, filename is ``None`` and will be set to ``'{epoch}-{step}'``. - Example: + Example:: >>> from pytorch_lightning import Trainer >>> from pytorch_lightning.callbacks import ModelCheckpoint # saves checkpoints to 'my/path/' at every epoch >>> checkpoint_callback = ModelCheckpoint(dirpath='my/path/') - >>> trainer = Trainer(checkpoint_callback=checkpoint_callback) + >>> trainer = Trainer(callbacks=[checkpoint_callback]) # save epoch and val_loss in name # saves a file like: my/path/sample-mnist-epoch=02-val_loss=0.32.ckpt @@ -123,7 +123,7 @@ class ModelCheckpoint(Callback): # retrieve the best checkpoint after training checkpoint_callback = ModelCheckpoint(dirpath='my/path/') - trainer = Trainer(checkpoint_callback=checkpoint_callback) + trainer = Trainer(callbacks=[checkpoint_callback]) model = ... trainer.fit(model) checkpoint_callback.best_model_path @@ -131,8 +131,6 @@ class ModelCheckpoint(Callback): CHECKPOINT_JOIN_CHAR = "-" CHECKPOINT_NAME_LAST = "last" - CHECKPOINT_STATE_BEST_SCORE = "checkpoint_callback_best_model_score" - CHECKPOINT_STATE_BEST_PATH = "checkpoint_callback_best_model_path" def __init__( self, @@ -187,6 +185,7 @@ def on_validation_end(self, trainer, pl_module): def on_save_checkpoint(self, trainer, pl_module) -> Dict[str, Any]: return { + "monitor": self.monitor, "best_model_score": self.best_model_score, "best_model_path": self.best_model_path, } @@ -223,16 +222,16 @@ def save_checkpoint(self, trainer, pl_module): monitor_candidates = self._monitor_candidates(trainer) # ie: path/val_loss=0.5.ckpt - filepath = self._get_metric_interpolated_filepath_name(epoch, monitor_candidates) + filepath = self._get_metric_interpolated_filepath_name(monitor_candidates, epoch, global_step) # callback supports multiple simultaneous modes # here we call each mode sequentially # Mode 1: save all checkpoints OR only the top k if self.save_top_k: - self._save_top_k_checkpoints(monitor_candidates, trainer, pl_module, epoch, filepath) + self._save_top_k_checkpoints(monitor_candidates, trainer, pl_module, filepath) # Mode 2: save the last checkpoint - self._save_last_checkpoint(trainer, pl_module, epoch, monitor_candidates, filepath) + self._save_last_checkpoint(trainer, pl_module, monitor_candidates, filepath) def __validate_init_configuration(self): if self.save_top_k is not None and self.save_top_k < -1: @@ -247,9 +246,9 @@ def __validate_init_configuration(self): ' configuration. No quantity for top_k to track.' ) if self.save_last: - raise MisconfigurationException( - 'ModelCheckpoint(save_last=True, monitor=None) is not a valid configuration.' - ' You can save the last checkpoint with ModelCheckpoint(save_top_k=None, monitor=None)' + rank_zero_warn( + 'ModelCheckpoint(save_last=True, monitor=None) is a redundant configuration.' + ' You can save the last checkpoint with ModelCheckpoint(save_top_k=None, monitor=None).' ) def __init_ckpt_dir(self, filepath, dirpath, filename, save_top_k): @@ -292,7 +291,7 @@ def __init_ckpt_dir(self, filepath, dirpath, filename, save_top_k): if dirpath and self._fs.protocol == 'file': dirpath = os.path.realpath(dirpath) - self.dirpath = dirpath or None + self.dirpath: Union[str, None] = dirpath or None self.filename = filename or None def __init_monitor_mode(self, monitor, mode): @@ -361,16 +360,18 @@ def _format_checkpoint_name( cls, filename: Optional[str], epoch: int, + step: int, metrics: Dict[str, Any], prefix: str = "", ) -> str: if not filename: # filename is not set, use default name - filename = "{epoch}" + filename = "{epoch}" + cls.CHECKPOINT_JOIN_CHAR + "{step}" + # check and parse user passed keys in the string groups = re.findall(r"(\{.*?)[:\}]", filename) if len(groups) >= 0: - metrics["epoch"] = epoch + metrics.update({"epoch": epoch, 'step': step}) for group in groups: name = group[1:] filename = filename.replace(group, name + "={" + name) @@ -380,7 +381,7 @@ def _format_checkpoint_name( return cls.CHECKPOINT_JOIN_CHAR.join([txt for txt in (prefix, filename) if txt]) def format_checkpoint_name( - self, epoch: int, metrics: Dict[str, Any], ver: Optional[int] = None + self, epoch: int, step: int, metrics: Dict[str, Any], ver: Optional[int] = None ) -> str: """Generate a filename according to the defined template. @@ -388,24 +389,24 @@ def format_checkpoint_name( >>> tmpdir = os.path.dirname(__file__) >>> ckpt = ModelCheckpoint(dirpath=tmpdir, filename='{epoch}') - >>> os.path.basename(ckpt.format_checkpoint_name(0, {})) + >>> os.path.basename(ckpt.format_checkpoint_name(0, 1, metrics={})) 'epoch=0.ckpt' >>> ckpt = ModelCheckpoint(dirpath=tmpdir, filename='{epoch:03d}') - >>> os.path.basename(ckpt.format_checkpoint_name(5, {})) + >>> os.path.basename(ckpt.format_checkpoint_name(5, 2, metrics={})) 'epoch=005.ckpt' >>> ckpt = ModelCheckpoint(dirpath=tmpdir, filename='{epoch}-{val_loss:.2f}') - >>> os.path.basename(ckpt.format_checkpoint_name(2, dict(val_loss=0.123456))) + >>> os.path.basename(ckpt.format_checkpoint_name(2, 3, metrics=dict(val_loss=0.123456))) 'epoch=2-val_loss=0.12.ckpt' >>> ckpt = ModelCheckpoint(dirpath=tmpdir, filename='{missing:d}') - >>> os.path.basename(ckpt.format_checkpoint_name(0, {})) + >>> os.path.basename(ckpt.format_checkpoint_name(0, 4, metrics={})) 'missing=0.ckpt' - >>> ckpt = ModelCheckpoint(filename='{epoch}') - >>> os.path.basename(ckpt.format_checkpoint_name(0, {})) - 'epoch=0.ckpt' + >>> ckpt = ModelCheckpoint(filename='{step}') + >>> os.path.basename(ckpt.format_checkpoint_name(0, 0, {})) + 'step=0.ckpt' """ filename = self._format_checkpoint_name( - self.filename, epoch, metrics, prefix=self.prefix + self.filename, epoch, step, metrics, prefix=self.prefix ) if ver is not None: filename = self.CHECKPOINT_JOIN_CHAR.join((filename, f"v{ver}")) @@ -480,13 +481,11 @@ def _validate_monitor_key(self, trainer): ) raise MisconfigurationException(m) - def _get_metric_interpolated_filepath_name(self, epoch, ckpt_name_metrics): - filepath = self.format_checkpoint_name(epoch, ckpt_name_metrics) + def _get_metric_interpolated_filepath_name(self, ckpt_name_metrics: Dict[str, Any], epoch: int, step: int): + filepath = self.format_checkpoint_name(epoch, step, ckpt_name_metrics) version_cnt = 0 while self._fs.exists(filepath): - filepath = self.format_checkpoint_name( - epoch, ckpt_name_metrics, ver=version_cnt - ) + filepath = self.format_checkpoint_name(epoch, step, ckpt_name_metrics, ver=version_cnt) # this epoch called before version_cnt += 1 return filepath @@ -495,9 +494,10 @@ def _monitor_candidates(self, trainer): ckpt_name_metrics = deepcopy(trainer.logger_connector.logged_metrics) ckpt_name_metrics.update(trainer.logger_connector.callback_metrics) ckpt_name_metrics.update(trainer.logger_connector.progress_bar_metrics) + ckpt_name_metrics.update({"step": trainer.global_step, "epoch": trainer.current_epoch}) return ckpt_name_metrics - def _save_last_checkpoint(self, trainer, pl_module, epoch, ckpt_name_metrics, filepath): + def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics, filepath): should_save_last = self.monitor is None or self.save_last if not should_save_last: return @@ -507,7 +507,11 @@ def _save_last_checkpoint(self, trainer, pl_module, epoch, ckpt_name_metrics, fi # when user ALSO asked for the 'last.ckpt' change the name if self.save_last: last_filepath = self._format_checkpoint_name( - self.CHECKPOINT_NAME_LAST, epoch, ckpt_name_metrics, prefix=self.prefix + self.CHECKPOINT_NAME_LAST, + trainer.current_epoch, + trainer.global_step, + ckpt_name_metrics, + prefix=self.prefix ) last_filepath = os.path.join(self.dirpath, f"{last_filepath}.ckpt") @@ -524,17 +528,19 @@ def _save_last_checkpoint(self, trainer, pl_module, epoch, ckpt_name_metrics, fi if self.monitor is None: self.best_model_path = self.last_model_path - def _save_top_k_checkpoints(self, metrics, trainer, pl_module, epoch, filepath): + def _save_top_k_checkpoints(self, metrics, trainer, pl_module, filepath): current = metrics.get(self.monitor) + epoch = metrics.get("epoch") + step = metrics.get("step") if not isinstance(current, torch.Tensor) and current is not None: current = torch.tensor(current, device=pl_module.device) if self.check_monitor_top_k(current): - self._update_best_and_save(filepath, current, epoch, trainer, pl_module) + self._update_best_and_save(filepath, current, epoch, step, trainer, pl_module) elif self.verbose: rank_zero_info( - f"Epoch {epoch:d}: {self.monitor} was not in top {self.save_top_k}" + f"Epoch {epoch:d}, step {step:d}: {self.monitor} was not in top {self.save_top_k}" ) def _is_valid_monitor_key(self, metrics): @@ -545,11 +551,11 @@ def _update_best_and_save( filepath: str, current: torch.Tensor, epoch: int, + step: int, trainer, pl_module, ): - - k = epoch + 1 if self.save_top_k == -1 else self.save_top_k + k = len(self.best_k_models) + 1 if self.save_top_k == -1 else self.save_top_k del_list = [] if len(self.best_k_models) == k and k > 0: @@ -576,9 +582,8 @@ def _update_best_and_save( if self.verbose: rank_zero_info( - f"Epoch {epoch:d}: {self.monitor} reached" - f" {current:0.5f} (best {self.best_model_score:0.5f})," - f" saving model to {filepath} as top {k}" + f"Epoch {epoch:d}, global step {step:d}: {self.monitor} reached {current:0.5f}" + f' (best {self.best_model_score:0.5f}), saving model to "{filepath}" as top {k}' ) self._save_model(filepath, trainer, pl_module) diff --git a/pytorch_lightning/cluster_environments/cluster_environment.py b/pytorch_lightning/cluster_environments/cluster_environment.py index ff3436e66204c..08fbbf4095ca3 100644 --- a/pytorch_lightning/cluster_environments/cluster_environment.py +++ b/pytorch_lightning/cluster_environments/cluster_environment.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + class ClusterEnvironment: def __init__(self): @@ -25,3 +26,6 @@ def master_port(self): def world_size(self): return self._world_size + + def local_rank(self): + pass diff --git a/pytorch_lightning/cluster_environments/slurm_environment.py b/pytorch_lightning/cluster_environments/slurm_environment.py index 44cdc2207899c..6df1cf680c57f 100644 --- a/pytorch_lightning/cluster_environments/slurm_environment.py +++ b/pytorch_lightning/cluster_environments/slurm_environment.py @@ -67,6 +67,9 @@ def master_port(self): def world_size(self): return self._world_size + def local_rank(self): + return int(os.environ['SLURM_LOCALID']) + def _resolve_root_node_address(self, root_node): if '[' in root_node: name, numbers = root_node.split('[', maxsplit=1) diff --git a/pytorch_lightning/cluster_environments/torchelastic_environment.py b/pytorch_lightning/cluster_environments/torchelastic_environment.py index d50a10a782dbb..a4d769518d252 100644 --- a/pytorch_lightning/cluster_environments/torchelastic_environment.py +++ b/pytorch_lightning/cluster_environments/torchelastic_environment.py @@ -46,3 +46,6 @@ def master_port(self): def world_size(self): return os.environ.get('WORLD_SIZE') + + def local_rank(self): + return int(os.environ['LOCAL_RANK']) diff --git a/pytorch_lightning/core/grads.py b/pytorch_lightning/core/grads.py index 2cdeaf4e59010..4ba1acf5689a7 100644 --- a/pytorch_lightning/core/grads.py +++ b/pytorch_lightning/core/grads.py @@ -46,11 +46,11 @@ def grad_norm(self, norm_type: Union[float, int, str]) -> Dict[str, float]: continue param_norm = float(p.grad.data.norm(norm_type)) - norms[f'grad_{norm_type}_norm_{name}'] = round(param_norm, 3) + norms[f'grad_{norm_type}_norm_{name}'] = round(param_norm, 4) all_norms.append(param_norm) total_norm = float(torch.tensor(all_norms).norm(norm_type)) - norms[f'grad_{norm_type}_norm_total'] = round(total_norm, 3) + norms[f'grad_{norm_type}_norm_total'] = round(total_norm, 4) return norms diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py index c185180991648..7500d1a11d440 100644 --- a/pytorch_lightning/core/hooks.py +++ b/pytorch_lightning/core/hooks.py @@ -132,7 +132,7 @@ def on_train_batch_end(self, outputs: Any, batch: Any, batch_idx: int, dataloade Called in the training loop after the batch. Args: - outputs: The outputs of validation_step_end(validation_step(x)) + outputs: The outputs of training_step_end(training_step(x)) batch: The batched data as it is returned by the training DataLoader. batch_idx: the index of the batch dataloader_idx: the index of the dataloader @@ -156,7 +156,7 @@ def on_validation_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: Called in the validation loop before anything happens for that batch. Args: - batch: The batched data as it is returned by the training DataLoader. + batch: The batched data as it is returned by the validation DataLoader. batch_idx: the index of the batch dataloader_idx: the index of the dataloader """ @@ -168,7 +168,7 @@ def on_validation_batch_end(self, outputs: Any, batch: Any, batch_idx: int, data Args: outputs: The outputs of validation_step_end(validation_step(x)) - batch: The batched data as it is returned by the training DataLoader. + batch: The batched data as it is returned by the validation DataLoader. batch_idx: the index of the batch dataloader_idx: the index of the dataloader """ @@ -179,7 +179,7 @@ def on_test_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: int) - Called in the test loop before anything happens for that batch. Args: - batch: The batched data as it is returned by the training DataLoader. + batch: The batched data as it is returned by the test DataLoader. batch_idx: the index of the batch dataloader_idx: the index of the dataloader """ @@ -191,7 +191,7 @@ def on_test_batch_end(self, outputs: Any, batch: Any, batch_idx: int, dataloader Args: outputs: The outputs of test_step_end(test_step(x)) - batch: The batched data as it is returned by the training DataLoader. + batch: The batched data as it is returned by the test DataLoader. batch_idx: the index of the batch dataloader_idx: the index of the dataloader """ diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 065b29c75da37..a332c0dcaa99a 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -11,16 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import os +import tempfile import collections import copy import inspect -import os import re -import tempfile from abc import ABC from argparse import Namespace -from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, Mapping import torch from pytorch_lightning import _logger as log @@ -28,16 +27,17 @@ from pytorch_lightning.core.hooks import CheckpointHooks, DataHooks, ModelHooks from pytorch_lightning.core.memory import ModelSummary from pytorch_lightning.core.saving import ALLOWED_CONFIG_TYPES, PRIMITIVE_TYPES, ModelIO +from pytorch_lightning.core.step_result import Result from pytorch_lightning.utilities import rank_zero_warn, AMPType from pytorch_lightning.utilities.device_dtype_mixin import DeviceDtypeModuleMixin from pytorch_lightning.utilities.xla_device_utils import XLADeviceUtils from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.core.step_result import Result from pytorch_lightning.utilities.parsing import ( AttributeDict, collect_init_args, get_init_args, ) +from pytorch_lightning.callbacks import Callback from torch import ScriptModule, Tensor from torch.nn import Module from torch.optim.optimizer import Optimizer @@ -111,6 +111,9 @@ def __init__(self, *args, **kwargs): self._datamodule = None self._results: Optional[Result] = None self._current_fx_name = '' + self._running_manual_backward = False + self._current_hook_fx_name = None + self._current_dataloader_idx = None def optimizers(self): opts = self.trainer.optimizers @@ -244,6 +247,20 @@ def log( on_step = self.__auto_choose_log_on_step(on_step) on_epoch = self.__auto_choose_log_on_epoch(on_epoch) + if self._current_hook_fx_name is not None: + self.trainer.logger_connector.check_logging_in_callbacks( + self._current_hook_fx_name, + on_step=on_step, + on_epoch=on_epoch + ) + + # make sure user doesn't introduce logic for multi-dataloaders + if "/dataloader_idx_" in name: + raise MisconfigurationException( + f"Logged key: {name} should not contain information about dataloader_idx.") + + accelerator = self.trainer.accelerator_backend + self._results.log( name, value, @@ -257,7 +274,9 @@ def log( enable_graph, sync_dist, sync_dist_op, - sync_dist_group + sync_dist_group, + accelerator.sync_tensor, + self._current_dataloader_idx, ) def log_dict( @@ -1067,6 +1086,9 @@ def manual_backward(self, loss: Tensor, optimizer: Optimizer, *args, **kwargs) - .. tip:: In manual mode we still automatically clip grads if Trainer(gradient_clip_val=x) is set + .. tip:: In manual mode we still automatically accumulate grad over batches if Trainer(accumulate_grad_batches=x) is set + and you use `model.manual_optimizer_step(optimizer)` + Example:: def training_step(...): @@ -1074,12 +1096,55 @@ def training_step(...): loss = ... # automatically applies scaling, etc... self.manual_backward(loss, opt_a) + self.manual_optimizer_step(opt_a) """ # make sure we're using manual opt self._verify_is_manual_optimization('manual_backward') # backward + self._running_manual_backward = True self.trainer.train_loop.backward(loss, optimizer, -1, *args, **kwargs) + self._running_manual_backward = False + + def manual_optimizer_step(self, optimizer: Optimizer, force_optimizer_step:bool = False) -> None: + """ + Call this directly from your training_step when doing optimizations manually. + By using this we can ensure that all the proper scaling when using 16-bit etc has been done for you + + .. tip:: In manual mode we still automatically accumulate grad over batches if Trainer(accumulate_grad_batches=x) is set. + + Args: + optimizer: Optimizer used to perform `.step()` call + + force_optimizer_step: Whether to force an optimizer step. Could be useful when having 2 optimizers + and one should use accumulated gradients but not the other one. + One could put its own logic to force an optimizer step. + + Example:: + + def training_step(...): + (opt_a, opt_b) = self.optimizers() + loss = ... + # automatically applies scaling, etc... + self.manual_backward(loss, opt_a) + # This will force an opt.step() even if accumulate_grad_batches is set. + self.manual_optimizer_step(opt_a, force_optimizer_step=True) + + """ + # make sure we're using manual opt + self._verify_is_manual_optimization('manual_optimizer_step') + + if not self.trainer.train_loop.should_accumulate() or force_optimizer_step: + + # mock closure function as the user is responsible to call `manual_backward` + def mock_optimizer_closure(): + return + + self.trainer.train_loop.optimizer_step(optimizer, None, self.trainer.batch_idx, mock_optimizer_closure) + + # update will be called after every optimizer_step call + if self.trainer.amp_backend == AMPType.NATIVE: + self.trainer.scaler.update() def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args, **kwargs) -> None: """ @@ -1100,8 +1165,8 @@ def backward(self, loss, optimizer, optimizer_idx): loss.backward() """ - loss.backward(*args, **kwargs) - self.trainer.train_loop.track_and_norm_grad(optimizer=optimizer) + if self.trainer.train_loop.automatic_optimization or self._running_manual_backward: + loss.backward(*args, **kwargs) def toggle_optimizer(self, optimizer: Optimizer, optimizer_idx: int): """ @@ -1129,10 +1194,10 @@ def optimizer_step( batch_idx: int, optimizer: Optimizer, optimizer_idx: int, - optimizer_closure: Optional[Callable] = None, - on_tpu: bool = False, - using_native_amp: bool = False, - using_lbfgs: bool = False, + optimizer_closure: Optional[Callable], + on_tpu: bool, + using_native_amp: bool, + using_lbfgs: bool, ) -> None: r""" Override this method to adjust the default way the @@ -1140,6 +1205,12 @@ def optimizer_step( By default, Lightning calls ``step()`` and ``zero_grad()`` as shown in the example once per optimizer. + Warning: + If you are overriding this method, make sure that you pass the ``optimizer_closure`` parameter + to ``optimizer.step()`` function as shown in the examples. This ensures that + ``train_step_and_backward_closure`` is called within + :meth:`~pytorch_lightning.trainer.training_loop.TrainLoop.run_training_batch`. + Args: epoch: Current epoch batch_idx: Index of current batch @@ -1154,23 +1225,23 @@ def optimizer_step( .. code-block:: python # DEFAULT - def optimizer_step(self, current_epoch, batch_idx, optimizer, optimizer_idx, + def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs): - optimizer.step() + optimizer.step(closure=optimizer_closure) # Alternating schedule for optimizer steps (i.e.: GANs) - def optimizer_step(self, current_epoch, batch_idx, optimizer, optimizer_idx, + def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs): # update generator opt every 2 steps if optimizer_idx == 0: if batch_idx % 2 == 0 : - optimizer.step() + optimizer.step(closure=optimizer_closure) optimizer.zero_grad() # update discriminator opt every 4 steps if optimizer_idx == 1: if batch_idx % 4 == 0 : - optimizer.step() + optimizer.step(closure=optimizer_closure) optimizer.zero_grad() # ... @@ -1183,8 +1254,8 @@ def optimizer_step(self, current_epoch, batch_idx, optimizer, optimizer_idx, .. code-block:: python # learning rate warm-up - def optimizer_step(self, current_epoch, batch_idx, optimizer, - optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs): + def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, + optimizer_closure, on_tpu, using_native_amp, using_lbfgs): # warm up lr if self.trainer.global_step < 500: lr_scale = min(1., float(self.trainer.global_step + 1) / 500.) @@ -1192,7 +1263,7 @@ def optimizer_step(self, current_epoch, batch_idx, optimizer, pg['lr'] = lr_scale * self.learning_rate # update params - optimizer.step() + optimizer.step(closure=optimizer_closure) optimizer.zero_grad() Note: @@ -1278,11 +1349,11 @@ def tbptt_split_batch(self, batch, split_size): batch_split = [] for i, x in enumerate(batch): if isinstance(x, torch.Tensor): - split_x = x[:, t : t + split_size] + split_x = x[:, t: t + split_size] elif isinstance(x, collections.Sequence): split_x = [None] * len(x) for batch_idx in range(len(x)): - split_x[batch_idx] = x[batch_idx][t : t + split_size] + split_x[batch_idx] = x[batch_idx][t: t + split_size] batch_split.append(split_x) @@ -1376,7 +1447,6 @@ def _verify_is_manual_optimization(self, fn_name): @classmethod def _auto_collect_arguments(cls, frame=None) -> Tuple[Dict, Dict]: - """""" """ Collect all module arguments in the current constructor and all child constructors. The child constructors are all the ``__init__`` methods that reach the current class through @@ -1395,7 +1465,7 @@ def _auto_collect_arguments(cls, frame=None) -> Tuple[Dict, Dict]: frame_args = collect_init_args(frame.f_back, []) self_arguments = frame_args[-1] - # set module_arguments in child + # set hyper_parameters in child self_arguments = self_arguments parents_arguments = {} @@ -1539,7 +1609,7 @@ def to_onnx(self, file_path: str, input_sample: Optional[Tensor] = None, **kwarg def to_torchscript( self, file_path: Optional[str] = None, method: Optional[str] = 'script', - example_inputs: Optional[torch.Tensor] = None, **kwargs + example_inputs: Optional[Union[torch.Tensor, Tuple[torch.Tensor]]] = None, **kwargs ) -> Union[ScriptModule, Dict[str, ScriptModule]]: """ By default compiles the whole model to a :class:`~torch.jit.ScriptModule`. @@ -1576,6 +1646,9 @@ def to_torchscript( >>> model = SimpleModel() >>> torch.jit.save(model.to_torchscript(), "model.pt") # doctest: +SKIP >>> os.path.isfile("model.pt") # doctest: +SKIP + >>> torch.jit.save(model.to_torchscript(file_path="model_trace.pt", method='trace', # doctest: +SKIP + ... example_inputs=torch.randn(1, 64))) # doctest: +SKIP + >>> os.path.isfile("model_trace.pt") # doctest: +SKIP True Return: @@ -1592,8 +1665,8 @@ def to_torchscript( if example_inputs is None: example_inputs = self.example_input_array # automatically send example inputs to the right device and use trace - torchscript_module = torch.jit.trace(func=self.eval(), example_inputs=example_inputs.to(self.device), - **kwargs) + example_inputs = self.transfer_batch_to_device(example_inputs, device=self.device) + torchscript_module = torch.jit.trace(func=self.eval(), example_inputs=example_inputs, **kwargs) else: raise ValueError(f"The 'method' parameter only supports 'script' or 'trace', but value given was:" f"{method}") diff --git a/pytorch_lightning/core/saving.py b/pytorch_lightning/core/saving.py index 7124007cd3f2a..2662aa6758332 100644 --- a/pytorch_lightning/core/saving.py +++ b/pytorch_lightning/core/saving.py @@ -60,9 +60,9 @@ def load_from_checkpoint( ): r""" Primary way of loading a model from a checkpoint. When Lightning saves a checkpoint - it stores the arguments passed to `__init__` in the checkpoint under `module_arguments` + it stores the arguments passed to `__init__` in the checkpoint under `hyper_parameters` - Any arguments specified through \*args and \*\*kwargs will override args stored in `hparams`. + Any arguments specified through \*args and \*\*kwargs will override args stored in `hyper_parameters`. Args: checkpoint_path: Path to checkpoint. This can also be a URL, or file-like object @@ -89,8 +89,8 @@ def load_from_checkpoint( `hparams` as :class:`~dict`. strict: Whether to strictly enforce that the keys in :attr:`checkpoint_path` match the keys returned by this module's state dict. Default: `True`. - hparam_overrides: A dictionary with keys to override in the hparams - kwargs: Any keyword args needed to init the model. + kwargs: Any extra keyword args needed to init the model. Can also be used to override saved + hyperparameter values. Return: :class:`LightningModule` with loaded weights and hyperparameters (if available). diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index 650c1876d0cd0..8f8a517d544f0 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -124,14 +124,17 @@ def log( sync_dist: bool = False, sync_dist_op: Union[Any, str] = 'mean', sync_dist_group: Optional[Any] = None, + sync_fn: Callable = None, + dataloader_idx: Optional[int] = None, ): # no metrics should be logged with graphs if not enable_graph and isinstance(value, torch.Tensor): value = value.detach() - # sync across ddp + # sync across workers when using distributed training + sync_fn = sync_fn or sync_ddp_if_available if sync_dist and isinstance(value, (torch.Tensor, numbers.Number)): - value = sync_ddp_if_available(value, group=sync_dist_group, reduce_op=sync_dist_op) + value = sync_fn(value, group=sync_dist_group, reduce_op=sync_dist_op) if 'meta' not in self: self.__setitem__('meta', {}) @@ -144,6 +147,7 @@ def log( # set step version step_name = f'{name}_step' + self.__set_meta( step_name, value, @@ -154,12 +158,15 @@ def log( reduce_fx=reduce_fx, tbptt_reduce_fx=tbptt_reduce_fx, tbptt_pad_token=tbptt_pad_token, - forked=False + forked=False, + dataloader_idx=dataloader_idx, ) + self.__setitem__(step_name, value) # set epoch version epoch_name = f'{name}_epoch' + self.__set_meta( epoch_name, value, @@ -170,7 +177,8 @@ def log( reduce_fx=reduce_fx, tbptt_reduce_fx=tbptt_reduce_fx, tbptt_pad_token=tbptt_pad_token, - forked=False + forked=False, + dataloader_idx=dataloader_idx, ) self.__setitem__(epoch_name, value) @@ -185,7 +193,8 @@ def log( reduce_fx, tbptt_reduce_fx=tbptt_reduce_fx, tbptt_pad_token=tbptt_pad_token, - forked=was_forked + forked=was_forked, + dataloader_idx=dataloader_idx, ) # set the value @@ -202,7 +211,8 @@ def __set_meta( reduce_fx: Callable, tbptt_pad_token: int, tbptt_reduce_fx: Callable, - forked: bool + forked: bool, + dataloader_idx: Union[int, None] ): # set the meta for the item meta_value = value @@ -215,7 +225,8 @@ def __set_meta( value=meta_value, tbptt_reduce_fx=tbptt_reduce_fx, tbptt_pad_token=tbptt_pad_token, - forked=forked + forked=forked, + dataloader_idx=dataloader_idx, ) self['meta'][name] = meta @@ -225,13 +236,22 @@ def __set_meta( _internal['_reduce_on_epoch'] = max(_internal['_reduce_on_epoch'], on_epoch) def track_batch_size(self, batch): + batch_size = Result.extract_batch_size(batch) + Result.attach_batch_size(batch_size, self) + + @staticmethod + def extract_batch_size(batch): try: batch_size = Result.unpack_batch_size(batch) except RecursionError as re: batch_size = 1 + return batch_size - meta = self['meta'] - meta['_internal']['batch_sizes'].append(batch_size) + @staticmethod + def attach_batch_size(batch_size: Union[int, None], result: 'Result') -> None: + if batch_size is not None: + meta = result['meta'] + meta['_internal']['batch_sizes'].append(batch_size) def get_batch_sizes(self): meta = self['meta'] @@ -242,7 +262,12 @@ def get_callback_metrics(self) -> dict: return result - def get_batch_log_metrics(self, include_forked_originals=True) -> dict: + def _add_dataloader_idx(self, k: str, dataloader_idx: Union[int, None], add_dataloader_idx: bool) -> str: + if dataloader_idx is not None and add_dataloader_idx: + return f"{k}/dataloader_idx_{dataloader_idx}" + return k + + def get_batch_log_metrics(self, include_forked_originals=True, add_dataloader_idx=False) -> dict: """ Gets the metrics to log at the end of the batch step @@ -257,15 +282,17 @@ def get_batch_log_metrics(self, include_forked_originals=True) -> dict: if options['forked'] and not include_forked_originals: continue + dl_key = self._add_dataloader_idx(k, options["dataloader_idx"], add_dataloader_idx) + if options['logger'] and options['on_step']: if isinstance(self[k], Metric): - result[k] = self[k]._forward_cache + result[dl_key] = self[k]._forward_cache.detach() else: - result[k] = self[k] + result[dl_key] = self[k] return result - def get_epoch_log_metrics(self) -> dict: + def get_epoch_log_metrics(self, add_dataloader_idx=False) -> dict: """ Gets the metrics to log at the end of epoch """ @@ -279,11 +306,13 @@ def get_epoch_log_metrics(self) -> dict: if options['forked']: continue + dl_key = self._add_dataloader_idx(k, options["dataloader_idx"], add_dataloader_idx) + if options['logger'] and options['on_epoch']: if isinstance(self[k], Metric): - result[k] = self[k].compute() + result[dl_key] = self[k].compute().detach() else: - result[k] = self[k] + result[dl_key] = self[k] if k in self and not options['on_epoch'] and isinstance(self[k], Metric): # compute metric on epoch anyway so state does not accumulate @@ -291,7 +320,7 @@ def get_epoch_log_metrics(self) -> dict: return result - def get_epoch_pbar_metrics(self): + def get_epoch_pbar_metrics(self, add_dataloader_idx=False): """ Gets the metrics to log at the end of epoch """ @@ -305,11 +334,13 @@ def get_epoch_pbar_metrics(self): if options['forked']: continue + dl_key = self._add_dataloader_idx(k, options["dataloader_idx"], add_dataloader_idx) + if options['prog_bar'] and options['on_epoch']: if isinstance(self[k], Metric): - result[k] = self[k].compute() + result[dl_key] = self[k].compute().detach() else: - result[k] = self[k] + result[dl_key] = self[k] if k in self and not options['on_epoch'] and isinstance(self[k], Metric): # compute metric on epoch anyway so state does not accumulate @@ -317,7 +348,7 @@ def get_epoch_pbar_metrics(self): return result - def get_forked_metrics(self): + def get_forked_metrics(self, add_dataloader_idx=False): """ Gets the metrics to log at the end of epoch """ @@ -328,12 +359,14 @@ def get_forked_metrics(self): if k == '_internal': continue + dl_key = self._add_dataloader_idx(k, options["dataloader_idx"], add_dataloader_idx) + if options['forked']: - result[k] = self[k] + result[dl_key] = self[k] return result - def get_batch_pbar_metrics(self, include_forked_originals=True): + def get_batch_pbar_metrics(self, include_forked_originals=True, add_dataloader_idx=False): """ Gets the metrics to log at the end of the batch step """ @@ -347,11 +380,13 @@ def get_batch_pbar_metrics(self, include_forked_originals=True): if options['forked'] and not include_forked_originals: continue + dl_key = self._add_dataloader_idx(k, options["dataloader_idx"], add_dataloader_idx) + if options['prog_bar'] and options['on_step']: if isinstance(self[k], Metric): - result[k] = self[k]._forward_cache + result[dl_key] = self[k]._forward_cache else: - result[k] = self[k] + result[dl_key] = self[k] return result @@ -360,6 +395,12 @@ def detach(self): if isinstance(v, torch.Tensor): self.__setitem__(k, v.detach()) + def cpu(self): + """Move all self attributes to CPU.""" + for k, v in self.items(): + if isinstance(v, torch.Tensor): + self.__setitem__(k, v.cpu()) + def __repr__(self): self_copy = self.copy() @@ -473,6 +514,8 @@ def reduce_on_epoch_end(cls, outputs): if option['on_epoch']: fx = option['reduce_fx'] if fx == torch.mean: + if isinstance(result[k], list): + result[k] = torch.tensor(result[k]).float() try: reduced_val = weighted_mean(result[k], batch_sizes) except Exception as e: diff --git a/pytorch_lightning/loggers/base.py b/pytorch_lightning/loggers/base.py index cf0b22d7d446f..2246d02bc9bcb 100644 --- a/pytorch_lightning/loggers/base.py +++ b/pytorch_lightning/loggers/base.py @@ -188,7 +188,7 @@ def _sanitize_callable(val): return val.__name__ return _val except Exception: - return val.__name__ + return getattr(val, "__name__", None) return val return {key: _sanitize_callable(val) for key, val in params.items()} diff --git a/pytorch_lightning/loggers/comet.py b/pytorch_lightning/loggers/comet.py index 553d5186a979d..40cff1bc7e819 100644 --- a/pytorch_lightning/loggers/comet.py +++ b/pytorch_lightning/loggers/comet.py @@ -187,7 +187,6 @@ def experiment(self): if self._future_experiment_key is not None: os.environ["COMET_EXPERIMENT_KEY"] = self._future_experiment_key - self._future_experiment_key = None try: if self.mode == "online": @@ -212,7 +211,9 @@ def experiment(self): **self._kwargs, ) finally: - os.environ.pop("COMET_EXPERIMENT_KEY", None) + if self._future_experiment_key is not None: + os.environ.pop("COMET_EXPERIMENT_KEY") + self._future_experiment_key = None if self._experiment_name: self._experiment.set_name(self._experiment_name) @@ -278,6 +279,9 @@ def version(self) -> str: if self._experiment_key is not None: return self._experiment_key + if "COMET_EXPERIMENT_KEY" in os.environ: + return os.environ["COMET_EXPERIMENT_KEY"] + if self._future_experiment_key is not None: return self._future_experiment_key diff --git a/pytorch_lightning/loggers/csv_logs.py b/pytorch_lightning/loggers/csv_logs.py index c22f46eb03f5a..2d478855eb1b5 100644 --- a/pytorch_lightning/loggers/csv_logs.py +++ b/pytorch_lightning/loggers/csv_logs.py @@ -52,7 +52,7 @@ def __init__(self, log_dir: str) -> None: self.metrics = [] self.log_dir = log_dir - if os.path.exists(self.log_dir): + if os.path.exists(self.log_dir) and os.listdir(self.log_dir): rank_zero_warn( f"Experiment logs directory {self.log_dir} exists and is not empty." " Previous log files in this directory will be deleted when the new ones are saved!" diff --git a/pytorch_lightning/loggers/mlflow.py b/pytorch_lightning/loggers/mlflow.py index de915785dcb45..ee9f8f86cf247 100644 --- a/pytorch_lightning/loggers/mlflow.py +++ b/pytorch_lightning/loggers/mlflow.py @@ -16,6 +16,8 @@ MLflow ------ """ +import re +import warnings from argparse import Namespace from time import time from typing import Any, Dict, Optional, Union @@ -151,6 +153,13 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> if isinstance(v, str): log.warning(f'Discarding metric with string value {k}={v}.') continue + + new_k = re.sub("[^a-zA-Z0-9_/. -]+", "", k) + if k != new_k: + warnings.warn(("MLFlow only allows '_', '/', '.' and ' ' special characters in metric name.\n", + f"Replacing {k} with {new_k}.")) + k = new_k + self.experiment.log_metric(self.run_id, k, v, timestamp_ms, step) @rank_zero_only diff --git a/pytorch_lightning/loggers/wandb.py b/pytorch_lightning/loggers/wandb.py index e6ce264d597bf..5786a52a8e371 100644 --- a/pytorch_lightning/loggers/wandb.py +++ b/pytorch_lightning/loggers/wandb.py @@ -94,6 +94,8 @@ def __init__( self._offline = offline self._log_model = log_model self._kwargs = kwargs + # logging multiple Trainer on a single W&B run (k-fold, etc) + self._step_offset = 0 def __getstate__(self): state = self.__dict__.copy() @@ -141,8 +143,7 @@ def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None: @rank_zero_only def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: assert rank_zero_only.rank == 0, 'experiment tried to log from global_rank != 0' - - self.experiment.log({'global_step': step, **metrics} if step is not None else metrics) + self.experiment.log(metrics, step=(step + self._step_offset) if step is not None else None) @property def save_dir(self) -> Optional[str]: @@ -159,6 +160,10 @@ def version(self) -> Optional[str]: return self._experiment.id if self._experiment else self._id def finalize(self, status: str) -> None: + # offset future training logged on same W&B run + if self._experiment is not None: + self._step_offset = self._experiment.step + # upload all checkpoints from saving dir if self._log_model: wandb.save(os.path.join(self.save_dir, "*.ckpt")) diff --git a/pytorch_lightning/metrics/__init__.py b/pytorch_lightning/metrics/__init__.py index 3e42c73e70f84..6be7fbdea7c30 100644 --- a/pytorch_lightning/metrics/__init__.py +++ b/pytorch_lightning/metrics/__init__.py @@ -17,7 +17,8 @@ Accuracy, Precision, Recall, - Fbeta + Fbeta, + ConfusionMatrix ) from pytorch_lightning.metrics.regression import ( diff --git a/pytorch_lightning/metrics/classification/__init__.py b/pytorch_lightning/metrics/classification/__init__.py index 7b342235085b7..e440edc2ebfb9 100644 --- a/pytorch_lightning/metrics/classification/__init__.py +++ b/pytorch_lightning/metrics/classification/__init__.py @@ -14,3 +14,4 @@ from pytorch_lightning.metrics.classification.accuracy import Accuracy from pytorch_lightning.metrics.classification.precision_recall import Precision, Recall from pytorch_lightning.metrics.classification.f_beta import Fbeta +from pytorch_lightning.metrics.classification.confusion_matrix import ConfusionMatrix diff --git a/pytorch_lightning/metrics/classification/accuracy.py b/pytorch_lightning/metrics/classification/accuracy.py index 70332331f681f..0f01fb9813407 100644 --- a/pytorch_lightning/metrics/classification/accuracy.py +++ b/pytorch_lightning/metrics/classification/accuracy.py @@ -21,6 +21,7 @@ import torch from torch import nn from pytorch_lightning.metrics.metric import Metric +from pytorch_lightning.metrics.utils import _input_format_classification class Accuracy(Metric): @@ -49,6 +50,9 @@ class Accuracy(Metric): before returning the value at the step. default: False process_group: Specify the process group on which synchronization is called. default: None (which selects the entire world) + dist_sync_fn: + Callback that performs the allgather operation on the metric state. When `None`, DDP + will be used to perform the allgather. default: None Example: @@ -60,18 +64,19 @@ class Accuracy(Metric): tensor(0.5000) """ - def __init__( self, threshold: float = 0.5, compute_on_step: bool = True, dist_sync_on_step: bool = False, process_group: Optional[Any] = None, + dist_sync_fn: Callable = None, ): super().__init__( compute_on_step=compute_on_step, dist_sync_on_step=dist_sync_on_step, process_group=process_group, + dist_sync_fn=dist_sync_fn, ) self.add_state("correct", default=torch.tensor(0), dist_reduce_fx="sum") @@ -79,21 +84,6 @@ def __init__( self.threshold = threshold - def _input_format(self, preds: torch.Tensor, target: torch.Tensor): - if not (len(preds.shape) == len(target.shape) or len(preds.shape) == len(target.shape) + 1): - raise ValueError( - "preds and target must have same number of dimensions, or one additional dimension for preds" - ) - - if len(preds.shape) == len(target.shape) + 1: - # multi class probabilites - preds = torch.argmax(preds, dim=1) - - if len(preds.shape) == len(target.shape) and preds.dtype == torch.float: - # binary or multilabel probablities - preds = (preds >= self.threshold).long() - return preds, target - def update(self, preds: torch.Tensor, target: torch.Tensor): """ Update state with predictions and targets. @@ -102,7 +92,7 @@ def update(self, preds: torch.Tensor, target: torch.Tensor): preds: Predictions from model target: Ground truth values """ - preds, target = self._input_format(preds, target) + preds, target = _input_format_classification(preds, target, self.threshold) assert preds.shape == target.shape self.correct += torch.sum(preds == target) diff --git a/pytorch_lightning/metrics/classification/confusion_matrix.py b/pytorch_lightning/metrics/classification/confusion_matrix.py new file mode 100644 index 0000000000000..5a825d8f191cc --- /dev/null +++ b/pytorch_lightning/metrics/classification/confusion_matrix.py @@ -0,0 +1,111 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Optional + +import torch + +from pytorch_lightning.metrics.metric import Metric +from pytorch_lightning.metrics.functional.confusion_matrix import ( + _confusion_matrix_update, + _confusion_matrix_compute +) + + +class ConfusionMatrix(Metric): + """ + Computes the confusion matrix. Works with binary, multiclass, and multilabel data. + Accepts logits from a model output or integer class values in prediction. + Works with multi-dimensional preds and target. + + Forward accepts + + - ``preds`` (float or long tensor): ``(N, ...)`` or ``(N, C, ...)`` where C is the number of classes + - ``target`` (long tensor): ``(N, ...)`` + + If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument. + This is the case for binary and multi-label logits. + + If preds has an extra dimension as in the case of multi-class scores we perform an argmax on ``dim=1``. + + Args: + num_classes: Number of classes in the dataset. + normalize: Normalization mode for confusion matrix. Choose from + + - ``None``: no normalization (default) + - ``'true'``: normalization over the targets (most commonly used) + - ``'pred'``: normalization over the predictions + - ``'all'``: normalization over the whole matrix + + threshold: + Threshold value for binary or multi-label logits. default: 0.5 + compute_on_step: + Forward only calls ``update()`` and return None if this is set to False. default: True + dist_sync_on_step: + Synchronize metric state across processes at each ``forward()`` + before returning the value at the step. default: False + process_group: + Specify the process group on which synchronization is called. default: None (which selects the entire world) + + Example: + + >>> from pytorch_lightning.metrics import ConfusionMatrix + >>> target = torch.tensor([1, 1, 0, 0]) + >>> preds = torch.tensor([0, 1, 0, 0]) + >>> confmat = ConfusionMatrix(num_classes=2) + >>> confmat(preds, target) + tensor([[2., 0.], + [1., 1.]]) + + """ + def __init__( + self, + num_classes: int, + normalize: Optional[str] = None, + threshold: float = 0.5, + compute_on_step: bool = True, + dist_sync_on_step: bool = False, + process_group: Optional[Any] = None, + ): + + super().__init__( + compute_on_step=compute_on_step, + dist_sync_on_step=dist_sync_on_step, + process_group=process_group, + ) + self.num_classes = num_classes + self.normalize = normalize + self.threshold = threshold + + allowed_normalize = ('true', 'pred', 'all', None) + assert self.normalize in allowed_normalize, \ + f"Argument average needs to one of the following: {allowed_normalize}" + + self.add_state("confmat", default=torch.zeros(num_classes, num_classes), dist_reduce_fx="sum") + + def update(self, preds: torch.Tensor, target: torch.Tensor): + """ + Update state with predictions and targets. + + Args: + preds: Predictions from model + target: Ground truth values + """ + confmat = _confusion_matrix_update(preds, target, self.num_classes, self.threshold) + self.confmat += confmat + + def compute(self) -> torch.Tensor: + """ + Computes confusion matrix + """ + return _confusion_matrix_compute(self.confmat, self.normalize) diff --git a/pytorch_lightning/metrics/functional/__init__.py b/pytorch_lightning/metrics/functional/__init__.py index 1af3db85b0de3..620072b44a2da 100644 --- a/pytorch_lightning/metrics/functional/__init__.py +++ b/pytorch_lightning/metrics/functional/__init__.py @@ -16,12 +16,12 @@ auc, auroc, average_precision, - confusion_matrix, dice_score, f1_score, fbeta_score, multiclass_precision_recall_curve, multiclass_roc, + multiclass_auroc, precision, precision_recall, precision_recall_curve, @@ -44,3 +44,4 @@ from pytorch_lightning.metrics.functional.mean_squared_log_error import mean_squared_log_error from pytorch_lightning.metrics.functional.psnr import psnr from pytorch_lightning.metrics.functional.ssim import ssim +from pytorch_lightning.metrics.functional.confusion_matrix import confusion_matrix diff --git a/pytorch_lightning/metrics/functional/classification.py b/pytorch_lightning/metrics/functional/classification.py index a831611fb9593..aec1b47096e26 100644 --- a/pytorch_lightning/metrics/functional/classification.py +++ b/pytorch_lightning/metrics/functional/classification.py @@ -301,48 +301,6 @@ def _confmat_normalize(cm): return cm -def confusion_matrix( - pred: torch.Tensor, - target: torch.Tensor, - normalize: bool = False, - num_classes: Optional[int] = None -) -> torch.Tensor: - """ - Computes the confusion matrix C where each entry C_{i,j} is the number of observations - in group i that were predicted in group j. - - Args: - pred: estimated targets - target: ground truth labels - normalize: normalizes confusion matrix - num_classes: number of classes - - Return: - Tensor, confusion matrix C [num_classes, num_classes ] - - Example: - - >>> x = torch.tensor([1, 2, 3]) - >>> y = torch.tensor([0, 2, 3]) - >>> confusion_matrix(x, y) - tensor([[0., 1., 0., 0.], - [0., 0., 0., 0.], - [0., 0., 1., 0.], - [0., 0., 0., 1.]]) - """ - num_classes = get_num_classes(pred, target, num_classes) - - unique_labels = (target.view(-1) * num_classes + pred.view(-1)).to(torch.int) - - bins = torch.bincount(unique_labels, minlength=num_classes ** 2) - cm = bins.reshape(num_classes, num_classes).squeeze().float() - - if normalize: - cm = _confmat_normalize(cm) - - return cm - - def precision_recall( pred: torch.Tensor, target: torch.Tensor, @@ -859,13 +817,14 @@ def new_func(*args, **kwargs) -> torch.Tensor: def multiclass_auc_decorator(reorder: bool = True) -> Callable: def wrapper(func_to_decorate: Callable) -> Callable: + @wraps(func_to_decorate) def new_func(*args, **kwargs) -> torch.Tensor: results = [] for class_result in func_to_decorate(*args, **kwargs): x, y = class_result[:2] results.append(auc(x, y, reorder=reorder)) - return torch.cat(results) + return torch.stack(results) return new_func @@ -900,7 +859,7 @@ def auroc( if any(target > 1): raise ValueError('AUROC metric is meant for binary classification, but' ' target tensor contains value different from 0 and 1.' - ' Multiclass is currently not supported.') + ' Use `multiclass_auroc` for multi class classification.') @auc_decorator(reorder=True) def _auroc(pred, target, sample_weight, pos_label): @@ -909,6 +868,62 @@ def _auroc(pred, target, sample_weight, pos_label): return _auroc(pred=pred, target=target, sample_weight=sample_weight, pos_label=pos_label) +def multiclass_auroc( + pred: torch.Tensor, + target: torch.Tensor, + sample_weight: Optional[Sequence] = None, + num_classes: Optional[int] = None, +) -> torch.Tensor: + """ + Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from multiclass + prediction scores + + Args: + pred: estimated probabilities, with shape [N, C] + target: ground-truth labels, with shape [N,] + sample_weight: sample weights + num_classes: number of classes (default: None, computes automatically from data) + + Return: + Tensor containing ROCAUC score + + Example: + + >>> pred = torch.tensor([[0.85, 0.05, 0.05, 0.05], + ... [0.05, 0.85, 0.05, 0.05], + ... [0.05, 0.05, 0.85, 0.05], + ... [0.05, 0.05, 0.05, 0.85]]) + >>> target = torch.tensor([0, 1, 3, 2]) + >>> multiclass_auroc(pred, target) # doctest: +NORMALIZE_WHITESPACE + tensor(0.6667) + """ + if not torch.allclose(pred.sum(dim=1), torch.tensor(1.0)): + raise ValueError( + "Multiclass AUROC metric expects the target scores to be" + " probabilities, i.e. they should sum up to 1.0 over classes") + + if torch.unique(target).size(0) != pred.size(1): + raise ValueError( + f"Number of classes found in in 'target' ({torch.unique(target).size(0)})" + f" does not equal the number of columns in 'pred' ({pred.size(1)})." + " Multiclass AUROC is not defined when all of the classes do not" + " occur in the target labels.") + + if num_classes is not None and num_classes != pred.size(1): + raise ValueError( + f"Number of classes deduced from 'pred' ({pred.size(1)}) does not equal" + f" the number of classes passed in 'num_classes' ({num_classes}).") + + @multiclass_auc_decorator(reorder=False) + def _multiclass_auroc(pred, target, sample_weight, num_classes): + return multiclass_roc(pred, target, sample_weight, num_classes) + + class_aurocs = _multiclass_auroc(pred=pred, target=target, + sample_weight=sample_weight, + num_classes=num_classes) + return torch.mean(class_aurocs) + + def average_precision( pred: torch.Tensor, target: torch.Tensor, diff --git a/pytorch_lightning/metrics/functional/confusion_matrix.py b/pytorch_lightning/metrics/functional/confusion_matrix.py new file mode 100644 index 0000000000000..143d237b3b2c6 --- /dev/null +++ b/pytorch_lightning/metrics/functional/confusion_matrix.py @@ -0,0 +1,96 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Optional + +import torch + +from pytorch_lightning.utilities import rank_zero_warn +from pytorch_lightning.metrics.utils import _input_format_classification + + +def _confusion_matrix_update(preds: torch.Tensor, + target: torch.Tensor, + num_classes: int, + threshold: float = 0.5) -> torch.Tensor: + preds, target = _input_format_classification(preds, target, threshold) + unique_mapping = (target.view(-1) * num_classes + preds.view(-1)).to(torch.long) + bins = torch.bincount(unique_mapping, minlength=num_classes ** 2) + confmat = bins.reshape(num_classes, num_classes) + return confmat + + +def _confusion_matrix_compute(confmat: torch.Tensor, + normalize: Optional[str] = None) -> torch.Tensor: + allowed_normalize = ('true', 'pred', 'all', None) + assert normalize in allowed_normalize, \ + f"Argument average needs to one of the following: {allowed_normalize}" + confmat = confmat.float() + if normalize is not None: + if normalize == 'true': + cm = confmat / confmat.sum(axis=1, keepdim=True) + elif normalize == 'pred': + cm = confmat / confmat.sum(axis=0, keepdim=True) + elif normalize == 'all': + cm = confmat / confmat.sum() + nan_elements = cm[torch.isnan(cm)].nelement() + if nan_elements != 0: + cm[torch.isnan(cm)] = 0 + rank_zero_warn(f'{nan_elements} nan values found in confusion matrix have been replaced with zeros.') + return cm + return confmat + + +def confusion_matrix( + preds: torch.Tensor, + target: torch.Tensor, + num_classes: int, + normalize: Optional[str] = None, + threshold: float = 0.5 +) -> torch.Tensor: + """ + Computes the confusion matrix. Works with binary, multiclass, and multilabel data. + Accepts logits from a model output or integer class values in prediction. + Works with multi-dimensional preds and target. + + If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument. + This is the case for binary and multi-label logits. + + If preds has an extra dimension as in the case of multi-class scores we perform an argmax on ``dim=1``. + + Args: + preds: (float or long tensor), Either a ``(N, ...)`` tensor with labels or + ``(N, C, ...)`` where C is the number of classes, tensor with logits/probabilities + target: ``target`` (long tensor), tensor with shape ``(N, ...)`` with ground true labels + num_classes: Number of classes in the dataset. + normalize: Normalization mode for confusion matrix. Choose from + + - ``None``: no normalization (default) + - ``'true'``: normalization over the targets (most commonly used) + - ``'pred'``: normalization over the predictions + - ``'all'``: normalization over the whole matrix + + threshold: + Threshold value for binary or multi-label logits. default: 0.5 + + Example: + + >>> from pytorch_lightning.metrics.functional import confusion_matrix + >>> target = torch.tensor([1, 1, 0, 0]) + >>> preds = torch.tensor([0, 1, 0, 0]) + >>> confusion_matrix(preds, target, num_classes=2) + tensor([[2., 0.], + [1., 1.]]) + """ + confmat = _confusion_matrix_update(preds, target, num_classes, threshold) + return _confusion_matrix_compute(confmat, normalize) diff --git a/pytorch_lightning/metrics/metric.py b/pytorch_lightning/metrics/metric.py index 3a853be0ebdd5..9fa479dfb567a 100644 --- a/pytorch_lightning/metrics/metric.py +++ b/pytorch_lightning/metrics/metric.py @@ -24,7 +24,7 @@ from torch import nn from pytorch_lightning.utilities.apply_func import apply_to_collection -from pytorch_lightning.utilities.distributed import gather_all_tensors_if_available +from pytorch_lightning.utilities.distributed import gather_all_tensors from pytorch_lightning.metrics.utils import _flatten, dim_zero_cat, dim_zero_mean, dim_zero_sum @@ -53,21 +53,26 @@ class Metric(nn.Module, ABC): Forward only calls ``update()`` and returns None if this is set to False. default: True dist_sync_on_step: Synchronize metric state across processes at each ``forward()`` - before returning the value at the step. default: False + before returning the value at the step. process_group: Specify the process group on which synchronization is called. default: None (which selects the entire world) + dist_sync_fn: + Callback that performs the allgather operation on the metric state. When `None`, DDP + will be used to perform the allgather. default: None """ def __init__( self, compute_on_step: bool = True, dist_sync_on_step: bool = False, process_group: Optional[Any] = None, + dist_sync_fn: Callable = None, ): super().__init__() self.dist_sync_on_step = dist_sync_on_step self.compute_on_step = compute_on_step self.process_group = process_group + self.dist_sync_fn = dist_sync_fn self._to_sync = True self.update = self._wrap_update(self.update) @@ -76,8 +81,9 @@ def __init__( self._forward_cache = None # initialize state - self._reductions = {} self._defaults = {} + self._persistent = {} + self._reductions = {} def add_state( self, name: str, default, dist_reduce_fx: Optional[Union[str, Callable]] = None, persistent: bool = True @@ -133,24 +139,20 @@ def add_state( "`dist_reduce_fx` must be callable or one of ['mean', 'sum', 'cat', None]" ) - if isinstance(default, torch.Tensor): - if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"): - # persistent keyword is only supported in torch >= 1.6.0 - self.register_buffer(name, default, persistent=persistent) - else: - self.register_buffer(name, default) - else: - setattr(self, name, default) + setattr(self, name, default) self._defaults[name] = deepcopy(default) + self._persistent[name] = persistent self._reductions[name] = dist_reduce_fx + @torch.jit.unused def forward(self, *args, **kwargs): """ Automatically calls ``update()``. Returns the metric value over inputs if ``compute_on_step`` is True. """ # add current step - self.update(*args, **kwargs) + with torch.no_grad(): + self.update(*args, **kwargs) self._forward_cache = None if self.compute_on_step: @@ -172,12 +174,12 @@ def forward(self, *args, **kwargs): return self._forward_cache - def _sync_dist(self): + def _sync_dist(self, dist_sync_fn=gather_all_tensors): input_dict = {attr: getattr(self, attr) for attr in self._reductions.keys()} output_dict = apply_to_collection( input_dict, torch.Tensor, - gather_all_tensors_if_available, + dist_sync_fn, group=self.process_group, ) @@ -206,12 +208,15 @@ def wrapped_func(*args, **kwargs): if self._computed is not None: return self._computed - if ( - self._to_sync - and torch.distributed.is_available() # noqa: W503 - and torch.distributed.is_initialized() # noqa: W503 - ): - self._sync_dist() + dist_sync_fn = self.dist_sync_fn + if (dist_sync_fn is None + and torch.distributed.is_available() + and torch.distributed.is_initialized()): + # User provided a bool, so we assume DDP if available + dist_sync_fn = gather_all_tensors + + if self._to_sync and dist_sync_fn is not None: + self._sync_dist(dist_sync_fn) self._computed = compute(*args, **kwargs) self.reset() @@ -255,3 +260,36 @@ def __setstate__(self, state): self.__dict__.update(state) self.update = self._wrap_update(self.update) self.compute = self._wrap_compute(self.compute) + + def _apply(self, fn): + """ Overwrite _apply function such that we can also move metric states + to the correct device when `.to`, `.cuda`, etc methods are called + """ + self = super()._apply(fn) + # Also apply fn to metric states + for key in self._defaults.keys(): + current_val = getattr(self, key) + if isinstance(current_val, torch.Tensor): + setattr(self, key, fn(current_val)) + elif isinstance(current_val, Sequence): + setattr(self, key, [fn(cur_v) for cur_v in current_val]) + else: + raise TypeError('Expected metric state to be either a torch.Tensor' + f'or a list of torch.Tensor, but encountered {current_val}') + return self + + def persistent(self, mode: bool = True): + """ Method for post-init to change if metric states should be saved to + its state_dict + """ + for key in self._persistent.keys(): + self._persistent[key] = mode + + def state_dict(self, *args, **kwargs): + # Register metric states to be part of the state_dict + state_dict = super().state_dict() + for key in self._defaults.keys(): + if self._persistent[key]: + current_val = getattr(self, key) + state_dict.update({key: current_val}) + return state_dict diff --git a/pytorch_lightning/metrics/regression/explained_variance.py b/pytorch_lightning/metrics/regression/explained_variance.py index 79fc8b4c4e183..f59ce0b67de62 100644 --- a/pytorch_lightning/metrics/regression/explained_variance.py +++ b/pytorch_lightning/metrics/regression/explained_variance.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import torch -from typing import Any, Optional +from typing import Any, Callable, Optional from pytorch_lightning.metrics.metric import Metric from pytorch_lightning.utilities import rank_zero_warn @@ -74,11 +74,13 @@ def __init__( compute_on_step: bool = True, dist_sync_on_step: bool = False, process_group: Optional[Any] = None, + dist_sync_fn: Callable = None, ): super().__init__( compute_on_step=compute_on_step, dist_sync_on_step=dist_sync_on_step, process_group=process_group, + dist_sync_fn=dist_sync_fn, ) allowed_multioutput = ('raw_values', 'uniform_average', 'variance_weighted') if multioutput not in allowed_multioutput: diff --git a/pytorch_lightning/metrics/regression/mean_absolute_error.py b/pytorch_lightning/metrics/regression/mean_absolute_error.py index 89cb56d431ad4..ba6d2c6d79a08 100644 --- a/pytorch_lightning/metrics/regression/mean_absolute_error.py +++ b/pytorch_lightning/metrics/regression/mean_absolute_error.py @@ -49,11 +49,13 @@ def __init__( compute_on_step: bool = True, dist_sync_on_step: bool = False, process_group: Optional[Any] = None, + dist_sync_fn: Callable = None, ): super().__init__( compute_on_step=compute_on_step, dist_sync_on_step=dist_sync_on_step, process_group=process_group, + dist_sync_fn=dist_sync_fn, ) self.add_state("sum_abs_error", default=torch.tensor(0.0), dist_reduce_fx="sum") diff --git a/pytorch_lightning/metrics/regression/mean_squared_error.py b/pytorch_lightning/metrics/regression/mean_squared_error.py index 87c1fddf2674c..6da6d55d5dd1c 100644 --- a/pytorch_lightning/metrics/regression/mean_squared_error.py +++ b/pytorch_lightning/metrics/regression/mean_squared_error.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import torch -from typing import Any, Optional +from typing import Any, Callable, Optional from pytorch_lightning.metrics.metric import Metric from pytorch_lightning.metrics.functional.mean_squared_error import ( @@ -50,11 +50,13 @@ def __init__( compute_on_step: bool = True, dist_sync_on_step: bool = False, process_group: Optional[Any] = None, + dist_sync_fn: Callable = None, ): super().__init__( compute_on_step=compute_on_step, dist_sync_on_step=dist_sync_on_step, process_group=process_group, + dist_sync_fn=dist_sync_fn, ) self.add_state("sum_squared_error", default=torch.tensor(0.0), dist_reduce_fx="sum") diff --git a/pytorch_lightning/metrics/regression/mean_squared_log_error.py b/pytorch_lightning/metrics/regression/mean_squared_log_error.py index 256fac20365af..696ad01ca829d 100644 --- a/pytorch_lightning/metrics/regression/mean_squared_log_error.py +++ b/pytorch_lightning/metrics/regression/mean_squared_log_error.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import torch -from typing import Any, Optional +from typing import Any, Callable, Optional from pytorch_lightning.metrics.metric import Metric from pytorch_lightning.metrics.functional.mean_squared_log_error import ( @@ -50,11 +50,13 @@ def __init__( compute_on_step: bool = True, dist_sync_on_step: bool = False, process_group: Optional[Any] = None, + dist_sync_fn: Callable = None, ): super().__init__( compute_on_step=compute_on_step, dist_sync_on_step=dist_sync_on_step, process_group=process_group, + dist_sync_fn=dist_sync_fn, ) self.add_state("sum_squared_log_error", default=torch.tensor(0.0), dist_reduce_fx="sum") diff --git a/pytorch_lightning/metrics/utils.py b/pytorch_lightning/metrics/utils.py index 886c070c2fd18..e9419cf70172b 100644 --- a/pytorch_lightning/metrics/utils.py +++ b/pytorch_lightning/metrics/utils.py @@ -67,3 +67,31 @@ def _check_same_shape(pred: torch.Tensor, target: torch.Tensor): """ Check that predictions and target have the same shape, else raise error """ if pred.shape != target.shape: raise RuntimeError('Predictions and targets are expected to have the same shape') + + +def _input_format_classification(preds: torch.Tensor, target: torch.Tensor, threshold: float): + """ Convert preds and target tensors into label tensors + + Args: + preds: either tensor with labels, tensor with probabilities/logits or + multilabel tensor + target: tensor with ground true labels + threshold: float used for thresholding multilabel input + + Returns: + preds: tensor with labels + target: tensor with labels + """ + if not (len(preds.shape) == len(target.shape) or len(preds.shape) == len(target.shape) + 1): + raise ValueError( + "preds and target must have same number of dimensions, or one additional dimension for preds" + ) + + if len(preds.shape) == len(target.shape) + 1: + # multi class probabilites + preds = torch.argmax(preds, dim=1) + + if len(preds.shape) == len(target.shape) and preds.dtype == torch.float: + # binary or multilabel probablities + preds = (preds >= threshold).long() + return preds, target diff --git a/pytorch_lightning/plugins/ddp_plugin.py b/pytorch_lightning/plugins/ddp_plugin.py index 27deeeddfdb45..4c4fdc8f0d368 100644 --- a/pytorch_lightning/plugins/ddp_plugin.py +++ b/pytorch_lightning/plugins/ddp_plugin.py @@ -1,12 +1,16 @@ -from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel +from typing import List, Dict, Any + from pytorch_lightning.core.lightning import LightningModule -from typing import List +from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel class DDPPlugin(object): """ Plugin to link a custom ddp implementation to any arbitrary accelerator. + This plugin forwards all constructor arguments to `LightningDistributedDataParallel`, + which in turn forwards all args to `DistributedDataParallel`. + Example:: class MyDDP(DDPPlugin): @@ -17,11 +21,16 @@ def configure_ddp(self, model, device_ids): my_ddp = MyDDP() trainer = Trainer(accelerator='ddp_x', plugins=[my_ddp]) - """ - def configure_ddp(self, model: LightningModule, device_ids: List[int]) -> LightningDistributedDataParallel: + def __init__(self, **kwargs): + self._ddp_kwargs: Dict[str, Any] = kwargs + + def configure_ddp( + self, model: LightningModule, device_ids: List[int] + ) -> LightningDistributedDataParallel: """ + Pass through all customizations from constructor to `LightningDistributedDataParallel`. Override to define a custom DDP implementation. .. note:: Only requirement is that your DDP implementation subclasses LightningDistributedDataParallel @@ -43,5 +52,13 @@ def configure_ddp(self, model, device_ids): the model wrapped in LightningDistributedDataParallel """ - model = LightningDistributedDataParallel(model, device_ids=device_ids, find_unused_parameters=True) + # if unset, default `find_unused_parameters` `True` + self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get( + "find_unused_parameters", True + ) + model = LightningDistributedDataParallel( + model, + device_ids=device_ids, + **self._ddp_kwargs, + ) return model diff --git a/pytorch_lightning/plugins/native_amp.py b/pytorch_lightning/plugins/native_amp.py index 6506540bde6e1..98bc8dfc87d25 100644 --- a/pytorch_lightning/plugins/native_amp.py +++ b/pytorch_lightning/plugins/native_amp.py @@ -29,8 +29,10 @@ def connect(self, model, optimizers): def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): closure_loss = self.trainer.scaler.scale(closure_loss) + automatic_optimization = self.trainer.train_loop.automatic_optimization + # do backward pass - if self.trainer.train_loop.automatic_optimization: + if automatic_optimization: model = self.trainer.get_model() model.backward(closure_loss, optimizer, opt_idx) else: @@ -38,6 +40,11 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): # once backward has been applied, release graph closure_loss = closure_loss.detach() + + # unscale gradient to allow analyze within `on_after_backward` + if not self.trainer.train_loop.should_accumulate() and automatic_optimization: + self.trainer.scaler.unscale_(optimizer) + return closure_loss def training_step(self, fx, args): diff --git a/pytorch_lightning/profiler/__init__.py b/pytorch_lightning/profiler/__init__.py index fc684d143e4b8..fe14bb5751161 100644 --- a/pytorch_lightning/profiler/__init__.py +++ b/pytorch_lightning/profiler/__init__.py @@ -22,12 +22,12 @@ Enable simple profiling ----------------------- -If you only wish to profile the standard actions, you can set `profiler=True` when constructing -your `Trainer` object. +If you only wish to profile the standard actions, you can set `profiler="simple"` +when constructing your `Trainer` object. .. code-block:: python - trainer = Trainer(..., profiler=True) + trainer = Trainer(..., profiler="simple") The profiler's results will be printed at the completion of a training `fit()`. @@ -59,6 +59,10 @@ .. code-block:: python + trainer = Trainer(..., profiler="advanced") + + or + profiler = AdvancedProfiler() trainer = Trainer(..., profiler=profiler) diff --git a/pytorch_lightning/trainer/__init__.py b/pytorch_lightning/trainer/__init__.py index 3f073e726f7eb..98abd994b531d 100644 --- a/pytorch_lightning/trainer/__init__.py +++ b/pytorch_lightning/trainer/__init__.py @@ -12,1555 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -.. testsetup:: * - - import os - from pytorch_lightning.trainer.trainer import Trainer - from pytorch_lightning.core.lightning import LightningModule - from pytorch_lightning.utilities.seed import seed_everything - - -Once you've organized your PyTorch code into a LightningModule, -the Trainer automates everything else. - -.. raw:: html - - - -| - -This abstraction achieves the following: - -1. You maintain control over all aspects via PyTorch code without an added abstraction. - -2. The trainer uses best practices embedded by contributors and users - from top AI labs such as Facebook AI Research, NYU, MIT, Stanford, etc... - -3. The trainer allows overriding any key part that you don't want automated. - -| - ------------ - -Basic use ---------- - -This is the basic use of the trainer: - -.. code-block:: python - - model = MyLightningModule() - - trainer = Trainer() - trainer.fit(model, train_dataloader, val_dataloader) - - --------- - -Trainer in Python scripts -------------------------- -In Python scripts, it's recommended you use a main function to call the Trainer. - -.. code-block:: python - - from argparse import ArgumentParser - - def main(hparams): - model = LightningModule() - trainer = Trainer(gpus=hparams.gpus) - trainer.fit(model) - - if __name__ == '__main__': - parser = ArgumentParser() - parser.add_argument('--gpus', default=None) - args = parser.parse_args() - - main(args) - -So you can run it like so: - -.. code-block:: bash - - python main.py --gpus 2 - -.. note:: - - Pro-tip: You don't need to define all flags manually. Lightning can add them automatically - -.. code-block:: python - - from argparse import ArgumentParser - - def main(args): - model = LightningModule() - trainer = Trainer.from_argparse_args(args) - trainer.fit(model) - - if __name__ == '__main__': - parser = ArgumentParser() - parser = Trainer.add_argparse_args(parser) - args = parser.parse_args() - - main(args) - -So you can run it like so: - -.. code-block:: bash - - python main.py --gpus 2 --max_steps 10 --limit_train_batches 10 --any_trainer_arg x - -.. note:: - If you want to stop a training run early, you can press "Ctrl + C" on your keyboard. - The trainer will catch the `KeyboardInterrupt` and attempt a graceful shutdown, including - running callbacks such as `on_train_end`. The trainer object will also set an attribute - `interrupted` to `True` in such cases. If you have a callback which shuts down compute - resources, for example, you can conditionally run the shutdown logic for only uninterrupted runs. - ------------- - -Testing -------- -Once you're done training, feel free to run the test set! -(Only right before publishing your paper or pushing to production) - -.. code-block:: python - - trainer.test(test_dataloader=test_dataloader) - ------------- - -Deployment / prediction ------------------------ -You just trained a LightningModule which is also just a torch.nn.Module. -Use it to do whatever! - -.. code-block:: python - - # load model - pretrained_model = LightningModule.load_from_checkpoint(PATH) - pretrained_model.freeze() - - # use it for finetuning - def forward(self, x): - features = pretrained_model(x) - classes = classifier(features) - - # or for prediction - out = pretrained_model(x) - api_write({'response': out} - - -You may wish to run the model on a variety of devices. Instead of moving the data -manually to the correct device, decorate the forward method (or any other method you use for inference) -with :func:`~pytorch_lightning.core.decorators.auto_move_data` and Lightning will take care of the rest. - ------------- - -Reproducibility ---------------- - -To ensure full reproducibility from run to run you need to set seeds for pseudo-random generators, -and set ``deterministic`` flag in ``Trainer``. - -Example:: - - from pytorch_lightning import Trainer, seed_everything - - seed_everything(42) - # sets seeds for numpy, torch, python.random and PYTHONHASHSEED. - model = Model() - trainer = Trainer(deterministic=True) - - -------- - -Trainer flags -------------- - -accelerator -^^^^^^^^^^^ - -.. raw:: html - - - -| - -The accelerator backend to use (previously known as distributed_backend). - -- (```dp```) is DataParallel (split batch among GPUs of same machine) -- (```ddp```) is DistributedDataParallel (each gpu on each node trains, and syncs grads) -- (```ddp_cpu```) is DistributedDataParallel on CPU (same as `ddp`, but does not use GPUs. - Useful for multi-node CPU training or single-node debugging. Note that this will **not** give - a speedup on a single node, since Torch already makes effient use of multiple CPUs on a single - machine.) -- (```ddp2```) dp on node, ddp across nodes. Useful for things like increasing - the number of negative samples - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(distributed_backend=None) - -Example:: - - # dp = DataParallel - trainer = Trainer(gpus=2, distributed_backend='dp') - - # ddp = DistributedDataParallel - trainer = Trainer(gpus=2, num_nodes=2, distributed_backend='ddp') - - # ddp2 = DistributedDataParallel + dp - trainer = Trainer(gpus=2, num_nodes=2, distributed_backend='ddp2') - -.. note:: this option does not apply to TPU. TPUs use ```ddp``` by default (over each core) - -You can also modify hardware behavior by subclassing an existing accelerator to adjust for your needs. - -Example:: - - class MyOwnDDP(DDPAccelerator): - ... - - Trainer(accelerator=MyOwnDDP()) - -.. warning:: Passing in custom accelerators is experimental but work is in progress to enable full compatibility. - -accumulate_grad_batches -^^^^^^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Accumulates grads every k batches or as set up in the dict. -Trainer also calls ``optimizer.step()`` for the last indivisible step number. - -.. testcode:: - - # default used by the Trainer (no accumulation) - trainer = Trainer(accumulate_grad_batches=1) - -Example:: - - # accumulate every 4 batches (effective batch size is batch*4) - trainer = Trainer(accumulate_grad_batches=4) - - # no accumulation for epochs 1-4. accumulate 3 for epochs 5-10. accumulate 20 after that - trainer = Trainer(accumulate_grad_batches={5: 3, 10: 20}) - -amp_backend -^^^^^^^^^^^ - -.. raw:: html - - - -| - -Use PyTorch AMP ('native') (available PyTorch 1.6+), or NVIDIA apex ('apex'). - -.. testcode:: - - # using PyTorch built-in AMP, default used by the Trainer - trainer = Trainer(amp_backend='native') - - # using NVIDIA Apex - trainer = Trainer(amp_backend='apex') - -amp_level -^^^^^^^^^ - -.. raw:: html - - - -| - -The optimization level to use (O1, O2, etc...) -for 16-bit GPU precision (using NVIDIA apex under the hood). - -Check `NVIDIA apex docs `_ for level - -Example:: - - # default used by the Trainer - trainer = Trainer(amp_level='O2') - -automatic_optimization -^^^^^^^^^^^^^^^^^^^^^^ -When set to False, Lightning does not automate the optimization process. This means you are responsible for your own -optimizer behavior - -Example:: - - def training_step(self, batch, batch_idx): - opt = self.optimizers() - - loss = ... - self.manual_backward(loss, opt) - opt.step() - opt.zero_grad() - -This is not recommended when using a single optimizer, instead it's recommended when using 2+ optimizers -AND you are an expert user. Most useful for research like RL, sparse coding and GAN research. - -In the multi-optimizer case, ignore the optimizer_idx flag and use the optimizers directly - -Example:: - - def training_step(self, batch, batch_idx, optimizer_idx): - (opt_a, opt_b) = self.optimizers() - - gen_loss = ... - self.manual_backward(gen_loss, opt_a) - opt_a.step() - opt_a.zero_grad() - - disc_loss = ... - self.manual_backward(disc_loss, opt_b) - opt_b.step() - opt_b.zero_grad() - -auto_scale_batch_size -^^^^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Automatically tries to find the largest batch size that fits into memory, -before any training. - -.. code-block:: - - # default used by the Trainer (no scaling of batch size) - trainer = Trainer(auto_scale_batch_size=None) - - # run batch size scaling, result overrides hparams.batch_size - trainer = Trainer(auto_scale_batch_size='binsearch') - - # call tune to find the batch size - trainer.tune(model) - -auto_select_gpus -^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -If enabled and `gpus` is an integer, pick available gpus automatically. -This is especially useful when GPUs are configured to be in "exclusive mode", -such that only one process at a time can access them. - -Example:: - - # no auto selection (picks first 2 gpus on system, may fail if other process is occupying) - trainer = Trainer(gpus=2, auto_select_gpus=False) - - # enable auto selection (will find two available gpus on system) - trainer = Trainer(gpus=2, auto_select_gpus=True) - -auto_lr_find -^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Runs a learning rate finder algorithm (see this `paper `_) -when calling trainer.tune(), to find optimal initial learning rate. - -.. code-block:: python - - # default used by the Trainer (no learning rate finder) - trainer = Trainer(auto_lr_find=False) - -Example:: - - # run learning rate finder, results override hparams.learning_rate - trainer = Trainer(auto_lr_find=True) - - # call tune to find the lr - trainer.tune(model) - -Example:: - - # run learning rate finder, results override hparams.my_lr_arg - trainer = Trainer(auto_lr_find='my_lr_arg') - - # call tune to find the lr - trainer.tune(model) - -.. note:: - See the :ref:`learning rate finder guide `. - -benchmark -^^^^^^^^^ - -.. raw:: html - - - -| - -If true enables cudnn.benchmark. -This flag is likely to increase the speed of your system if your -input sizes don't change. However, if it does, then it will likely -make your system slower. - -The speedup comes from allowing the cudnn auto-tuner to find the best -algorithm for the hardware `[see discussion here] -`_. - -Example:: - - # default used by the Trainer - trainer = Trainer(benchmark=False) - -deterministic -^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -If true enables cudnn.deterministic. -Might make your system slower, but ensures reproducibility. -Also sets ``$HOROVOD_FUSION_THRESHOLD=0``. - -For more info check `[pytorch docs] -`_. - -Example:: - - # default used by the Trainer - trainer = Trainer(deterministic=False) - -callbacks -^^^^^^^^^ - -.. raw:: html - - - -| - -Add a list of :class:`~pytorch_lightning.callbacks.Callback`. These callbacks DO NOT replace the explicit callbacks -(loggers or :class:`~pytorch_lightning.callbacks.ModelCheckpoint`). - -.. note:: Only user defined callbacks (ie: Not :class:`~pytorch_lightning.callbacks.ModelCheckpoint`) - -.. code-block:: python - - # a list of callbacks - callbacks = [PrintCallback()] - trainer = Trainer(callbacks=callbacks) - -Example:: - - from pytorch_lightning.callbacks import Callback - - class PrintCallback(Callback): - def on_train_start(self, trainer, pl_module): - print("Training is started!") - def on_train_end(self, trainer, pl_module): - print("Training is done.") - -check_val_every_n_epoch -^^^^^^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Check val every n train epochs. - -Example:: - - # default used by the Trainer - trainer = Trainer(check_val_every_n_epoch=1) - - # run val loop every 10 training epochs - trainer = Trainer(check_val_every_n_epoch=10) - -checkpoint_callback -^^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Pass in a callback for checkpointing. Checkpoints capture the exact value of all parameters used by a model. -By default Lightning saves a checkpoint for you in your current working directory, with the state of your last training epoch, -but you can override the default behavior by Initializing the :class:`~pytorch_lightning.callbacks.ModelCheckpoint` callback, -and passing it to :class:`~pytorch_lightning.trainer.Trainer` `checkpoint_callback` flag. - -.. code-block:: python - - from pytorch_lightning.callbacks import ModelCheckpoint - - # default used by the Trainer - checkpoint_callback = ModelCheckpoint( - dirpath=os.getcwd(), - save_top_k=True, - verbose=True, - monitor='checkpoint_on', - mode='min', - prefix='' - ) - - trainer = Trainer(checkpoint_callback=checkpoint_callback) - -To disable automatic checkpointing, set this to `False`. - -.. code-block:: python - - trainer = Trainer(checkpoint_callback=False) - -See also :ref:`Saving and Loading Weights `. - -default_root_dir -^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Default path for logs and weights when no logger or -:class:`pytorch_lightning.callbacks.ModelCheckpoint` callback passed. On -certain clusters you might want to separate where logs and checkpoints are -stored. If you don't then use this argument for convenience. Paths can be local -paths or remote paths such as `s3://bucket/path` or 'hdfs://path/'. Credentials -will need to be set up to use remote filepaths. - -Example:: - - # default used by the Trainer - trainer = Trainer(default_root_path=os.getcwd()) - -distributed_backend -^^^^^^^^^^^^^^^^^^^ -This has been renamed "accelerator". - -fast_dev_run -^^^^^^^^^^^^ - -.. raw:: html - - - -| - -.. raw:: html - - - -| - -Runs 1 batch of train, test and val to find any bugs (ie: a sort of unit test). - -Under the hood the pseudocode looks like this: - -.. code-block:: python - - # loading - __init__() - prepare_data - - # test training step - training_batch = next(train_dataloader) - training_step(training_batch) - - # test val step - val_batch = next(val_dataloader) - out = validation_step(val_batch) - validation_epoch_end([out]) - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(fast_dev_run=False) - - # runs 1 train, val, test batch and program ends - trainer = Trainer(fast_dev_run=True) - -gpus -^^^^ - -.. raw:: html - - - -| - -- Number of GPUs to train on (int) -- or which GPUs to train on (list) -- can handle strings - -.. testcode:: - - # default used by the Trainer (ie: train on CPU) - trainer = Trainer(gpus=None) - - # equivalent - trainer = Trainer(gpus=0) - -Example:: - - # int: train on 2 gpus - trainer = Trainer(gpus=2) - - # list: train on GPUs 1, 4 (by bus ordering) - trainer = Trainer(gpus=[1, 4]) - trainer = Trainer(gpus='1, 4') # equivalent - - # -1: train on all gpus - trainer = Trainer(gpus=-1) - trainer = Trainer(gpus='-1') # equivalent - - # combine with num_nodes to train on multiple GPUs across nodes - # uses 8 gpus in total - trainer = Trainer(gpus=2, num_nodes=4) - - # train only on GPUs 1 and 4 across nodes - trainer = Trainer(gpus=[1, 4], num_nodes=4) - -See Also: - - :ref:`Multi-GPU training guide `. - -gradient_clip_val -^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Gradient clipping value - -- 0 means don't clip. - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(gradient_clip_val=0.0) - - -limit_test_batches -^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -How much of test dataset to check. - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(limit_test_batches=1.0) - - # run through only 25% of the test set each epoch - trainer = Trainer(limit_test_batches=0.25) - - # run for only 10 batches - trainer = Trainer(limit_test_batches=10) - -In the case of multiple test dataloaders, the limit applies to each dataloader individually. - -limit_val_batches -^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -How much of validation dataset to check. -Useful when debugging or testing something that happens at the end of an epoch. - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(limit_val_batches=1.0) - - # run through only 25% of the validation set each epoch - trainer = Trainer(limit_val_batches=0.25) - - # run for only 10 batches - trainer = Trainer(limit_val_batches=10) - -In the case of multiple validation dataloaders, the limit applies to each dataloader individually. - -log_gpu_memory -^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Options: - -- None -- 'min_max' -- 'all' - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(log_gpu_memory=None) - - # log all the GPUs (on master node only) - trainer = Trainer(log_gpu_memory='all') - - # log only the min and max memory on the master node - trainer = Trainer(log_gpu_memory='min_max') - -.. note:: Might slow performance because it uses the output of nvidia-smi. - -flush_logs_every_n_steps -^^^^^^^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Writes logs to disk this often. - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(flush_logs_every_n_steps=100) - -See Also: - - :ref:`logging` - -logger -^^^^^^ - -.. raw:: html - - - -| - -:ref:`Logger ` (or iterable collection of loggers) for experiment tracking. - -.. testcode:: - - from pytorch_lightning.loggers import TensorBoardLogger - - # default logger used by trainer - logger = TensorBoardLogger( - save_dir=os.getcwd(), - version=1, - name='lightning_logs' - ) - Trainer(logger=logger) - -max_epochs -^^^^^^^^^^ - -.. raw:: html - - - -| - -Stop training once this number of epochs is reached - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(max_epochs=1000) - -min_epochs -^^^^^^^^^^ - -.. raw:: html - - - -| - -Force training for at least these many epochs - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(min_epochs=1) - -max_steps -^^^^^^^^^ - -.. raw:: html - - - -| - -Stop training after this number of steps -Training will stop if max_steps or max_epochs have reached (earliest). - -.. testcode:: - - # Default (disabled) - trainer = Trainer(max_steps=None) - - # Stop after 100 steps - trainer = Trainer(max_steps=100) - -min_steps -^^^^^^^^^ - -.. raw:: html - - - -| - -Force training for at least these number of steps. -Trainer will train model for at least min_steps or min_epochs (latest). - -.. testcode:: - - # Default (disabled) - trainer = Trainer(min_steps=None) - - # Run at least for 100 steps (disable min_epochs) - trainer = Trainer(min_steps=100, min_epochs=0) - -num_nodes -^^^^^^^^^ - -.. raw:: html - - - -| - -Number of GPU nodes for distributed training. - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(num_nodes=1) - - # to train on 8 nodes - trainer = Trainer(num_nodes=8) - -num_processes -^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Number of processes to train with. Automatically set to the number of GPUs -when using ``distrbuted_backend="ddp"``. Set to a number greater than 1 when -using ``distributed_backend="ddp_cpu"`` to mimic distributed training on a -machine without GPUs. This is useful for debugging, but **will not** provide -any speedup, since single-process Torch already makes effient use of multiple -CPUs. - -.. testcode:: - - # Simulate DDP for debugging on your GPU-less laptop - trainer = Trainer(distributed_backend="ddp_cpu", num_processes=2) - -num_sanity_val_steps -^^^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Sanity check runs n batches of val before starting the training routine. -This catches any bugs in your validation without having to wait for the first validation check. -The Trainer uses 2 steps by default. Turn it off or modify it here. - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(num_sanity_val_steps=2) - - # turn it off - trainer = Trainer(num_sanity_val_steps=0) - - # check all validation data - trainer = Trainer(num_sanity_val_steps=-1) - -Example:: - - python -m torch_xla.distributed.xla_dist - --tpu=$TPU_POD_NAME - --conda-env=torch-xla-nightly - --env=XLA_USE_BF16=1 - -- python your_trainer_file.py - -plugins -^^^^^^^ - -.. raw:: html - - - -| - -Plugins allow you to connect arbitrary backends, precision libraries, SLURM, etc... For example: - -- DDP -- SLURM -- TorchElastic -- Apex - -To define your own behavior, subclass the relevant class and pass it in. Here's an example linking up your own cluster. - -.. code-block:: python - - from pytorch_lightning.cluster_environments import cluster_environment - - class MyCluster(ClusterEnvironment): - - def master_address(self): - return your_master_address - - def master_port(self): - return your_master_port - - def world_size(self): - return the_world_size - - trainer = Trainer(cluster_environment=cluster_environment()) - -prepare_data_per_node -^^^^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -If True will call `prepare_data()` on LOCAL_RANK=0 for every node. -If False will only call from NODE_RANK=0, LOCAL_RANK=0 - -.. testcode:: - - # default - Trainer(prepare_data_per_node=True) - - # use only NODE_RANK=0, LOCAL_RANK=0 - Trainer(prepare_data_per_node=False) - -tpu_cores -^^^^^^^^^ - -.. raw:: html - - - -| - -- How many TPU cores to train on (1 or 8). -- Which TPU core to train on [1-8] - -A single TPU v2 or v3 has 8 cores. A TPU pod has -up to 2048 cores. A slice of a POD means you get as many cores -as you request. - -Your effective batch size is batch_size * total tpu cores. - -.. note:: No need to add a DistributedDataSampler, Lightning automatically does it for you. - -This parameter can be either 1 or 8. - -.. testcode:: - - # your_trainer_file.py - - # default used by the Trainer (ie: train on CPU) - trainer = Trainer(tpu_cores=None) - - # int: train on a single core - trainer = Trainer(tpu_cores=1) - - # list: train on a single selected core - trainer = Trainer(tpu_cores=[2]) - - # int: train on all cores few cores - trainer = Trainer(tpu_cores=8) - - # for 8+ cores must submit via xla script with - # a max of 8 cores specified. The XLA script - # will duplicate script onto each TPU in the POD - trainer = Trainer(tpu_cores=8) - -To train on more than 8 cores (ie: a POD), -submit this script using the xla_dist script. - -Example:: - - python -m torch_xla.distributed.xla_dist - --tpu=$TPU_POD_NAME - --conda-env=torch-xla-nightly - --env=XLA_USE_BF16=1 - -- python your_trainer_file.py - -overfit_batches -^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Uses this much data of the training set. If nonzero, will use the same training set for validation and testing. -If the training dataloaders have `shuffle=True`, Lightning will automatically disable it. - -Useful for quickly debugging or trying to overfit on purpose. - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(overfit_batches=0.0) - - # use only 1% of the train set (and use the train set for val and test) - trainer = Trainer(overfit_batches=0.01) - - # overfit on 10 of the same batches - trainer = Trainer(overfit_batches=10) - -precision -^^^^^^^^^ - -.. raw:: html - - - -| - -Full precision (32), half precision (16). -Can be used on CPU, GPU or TPUs. - -If used on TPU will use torch.bfloat16 but tensor printing -will still show torch.float32. - -.. testcode:: - :skipif: not APEX_AVAILABLE and not NATIVE_AMP_AVALAIBLE - - # default used by the Trainer - trainer = Trainer(precision=32) - - # 16-bit precision - trainer = Trainer(precision=16) - -Example:: - - # one day - trainer = Trainer(precision=8|4|2) - -process_position -^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Orders the progress bar. Useful when running multiple trainers on the same node. - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(process_position=0) - -Note: - This argument is ignored if a custom callback is passed to :paramref:`~Trainer.callbacks`. - -profiler -^^^^^^^^ - -.. raw:: html - - - -| - -To profile individual steps during training and assist in identifying bottlenecks. - -See the :ref:`profiler documentation `. for more details. - -.. testcode:: - - from pytorch_lightning.profiler import SimpleProfiler, AdvancedProfiler - - # default used by the Trainer - trainer = Trainer(profiler=None) - - # to profile standard training events - trainer = Trainer(profiler=True) - - # equivalent to profiler=True - trainer = Trainer(profiler=SimpleProfiler()) - - # advanced profiler for function-level stats - trainer = Trainer(profiler=AdvancedProfiler()) - -progress_bar_refresh_rate -^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -How often to refresh progress bar (in steps). -In notebooks, faster refresh rates (lower number) is known to crash them -because of their screen refresh rates, so raise it to 50 or more. - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(progress_bar_refresh_rate=1) - - # disable progress bar - trainer = Trainer(progress_bar_refresh_rate=0) - -Note: - This argument is ignored if a custom callback is passed to :paramref:`~Trainer.callbacks`. - -reload_dataloaders_every_epoch -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Set to True to reload dataloaders every epoch. - -.. code-block:: python - - # if False (default) - train_loader = model.train_dataloader() - for epoch in epochs: - for batch in train_loader: - ... - - # if True - for epoch in epochs: - train_loader = model.train_dataloader() - for batch in train_loader: - -replace_sampler_ddp -^^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Enables auto adding of distributed sampler. By default it will add ``shuffle=True`` -for train sampler and ``shuffle=False`` for val/test sampler. If you want to customize -it, you can set ``replace_sampler_ddp=False`` and add your own distributed sampler. -If ``replace_sampler_ddp=True`` and a distributed sampler was already added, -Lightning will not replace the existing one. - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(replace_sampler_ddp=True) - -By setting to False, you have to add your own distributed sampler: - -.. code-block:: python - - # default used by the Trainer - sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=True) - dataloader = DataLoader(dataset, batch_size=32, sampler=sampler) - -resume_from_checkpoint -^^^^^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -To resume training from a specific checkpoint pass in the path here. - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(resume_from_checkpoint=None) - - # resume from a specific checkpoint - trainer = Trainer(resume_from_checkpoint='some/path/to/my_checkpoint.ckpt') - -log_every_n_steps -^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - - -How often to add logging rows (does not write to disk) - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(log_every_n_steps=50) - -See Also: - - :ref:`logging` - - -sync_batchnorm -^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Enable synchronization between batchnorm layers across all GPUs. - -.. testcode:: - - trainer = Trainer(sync_batchnorm=True) - -track_grad_norm -^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -- no tracking (-1) -- Otherwise tracks that norm (2 for 2-norm) - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(track_grad_norm=-1) - - # track the 2-norm - trainer = Trainer(track_grad_norm=2) - -limit_train_batches -^^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -How much of training dataset to check. -Useful when debugging or testing something that happens at the end of an epoch. - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(limit_train_batches=1.0) - -Example:: - - # default used by the Trainer - trainer = Trainer(limit_train_batches=1.0) - - # run through only 25% of the training set each epoch - trainer = Trainer(limit_train_batches=0.25) - - # run through only 10 batches of the training set each epoch - trainer = Trainer(limit_train_batches=10) - -truncated_bptt_steps -^^^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Truncated back prop breaks performs backprop every k steps of -a much longer sequence. - -If this is enabled, your batches will automatically get truncated -and the trainer will apply Truncated Backprop to it. - -(`Williams et al. "An efficient gradient-based algorithm for on-line training of -recurrent network trajectories." -`_) - -.. testcode:: - - # default used by the Trainer (ie: disabled) - trainer = Trainer(truncated_bptt_steps=None) - - # backprop every 5 steps in a batch - trainer = Trainer(truncated_bptt_steps=5) - -.. note:: Make sure your batches have a sequence dimension. - -Lightning takes care to split your batch along the time-dimension. - -.. code-block:: python - - # we use the second as the time dimension - # (batch, time, ...) - sub_batch = batch[0, 0:t, ...] - -Using this feature requires updating your LightningModule's -:meth:`pytorch_lightning.core.LightningModule.training_step` to include a `hiddens` arg -with the hidden - -.. code-block:: python - - # Truncated back-propagation through time - def training_step(self, batch, batch_idx, hiddens): - # hiddens are the hiddens from the previous truncated backprop step - out, hiddens = self.lstm(data, hiddens) - - return { - "loss": ..., - "hiddens": hiddens # remember to detach() this - } - -To modify how the batch is split, -override :meth:`pytorch_lightning.core.LightningModule.tbptt_split_batch`: - -.. testcode:: - - class LitMNIST(LightningModule): - def tbptt_split_batch(self, batch, split_size): - # do your own splitting on the batch - return splits - -val_check_interval -^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -How often within one training epoch to check the validation set. -Can specify as float or int. - -- use (float) to check within a training epoch -- use (int) to check every n steps (batches) - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(val_check_interval=1.0) - - # check validation set 4 times during a training epoch - trainer = Trainer(val_check_interval=0.25) - - # check validation set every 1000 training batches - # use this when using iterableDataset and your dataset has no length - # (ie: production cases with streaming data) - trainer = Trainer(val_check_interval=1000) - - -weights_save_path -^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Directory of where to save weights if specified. - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(weights_save_path=os.getcwd()) - - # save to your custom path - trainer = Trainer(weights_save_path='my/path') - -Example:: - - # if checkpoint callback used, then overrides the weights path - # **NOTE: this saves weights to some/path NOT my/path - checkpoint = ModelCheckpoint(dirpath='some/path') - trainer = Trainer( - checkpoint_callback=checkpoint, - weights_save_path='my/path' - ) - -weights_summary -^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Prints a summary of the weights when training begins. -Options: 'full', 'top', None. - -.. testcode:: - - # default used by the Trainer (ie: print summary of top level modules) - trainer = Trainer(weights_summary='top') - - # print full summary of all modules and submodules - trainer = Trainer(weights_summary='full') - - # don't print a summary - trainer = Trainer(weights_summary=None) - -Trainer class API ------------------ """ diff --git a/pytorch_lightning/trainer/callback_hook.py b/pytorch_lightning/trainer/callback_hook.py index 46f2a32c0a8f1..8f3885c20fcdc 100644 --- a/pytorch_lightning/trainer/callback_hook.py +++ b/pytorch_lightning/trainer/callback_hook.py @@ -209,3 +209,17 @@ def on_load_checkpoint(self, checkpoint): if state: state = deepcopy(state) callback.on_load_checkpoint(state) + + def on_after_backward(self): + """ + Called after loss.backward() and before optimizers do anything. + """ + for callback in self.callbacks: + callback.on_after_backward(self, self.get_model()) + + def on_before_zero_grad(self, optimizer): + """ + Called after optimizer.step() and before optimizer.zero_grad(). + """ + for callback in self.callbacks: + callback.on_before_zero_grad(self, self.get_model(), optimizer) diff --git a/pytorch_lightning/trainer/connectors/callback_connector.py b/pytorch_lightning/trainer/connectors/callback_connector.py index 187ff237056a2..c9ef4ae32be77 100644 --- a/pytorch_lightning/trainer/connectors/callback_connector.py +++ b/pytorch_lightning/trainer/connectors/callback_connector.py @@ -12,6 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. import os + +from typing import Union, Optional + from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, ProgressBarBase, ProgressBar from pytorch_lightning.utilities import rank_zero_warn from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -44,25 +47,31 @@ def on_trainer_init( # configure checkpoint callback # it is important that this is the last callback to run # pass through the required args to figure out defaults - checkpoint_callback = self.init_default_checkpoint_callback(checkpoint_callback) - if checkpoint_callback: - self.trainer.callbacks.append(checkpoint_callback) - - # TODO refactor codebase (tests) to not directly reach into these callbacks - self.trainer.checkpoint_callback = checkpoint_callback + self.configure_checkpoint_callbacks(checkpoint_callback) # init progress bar self.trainer._progress_bar_callback = self.configure_progress_bar( progress_bar_refresh_rate, process_position ) - def init_default_checkpoint_callback(self, checkpoint_callback): - if checkpoint_callback is True: - checkpoint_callback = ModelCheckpoint(dirpath=None, filename=None) - elif checkpoint_callback is False: - checkpoint_callback = None + def configure_checkpoint_callbacks(self, checkpoint_callback: Union[ModelCheckpoint, bool]): + if isinstance(checkpoint_callback, ModelCheckpoint): + # TODO: deprecated, remove this block in v1.3.0 + rank_zero_warn( + "Passing a ModelCheckpoint instance to Trainer(checkpoint_callbacks=...)" + " is deprecated since v1.1 and will no longer be supported in v1.3.", + DeprecationWarning + ) + self.trainer.callbacks.append(checkpoint_callback) + + if self._trainer_has_checkpoint_callbacks() and checkpoint_callback is False: + raise MisconfigurationException( + "Trainer was configured with checkpoint_callback=False but found ModelCheckpoint" + " in callbacks list." + ) - return checkpoint_callback + if not self._trainer_has_checkpoint_callbacks() and checkpoint_callback is True: + self.trainer.callbacks.append(ModelCheckpoint(dirpath=None, filename=None)) def configure_progress_bar(self, refresh_rate=1, process_position=0): progress_bars = [c for c in self.trainer.callbacks if isinstance(c, ProgressBarBase)] @@ -83,3 +92,6 @@ def configure_progress_bar(self, refresh_rate=1, process_position=0): progress_bar_callback = None return progress_bar_callback + + def _trainer_has_checkpoint_callbacks(self): + return len(self.trainer.checkpoint_callbacks) > 0 diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py index 1bbdb4abac282..3b44ce96c02ad 100644 --- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py +++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py @@ -54,10 +54,10 @@ def __init__(self, trainer): def restore_weights(self, model: LightningModule): """ - We attempt to restore weights in this order: - 1. HPC weights. - 2. if no HPC weights restore checkpoint_path weights - 3. otherwise don't restore weights + Attempt to restore a checkpoint (e.g. weights) in this priority: + 1. from HPC weights + 2. from `resume_from_checkpoint` file + 3. don't restore """ # clear cache before restore if self.trainer.on_gpu: @@ -83,45 +83,50 @@ def restore_weights(self, model: LightningModule): def restore(self, checkpoint_path: str, on_gpu: bool): """ - Restore training state from checkpoint. + Load model/training states from the checkpoint file through file-read and state-restore. Also restores all training state like: - epoch - callbacks - schedulers - optimizer + In detail, check return value description of `dump_checkpoint` """ # if on_gpu: # checkpoint = torch.load(checkpoint_path) # else: # load on CPU first + # read a checkpoint dictionary object from the checkpoint file at `checkpoint_path` checkpoint = pl_load(checkpoint_path, map_location=lambda storage, loc: storage) + # restore states from the checkpoint dictionary object # load model state model = self.trainer.get_model() + # restore model and datamodule state + self.restore_model_state(model, checkpoint) + + if on_gpu: + model.cuda(self.trainer.root_gpu) + + # restore training state + self.restore_training_state(checkpoint) + + def restore_model_state(self, model: LightningModule, checkpoint) -> None: + """ + Restore model states from a 'PyTorch-Lightning checkpoint' dictionary object + """ + # give the datamodule a chance to load something if self.trainer.datamodule is not None: self.trainer.datamodule.on_load_checkpoint(checkpoint) - # give model a chance to load something + # give model a chance to restore something model.on_load_checkpoint(checkpoint) - # load the state_dict on the model automatically + # restore the state_dict on the model model.load_state_dict(checkpoint['state_dict']) - if on_gpu: - model.cuda(self.trainer.root_gpu) - - # restore amp scaling - if self.trainer.amp_backend == AMPType.NATIVE and 'native_amp_scaling_state' in checkpoint: - self.trainer.scaler.load_state_dict(checkpoint['native_amp_scaling_state']) - elif self.trainer.amp_backend == AMPType.APEX and 'amp_scaling_state' in checkpoint: - amp.load_state_dict(checkpoint['amp_scaling_state']) - - # load training state (affects trainer only) - self.restore_training_state(checkpoint) - def restore_training_state(self, checkpoint): """ Restore trainer state. @@ -129,6 +134,7 @@ def restore_training_state(self, checkpoint): :param checkpoint: :return: """ + # validation if 'optimizer_states' not in checkpoint or 'lr_schedulers' not in checkpoint: raise KeyError( 'Trying to restore training state but checkpoint contains only the model.' @@ -143,7 +149,13 @@ def restore_training_state(self, checkpoint): " where `model.ckpt` is your checkpoint file." ) - # load callback states + # restore amp scaling + if self.trainer.amp_backend == AMPType.NATIVE and 'native_amp_scaling_state' in checkpoint: + self.trainer.scaler.load_state_dict(checkpoint['native_amp_scaling_state']) + elif self.trainer.amp_backend == AMPType.APEX and 'amp_scaling_state' in checkpoint: + amp.load_state_dict(checkpoint['amp_scaling_state']) + + # restore callback states self.trainer.on_load_checkpoint(checkpoint) self.trainer.global_step = checkpoint['global_step'] @@ -234,30 +246,45 @@ def hpc_save(self, folderpath: str, logger): if LightningModule.CHECKPOINT_HYPER_PARAMS_KEY in checkpoint: del checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_KEY] rank_zero_warn( - 'warning, `module_arguments` dropped from checkpoint.' f' An attribute is not picklable {err}' + 'warning, `hyper_parameters` dropped from checkpoint.' f' An attribute is not picklable {err}' ) atomic_save(checkpoint, filepath) return filepath def dump_checkpoint(self, weights_only: bool = False) -> dict: - """Creating model checkpoint. + """Creating a model checkpoint dictionary object from various component states. Args: weights_only: saving model weights only Return: - structured dictionary + structured dictionary: { + 'epoch': training epoch + 'global_step': training global step + 'pytorch-lightning_version': PyTorch Lightning's version + 'callbacks': "callback specific state"[] # if not weights_only + 'optimizer_states': "PT optim's state_dict"[] # if not weights_only + 'lr_schedulers': "PT sched's state_dict"[] # if not weights_only + 'native_amp_scaling_state': PT amp's state_dict # if not weights_only and use native amp + 'amp_scaling_state': Apex's state_dict # if not weights_only and use apex amp + 'state_dict': Model's state_dict (e.g. network weights) + CHECKPOINT_HYPER_PARAMS_NAME: + CHECKPOINT_HYPER_PARAMS_KEY: + CHECKPOINT_HYPER_PARAMS_TYPE: + something_cool_i_want_to_save: anything you define through model.on_save_checkpoint + LightningDataModule.__class__.__name__: pl DataModule's state + } """ + # dump epoch/global_step/pytorch-lightning_version current_epoch = self.trainer.current_epoch global_step = self.trainer.global_step has_reached_max_steps = self.trainer.max_steps and self.trainer.max_steps <= global_step global_step += 1 - if self.has_trained: - if not has_reached_max_steps: - current_epoch += 1 + if not has_reached_max_steps: + current_epoch += 1 checkpoint = { 'epoch': current_epoch, @@ -267,37 +294,38 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict: if not weights_only: - # save callbacks + # dump callbacks callback_states = self.trainer.on_save_checkpoint() checkpoint['callbacks'] = callback_states - # save optimizers + # dump optimizers optimizer_states = [] for i, optimizer in enumerate(self.trainer.optimizers): optimizer_states.append(optimizer.state_dict()) checkpoint['optimizer_states'] = optimizer_states - # save lr schedulers + # dump lr schedulers lr_schedulers = [] for scheduler in self.trainer.lr_schedulers: lr_schedulers.append(scheduler['scheduler'].state_dict()) checkpoint['lr_schedulers'] = lr_schedulers - # save native amp scaling + # dump amp scaling if self.trainer.amp_backend == AMPType.NATIVE and not self.trainer.use_tpu and self.trainer.scaler is not None: checkpoint['native_amp_scaling_state'] = self.trainer.scaler.state_dict() elif self.trainer.amp_backend == AMPType.APEX: checkpoint['amp_scaling_state'] = amp.state_dict() - # add the module_arguments and state_dict from the model + # add the hyper_parameters and state_dict from the model model = self.trainer.get_model() + # dump the module_arguments and state_dict from the model checkpoint['state_dict'] = model.state_dict() if model.hparams: if hasattr(model, '_hparams_name'): checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_NAME] = model._hparams_name - # add arguments to the checkpoint + # dump arguments if OMEGACONF_AVAILABLE: checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_KEY] = model.hparams if isinstance(model.hparams, Container): @@ -305,7 +333,7 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict: else: checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_KEY] = dict(model.hparams) - # give the model a chance to add a few things + # give the model a chance to dump a few things model.on_save_checkpoint(checkpoint) if self.trainer.datamodule is not None: self.trainer.datamodule.on_save_checkpoint(checkpoint) @@ -316,19 +344,13 @@ def hpc_load(self, folderpath, on_gpu): filepath = '{}/hpc_ckpt_{}.ckpt'.format(folderpath, self.max_ckpt_in_folder(folderpath)) # load on CPU first - checkpoint = torch.load(filepath, map_location=lambda storage, loc: storage) + checkpoint = pl_load(filepath, map_location=lambda storage, loc: storage) # load model state model = self.trainer.get_model() - # load the state_dict on the model automatically - model.load_state_dict(checkpoint['state_dict']) - - # restore amp scaling - if self.trainer.amp_backend == AMPType.NATIVE and 'native_amp_scaling_state' in checkpoint: - self.trainer.scaler.load_state_dict(checkpoint['native_amp_scaling_state']) - elif self.trainer.amp_backend == AMPType.APEX and 'amp_scaling_state' in checkpoint: - amp.load_state_dict(checkpoint['amp_scaling_state']) + # restore states from 'PyTorch-Lightning checkpoint' dictionary object + self.restore_model_state(model, checkpoint) if self.trainer.root_gpu is not None: model.cuda(self.trainer.root_gpu) @@ -357,16 +379,23 @@ def max_ckpt_in_folder(self, path, name_key='ckpt_'): return max(ckpt_vs) def save_checkpoint(self, filepath, weights_only: bool = False): + """Save model/training states as a checkpoint file through state-dump and file-write. + + Args: + filepath: write-target file's path + weights_only: saving model weights only + """ + # dump states as a checkpoint dictionary object checkpoint = self.dump_checkpoint(weights_only) if self.trainer.is_global_zero: - # do the actual save + # write the checkpoint dictionary on the file try: atomic_save(checkpoint, filepath) except AttributeError as err: if LightningModule.CHECKPOINT_HYPER_PARAMS_KEY in checkpoint: del checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_KEY] rank_zero_warn( - 'Warning, `module_arguments` dropped from checkpoint.' f' An attribute is not picklable {err}' + 'Warning, `hyper_parameters` dropped from checkpoint.' f' An attribute is not picklable {err}' ) atomic_save(checkpoint, filepath) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/__init__.py b/pytorch_lightning/trainer/connectors/logger_connector/__init__.py new file mode 100644 index 0000000000000..4034840a09b97 --- /dev/null +++ b/pytorch_lightning/trainer/connectors/logger_connector/__init__.py @@ -0,0 +1 @@ +from pytorch_lightning.trainer.connectors.logger_connector.logger_connector import LoggerConnector diff --git a/pytorch_lightning/trainer/connectors/logger_connector/callback_hook_validator.py b/pytorch_lightning/trainer/connectors/logger_connector/callback_hook_validator.py new file mode 100644 index 0000000000000..e9c33cea70b8a --- /dev/null +++ b/pytorch_lightning/trainer/connectors/logger_connector/callback_hook_validator.py @@ -0,0 +1,220 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pytorch_lightning.utilities.exceptions import MisconfigurationException + + +class CallbackHookNameValidator: + + @staticmethod + def check_logging_in_callbacks(current_hook_fx_name: str = None, on_step: bool = None, + on_epoch: bool = None) -> None: + if current_hook_fx_name is None: + return + + internal_func = getattr(CallbackHookNameValidator, f"_{current_hook_fx_name}_log", None) + + if internal_func is None: + return + + current_callback_hook_auth_args = internal_func() + + if current_callback_hook_auth_args is not None: + m = "{} function supports only {} in {}. Provided {}" + if on_step not in current_callback_hook_auth_args["on_step"]: + msg = m.format(current_hook_fx_name, "on_step", current_callback_hook_auth_args["on_step"], on_step) + raise MisconfigurationException(msg) + + if on_epoch not in current_callback_hook_auth_args["on_epoch"]: + msg = m.format(current_hook_fx_name, "on_epoch", current_callback_hook_auth_args["on_epoch"], on_epoch) + raise MisconfigurationException(msg) + else: + raise MisconfigurationException( + f"{current_hook_fx_name} function doesn't support logging using self.log() yet." + ) + + @staticmethod + def _setup_log(): + """Called when fit or test begins""" + return None + + @staticmethod + def _teardown_log(): + """Called at the end of fit and test""" + return None + + @staticmethod + def _on_init_start_log(): + """Called when the trainer initialization begins, model has not yet been set.""" + return None + + @staticmethod + def _on_init_end_log(): + """Called when the trainer initialization ends, model has not yet been set.""" + return None + + @staticmethod + def _on_fit_start_log(): + """Called when the trainer initialization begins, model has not yet been set.""" + return None + + @staticmethod + def _on_fit_end_log(): + """Called when the trainer initialization begins, model has not yet been set.""" + return None + + @staticmethod + def _on_sanity_check_start_log(): + """Called when the validation sanity check starts.""" + return None + + @staticmethod + def _on_sanity_check_end_log(): + """Called when the validation sanity check ends.""" + return None + + @staticmethod + def _on_train_epoch_start_log(): + """Called when the epoch begins.""" + return {"on_step": [False, True], "on_epoch": [False, True]} + + @staticmethod + def _on_train_epoch_end_log(): + """Called when the epoch ends.""" + return {"on_step": [False], "on_epoch": [False, True]} + + @staticmethod + def _on_validation_epoch_start_log(): + """Called when the epoch begins.""" + return {"on_step": [False, True], "on_epoch": [False, True]} + + @staticmethod + def _on_validation_epoch_end_log(): + """Called when the epoch ends.""" + return {"on_step": [False], "on_epoch": [False, True]} + + @staticmethod + def _on_test_epoch_start_log(): + """Called when the epoch begins.""" + return {"on_step": [False, True], "on_epoch": [False, True]} + + @staticmethod + def _on_test_epoch_end_log(): + """Called when the epoch ends.""" + return {"on_step": [False], "on_epoch": [False, True]} + + @staticmethod + def _on_epoch_start_log(): + """Called when the epoch begins.""" + return {"on_step": [False, True], "on_epoch": [False, True]} + + @staticmethod + def _on_epoch_end_log(): + """Called when the epoch ends.""" + return {"on_step": [False], "on_epoch": [False, True]} + + @staticmethod + def _on_train_start_log(): + """Called when the train begins.""" + return {"on_step": [False, True], "on_epoch": [False, True]} + + @staticmethod + def _on_train_end_log(): + """Called when the train ends.""" + return None + + @staticmethod + def _on_pretrain_routine_start_log(): + """Called when the train begins.""" + return None + + @staticmethod + def _on_pretrain_routine_end_log(): + """Called when the train ends.""" + return None + + @staticmethod + def _on_batch_start_log(): + """Called when the training batch begins.""" + return {"on_step": [False, True], "on_epoch": [False, True]} + + @staticmethod + def _on_batch_end_log(): + """Called when the training batch ends.""" + return {"on_step": [False, True], "on_epoch": [False, True]} + + @staticmethod + def _on_train_batch_start_log(): + """Called when the training batch begins.""" + return {"on_step": [False, True], "on_epoch": [False, True]} + + @staticmethod + def _on_train_batch_end_log(): + """Called when the training batch ends.""" + return {"on_step": [False, True], "on_epoch": [False, True]} + + @staticmethod + def _on_validation_batch_start_log(): + """Called when the validation batch begins.""" + return {"on_step": [False, True], "on_epoch": [False, True]} + + @staticmethod + def _on_validation_batch_end_log(): + """Called when the validation batch ends.""" + return {"on_step": [False, True], "on_epoch": [False, True]} + + @staticmethod + def _on_test_batch_start_log(): + """Called when the test batch begins.""" + return {"on_step": [False, True], "on_epoch": [False, True]} + + @staticmethod + def _on_test_batch_end_log(): + """Called when the test batch ends.""" + return {"on_step": [False, True], "on_epoch": [False, True]} + + @staticmethod + def _on_validation_start_log(): + """Called when the validation loop begins.""" + return {"on_step": [False, True], "on_epoch": [False, True]} + + @staticmethod + def _on_validation_end_log(): + """Called when the validation loop ends.""" + return None + + @staticmethod + def _on_test_start_log(): + """Called when the test begins.""" + return {"on_step": [False, True], "on_epoch": [False, True]} + + @staticmethod + def _on_test_end_log(): + """Called when the test ends.""" + return None + + @staticmethod + def _on_keyboard_interrupt_log(): + """Called when the training is interrupted by KeyboardInterrupt.""" + return None + + @staticmethod + def _on_save_checkpoint_log(): + """Called when saving a model checkpoint.""" + return None + + @staticmethod + def _on_load_checkpoint_log(): + """Called when loading a model checkpoint.""" + return None diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py new file mode 100644 index 0000000000000..9f8d029d9bef4 --- /dev/null +++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py @@ -0,0 +1,618 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from collections import defaultdict, ChainMap +from enum import Enum +from typing import Union, Tuple, Any, Dict, Optional, List +from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.core.step_result import Result + + +# used to map boolean to right LoggerStage values +class FrozenDict(dict): + def __init__(self, *args, **kwargs): + self._hash = None + super(FrozenDict, self).__init__(*args, **kwargs) + + def __hash__(self): + if self._hash is None: + self._hash = hash(tuple(sorted(self.items()))) # iteritems() on py2 + return self._hash + + def _immutable(self, *args, **kws): + raise TypeError('cannot change object - object is immutable') + + __setitem__ = _immutable + __delitem__ = _immutable + pop = _immutable + popitem = _immutable + clear = _immutable + update = _immutable + setdefault = _immutable + + +LOOKUP_TABLE = FrozenDict({"1": "test", "0": "validation", "True": "test", "False": "validation"}) + + +class LoggerStages(Enum): + TRAIN = "train" + VAL = "validation" + TEST = "test" + + +class ResultStoreType(Enum): + INSIDE_BATCH_TRAIN_LOOP = "inside_batch_train_loop" + OUTSIDE_BATCH_TRAIN_LOOP = "outside_batch_train_loop" + + +class HookResultStore: + """ + This class is defined for internal usage. + It holds all metrics logged using the self.log function + in the scope of ModelHooks or Callback functions. + + We need to differiante 3 different scenarios: + - (1): We are outside of a batch loop + * It means no dataloader_idx, no optimizer idx, etc.. + - (2): We are inside the training batch loop + * We have an optimizer idx and split idx to track + - (3): We are inside the evaluation loop + * We have a dataloader_idx to track + + The data store `Result` objects for those 3 scenarios in `self._internals`. + + (1): self._internals = {"dataloader_idx": [Result(), ..., Result()]} + * dataloader_idx not being defined, it is set to 0 b default + (2): self._internals = {"dataloader_idx": + {"optimizer_idx": + {"batch_idx": + [Result(), Result()] + } + } + } + (3): Same as (1) for simplicity + + Those data structures enables us to reduce properly Result object when batch loop is finished. + """ + def __init__(self, fx_name): + self._fx_name = fx_name + self._internals = {} + self._internals_reduced = {} + self._internal_type = None + self.has_reduced = False + self._latest_ref = {} + + @property + def has_several_dataloaders(self) -> bool: + return self.num_dataloaders > 1 + + @property + def num_dataloaders(self) -> int: + _inter = self._internals_reduced if self.has_reduced else self._internals + return len(_inter) + + def check_dataloader_idx(self, result: Result) -> bool: + random_key = [*result.keys()][-1] + add_dataloader_idx = result["meta"][random_key]["dataloader_idx"] is not None + return add_dataloader_idx + + def get_lastest_from_func_name(self, latest_result, func_name: str, *args, **kwargs) -> Dict: + results = {} + add_dataloader_idx = self.check_dataloader_idx(latest_result) + func = getattr(latest_result, func_name) + results.update(func(*args, add_dataloader_idx=add_dataloader_idx, **kwargs)) + return results + + def run_lastest_batch_metrics_with_func_name(self, func_name, *args, **kwargs) -> List[Dict]: + """ + This function used cache_ref and cache_result to optimize loading metrics + + Context: As we update the logger_connector metrics on every `self.log` call, + and it can be pretty time consuming, especially when logging outside batch loop. + + HookResultStore keeps track of its latest added result object, + and cache its pbar and log metrics if already called on, + """ + results = [] + for dl_idx in range(self.num_dataloaders): + dl_idx = str(dl_idx) + latest_result = self._latest_ref[dl_idx] + result = self.get_lastest_from_func_name(latest_result, func_name, *args, **kwargs) + results.append(result) + return results + + def get_batch_pbar_metrics(self, *args, **kwargs): + return self.run_lastest_batch_metrics_with_func_name("get_batch_pbar_metrics", + *args, + **kwargs) + + def get_batch_log_metrics(self, *args, **kwargs): + return self.run_lastest_batch_metrics_with_func_name("get_batch_log_metrics", + *args, + **kwargs) + + def run_epoch_func(self, results, opt_metric, func_name, *args, **kwargs) -> None: + if isinstance(opt_metric, Result): + func = getattr(opt_metric, func_name) + metrics_to_log = func( + *args, + add_dataloader_idx=self.has_several_dataloaders, + **kwargs) + results.append(metrics_to_log) + else: + raise Exception("The provided opt_metric should be a Result Object. Something is wrong") + + def get_epoch_from_func_name(self, func_name, *args, **kwargs) -> List[Dict]: + results = [] + for dl_idx in range(self.num_dataloaders): + dl_idx = str(dl_idx) + opt_metrics = self._internals_reduced[dl_idx] + if isinstance(opt_metrics, defaultdict): + for opt_metric in opt_metrics.values(): + self.run_epoch_func(results, opt_metric, func_name, *args, **kwargs) + else: + self.run_epoch_func(results, opt_metrics, func_name, *args, **kwargs) + return results + + def get_epoch_pbar_metrics(self, *args, **kwargs) -> List[Dict]: + return self.get_epoch_from_func_name("get_epoch_pbar_metrics") + + def get_epoch_log_metrics(self, *args, **kwargs) -> List[Dict]: + return self.get_epoch_from_func_name("get_epoch_log_metrics") + + def get_forked_metrics(self, *args, **kwargs) -> List[Dict]: + return self.get_epoch_from_func_name("get_forked_metrics") + + @staticmethod + def _append_to_structure(primary_dict, opt_idx, batch_idx, result) -> None: + if opt_idx not in primary_dict: + primary_dict[opt_idx] = {} + + if batch_idx not in primary_dict[opt_idx]: + primary_dict[opt_idx][batch_idx] = [] + + primary_dict[opt_idx][batch_idx].append(result) + + def append(self, result, dataloader_idx=None, extra_info: dict = {}) -> None: + + assert isinstance(result, Result) + + if dataloader_idx is None: + dataloader_idx = 0 + + primary_key = f"{dataloader_idx}" + + # [dataloader_idx][optimizer_idx][training_step_idx] is a list + if len(extra_info) > 0: + self._internal_type = ResultStoreType.INSIDE_BATCH_TRAIN_LOOP + # initialize dictionary + if primary_key not in self._internals: + self._internals[primary_key] = {} + self._internals_reduced[primary_key] = defaultdict(dict) + + # extract infos + opt_idx = str(extra_info["opt_idx"]) + batch_idx = str(extra_info["batch_idx"]) + + self._append_to_structure(self._internals[primary_key], opt_idx, batch_idx, result) + + self._latest_ref[primary_key] = result + + # [dataloader_idx] is a list + else: + self._internal_type = ResultStoreType.OUTSIDE_BATCH_TRAIN_LOOP + if primary_key not in self._internals: + self._internals[primary_key] = [] + self._internals[primary_key].append(result) + + self._latest_ref[primary_key] = result + + def auto_reduce_results_on_epoch_end(self) -> None: + """ + This function is called to reduce `self._internals` Result object. + The reduced Result object will be saved into `self._internals_reduced` + The `self._internals` stored Result objects will be deleted to save memory. + """ + if not self.has_reduced: + epoch_log_metrics = {} + epoch_progress_bar_metrics = {} + + for dl_idx in range(self.num_dataloaders): + dl_idx = str(dl_idx) + epoch_metrics = self._internals[dl_idx] + + if self._internal_type == ResultStoreType.INSIDE_BATCH_TRAIN_LOOP: + + num_opt_idx = len(self._internals[dl_idx]) - 1 + + # Make sure we didn't create key + assert num_opt_idx >= 0 + + for opt_idx in range(num_opt_idx + 1): + opt_idx = str(opt_idx) + # TODO: Figure out to reduce memory + # TODO: How to start training in middle of epoch + opt_outputs = epoch_metrics[opt_idx] + + num_batch_idx = len(self._internals[dl_idx][str(num_opt_idx)]) - 1 + assert num_batch_idx >= 0 + batch_indexes = self._internals[dl_idx][str(num_opt_idx)].keys() + + # reduce across time first + time_reduced_outputs = [] + for batch_idx in batch_indexes: + batch_idx = str(batch_idx) + tbptt_outs = opt_outputs[str(batch_idx)] + tbptt_outs = tbptt_outs[0].__class__.reduce_across_time(tbptt_outs) + if len(tbptt_outs) > 1: + time_reduced_outputs.append(tbptt_outs) + + if len(time_reduced_outputs) == 0: + continue + + # reduce across training steps + opt_outputs = time_reduced_outputs[0].__class__.reduce_on_epoch_end(time_reduced_outputs) + + # with manual opt need 1 + metrics because meta is always there + if opt_outputs.minimize is not None: + opt_outputs.minimize = opt_outputs.minimize.mean() + + self._internals_reduced[dl_idx][str(opt_idx)] = opt_outputs + + # free memory + del self._internals[dl_idx][opt_idx] + else: + # no need to reduce as called only once + if len(epoch_metrics) == 1: + reduced_epoch_metrics = epoch_metrics[0] + else: + reduced_epoch_metrics = epoch_metrics[0].__class__.reduce_on_epoch_end(epoch_metrics) + + self._internals_reduced[dl_idx] = reduced_epoch_metrics + + # free memory + del self._internals[dl_idx] + + self.has_reduced = True + + def __getitem__(self, key: str) -> Any: + try: + if key in self._internals: + return self._internals[key] + return self[key] + except KeyError: + return None + + def __repr__(self): + return self._internals.__repr__() + + +class EpochResultStore: + """ + This class is defined for internal usage. + It holds all metrics logged using the self.log function using `HookResultStore` object. + The internal datastructure is as follow: + self._internals = {"fx_name_0": HookResultStore(), ..., "fx_name_n": HookResultStore()} + Pseudo Code Example: + ``` + model._current_fx_name = 'something' + model._results = Result() + model.log('a', ...) + epoch_result_store.cache_result() + ``` + """ + def __init__(self, trainer, stage): + self.trainer = trainer + self._stage = stage + self.reset() + + def __getitem__(self, key: str) -> Any: + try: + if key in self._internals: + return self._internals[key] + return None + except KeyError: + return None + + @property + def has_split_and_opt_idx(self): + """ + This function informs if we are running within training batch loop + """ + if self._split_idx is not None and self._opt_idx is not None: + return True + return False + + @property + def extra_info(self): + """ + This function provides necessary parameters to properly configure HookResultStore obj + """ + return {"batch_idx": self.trainer.batch_idx, + "split_idx": self._split_idx, + "opt_idx": self._opt_idx} + + def reset_model(self): + """ + This function is used to reset model state at the end of the capture + """ + model_ref = self.trainer.get_model() + model_ref._results = Result() + model_ref._current_hook_fx_name = None + model_ref._current_fx_name = '' + + def current_model_info(self): + """ + This function is used to extract + information related to current function scoping `self.log` call. + """ + model_ref = self.trainer.get_model() + # extract hook information + fx_name = model_ref._current_hook_fx_name + if fx_name is None: + fx_name = model_ref._current_fx_name + dataloader_idx = model_ref._current_dataloader_idx + return fx_name, dataloader_idx + + def cache_result(self) -> None: + """ + This function is called after every hook + and store the result object + """ + model_ref = self.trainer.get_model() + + # extract hook results + hook_result = model_ref._results + + # extract model information + fx_name, dataloader_idx = self.current_model_info() + + # add only if anything as been logged + # default len is 1 due to _internals + if len(hook_result) > 1: + + if fx_name not in self._internals: + self._internals[fx_name] = HookResultStore(fx_name) + + extra_info = {} + if self.has_split_and_opt_idx: + extra_info = self.extra_info + + # attach capture batch_size + Result.attach_batch_size(self._batch_size, hook_result) + + hook_result.detach() + if self.trainer.move_metrics_to_cpu: + hook_result.cpu() + + self._internals[fx_name].append( + hook_result, + dataloader_idx=dataloader_idx, + extra_info=extra_info) + + # update logged_metrics, progress_bar_metrics, callback_metrics + self.update_logger_connector(fx_name) + + # reset _results, fx_name + self.reset_model() + + def update_logger_connector(self, fx_name: str = None) -> None: + """ + This function is called every time we capture a hook + It automatically updates the logger_connector followings: + - progress_bar_metrics with pbar_metrics + - logged_metrics with log_metrics + - callback_metrics with progress_bar_metrics + logged_metrics + """ + + logger_connector = self.trainer.logger_connector + + callback_metrics = {} + + if not self._has_batch_loop_finished: + # get pbar + batch_pbar_metrics = self.get_latest_batch_pbar_metrics() + logger_connector.add_progress_bar_metrics(batch_pbar_metrics) + + if self._stage in LoggerStages.TRAIN.value: + # Only log and add to callback epoch step during evaluation, test. + batch_log_metrics = self.get_latest_batch_log_metrics() + logger_connector.logged_metrics.update(batch_log_metrics) + + callback_metrics.update(batch_pbar_metrics) + callback_metrics.update(batch_log_metrics) + else: + epoch_dict = {"epoch": self.trainer.current_epoch} + + # get pbar + epoch_pbar_metrics = self.get_epoch_pbar_metrics() + logger_connector.add_progress_bar_metrics(epoch_pbar_metrics) + + # get logged_metrics + epoch_log_metrics = self.get_epoch_log_metrics() + logger_connector.logged_metrics.update(epoch_log_metrics) + logger_connector.logged_metrics.update(epoch_dict) + + # get forked_metrics + forked_metrics = self.get_forked_metrics() + + callback_metrics.update(epoch_pbar_metrics) + callback_metrics.update(epoch_log_metrics) + callback_metrics.update(forked_metrics) + + # update callback_metrics + logger_connector.callback_metrics.update(callback_metrics) + logger_connector.callback_metrics.pop("epoch", None) + + def run_batch_from_func_name(self, func_name) -> Dict: + results = [] + for fx_name, hook_result in self._internals.items(): + func = getattr(hook_result, func_name) + results.append(func(include_forked_originals=False)) + return dict(ChainMap(*sum(results, []))) + + def get_latest_batch_log_metrics(self) -> Dict: + batch_log_metrics = self.run_batch_from_func_name("get_batch_log_metrics") + batch_log_metrics.update(self.legacy_batch_log_metrics) + return batch_log_metrics + + def get_latest_batch_pbar_metrics(self) -> Dict: + batch_pbar_metrics = self.run_batch_from_func_name("get_batch_pbar_metrics") + batch_pbar_metrics.update(self.legacy_batch_pbar_metrics) + return batch_pbar_metrics + + @property + def has_reduced(self) -> bool: + hook_results = self._internals.values() + return len(hook_results) == sum([h.has_reduced for h in hook_results]) + + def auto_reduce_results_on_epoch_end(self) -> None: + if not self.has_reduced: + for fx_name, hook_result in self._internals.items(): + hook_result.auto_reduce_results_on_epoch_end() + + @property + def has_batch_loop_finished(self) -> bool: + return self._has_batch_loop_finished + + @has_batch_loop_finished.setter + def has_batch_loop_finished(self, has_batch_loop_finished): + if has_batch_loop_finished: + # If batch loop has finished, reduce metrics + self.auto_reduce_results_on_epoch_end() + + # batch_size should be none as we finished batch loop + self._batch_size = None + + self._has_batch_loop_finished = has_batch_loop_finished + self.update_logger_connector() + + def run_epoch_by_func_name(self, func_name) -> Dict: + if not self.has_reduced: + self.auto_reduce_results_on_epoch_end() + results = [] + for fx_name, hook_result in self._internals.items(): + func = getattr(hook_result, func_name) + results.append(func()) + return dict(ChainMap(*sum(results, []))) + + def get_epoch_pbar_metrics(self) -> Dict: + return self.run_epoch_by_func_name("get_epoch_pbar_metrics") + + def get_epoch_log_metrics(self) -> Dict: + return self.run_epoch_by_func_name("get_epoch_log_metrics") + + def get_forked_metrics(self) -> Dict: + return self.run_epoch_by_func_name("get_forked_metrics") + + def reset(self): + self._internals = {} + self._dataloader_idx: Union[int, None] = None + self._split_idx: Union[int, None] = None + self._opt_idx: Union[int, None] = None + self._batch_size: Union[int, None] = None + self._has_batch_loop_finished = False + self.legacy_batch_log_metrics = {} + self.legacy_batch_pbar_metrics = {} + + def __call__( + self, + fx_name: Optional[Union[str, int]] = None, + dl_idx: Optional[Union[str, int]] = None, + opt_idx: Optional[Union[str, int]] = None, + batch_idx: Optional[Union[str, int]] = None, + split_idx: Optional[Union[str, int]] = None, + reduced: bool = False, + ): + """ + This function is an helper to access stored data + + It access data from the HookResultStore. Please, + check its data structure for better understanding + + Data can be accessed with the following chains: + + IF REDUCED: + * IF accessing a fx_name defined in batch training loop: + fx_name -> dl_idx -> opt_idx -> batch_idx -> split_idx + * ELSE fx_name -> dl_idx -> batch_idx + ELSE: + * IF accessing a fx_name defined in batch training loop: + fx_name -> dl_idx -> opt_idx + * ELSE fx_name -> dl_idx + + Note: + As soon as a param is None, it breaks the chain and returns associated stored data. + + Example:: + + result: Result = self(fx_name="training_step", dl_idx="0", opt_idx="0", reduced=True) + result['train_loss_epoch'] # aggregated train_loss over one epoch. + + Args: + + fx_name: Hook name from ModelHooks or Callback. Example: `training_step` + + dl_idx: Dataloader idx in short. It starts from 0 to num_dataloaders - 1 + + opt_idx: Optimizer idx in short. It starts from 0 to num_optimizers - 1 + + batch_idx: Index of batch idx seen during batch training or evaluation. + Works only with reduced=False + + split_idx: Index of split idx in training loop when ttbt is used. + + reduced: Data are being aggregated on on_epoch_end. + Indicates if we want to access aggregated Result or not. + """ + + hook_result = self[str(fx_name)] + + dl_idx = str(dl_idx) if dl_idx is not None else None + opt_idx = str(opt_idx) if opt_idx is not None else None + batch_idx = str(batch_idx) if batch_idx is not None else None + split_idx = int(split_idx) if split_idx is not None else None + + internal_type = hook_result._internal_type + + if reduced: + result = hook_result._internals_reduced + else: + result = hook_result._internals + + if internal_type == ResultStoreType.INSIDE_BATCH_TRAIN_LOOP: + if not reduced: + if dl_idx is not None: + result = result[dl_idx] + if opt_idx is not None: + result = result[opt_idx] + if batch_idx is not None: + result = result[batch_idx] + if split_idx is not None: + result = result[split_idx] + else: + if dl_idx is not None: + result = result[dl_idx] + if opt_idx is not None: + result = result[opt_idx] + else: + if dl_idx is not None: + result = result[dl_idx] + if batch_idx and not reduced: + result = result[batch_idx] + + return result + + def __repr__(self): + return f"{self.__class__.__name__}(stage={self._stage}, internals={self._internals})" diff --git a/pytorch_lightning/trainer/connectors/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py similarity index 68% rename from pytorch_lightning/trainer/connectors/logger_connector.py rename to pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 893eab5a16a3d..6a6a3229b8061 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -12,6 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import os +from pprint import pprint +from typing import Iterable, Union, cast +from copy import deepcopy +from collections import ChainMap import torch from pytorch_lightning.core import memory from pytorch_lightning.loggers import TensorBoardLogger, LoggerCollection @@ -19,10 +23,12 @@ from pytorch_lightning.utilities.model_utils import is_overridden from pytorch_lightning.core.step_result import EvalResult, Result from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pprint import pprint -from typing import Iterable -from copy import deepcopy -from collections import ChainMap +from pytorch_lightning.trainer.connectors.logger_connector.callback_hook_validator import CallbackHookNameValidator +from pytorch_lightning.trainer.connectors.logger_connector.epoch_result_store import ( + EpochResultStore, + LoggerStages, + LOOKUP_TABLE +) class LoggerConnector: @@ -33,8 +39,61 @@ def __init__(self, trainer): self.logged_metrics = {} self.progress_bar_metrics = {} self.eval_loop_results = [] + self._stages = sorted([s.value for s in LoggerStages]) + self._cached_results = {stage: EpochResultStore(trainer, stage) for stage in self._stages} + self._callback_hook_validator = CallbackHookNameValidator() + self._current_stage = None + + @property + def cached_results(self) -> Union[EpochResultStore, None]: + return self._cached_results[self._current_stage] + + def set_stage(self, stage_or_testing: str, reset:bool = False) -> None: + self._current_stage = self._determine_stage(stage_or_testing) + if reset: + self.cached_results.reset() + + def check_logging_in_callbacks(self, hook_fx_name, on_step: bool = None, on_epoch: bool = None) -> None: + self._callback_hook_validator.check_logging_in_callbacks(current_hook_fx_name=hook_fx_name, + on_step=on_step, + on_epoch=on_epoch) + + def on_evaluation_batch_start(self, testing, batch, dataloader_idx, num_dataloaders): + # reset the result of the PL module + model = self.trainer.get_model() + model._current_dataloader_idx = dataloader_idx if num_dataloaders > 1 else None + + # track batch_size + self.cached_results._batch_size = Result.extract_batch_size(batch) + + def on_train_split_start(self, split_idx: int, opt_idx: int, split_batch) -> None: + self.cached_results._split_idx = split_idx + self.cached_results._opt_idx = opt_idx + self.cached_results._batch_size = Result.extract_batch_size(split_batch) + + def on_train_batch_end(self) -> None: + self.cached_results._split_idx = None + self.cached_results._opt_idx = None + self.cached_results._batch_size = None + + def _determine_stage(self, stage_or_testing: Union[str, bool]) -> str: + stage_or_testing = str(stage_or_testing) + stages = self._stages + if stage_or_testing in stages: + return stage_or_testing + if stage_or_testing in LOOKUP_TABLE: + # Acces using trainer.testing + return LOOKUP_TABLE[stage_or_testing] + raise MisconfigurationException( + f"Provide stage_or_testing {stage_or_testing} doesn't belong either to {stages}" + f" or {LOOKUP_TABLE.keys()}" + ) - def on_trainer_init(self, logger, flush_logs_every_n_steps, log_every_n_steps): + def cache_logged_metrics(self) -> Union[EpochResultStore, None]: + if self._current_stage is not None: + self._cached_results[self._current_stage].cache_result() + + def on_trainer_init(self, logger, flush_logs_every_n_steps: int, log_every_n_steps: int, move_metrics_to_cpu: bool): # logging self.configure_logger(logger) # todo: IDE is complaining, these shall be initialized in the Trainer init at leas as placeholders @@ -42,6 +101,18 @@ def on_trainer_init(self, logger, flush_logs_every_n_steps, log_every_n_steps): self.trainer.flush_logs_every_n_steps = flush_logs_every_n_steps self.trainer.log_every_n_steps = log_every_n_steps + self.trainer.move_metrics_to_cpu = move_metrics_to_cpu + + @property + def should_flush_logs(self): + should_flush = (self.trainer.global_step + 1) % self.trainer.flush_logs_every_n_steps == 0 + return should_flush or self.trainer.should_stop + + @property + def should_update_logs(self): + should_log_every_n_steps = (self.trainer.global_step + 1) % self.trainer.log_every_n_steps == 0 + return should_log_every_n_steps or self.trainer.should_stop + def configure_logger(self, logger): if logger is True: version = os.environ.get('PL_EXP_VERSION', self.trainer.slurm_job_id) @@ -60,6 +131,53 @@ def configure_logger(self, logger): else: self.trainer.logger = logger + def cache_training_step_metrics(self, opt_closure_result): + """ + This function is responsible to update + logger_connector internals metrics holder based for depreceated logging + """ + using_results_obj = isinstance(opt_closure_result.training_step_output, Result) + + # temporary dict to collect metrics + logged_metrics_tmp = {} + pbar_metrics_tmp = {} + callback_metrics_tmp = {} + + if using_results_obj: + batch_log_metrics = opt_closure_result.training_step_output.get_batch_log_metrics( + include_forked_originals=False + ) + logged_metrics_tmp.update(batch_log_metrics) + + batch_pbar_metrics = opt_closure_result.training_step_output.get_batch_pbar_metrics( + include_forked_originals=False + ) + pbar_metrics_tmp.update(batch_pbar_metrics) + + forked_metrics = opt_closure_result.training_step_output.get_forked_metrics() + callback_metrics_tmp.update(forked_metrics) + callback_metrics_tmp.update(logged_metrics_tmp) + + else: + batch_log_metrics = opt_closure_result.training_step_output.log_metrics + logged_metrics_tmp.update(batch_log_metrics) + + callback_metrics = opt_closure_result.training_step_output.callback_metrics + callback_metrics_tmp.update(callback_metrics) + + batch_pbar_metrics = opt_closure_result.training_step_output.pbar_on_batch_end + pbar_metrics_tmp.update(batch_pbar_metrics) + + # track progress bar metrics + if len(pbar_metrics_tmp) > 0: + self.add_progress_bar_metrics(pbar_metrics_tmp) + + self.callback_metrics.update(callback_metrics_tmp) + + # save legacy log metrics + self.logged_metrics.update(logged_metrics_tmp) + self.cached_results.legacy_batch_log_metrics.update(logged_metrics_tmp) + def log_metrics(self, metrics, grad_norm_dic, step=None): """Logs the metric dict passed in. If `step` parameter is None and `step` key is presented is metrics, @@ -110,11 +228,12 @@ def add_progress_bar_metrics(self, metrics): def on_evaluation_epoch_end(self, deprecated_eval_results, epoch_logs, using_eval_result, test_mode): self._track_callback_metrics(deprecated_eval_results, using_eval_result) - self._log_on_evaluation_epoch_end_metrics(epoch_logs) # TODO: deprecate parts of this for 1.0 (when removing results) self.__process_eval_epoch_end_results_and_log_legacy(deprecated_eval_results, test_mode) + self._log_on_evaluation_epoch_end_metrics(epoch_logs) + # get the final loop results eval_loop_results = self._get_evaluate_epoch_results(test_mode) return eval_loop_results @@ -179,12 +298,16 @@ def _log_on_evaluation_epoch_end_metrics(self, epoch_logs): continue reduced_epoch_metrics = dl_metrics[0].__class__.reduce_on_epoch_end(dl_metrics) - # make the keys 'k/dl' - reduced_epoch_metrics = self.__rename_keys_by_dataloader_idx(reduced_epoch_metrics, dl_idx, num_loaders) - # track the metrics logger_metrics = reduced_epoch_metrics.get_epoch_log_metrics() pbar_metrics = reduced_epoch_metrics.get_epoch_pbar_metrics() + forked_metrics = reduced_epoch_metrics.get_forked_metrics() + + # make the keys 'k/dl' + logger_metrics = self.__rename_keys_by_dataloader_idx(logger_metrics, dl_idx, num_loaders) + pbar_metrics = self.__rename_keys_by_dataloader_idx(pbar_metrics, dl_idx, num_loaders) + forked_metrics = self.__rename_keys_by_dataloader_idx(forked_metrics, dl_idx, num_loaders) + self.logged_metrics.update(logger_metrics) self.add_progress_bar_metrics(pbar_metrics) @@ -193,11 +316,10 @@ def _log_on_evaluation_epoch_end_metrics(self, epoch_logs): self.callback_metrics.update(pbar_metrics) # forked metrics were dropped, enable them for callbacks - forked_metrics = reduced_epoch_metrics.get_forked_metrics() self.callback_metrics.update(forked_metrics) # track the final results for the dataloader - self.eval_loop_results.append(deepcopy(self.callback_metrics)) + self.add_to_eval_loop_results(dl_idx, num_loaders) # actually log if len(logger_metrics) > 0: @@ -208,6 +330,22 @@ def _log_on_evaluation_epoch_end_metrics(self, epoch_logs): if len(metrics_to_log) > 0: self.log_metrics(metrics_to_log, {}) + def add_to_eval_loop_results(self, dl_idx, num_loaders): + callback_metrics = deepcopy(self.callback_metrics) + if num_loaders == 1: + if len(self.eval_loop_results) > 0: + self.eval_loop_results[0].update(callback_metrics) + else: + self.eval_loop_results.append(callback_metrics) + return + + for key in list(callback_metrics.keys()): + if "dataloader_idx" in key: + if f"dataloader_idx_{dl_idx}" not in key: + # remove dl_idx from self.callback_metrics not belonging to this dataset. + del callback_metrics[key] + self.eval_loop_results.append(callback_metrics) + def __rename_keys_by_dataloader_idx(self, metrics, dataloader_idx, num_loaders): if num_loaders == 1: return metrics @@ -229,6 +367,7 @@ def _track_callback_metrics(self, eval_results, using_eval_result): else: self.trainer.logger_connector.callback_metrics.update(eval_results.callback_metrics) else: + flat = {} if isinstance(eval_results, list): for eval_result in eval_results: # with a scalar return, auto set it to "val_loss" for callbacks @@ -255,6 +394,25 @@ def _track_callback_metrics(self, eval_results, using_eval_result): flat['early_stop_on'] = flat['val_loss'] self.trainer.logger_connector.callback_metrics.update(flat) + def __process_eval_epoch_end_results_and_log_legacy_update(self, prog_bar_metrics, log_metrics, callback_metrics): + # eval loop returns all metrics + dataloader_result_metrics = {**prog_bar_metrics, **log_metrics, **callback_metrics} + + # add metrics to prog bar + self.trainer.logger_connector.add_progress_bar_metrics(prog_bar_metrics) + + # log metrics + if len(log_metrics) > 0: + self.trainer.logger_connector.log_metrics(log_metrics, {}) + + # track metrics for callbacks (all prog bar, logged and callback metrics) + self.trainer.logger_connector.callback_metrics.update(callback_metrics) + self.trainer.logger_connector.callback_metrics.update(log_metrics) + self.trainer.logger_connector.callback_metrics.update(prog_bar_metrics) + + if len(dataloader_result_metrics) > 0: + self.eval_loop_results.append(dataloader_result_metrics) + def __process_eval_epoch_end_results_and_log_legacy(self, eval_results, test_mode): if self.trainer.running_sanity_check: return @@ -265,6 +423,9 @@ def __process_eval_epoch_end_results_and_log_legacy(self, eval_results, test_mod if not isinstance(eval_results, list): eval_results = [eval_results] + num_loaders: int = self.trainer.evaluation_loop.num_dataloaders + prog_bar_metrics, log_metrics, callback_metrics = {}, {}, {} + for result_idx, result in enumerate(eval_results): if isinstance(result, EvalResult): prog_bar_metrics = result.epoch_pbar_metrics @@ -277,26 +438,15 @@ def __process_eval_epoch_end_results_and_log_legacy(self, eval_results, test_mod else: _, prog_bar_metrics, log_metrics, callback_metrics, _ = self.trainer.process_dict_result(result) - # eval loop returns all metrics - dataloader_result_metrics = {**prog_bar_metrics, **log_metrics, **callback_metrics} - - # add metrics to prog bar - self.trainer.logger_connector.add_progress_bar_metrics(prog_bar_metrics) + if num_loaders > 1: + self.__process_eval_epoch_end_results_and_log_legacy_update(prog_bar_metrics, log_metrics, callback_metrics) - # log metrics - if len(log_metrics) > 0: - self.trainer.logger_connector.log_metrics(log_metrics, {}) + if num_loaders == 1: + self.__process_eval_epoch_end_results_and_log_legacy_update(prog_bar_metrics, log_metrics, callback_metrics) - # track metrics for callbacks (all prog bar, logged and callback metrics) - self.trainer.logger_connector.callback_metrics.update(callback_metrics) - self.trainer.logger_connector.callback_metrics.update(log_metrics) - self.trainer.logger_connector.callback_metrics.update(prog_bar_metrics) - - if len(dataloader_result_metrics) > 0: - self.eval_loop_results.append(dataloader_result_metrics) - - def on_train_epoch_end(self, epoch_output): - pass + def on_train_epoch_end(self): + # inform cached logger connector epoch finished + self.cached_results.has_batch_loop_finished = True def log_train_epoch_end_metrics(self, epoch_output, @@ -340,12 +490,10 @@ def log_train_epoch_end_metrics(self, # ------------------ if is_1_0_result: # lightning module hook - epoch_end_log_result = self.training_epoch_end(model, epoch_output, num_optimizers) + self.training_epoch_end(model, epoch_output, num_optimizers) # log/aggregate metrics automatically epoch_log_metrics, epoch_progress_bar_metrics = self.__auto_reduce_results_on_epoch_end(epoch_output) - epoch_log_metrics.update(epoch_end_log_result.get_epoch_log_metrics()) - epoch_progress_bar_metrics.update(epoch_end_log_result.get_epoch_pbar_metrics()) # TODO: deprecate 1.0 else: @@ -358,6 +506,14 @@ def log_train_epoch_end_metrics(self, ) epoch_log_metrics, epoch_progress_bar_metrics, epoch_callback_metrics = out + # it will perform reduction over epoch and return log metrics + cached_epoch_log_metrics = self.cached_results.get_epoch_log_metrics() + cached_epoch_pbar_metrics = self.cached_results.get_epoch_pbar_metrics() + + # update + epoch_log_metrics.update(cached_epoch_log_metrics) + epoch_progress_bar_metrics.update(cached_epoch_pbar_metrics) + # -------------------------- # track results # -------------------------- @@ -374,15 +530,16 @@ def log_train_epoch_end_metrics(self, self.add_progress_bar_metrics(epoch_progress_bar_metrics) self.callback_metrics.update(epoch_progress_bar_metrics) + # reset epoch loop result for next epoch + self.cached_results.reset() + def training_epoch_end(self, model, epoch_output, num_optimizers): if not is_overridden('training_epoch_end', model=model): - return Result() + return # run training_epoch_end # refresh the result for custom logging at the epoch level model._current_fx_name = 'training_epoch_end' - model._results = Result() - epoch_output = self.__prepare_epoch_end_inputs(epoch_output) if num_optimizers == 1 or not self.trainer.train_loop.automatic_optimization: @@ -391,15 +548,11 @@ def training_epoch_end(self, model, epoch_output, num_optimizers): # lightningmodule hook epoch_output = model.training_epoch_end(epoch_output) - model._current_fx_name = '' - if epoch_output is not None: raise MisconfigurationException('training_epoch_end expects a return of None. ' 'HINT: remove the return statement in training_epoch_end') - - # user can ALSO log at the end of an epoch - new_epoch_end_logs = model._results - return new_epoch_end_logs + # capture logging + self.trainer.logger_connector.cache_logged_metrics() def __run_legacy_training_epoch_end( self, @@ -426,8 +579,12 @@ def __run_legacy_training_epoch_end( # run training_epoch_end # a list with a result per optimizer index + model._current_fx_name = 'training_epoch_end' epoch_output = model.training_epoch_end(epoch_output) + # capture logging + self.trainer.logger_connector.cache_logged_metrics() + if isinstance(epoch_output, Result): epoch_log_metrics = epoch_output.epoch_log_metrics epoch_progress_bar_metrics = epoch_output.epoch_pbar_metrics @@ -451,8 +608,7 @@ def __auto_reduce_results_on_epoch_end(self, epoch_output): for opt_outputs in epoch_output: # reduce across time first time_reduced_outputs = [] - for train_step_idx in range(len(opt_outputs)): - tbptt_outs = opt_outputs[train_step_idx] + for tbptt_outs in opt_outputs: tbptt_outs = tbptt_outs[0].__class__.reduce_across_time(tbptt_outs) if len(tbptt_outs) > 1: time_reduced_outputs.append(tbptt_outs) @@ -463,7 +619,7 @@ def __auto_reduce_results_on_epoch_end(self, epoch_output): # reduce across training steps opt_outputs = time_reduced_outputs[0].__class__.reduce_on_epoch_end(time_reduced_outputs) - # with manual opt need 1+ metrics because meta is always there + # with manual opt need 1 + metrics because meta is always there if opt_outputs.minimize is not None: opt_outputs.minimize = opt_outputs.minimize.mean() epoch_log_metrics.update(opt_outputs.epoch_log_metrics) @@ -482,8 +638,7 @@ def __prepare_epoch_end_inputs(self, epoch_output): for opt_outputs in epoch_output: # gather across time first time_gathered_outputs = [] - for train_step_idx in range(len(opt_outputs)): - tbptt_outs = opt_outputs[train_step_idx] + for tbptt_outs in opt_outputs: result = [] for x in tbptt_outs: out = x.extra @@ -511,8 +666,7 @@ def __gather_result_across_time_and_optimizers(self, epoch_output): for opt_outputs in epoch_output: # gather across time first time_gathered_outputs = [] - for train_step_idx in range(len(opt_outputs)): - tbptt_outs = opt_outputs[train_step_idx] + for tbptt_outs in opt_outputs: tbptt_outs = tbptt_outs[0].__class__.gather(tbptt_outs) time_gathered_outputs.append(tbptt_outs) @@ -525,12 +679,9 @@ def __gather_result_across_time_and_optimizers(self, epoch_output): def log_train_step_metrics(self, batch_output): # when metrics should be logged - should_log_metrics = ( - (self.trainer.global_step + 1) % self.trainer.log_every_n_steps == 0 or self.trainer.should_stop - ) - if should_log_metrics or self.trainer.fast_dev_run: + if self.should_update_logs or self.trainer.fast_dev_run: # logs user requested information to logger - metrics = batch_output.batch_log_metrics + metrics = self.cached_results.get_latest_batch_log_metrics() grad_norm_dic = batch_output.grad_norm_dic if len(metrics) > 0 or len(grad_norm_dic) > 0: self.log_metrics(metrics, grad_norm_dic) diff --git a/pytorch_lightning/trainer/connectors/profiler_connector.py b/pytorch_lightning/trainer/connectors/profiler_connector.py index 17aed23ab5b32..0f6686f1f83c7 100644 --- a/pytorch_lightning/trainer/connectors/profiler_connector.py +++ b/pytorch_lightning/trainer/connectors/profiler_connector.py @@ -11,7 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License -from pytorch_lightning.profiler import PassThroughProfiler, SimpleProfiler + +from typing import Union + +from pytorch_lightning.profiler import BaseProfiler, PassThroughProfiler, SimpleProfiler, AdvancedProfiler +from pytorch_lightning.utilities import rank_zero_warn +from pytorch_lightning.utilities.exceptions import MisconfigurationException class ProfilerConnector: @@ -19,8 +24,27 @@ class ProfilerConnector: def __init__(self, trainer): self.trainer = trainer - def on_trainer_init(self, profiler): - # configure profiler - if profiler is True: - profiler = SimpleProfiler() + def on_trainer_init(self, profiler: Union[BaseProfiler, bool, str]): + + if profiler and not isinstance(profiler, (bool, str, BaseProfiler)): + # TODO: Update exception on removal of bool + raise MisconfigurationException("Only None, bool, str and subclasses of `BaseProfiler` " + "are valid values for `Trainer`'s `profiler` parameter. " + f"Received {profiler} which is of type {type(profiler)}.") + + if isinstance(profiler, bool): + rank_zero_warn("Passing a bool value as a `profiler` argument to `Trainer` is deprecated" + " and will be removed in v1.3. Use str ('simple' or 'advanced') instead.", + DeprecationWarning) + if profiler: + profiler = SimpleProfiler() + elif isinstance(profiler, str): + profiler = profiler.lower() + if profiler == "simple": + profiler = SimpleProfiler() + elif profiler == "advanced": + profiler = AdvancedProfiler() + else: + raise ValueError("When passing string value for the `profiler` parameter of" + " `Trainer`, it can only be 'simple' or 'advanced'") self.trainer.profiler = profiler or PassThroughProfiler() diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index 9dab036583dd8..6ebab1ade0f1d 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -29,6 +29,7 @@ def __init__(self, trainer): self.predictions = None self.max_batches = None self.warning_cache = WarningCache() + self.num_dataloaders = None def on_trainer_init(self): self.trainer.num_val_batches = [] @@ -108,6 +109,9 @@ def on_evaluation_end(self, *args, **kwargs): else: self.trainer.call_hook('on_validation_end', *args, **kwargs) + # reset stage to train + self.trainer.logger_connector.set_stage("train") + def reload_evaluation_dataloaders(self): model = self.trainer.get_model() if self.testing: @@ -133,6 +137,7 @@ def setup(self, model, max_batches, dataloaders): max_batches = [max_batches] * len(dataloaders) self.max_batches = max_batches + self.num_dataloaders = self._get_num_dataloaders(dataloaders) def on_evaluation_epoch_start(self, *args, **kwargs): if self.testing: @@ -250,9 +255,10 @@ def __run_eval_epoch_end(self, num_dataloaders, using_eval_result): # depre warning if eval_results is not None and user_reduced: step = 'testing_epoch_end' if self.testing else 'validation_epoch_end' - m = f'The {step} should not return anything as of 9.1.' \ - f'to log, use self.log(...) or self.write(...) directly in the LightningModule' - self.warning_cache.warn(m) + self.warning_cache.warn( + f'The {step} should not return anything as of 9.1.' + ' To log, use self.log(...) or self.write(...) directly in the LightningModule' + ) if using_eval_result and not user_reduced: eval_results = self.__auto_reduce_result_objs(outputs) @@ -292,16 +298,20 @@ def __auto_reduce_result_objs(self, outputs): return eval_results - def on_evaluation_batch_start(self, *args, **kwargs): + def on_evaluation_batch_start(self, batch, batch_idx, dataloader_idx): # reset the result of the PL module model = self.trainer.get_model() model._results = Result() model._current_fx_name = 'evaluation_step' + # set dataloader_idx and track batch_size + self.trainer.logger_connector.on_evaluation_batch_start( + self.testing, batch, dataloader_idx, self.num_dataloaders) + if self.testing: - self.trainer.call_hook('on_test_batch_start', *args, **kwargs) + self.trainer.call_hook('on_test_batch_start', batch, batch_idx, dataloader_idx) else: - self.trainer.call_hook('on_validation_batch_start', *args, **kwargs) + self.trainer.call_hook('on_validation_batch_start', batch, batch_idx, dataloader_idx) def on_evaluation_batch_end(self, *args, **kwargs): if self.testing: @@ -348,6 +358,9 @@ def __log_result_step_metrics(self, output, batch_idx): step_log_metrics = output.get_batch_log_metrics(include_forked_originals=False) step_pbar_metrics = output.get_batch_pbar_metrics(include_forked_originals=False) + cached_batch_log_metrics = \ + self.trainer.logger_connector.cached_results.get_latest_batch_log_metrics() + if len(step_log_metrics) > 0: # make the metrics appear as a different line in the same graph metrics_by_epoch = {} diff --git a/pytorch_lightning/trainer/logging.py b/pytorch_lightning/trainer/logging.py index b585647fb5a0e..ae4d280d54649 100644 --- a/pytorch_lightning/trainer/logging.py +++ b/pytorch_lightning/trainer/logging.py @@ -14,7 +14,7 @@ from abc import ABC import inspect -from typing import Union, Iterable +from typing import Union, Iterable, Mapping import torch @@ -92,7 +92,7 @@ def process_dict_result(self, output, train=False): # --------------- # all keys not progress_bar or log are candidates for callbacks callback_metrics = {} - if output: + if isinstance(output, Mapping): for k, v in output.items(): if k not in ['progress_bar', 'log', 'hiddens']: callback_metrics[k] = v @@ -156,7 +156,7 @@ def process_dict_result(self, output, train=False): # --------------- # EXTRACT HIDDEN # --------------- - hiddens = output.get('hiddens') if output else None + hiddens = output.get('hiddens', None) if isinstance(output, Mapping) else None # use every metric passed in as a candidate for callback callback_metrics.update(progress_bar_metrics) diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py index afb2f4cb5eb91..af06b1bbc1352 100644 --- a/pytorch_lightning/trainer/properties.py +++ b/pytorch_lightning/trainer/properties.py @@ -15,9 +15,9 @@ import os from abc import ABC from argparse import ArgumentParser, Namespace -from typing import List, Optional, Union, Type, TypeVar +from typing import List, Optional, Union, Type, TypeVar, cast -from pytorch_lightning.callbacks import ProgressBarBase +from pytorch_lightning.callbacks import Callback, ProgressBarBase, ModelCheckpoint from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector @@ -26,6 +26,9 @@ from pytorch_lightning.utilities import argparse_utils from pytorch_lightning.utilities.cloud_io import get_filesystem from pytorch_lightning.utilities.model_utils import is_overridden +from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.loggers.base import LightningLoggerBase +from pytorch_lightning.loggers.tensorboard import TensorBoardLogger class TrainerProperties(ABC): @@ -44,8 +47,29 @@ class TrainerProperties(ABC): limit_val_batches: int _default_root_dir: str _weights_save_path: str + default_root_path: str + accelerator_backend: Accelerator + logger: LightningLoggerBase model_connector: ModelConnector checkpoint_connector: CheckpointConnector + callbacks: List[Callback] + + @property + def log_dir(self): + if self.checkpoint_callback is not None: + dir = self.checkpoint_callback.dirpath + dir = os.path.split(dir)[0] + elif self.logger is not None: + if isinstance(self.logger, TensorBoardLogger): + dir = self.logger.log_dir + else: + dir = self.logger.save_dir + else: + dir = self._default_root_dir + + if self.accelerator_backend is not None: + dir = self.accelerator_backend.broadcast(dir) + return dir @property def use_amp(self) -> bool: @@ -153,6 +177,7 @@ def progress_bar_callback(self): def progress_bar_dict(self) -> dict: """ Read-only for progress bar metrics. """ ref_model = self.model if not self.data_parallel else self.model.module + ref_model = cast(LightningModule, ref_model) return dict(**ref_model.get_progress_bar_dict(), **self.logger_connector.progress_bar_metrics) @property @@ -187,6 +212,20 @@ def weights_save_path(self) -> str: return os.path.normpath(self._weights_save_path) return self._weights_save_path + @property + def checkpoint_callback(self) -> Optional[ModelCheckpoint]: + """ + The first checkpoint callback in the Trainer.callbacks list, or ``None`` if + no checkpoint callbacks exist. + """ + callbacks = self.checkpoint_callbacks + return callbacks[0] if len(callbacks) > 0 else None + + @property + def checkpoint_callbacks(self) -> List[ModelCheckpoint]: + """ A list of all instances of ModelCheckpoint found in the Trainer.callbacks list. """ + return [c for c in self.callbacks if isinstance(c, ModelCheckpoint)] + def save_checkpoint(self, filepath, weights_only: bool = False): self.checkpoint_connector.save_checkpoint(filepath, weights_only) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index ee98e3614da67..84c7982700df0 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -43,7 +43,7 @@ class TensorRunningAccum(object): def __init__(self, window_length: int): self.window_length = window_length - self.memory = torch.Tensor(self.window_length) + self.memory = torch.zeros(self.window_length) self.current_idx: int = 0 self.last_idx: Optional[int] = None self.rotated: bool = False diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 44250ae905aba..4ef83dc7de544 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -22,7 +22,8 @@ from pytorch_lightning.callbacks import Callback, ModelCheckpoint from pytorch_lightning.core.datamodule import LightningDataModule from pytorch_lightning.core.lightning import LightningModule -from pytorch_lightning.core.step_result import EvalResult +from pytorch_lightning.core.memory import ModelSummary +from pytorch_lightning.core.step_result import Result, EvalResult from pytorch_lightning.loggers import LightningLoggerBase from pytorch_lightning.profiler import BaseProfiler from pytorch_lightning.trainer.callback_hook import TrainerCallbackHookMixin @@ -59,6 +60,7 @@ from pytorch_lightning.plugins.plugin_connector import PluginConnector from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.accelerators.cpu_accelerator import CPUAccelerator +from pytorch_lightning.utilities.memory import recursive_detach # warnings to ignore in trainer warnings.filterwarnings( @@ -85,7 +87,7 @@ class Trainer( def __init__( self, logger: Union[LightningLoggerBase, Iterable[LightningLoggerBase], bool] = True, - checkpoint_callback: Union[ModelCheckpoint, bool] = True, + checkpoint_callback: bool = True, callbacks: Optional[List[Callback]] = None, default_root_dir: Optional[str] = None, gradient_clip_val: float = 0, @@ -120,7 +122,7 @@ def __init__( num_sanity_val_steps: int = 2, truncated_bptt_steps: Optional[int] = None, resume_from_checkpoint: Optional[str] = None, - profiler: Optional[Union[BaseProfiler, bool]] = None, + profiler: Optional[Union[BaseProfiler, bool, str]] = None, benchmark: bool = False, deterministic: bool = False, reload_dataloaders_every_epoch: bool = False, @@ -134,6 +136,7 @@ def __init__( amp_level: str = 'O2', distributed_backend: Optional[str] = None, automatic_optimization: bool = True, + move_metrics_to_cpu: bool = False, ): r""" Customize every aspect of training via flags @@ -169,7 +172,12 @@ def __init__( callbacks: Add a list of callbacks. - checkpoint_callback: Callback for checkpointing. + checkpoint_callback: If ``True``, enable checkpointing. + It will configure a default ModelCheckpoint callback if there is no user-defined ModelCheckpoint in + :paramref:`~pytorch_lightning.trainer.trainer.Trainer.callbacks`. Default: ``True``. + + .. warning:: Passing a ModelCheckpoint instance to this argument is deprecated since + v1.1.0 and will be unsupported from v1.3.0. check_val_every_n_epoch: Check val every n train epochs. @@ -212,7 +220,8 @@ def __init__( progress_bar_refresh_rate: How often to refresh progress bar (in steps). Value ``0`` disables progress bar. Ignored when a custom callback is passed to :paramref:`~Trainer.callbacks`. - profiler: To profile individual steps during training and assist in identifying bottlenecks. + profiler: To profile individual steps during training and assist in identifying bottlenecks. Passing bool + value is deprecated in v1.1 and will be removed in v1.3. overfit_batches: Overfit a percent of training data (float) or a set number of batches (int). Default: 0.0 @@ -265,6 +274,9 @@ def __init__( stored in a different place than the logs written in `default_root_dir`. Can be remote file paths such as `s3://mybucket/path` or 'hdfs://path/' Defaults to `default_root_dir`. + + move_metrics_to_cpu: Whether to force internal logged metrics to be moved to cpu. + This can save some gpu memory, but can make training slower. Use with attention. """ super().__init__() @@ -296,7 +308,6 @@ def __init__( # init callbacks # Declare attributes to be set in callback_connector on_trainer_init - self.checkpoint_callback: Union[ModelCheckpoint, bool] = checkpoint_callback self.callback_connector.on_trainer_init( callbacks, checkpoint_callback, @@ -357,7 +368,12 @@ def __init__( self.profile_connector.on_trainer_init(profiler) # init logger flags - self.logger_connector.on_trainer_init(logger, flush_logs_every_n_steps, log_every_n_steps) + self.logger_connector.on_trainer_init( + logger, + flush_logs_every_n_steps, + log_every_n_steps, + move_metrics_to_cpu + ) # init debugging flags self.debugging_connector.on_init_start( @@ -460,6 +476,9 @@ def fit( def train(self): self.run_sanity_check(self.get_model()) + # set stage for logging + self.logger_connector.set_stage("train") + self.checkpoint_connector.has_trained = False # enable train mode @@ -473,6 +492,10 @@ def train(self): # hook self.train_loop.on_train_start() + if self.train_loop.should_skip_training(): + self.train_loop.on_train_end() + return + try: # run all epochs for epoch in range(self.current_epoch, self.max_epochs): @@ -523,16 +546,25 @@ def train(self): self.train_loop.on_train_end() def run_evaluation(self, test_mode: bool = False, max_batches=None): + + # used to know if we are logging for val, test + reset cached results + self.logger_connector.set_stage(test_mode, reset=True) + # bookkeeping self.evaluation_loop.testing = test_mode + + # prepare dataloaders dataloaders, max_batches = self.evaluation_loop.get_evaluation_dataloaders(max_batches) + + # check if we want to skip this evaluation if self.evaluation_loop.should_skip_evaluation(dataloaders, max_batches): return [], [] - # enable eval mode + no grads + # ref model model = self.get_model() - self.evaluation_loop.on_evaluation_model_eval() + # enable eval mode + no grads + self.evaluation_loop.on_evaluation_model_eval() model.zero_grad() torch.set_grad_enabled(False) @@ -581,12 +613,11 @@ def run_evaluation(self, test_mode: bool = False, max_batches=None): # log step metrics step_metrics = self.evaluation_loop.log_evaluation_step_metrics(batch, batch_idx) - if step_metrics is not None: - dl_step_metrics.append(step_metrics) + # track epoch level outputs + dl_step_metrics = self.track_output_for_epoch_end(dl_step_metrics, step_metrics) # track epoch level outputs - if output is not None: - dl_outputs.append(output) + dl_outputs = self.track_output_for_epoch_end(dl_outputs, output) self.evaluation_loop.outputs.append(dl_outputs) self.evaluation_loop.step_metrics.append(dl_step_metrics) @@ -612,6 +643,19 @@ def run_evaluation(self, test_mode: bool = False, max_batches=None): return eval_loop_results, deprecated_eval_results + def track_output_for_epoch_end(self, outputs, output): + if output is not None: + if isinstance(output, Result): + output.detach() + if self.move_metrics_to_cpu: + output.cpu() + elif isinstance(output, dict): + output = recursive_detach(output, to_cpu=self.move_metrics_to_cpu) + elif isinstance(output, torch.Tensor) and output.is_cuda and self.move_metrics_to_cpu: + output = output.cpu() + outputs.append(output) + return outputs + def run_test(self): # only load test dataloader for testing # self.reset_test_dataloader(ref_model) @@ -696,6 +740,8 @@ def test( # -------------------- self.verbose_test = verbose + self.logger_connector.set_stage("test") + # If you supply a datamodule you can't supply train_dataloader or val_dataloaders if test_dataloaders and datamodule: raise MisconfigurationException( @@ -814,7 +860,25 @@ def call_setup_hook(self, model): self.setup(stage_name) model.setup(stage_name) + def _reset_result_and_set_hook_fx_name(self, hook_name): + model_ref = self.get_model() + if model_ref is not None: + # used to track current hook name called + model_ref._results = Result() + model_ref._current_hook_fx_name = hook_name + + def _cache_logged_metrics(self): + model_ref = self.get_model() + if model_ref is not None: + # capture logging for this hook + self.logger_connector.cache_logged_metrics() + def call_hook(self, hook_name, *args, **kwargs): + # temporary. Don't modify evaluation behaviour + if self.logger_connector._current_stage == "train": + # set hook_name to model + reset Result obj + self._reset_result_and_set_hook_fx_name(hook_name) + # always profile hooks with self.profiler.profile(hook_name): @@ -836,4 +900,8 @@ def call_hook(self, hook_name, *args, **kwargs): accelerator_hook = getattr(self.accelerator_backend, hook_name) output = accelerator_hook(*args, **kwargs) - return output + # temporary. Don't modify evaluation behaviour + if self.logger_connector._current_stage == "train": + # capture logging + self._cache_logged_metrics() + return output diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index d32f47dbbd485..f705d82868da7 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import subprocess +from contextlib import contextmanager from copy import copy, deepcopy import numpy as np @@ -77,6 +77,15 @@ def num_optimizers(self): num_optimizers = len(self.get_optimizers_iterable()) return num_optimizers + def should_skip_training(self): + if self.trainer.current_epoch >= self.trainer.max_epochs: + return True + + if self.trainer.limit_train_batches == 0: + return True + + return False + def on_train_start(self): # clear cache before training if self.trainer.on_gpu and self.trainer.root_gpu is not None: @@ -203,7 +212,7 @@ def on_train_end(self): def check_checkpoint_callback(self, should_save, is_last=False): # TODO bake this logic into the checkpoint callback - if should_save: + if should_save and self.trainer.checkpoint_connector.has_trained: checkpoint_callbacks = [c for c in self.trainer.callbacks if isinstance(c, ModelCheckpoint)] if is_last and any(c.save_last for c in checkpoint_callbacks): rank_zero_info("Saving latest checkpoint...") @@ -242,12 +251,15 @@ def on_train_epoch_start(self, epoch): self.trainer.call_hook("on_train_epoch_start") def on_train_batch_end(self, epoch_output, epoch_end_outputs, batch, batch_idx, dataloader_idx): + # hook + self.trainer.call_hook('on_batch_end') + self.trainer.call_hook('on_train_batch_end', epoch_end_outputs, batch, batch_idx, dataloader_idx) + # figure out what to track for epoch end self.track_epoch_end_reduce_metrics(epoch_output, epoch_end_outputs) - # hook - self.trainer.call_hook("on_batch_end") - self.trainer.call_hook("on_train_batch_end", epoch_end_outputs, batch, batch_idx, dataloader_idx) + # reset batch logger internals + self.trainer.logger_connector.on_train_batch_end() def reset_train_val_dataloaders(self, model): if not self.trainer.reload_dataloaders_every_epoch: @@ -294,15 +306,26 @@ def on_after_backward(self, training_step_output, batch_idx, untouched_loss): # when in dev debugging track the losses self.trainer.dev_debugger.track_train_loss_history(batch_idx, untouched_loss.detach()) + def _check_training_step_output(self, training_step_output): + if isinstance(training_step_output, torch.Tensor) and not self.automatic_optimization: + if training_step_output.grad_fn is None: + # TODO: Find why - RuntimeError: Expected to mark a variable ready only once ... + raise MisconfigurationException("In manual optimization, `training_step` should not return a Tensor") + def training_step(self, split_batch, batch_idx, opt_idx, hiddens): # give the PL module a result for logging - model = self.trainer.get_model() - model._results = Result() - model._current_fx_name = "training_step" + model_ref = self.trainer.get_model() with self.trainer.profiler.profile("model_forward"): args = self.build_train_args(split_batch, batch_idx, opt_idx, hiddens) + + # manually capture logged metrics + model_ref._current_fx_name = 'training_step' training_step_output = self.trainer.accelerator_backend.training_step(args) + self.trainer.logger_connector.cache_logged_metrics() + + self._check_training_step_output(training_step_output) + training_step_output = self.trainer.call_hook("training_step_end", training_step_output) training_step_output_for_epoch_end, training_step_output = self._process_training_step_output( @@ -411,6 +434,8 @@ def _process_training_step_output_1_0(self, training_step_output, split_batch): # track metrics without grads for epoch reduction training_step_output_for_epoch_end = copy(result) training_step_output_for_epoch_end.detach() + if self.trainer.move_metrics_to_cpu: + training_step_output_for_epoch_end.cpu() # what flows back into the system training_step_output = result @@ -454,8 +479,7 @@ def optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_ ) def on_before_zero_grad(self, optimizer): - model = self.trainer.get_model() - model.on_before_zero_grad(optimizer) + self.trainer.call_hook('on_before_zero_grad', optimizer) def optimizer_zero_grad(self, batch_idx, optimizer, opt_idx): self.trainer.accelerator_backend.optimizer_zero_grad(batch_idx, optimizer, opt_idx) @@ -476,35 +500,6 @@ def _track_gradient_norm(self): grad_norm_dict = model.grad_norm(self.trainer.track_grad_norm) return grad_norm_dict - def log_training_step_metrics(self, opt_closure_result, batch_callback_metrics, batch_log_metrics): - # track callback metrics - callback_metrics = opt_closure_result.training_step_output.callback_metrics - - # decide which metrics to log (results vs dict return) - using_results_obj = isinstance(opt_closure_result.training_step_output, Result) - if using_results_obj: - metrics_to_log = opt_closure_result.training_step_output.get_batch_log_metrics( - include_forked_originals=False - ) - step_pbar_metrics = opt_closure_result.training_step_output.get_batch_pbar_metrics( - include_forked_originals=False - ) - forked_metrics = opt_closure_result.training_step_output.get_forked_metrics() - callback_metrics.update(forked_metrics) - else: - metrics_to_log = opt_closure_result.training_step_output.log_metrics - step_pbar_metrics = opt_closure_result.training_step_output.pbar_on_batch_end - - # track batch log metrics - batch_log_metrics.append(metrics_to_log) - - # track progress bar metrics - if len(step_pbar_metrics) > 0: - self.trainer.logger_connector.add_progress_bar_metrics(step_pbar_metrics) - self.trainer.logger_connector.callback_metrics.update(step_pbar_metrics) - - batch_callback_metrics.append(callback_metrics) - def process_hiddens(self, opt_closure_result): hiddens = opt_closure_result.hiddens if isinstance(opt_closure_result.training_step_output, Result): @@ -570,6 +565,8 @@ def run_training_epoch(self): should_check_val = self.should_check_val_fx(batch_idx, is_last_batch) if should_check_val: self.trainer.run_evaluation(test_mode=False) + # reset stage to train + self.trainer.logger_connector.set_stage("train") # ----------------------------------------- # SAVE LOGGERS (ie: Tensorboard, etc...) @@ -578,8 +575,8 @@ def run_training_epoch(self): # update LR schedulers monitor_metrics = deepcopy(self.trainer.logger_connector.callback_metrics) - monitor_metrics.update(batch_output.batch_log_metrics) self.update_train_loop_lr_schedulers(monitor_metrics=monitor_metrics) + self.trainer.checkpoint_connector.has_trained = True # max steps reached, end training if self.trainer.max_steps is not None and self.trainer.max_steps == self.trainer.global_step + 1: @@ -597,27 +594,25 @@ def run_training_epoch(self): self.trainer.total_batch_idx += 1 # stop epoch if we limited the number of training batches - if batch_idx + 1 >= self.trainer.num_training_batches: + if (batch_idx + 1) >= self.trainer.num_training_batches: break # progress global step according to grads progress self.increment_accumulated_grad_global_step() - self.trainer.checkpoint_connector.has_trained = True + # epoch end hook + self.run_on_epoch_end_hook(epoch_output) # log epoch metrics self.trainer.logger_connector.log_train_epoch_end_metrics( - epoch_output, self.checkpoint_accumulator, self.early_stopping_accumulator, self.num_optimizers + epoch_output, + self.checkpoint_accumulator, + self.early_stopping_accumulator, + self.num_optimizers ) - # hook - self.trainer.logger_connector.on_train_epoch_end(epoch_output) - # when no val loop is present or fast-dev-run still need to call checkpoints - self.check_checkpoint_callback(not (should_check_val or is_overridden("validation_step", model))) - - # epoch end hook - self.run_on_epoch_end_hook(epoch_output) + self.check_checkpoint_callback(not (should_check_val or is_overridden('validation_step', model))) # increment the global step once # progress global step according to grads progress @@ -627,12 +622,6 @@ def run_training_batch(self, batch, batch_idx, dataloader_idx): # track grad norms grad_norm_dic = {} - # track all metrics for callbacks - batch_callback_metrics = [] - - # track metrics to log - batch_log_metrics = [] - # bookkeeping using_results_obj = False self.trainer.hiddens = None @@ -653,39 +642,29 @@ def run_training_batch(self, batch, batch_idx, dataloader_idx): if response == -1: return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic) - # checks if backward or backward + optimizer step (via closure) - accumulation_done = self._accumulated_batches_reached() - is_final_batch = self._num_training_batches_reached() - # lightning module hook splits = self.tbptt_split_batch(batch) for split_idx, split_batch in enumerate(splits): - self.trainer.split_idx = split_idx - # in manual optimization we loop over all optimizers at once - optimizers = self.get_optimizers_iterable() - if not self.automatic_optimization: - optimizers = [optimizers[0]] - - # loop over optimizers - for opt_idx, optimizer in optimizers: - # make sure only the gradients of the current optimizer's parameters are calculated - # in the training step to prevent dangling gradients in multiple-optimizer setup. - if self.automatic_optimization and len(self.trainer.optimizers) > 1: - model = self.trainer.get_model() - model.toggle_optimizer(optimizer, opt_idx) - - if not (accumulation_done or is_final_batch): + # create an iterable for optimizers and loop over them + for opt_idx, optimizer in self.prepare_optimizers(): + + # toggle model params + set info to logger_connector + self.run_train_split_start(split_idx, split_batch, opt_idx, optimizer) + + if self.should_accumulate(): # For gradient accumulation # ------------------- # calculate loss (train step + train step end) # ------------------- - self.training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, self.trainer.hiddens) + + # perform dpp sync only when performing optimizer_step + with self.block_ddp_sync_behaviour(): + self.training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, self.trainer.hiddens) + batch_outputs = self._process_closure_result( - batch_callback_metrics=batch_callback_metrics, - batch_log_metrics=batch_log_metrics, batch_outputs=batch_outputs, opt_idx=opt_idx, ) @@ -696,7 +675,6 @@ def run_training_batch(self, batch, batch_idx, dataloader_idx): # gradient update with accumulated gradients else: - if self.automatic_optimization: def train_step_and_backward_closure(): @@ -713,63 +691,57 @@ def train_step_and_backward_closure(): self.optimizer_step(optimizer, opt_idx, batch_idx, train_step_and_backward_closure) else: - self._curr_step_result = self.training_step(split_batch, batch_idx, opt_idx, self.trainer.hiddens) + self._curr_step_result = self.training_step( + split_batch, + batch_idx, + opt_idx, + self.trainer.hiddens + ) if self._curr_step_result is None: # user decided to skip optimization + # make sure to zero grad. + self.zero_grad_handler(batch_idx, optimizer, opt_idx) continue batch_outputs = self._process_closure_result( - batch_callback_metrics=batch_callback_metrics, - batch_log_metrics=batch_log_metrics, batch_outputs=batch_outputs, opt_idx=opt_idx, ) + # todo: Properly aggregate grad_norm accros opt_idx and split_idx grad_norm_dic = self._cur_grad_norm_dict self._cur_grad_norm_dict = None - # hook - self.on_before_zero_grad(optimizer) - - # clear gradients - self.optimizer_zero_grad(batch_idx, optimizer, opt_idx) - - accumulated_loss = self.accumulated_loss.mean() - - if accumulated_loss is not None: - # calculate running loss for display - self.running_loss.append(self.accumulated_loss.mean() * self.trainer.accumulate_grad_batches) + # hook + clear gradients + self.zero_grad_handler(batch_idx, optimizer, opt_idx) - # reset for next set of accumulated grads - self.accumulated_loss.reset() - - # collapse all metrics into one dict - batch_log_metrics = {k: v for d in batch_log_metrics for k, v in d.items()} - - # track all metrics for callbacks - self.trainer.logger_connector.callback_metrics.update(batch_log_metrics) - self.trainer.logger_connector.callback_metrics.update( - {k: v for d in batch_callback_metrics for k, v in d.items() if v is not None} - ) + # update running loss + reset accumulated loss + self.update_running_loss() result = AttributeDict( signal=0, grad_norm_dic=grad_norm_dic, - batch_log_metrics=batch_log_metrics, training_step_output_for_epoch_end=batch_outputs, ) return result + @contextmanager + def block_ddp_sync_behaviour(self): + if isinstance(self.trainer.model, torch.nn.parallel.DistributedDataParallel): + yield self.trainer.model.no_sync() + else: + yield + def _process_closure_result( - self, batch_callback_metrics: list, batch_log_metrics: list, batch_outputs: list, opt_idx: int + self, batch_outputs: list, opt_idx: int ) -> list: opt_closure_result = self._curr_step_result if opt_closure_result is not None: - # log metrics - self.log_training_step_metrics(opt_closure_result, batch_callback_metrics, batch_log_metrics) + # cache metrics + self.trainer.logger_connector.cache_training_step_metrics(opt_closure_result) # track hiddens self.trainer.hiddens = self.process_hiddens(opt_closure_result) @@ -807,8 +779,10 @@ def training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer, with self.trainer.profiler.profile("model_backward"): self.backward(result, optimizer, opt_idx) - # hook - self.on_after_backward(result.training_step_output, batch_idx, result.loss) + # hook - call this hook only + # when gradients have finished to accumulate + if not self.should_accumulate(): + self.on_after_backward(result.training_step_output, batch_idx, result.loss) # check if loss or model weights are nan if self.trainer.terminate_on_nan: @@ -827,6 +801,10 @@ def backward(self, result, optimizer, opt_idx, *args, **kwargs): result.closure_loss, optimizer, opt_idx, *args, **kwargs ) + if not self.should_accumulate(): + # track gradients + self.track_and_norm_grad(optimizer=optimizer) + def update_train_loop_lr_schedulers(self, monitor_metrics=None): num_accumulated_batches_reached = self._accumulated_batches_reached() num_training_batches_reached = self._num_training_batches_reached() @@ -836,8 +814,10 @@ def update_train_loop_lr_schedulers(self, monitor_metrics=None): self.trainer.optimizer_connector.update_learning_rates(interval="step", monitor_metrics=monitor_metrics) def run_on_epoch_end_hook(self, epoch_output): - self.trainer.call_hook("on_epoch_end") - self.trainer.call_hook("on_train_epoch_end", epoch_output) + self.trainer.call_hook('on_epoch_end') + self.trainer.call_hook('on_train_epoch_end', epoch_output) + + self.trainer.logger_connector.on_train_epoch_end() def increment_accumulated_grad_global_step(self): num_accumulated_batches_reached = self._accumulated_batches_reached() @@ -853,6 +833,12 @@ def _accumulated_batches_reached(self): def _num_training_batches_reached(self): return (self.trainer.batch_idx + 1) == self.trainer.num_training_batches + def should_accumulate(self): + # checks if backward or backward + optimizer step (via closure) + accumulation_done = self._accumulated_batches_reached() + is_final_batch = self._num_training_batches_reached() + return not (accumulation_done or is_final_batch) + def should_check_val_fx(self, batch_idx, is_last_batch): # decide if we should run validation is_val_check_batch = (batch_idx + 1) % self.trainer.val_check_batch == 0 @@ -886,10 +872,8 @@ def build_train_args(self, batch, batch_idx, opt_idx, hiddens): def save_loggers_on_train_batch_end(self): # when loggers should save to disk - should_save_log = ( - self.trainer.global_step + 1 - ) % self.trainer.flush_logs_every_n_steps == 0 or self.trainer.should_stop - if should_save_log or self.trainer.fast_dev_run: + should_flush_logs = self.trainer.logger_connector.should_flush_logs + if should_flush_logs or self.trainer.fast_dev_run: if self.trainer.is_global_zero and self.trainer.logger is not None: self.trainer.logger.save() @@ -924,3 +908,44 @@ def process_train_step_outputs(self, all_train_step_outputs, early_stopping_accu epoch_end_outputs.append(optimizer_idx_outputs) return epoch_end_outputs + + def prepare_optimizers(self): + # in manual optimization we loop over all optimizers at once + optimizers = self.get_optimizers_iterable() + if not self.automatic_optimization: + optimizers = [optimizers[0]] + return optimizers + + def run_train_split_start(self, split_idx, split_batch, opt_idx, optimizer): + # set split_idx to trainer for tracking + self.trainer.split_idx = split_idx + + # make sure only the gradients of the current optimizer's parameters are calculated + # in the training step to prevent dangling gradients in multiple-optimizer setup. + if self.automatic_optimization and len(self.trainer.optimizers) > 1: + model = self.trainer.get_model() + model.toggle_optimizer(optimizer, opt_idx) + + # use to track metrics internally + self.trainer.logger_connector.on_train_split_start(split_idx, opt_idx, split_batch) + + def update_running_loss(self): + accumulated_loss = self.accumulated_loss.mean() + + if accumulated_loss is not None: + # calculate running loss for display + self.running_loss.append(self.accumulated_loss.mean() * self.trainer.accumulate_grad_batches) + + # reset for next set of accumulated grads + self.accumulated_loss.reset() + + def zero_grad_handler(self, batch_idx, optimizer, opt_idx): + if self.automatic_optimization: + # hook + self.on_before_zero_grad(optimizer) + optimizers = enumerate([optimizer]) + else: + optimizers = self.get_optimizers_iterable() + + for idx, optimizer in optimizers: + self.optimizer_zero_grad(batch_idx, optimizer, opt_idx) diff --git a/pytorch_lightning/tuner/auto_gpu_select.py b/pytorch_lightning/tuner/auto_gpu_select.py index f1b13a69745bc..fd2ba4a1f3627 100644 --- a/pytorch_lightning/tuner/auto_gpu_select.py +++ b/pytorch_lightning/tuner/auto_gpu_select.py @@ -13,8 +13,18 @@ # limitations under the License. import torch +from pytorch_lightning.utilities.exceptions import MisconfigurationException + def pick_multiple_gpus(nb): + if nb == 0: + raise MisconfigurationException( + r"auto_select_gpus=True, gpus=0 is not a valid configuration.\ + Please select a valid number of GPU resources when using auto_select_gpus." + ) + + nb = torch.cuda.device_count() if nb == -1 else nb + picked = [] for _ in range(nb): picked.append(pick_single_gpu(exclude_gpus=picked)) diff --git a/pytorch_lightning/tuner/batch_size_scaling.py b/pytorch_lightning/tuner/batch_size_scaling.py index 87783fbde5d1f..67a4704b628fc 100644 --- a/pytorch_lightning/tuner/batch_size_scaling.py +++ b/pytorch_lightning/tuner/batch_size_scaling.py @@ -22,6 +22,7 @@ from pytorch_lightning.utilities.memory import is_oom_error, garbage_collection_cuda from pytorch_lightning.loggers.base import DummyLogger from pytorch_lightning import _logger as log +from pytorch_lightning.utilities.cloud_io import get_filesystem def scale_batch_size(trainer, @@ -68,6 +69,10 @@ def scale_batch_size(trainer, **fit_kwargs: remaining arguments to be passed to .fit(), e.g., dataloader or datamodule. """ + if trainer.fast_dev_run: + rank_zero_warn('Skipping batch size scaler since `fast_dev_run=True`', UserWarning) + return + if not lightning_hasattr(model, batch_arg_name): raise MisconfigurationException( f'Field {batch_arg_name} not found in both `model` and `model.hparams`') @@ -90,7 +95,7 @@ def scale_batch_size(trainer, __scale_batch_reset_params(trainer, model, steps_per_trial) # Save initial model, that is loaded after batch size is found - save_path = os.path.join(trainer.default_root_dir, 'temp_model.ckpt') + save_path = os.path.join(trainer.default_root_dir, 'scale_batch_size_temp_model.ckpt') trainer.save_checkpoint(str(save_path)) if trainer.progress_bar_callback: @@ -109,8 +114,11 @@ def scale_batch_size(trainer, log.info(f'Finished batch size finder, will continue with full run using batch size {new_size}') # Restore initial state of model - trainer.checkpoint_connector.restore(str(save_path), on_gpu=trainer.on_gpu) - os.remove(save_path) + if trainer.is_global_zero: + trainer.checkpoint_connector.restore(str(save_path), on_gpu=trainer.on_gpu) + fs = get_filesystem(str(save_path)) + if fs.exists(save_path): + fs.rm(save_path) # Finish by resetting variables so trainer is ready to fit model __scale_batch_restore_params(trainer) @@ -144,7 +152,6 @@ def __scale_batch_reset_params(trainer, model, steps_per_trial): trainer.weights_summary = None # not needed before full run trainer.logger = DummyLogger() trainer.callbacks = [] # not needed before full run - trainer.checkpoint_callback = False # required for saving trainer.limit_train_batches = 1.0 trainer.optimizers, trainer.schedulers = [], [] # required for saving trainer.model = model # required for saving @@ -157,7 +164,6 @@ def __scale_batch_restore_params(trainer): trainer.weights_summary = trainer.__dumped_params['weights_summary'] trainer.logger = trainer.__dumped_params['logger'] trainer.callbacks = trainer.__dumped_params['callbacks'] - trainer.checkpoint_callback = trainer.__dumped_params['checkpoint_callback'] trainer.auto_scale_batch_size = trainer.__dumped_params['auto_scale_batch_size'] trainer.limit_train_batches = trainer.__dumped_params['limit_train_batches'] trainer.model = trainer.__dumped_params['model'] diff --git a/pytorch_lightning/tuner/lr_finder.py b/pytorch_lightning/tuner/lr_finder.py index d0ab33df8e1b8..b6d8c8178093b 100644 --- a/pytorch_lightning/tuner/lr_finder.py +++ b/pytorch_lightning/tuner/lr_finder.py @@ -29,6 +29,8 @@ from pytorch_lightning.loggers.base import DummyLogger from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.parsing import lightning_hasattr, lightning_setattr +from pytorch_lightning.utilities import rank_zero_warn +from pytorch_lightning.utilities.cloud_io import get_filesystem # check if ipywidgets is installed before importing tqdm.auto # to ensure it won't fail and a progress bar is displayed @@ -41,6 +43,10 @@ def _run_lr_finder_internally(trainer, model: LightningModule): """ Call lr finder internally during Trainer.fit() """ lr_finder = lr_find(trainer, model) + + if lr_finder is None: + return + lr = lr_finder.suggestion() # TODO: log lr.results to self.logger @@ -130,7 +136,11 @@ def lr_find( trainer.fit(model) """ - save_path = os.path.join(trainer.default_root_dir, 'lr_find_temp.ckpt') + if trainer.fast_dev_run: + rank_zero_warn('Skipping learning rate finder since `fast_dev_run=True`', UserWarning) + return + + save_path = os.path.join(trainer.default_root_dir, 'lr_find_temp_model.ckpt') __lr_finder_dump_params(trainer, model) @@ -155,9 +165,6 @@ def lr_find( if trainer.progress_bar_callback: trainer.progress_bar_callback.disable() - # Disable standard checkpoint & early stopping - trainer.checkpoint_callback = False - # Required for saving the model trainer.optimizers, trainer.schedulers = [], [], trainer.model = model @@ -184,8 +191,11 @@ def lr_find( lr_finder._total_batch_idx = trainer.total_batch_idx # for debug purpose # Reset model state - trainer.checkpoint_connector.restore(str(save_path), on_gpu=trainer.on_gpu) - os.remove(save_path) + if trainer.is_global_zero: + trainer.checkpoint_connector.restore(str(save_path), on_gpu=trainer.on_gpu) + fs = get_filesystem(str(save_path)) + if fs.exists(save_path): + fs.rm(save_path) # Finish by resetting variables so trainer is ready to fit model __lr_finder_restore_params(trainer, model) @@ -212,7 +222,6 @@ def __lr_finder_restore_params(trainer, model): trainer.logger = trainer.__dumped_params['logger'] trainer.callbacks = trainer.__dumped_params['callbacks'] trainer.max_steps = trainer.__dumped_params['max_steps'] - trainer.checkpoint_callback = trainer.__dumped_params['checkpoint_callback'] model.configure_optimizers = trainer.__dumped_params['configure_optimizers'] del trainer.__dumped_params diff --git a/pytorch_lightning/tuner/tuning.py b/pytorch_lightning/tuner/tuning.py index 9929249804309..88009427bc3c6 100644 --- a/pytorch_lightning/tuner/tuning.py +++ b/pytorch_lightning/tuner/tuning.py @@ -54,17 +54,19 @@ def tune(self, model, train_dataloader, val_dataloaders, datamodule): # Run learning rate finder: if self.trainer.auto_lr_find: - self.internal_find_lr(self.trainer, model) + self.internal_find_lr(model) model.logger = self.trainer.logger # reset logger binding - def scale_batch_size(self, - model, - mode: str = 'power', - steps_per_trial: int = 3, - init_val: int = 2, - max_trials: int = 25, - batch_arg_name: str = 'batch_size', - **fit_kwargs): + def scale_batch_size( + self, + model, + mode: str = 'power', + steps_per_trial: int = 3, + init_val: int = 2, + max_trials: int = 25, + batch_arg_name: str = 'batch_size', + **fit_kwargs + ): r""" Will iteratively try to find the largest batch size for a given model that does not give an out of memory (OOM) error. @@ -102,7 +104,14 @@ def scale_batch_size(self, """ return scale_batch_size( - self.trainer, model, mode, steps_per_trial, init_val, max_trials, batch_arg_name, **fit_kwargs + self.trainer, + model, + mode, + steps_per_trial, + init_val, + max_trials, + batch_arg_name, + **fit_kwargs, ) def lr_find( @@ -130,8 +139,8 @@ def lr_find( datamodule, ) - def internal_find_lr(self, trainer, model: LightningModule): - return _run_lr_finder_internally(trainer, model) + def internal_find_lr(self, model: LightningModule): + return _run_lr_finder_internally(self.trainer, model) def pick_multiple_gpus(self, num_gpus: int): return pick_multiple_gpus(num_gpus) diff --git a/pytorch_lightning/utilities/argparse_utils.py b/pytorch_lightning/utilities/argparse_utils.py index 57c9e23d80dc9..bbb89ad09aa48 100644 --- a/pytorch_lightning/utilities/argparse_utils.py +++ b/pytorch_lightning/utilities/argparse_utils.py @@ -14,7 +14,7 @@ import inspect import os from argparse import ArgumentParser, Namespace -from typing import Union, List, Tuple, Any +from typing import Dict, Union, List, Tuple, Any from pytorch_lightning.utilities import parsing @@ -160,7 +160,7 @@ def add_argparse_args(cls, parent_parser: ArgumentParser) -> ArgumentParser: allowed_types = (str, int, float, bool) - # TODO: get "help" from docstring :) + args_help = parse_args_from_docstring(cls.__init__.__doc__ or cls.__doc__) for arg, arg_types, arg_default in ( at for at in get_init_arguments_and_types(cls) if at[0] not in depr_arg_names ): @@ -174,8 +174,7 @@ def add_argparse_args(cls, parent_parser: ArgumentParser) -> ArgumentParser: # if the only arg type is bool if len(arg_types) == 1: use_type = parsing.str_to_bool - # if only two args (str, bool) - elif len(arg_types) == 2 and set(arg_types) == {str, bool}: + elif str in arg_types: use_type = parsing.str_to_bool_or_str else: # filter out the bool as we need to use more general @@ -200,13 +199,36 @@ def add_argparse_args(cls, parent_parser: ArgumentParser) -> ArgumentParser: dest=arg, default=arg_default, type=use_type, - help='autogenerated by pl.Trainer', + help=args_help.get(arg), **arg_kwargs, ) return parser +def parse_args_from_docstring(docstring: str) -> Dict[str, str]: + arg_block_indent = None + current_arg = None + parsed = {} + for line in docstring.split("\n"): + stripped = line.lstrip() + if not stripped: + continue + line_indent = len(line) - len(stripped) + if stripped.startswith(('Args:', 'Arguments:', 'Parameters:')): + arg_block_indent = line_indent + 4 + elif arg_block_indent is None: + continue + elif line_indent < arg_block_indent: + break + elif line_indent == arg_block_indent: + current_arg, arg_description = stripped.split(':', maxsplit=1) + parsed[current_arg] = arg_description.lstrip() + elif line_indent > arg_block_indent: + parsed[current_arg] += f' {stripped}' + return parsed + + def _gpus_allowed_type(x) -> Union[int, str]: if ',' in x: return str(x) diff --git a/pytorch_lightning/utilities/debugging.py b/pytorch_lightning/utilities/debugging.py index 242f3105d780c..f7b9e79b7f932 100644 --- a/pytorch_lightning/utilities/debugging.py +++ b/pytorch_lightning/utilities/debugging.py @@ -37,7 +37,6 @@ def wrapped_fn(self, *args, **kwargs): class InternalDebugger(object): def __init__(self, trainer): - self.enabled = os.environ.get('PL_DEV_DEBUG', '0') == '1' self.trainer = trainer self.logged_metrics = [] diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py index a29fd3e5a1059..98d322ce0a3a2 100644 --- a/pytorch_lightning/utilities/distributed.py +++ b/pytorch_lightning/utilities/distributed.py @@ -73,7 +73,7 @@ def find_free_network_port() -> int: return port -def gather_all_tensors_if_available(result: Union[torch.Tensor], group: Optional[Any] = None): +def gather_all_tensors(result: Union[torch.Tensor], group: Optional[Any] = None): """ Function to gather all tensors from several ddp processes onto a list that is broadcasted to all processes @@ -85,26 +85,41 @@ def gather_all_tensors_if_available(result: Union[torch.Tensor], group: Optional Return: gathered_result: list with size equal to the process group where gathered_result[i] corresponds to result tensor from process i - """ - if torch.distributed.is_available() and torch.distributed.is_initialized(): - if group is None: - group = torch.distributed.group.WORLD + if group is None: + group = torch.distributed.group.WORLD - world_size = torch.distributed.get_world_size(group) + world_size = torch.distributed.get_world_size(group) - gathered_result = [torch.zeros_like(result) for _ in range(world_size)] + gathered_result = [torch.zeros_like(result) for _ in range(world_size)] - # sync and broadcast all - torch.distributed.barrier(group=group) - torch.distributed.all_gather(gathered_result, result, group) + # sync and broadcast all + torch.distributed.barrier(group=group) + torch.distributed.all_gather(gathered_result, result, group) - result = gathered_result - return result + return gathered_result def sync_ddp_if_available( result: Union[torch.Tensor], group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None +) -> torch.Tensor: + """ + Function to reduce a tensor across worker processes during distributed training + Args: + result: the value to sync and reduce (typically tensor or number) + group: the process group to gather results from. Defaults to all processes (world) + reduce_op: the reduction operation. Defaults to sum. + Can also be a string of 'avg', 'mean' to calculate the mean during reduction. + Return: + reduced value + """ + if torch.distributed.is_available() and torch.distributed.is_initialized(): + return sync_ddp(result, group=group, reduce_op=reduce_op) + return result + + +def sync_ddp( + result: Union[torch.Tensor], group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None ) -> torch.Tensor: """ Function to reduce the tensors from several ddp processes to one master process @@ -118,24 +133,22 @@ def sync_ddp_if_available( Return: reduced value """ + divide_by_world_size = False - if torch.distributed.is_available() and torch.distributed.is_initialized(): - divide_by_world_size = False - - if group is None: - group = torch.distributed.group.WORLD + if group is None: + group = torch.distributed.group.WORLD - if reduce_op is None: - reduce_op = torch.distributed.ReduceOp.SUM - elif isinstance(reduce_op, str) and reduce_op in ("avg", "mean"): - reduce_op = torch.distributed.ReduceOp.SUM - divide_by_world_size = True + if reduce_op is None: + reduce_op = torch.distributed.ReduceOp.SUM + elif isinstance(reduce_op, str) and reduce_op in ("avg", "mean"): + reduce_op = torch.distributed.ReduceOp.SUM + divide_by_world_size = True - # sync all processes before reduction - torch.distributed.barrier(group=group) - torch.distributed.all_reduce(result, op=reduce_op, group=group, async_op=False) + # sync all processes before reduction + torch.distributed.barrier(group=group) + torch.distributed.all_reduce(result, op=reduce_op, group=group, async_op=False) - if divide_by_world_size: - result = result / torch.distributed.get_world_size(group) + if divide_by_world_size: + result = result / torch.distributed.get_world_size(group) return result diff --git a/pytorch_lightning/utilities/memory.py b/pytorch_lightning/utilities/memory.py index 1d3b8d27807f0..16c0ede1e5413 100644 --- a/pytorch_lightning/utilities/memory.py +++ b/pytorch_lightning/utilities/memory.py @@ -17,7 +17,7 @@ import torch -def recursive_detach(in_dict: dict) -> dict: +def recursive_detach(in_dict: dict, to_cpu: bool = False) -> dict: """Detach all tensors in `in_dict`. May operate recursively if some of the values in `in_dict` are dictionaries @@ -26,6 +26,7 @@ def recursive_detach(in_dict: dict) -> dict: Args: in_dict: + to_cpu: Wheter to move tensor to cpu Return: out_dict: @@ -35,7 +36,11 @@ def recursive_detach(in_dict: dict) -> dict: if isinstance(v, dict): out_dict.update({k: recursive_detach(v)}) elif callable(getattr(v, 'detach', None)): - out_dict.update({k: v.detach()}) + # detach + v = v.detach() + if to_cpu: + v = v.cpu() + out_dict.update({k: v}) else: out_dict.update({k: v}) return out_dict diff --git a/pytorch_lightning/utilities/parsing.py b/pytorch_lightning/utilities/parsing.py index c562f780e88ee..348eec110c3a1 100644 --- a/pytorch_lightning/utilities/parsing.py +++ b/pytorch_lightning/utilities/parsing.py @@ -61,7 +61,7 @@ def is_picklable(obj: object) -> bool: try: pickle.dumps(obj) return True - except pickle.PicklingError: + except (pickle.PicklingError, AttributeError): return False @@ -177,8 +177,9 @@ def __repr__(self): def lightning_hasattr(model, attribute): """ Special hasattr for lightning. Checks for attribute in model namespace, the old hparams namespace/dict, and the datamodule. """ - trainer = model.trainer + trainer = getattr(model, 'trainer', None) + attr = False # Check if attribute in model if hasattr(model, attribute): attr = True @@ -189,10 +190,8 @@ def lightning_hasattr(model, attribute): else: attr = hasattr(model.hparams, attribute) # Check if the attribute in datamodule (datamodule gets registered in Trainer) - elif trainer is not None and trainer.datamodule is not None and hasattr(trainer.datamodule, attribute): - attr = getattr(trainer.datamodule, attribute) - else: - attr = False + if not attr and trainer is not None: + attr = hasattr(trainer.datamodule, attribute) return attr @@ -200,18 +199,16 @@ def lightning_hasattr(model, attribute): def lightning_getattr(model, attribute): """ Special getattr for lightning. Checks for attribute in model namespace, the old hparams namespace/dict, and the datamodule. """ - trainer = model.trainer + trainer = getattr(model, 'trainer', None) # Check if attribute in model if hasattr(model, attribute): attr = getattr(model, attribute) # Check if attribute in model.hparams, either namespace or dict - elif hasattr(model, 'hparams'): - if isinstance(model.hparams, dict): - attr = model.hparams[attribute] - else: - attr = getattr(model.hparams, attribute) - + elif hasattr(model, 'hparams') and isinstance(model.hparams, dict) and attribute in model.hparams: + attr = model.hparams[attribute] + elif hasattr(model, 'hparams') and hasattr(model.hparams, attribute): + attr = getattr(model.hparams, attribute) # Check if the attribute in datamodule (datamodule gets registered in Trainer) elif trainer is not None and trainer.datamodule is not None and hasattr(trainer.datamodule, attribute): attr = getattr(trainer.datamodule, attribute) @@ -230,7 +227,7 @@ def lightning_setattr(model, attribute, value): raise ValueError(f'{attribute} is neither stored in the model namespace' ' nor the `hparams` namespace/dict, nor the datamodule.') - trainer = model.trainer + trainer = getattr(model, 'trainer', None) # Check if attribute in model if hasattr(model, attribute): diff --git a/pytorch_lightning/utilities/xla_device_utils.py b/pytorch_lightning/utilities/xla_device_utils.py index 5687992981ae6..14a59fd105c5a 100644 --- a/pytorch_lightning/utilities/xla_device_utils.py +++ b/pytorch_lightning/utilities/xla_device_utils.py @@ -13,6 +13,7 @@ # limitations under the License. import functools import importlib +import queue as q from multiprocessing import Process, Queue import torch @@ -24,10 +25,10 @@ xm = None -def inner_f(queue, func, **kwargs): # pragma: no cover +def inner_f(queue, func, *args, **kwargs): # pragma: no cover try: - queue.put(func(**kwargs)) - except Exception as _e: + queue.put(func(*args, **kwargs)) + except Exception: import traceback traceback.print_exc() @@ -38,10 +39,13 @@ def pl_multi_process(func): @functools.wraps(func) def wrapper(*args, **kwargs): queue = Queue() - proc = Process(target=inner_f, args=(queue, func,), kwargs=kwargs) + proc = Process(target=inner_f, args=(queue, func, *args), kwargs=kwargs) proc.start() - proc.join() - return queue.get() + proc.join(10) + try: + return queue.get_nowait() + except q.Empty: + return False return wrapper diff --git a/requirements.txt b/requirements.txt index 0f8423e0860f0..d270e2bc5d854 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # the default package dependencies numpy>=1.16.4 -torch>=1.3 +torch>=1.3,<1.8 future>=0.17.1 # required for builtins in setup.py # pyyaml>=3.13 PyYAML>=5.1 # OmegaConf requirement >=5.1 diff --git a/requirements/examples.txt b/requirements/examples.txt index c87d10a39346f..0afa62f9ffa95 100644 --- a/requirements/examples.txt +++ b/requirements/examples.txt @@ -1,2 +1,2 @@ -torchvision>=0.4.1 -gym>=0.17.0 \ No newline at end of file +torchvision>=0.4.1,<0.9.0 +gym>=0.17.0 diff --git a/requirements/extra.txt b/requirements/extra.txt index dbd5f7515109e..be21317a1d826 100644 --- a/requirements/extra.txt +++ b/requirements/extra.txt @@ -1,8 +1,7 @@ # extended list of package dependencies to reach full functionality matplotlib>=3.1.1 -# no need to install with [pytorch] as pytorch is already installed and torchvision is required only for Horovod examples -horovod>=0.20.1 # v0.20.0 has problem with building the wheel/installation +horovod>=0.20.2 # no need to install with [pytorch] as pytorch is already installed omegaconf>=2.0.0 # scipy>=0.13.3 scikit-learn>=0.22.2 diff --git a/requirements/test.txt b/requirements/test.txt index d98048568fa75..0ceb532ac2266 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -7,7 +7,7 @@ flake8>=3.6 flake8-black check-manifest twine==1.13.0 -scikit-image +scikit-image>=0.17.2 black>=20.8b1 pre-commit>=1.0 diff --git a/tests/README.md b/tests/README.md index 7fd3c90c0241e..8ef006c4d879a 100644 --- a/tests/README.md +++ b/tests/README.md @@ -30,7 +30,7 @@ To test models that require GPU make sure to run the above command on a GPU mach The GPU machine must have: 1. At least 2 GPUs. 2. [NVIDIA-apex](https://github.com/NVIDIA/apex#linux) installed. -3. [Horovod with NCCL](https://horovod.readthedocs.io/en/stable/gpus_include.html) support: `HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL pip install horovod` +3. [Horovod with NCCL](https://horovod.readthedocs.io/en/stable/gpus_include.html) support: `HOROVOD_GPU_OPERATIONS=NCCL pip install horovod` ## Running Coverage diff --git a/tests/backends/test_accelerator_connector.py b/tests/backends/test_accelerator_connector.py index cbc96b0793062..7eeada3d5ddd1 100644 --- a/tests/backends/test_accelerator_connector.py +++ b/tests/backends/test_accelerator_connector.py @@ -104,15 +104,17 @@ def on_fit_start(self, trainer, pl_module): "SLURM_NTASKS": "2", "SLURM_JOB_NAME": "SOME_NAME", "SLURM_NODEID": "0", - "SLURM_LOCALID": "0" + "SLURM_LOCALID": "10" }) @mock.patch('torch.cuda.device_count', return_value=2) def test_accelerator_choice_ddp_slurm(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert trainer.use_ddp - assert isinstance(trainer.accelerator_backend, accelerators.DDPSLURMAccelerator) + assert isinstance(trainer.accelerator_backend, accelerators.DDPHPCAccelerator) assert isinstance(trainer.accelerator_backend.cluster_environment, SLURMEnvironment) + assert trainer.accelerator_backend.task_idx == 10 + assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx raise SystemExit() model = BoringModel() @@ -133,7 +135,7 @@ def on_fit_start(self, trainer, pl_module): "SLURM_JOB_NAME": "SOME_NAME", "SLURM_NODEID": "0", "LOCAL_RANK": "0", - "SLURM_LOCALID": "0" + "SLURM_LOCALID": "10" }) @mock.patch('torch.cuda.device_count', return_value=2) def test_accelerator_choice_ddp2_slurm(tmpdir): @@ -142,6 +144,9 @@ def on_fit_start(self, trainer, pl_module): assert trainer.use_ddp2 assert isinstance(trainer.accelerator_backend, accelerators.DDP2Accelerator) assert isinstance(trainer.accelerator_backend.cluster_environment, SLURMEnvironment) + assert trainer.accelerator_backend.task_idx == 10 + assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx + raise SystemExit() model = BoringModel() @@ -159,7 +164,7 @@ def on_fit_start(self, trainer, pl_module): @mock.patch.dict(os.environ, { "CUDA_VISIBLE_DEVICES": "0,1", "WORLD_SIZE": "2", - "LOCAL_RANK": "0", + "LOCAL_RANK": "10", "NODE_RANK": "0" }) @mock.patch('torch.cuda.device_count', return_value=2) @@ -167,8 +172,10 @@ def test_accelerator_choice_ddp_te(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert trainer.use_ddp - assert isinstance(trainer.accelerator_backend, accelerators.DDPTorchElasticAccelerator) + assert isinstance(trainer.accelerator_backend, accelerators.DDPHPCAccelerator) assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment) + assert trainer.accelerator_backend.task_idx == 10 + assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx raise SystemExit() model = BoringModel() @@ -186,7 +193,7 @@ def on_fit_start(self, trainer, pl_module): @mock.patch.dict(os.environ, { "CUDA_VISIBLE_DEVICES": "0,1", "WORLD_SIZE": "2", - "LOCAL_RANK": "0", + "LOCAL_RANK": "10", "NODE_RANK": "0" }) @mock.patch('torch.cuda.device_count', return_value=2) @@ -196,6 +203,8 @@ def on_fit_start(self, trainer, pl_module): assert trainer.use_ddp2 assert isinstance(trainer.accelerator_backend, accelerators.DDP2Accelerator) assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment) + assert trainer.accelerator_backend.task_idx == 10 + assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx raise SystemExit() model = BoringModel() @@ -212,7 +221,7 @@ def on_fit_start(self, trainer, pl_module): @mock.patch.dict(os.environ, { "WORLD_SIZE": "1", - "LOCAL_RANK": "0", + "LOCAL_RANK": "10", "NODE_RANK": "0" }) @mock.patch('torch.cuda.device_count', return_value=0) @@ -220,8 +229,11 @@ def test_accelerator_choice_ddp_cpu_te(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert trainer.use_ddp - assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUTorchElasticAccelerator) + assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUHPCAccelerator) assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment) + assert trainer.accelerator_backend.task_idx == 10 + assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx + raise SystemExit() model = BoringModel() @@ -248,7 +260,7 @@ def test_accelerator_choice_ddp_cpu_slurm(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert trainer.use_ddp - assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUSLURMAccelerator) + assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUHPCAccelerator) assert isinstance(trainer.accelerator_backend.cluster_environment, SLURMEnvironment) raise SystemExit() @@ -283,7 +295,7 @@ def master_address(self): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert trainer.use_ddp - assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUSLURMAccelerator) + assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUHPCAccelerator) assert isinstance(trainer.accelerator_backend.cluster_environment, CustomCluster) raise SystemExit() @@ -341,7 +353,7 @@ def on_fit_start(self, trainer, pl_module): def test_dist_backend_accelerator_mapping(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): - assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUSLURMAccelerator) + assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUHPCAccelerator) raise SystemExit() model = BoringModel() diff --git a/tests/base/__init__.py b/tests/base/__init__.py index a337d443b4384..faefa623dfee7 100644 --- a/tests/base/__init__.py +++ b/tests/base/__init__.py @@ -3,4 +3,4 @@ from tests.base.datasets import TrialMNIST from tests.base.model_template import EvalModelTemplate, GenericEvalModelTemplate from tests.base.simple_model import SimpleModule -from tests.base.boring_model import BoringModel +from tests.base.boring_model import BoringModel, RandomDataset diff --git a/tests/base/develop_utils.py b/tests/base/develop_utils.py index ba0d20c2c8389..9c88ba1b7e4d3 100644 --- a/tests/base/develop_utils.py +++ b/tests/base/develop_utils.py @@ -32,7 +32,7 @@ def assert_speed_parity_relative(pl_times, pt_times, max_diff: float = 0.1): f"lightning {diffs} was slower than PT (threshold {max_diff})" -def assert_speed_parity_absolute(pl_times, pt_times, nb_epochs, max_diff: float = 0.6): +def assert_speed_parity_absolute(pl_times, pt_times, nb_epochs, max_diff: float = 0.55): # assert speeds diffs = np.asarray(pl_times) - np.asarray(pt_times) # norm by vanila time diff --git a/tests/base/model_optimizers.py b/tests/base/model_optimizers.py index e4b8d489f872d..46574adfd0b4d 100644 --- a/tests/base/model_optimizers.py +++ b/tests/base/model_optimizers.py @@ -64,6 +64,13 @@ def configure_optimizers__single_scheduler(self): lr_scheduler = optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.1) return [optimizer], [lr_scheduler] + def configure_optimizers__onecycle_scheduler(self): + optimizer = optim.SGD(self.parameters(), lr=self.learning_rate, momentum=0.9) + lr_scheduler = optim.lr_scheduler.OneCycleLR(optimizer, + max_lr=self.learning_rate, + total_steps=10_000) + return [optimizer], [lr_scheduler] + def configure_optimizers__multiple_schedulers(self): optimizer1 = optim.Adam(self.parameters(), lr=self.learning_rate) optimizer2 = optim.Adam(self.parameters(), lr=self.learning_rate) diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index bb7ec8430a7df..cf88f52436576 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -55,6 +55,8 @@ def __init__(self): self.on_validation_end_called = False self.on_test_start_called = False self.on_test_end_called = False + self.on_after_backward_called = False + self.on_before_zero_grad_called = False def setup(self, trainer, pl_module, stage: str): assert isinstance(trainer, Trainer) @@ -160,6 +162,14 @@ def on_test_end(self, trainer, pl_module): _check_args(trainer, pl_module) self.on_test_end_called = True + def on_after_backward(self, trainer, pl_module): + _check_args(trainer, pl_module) + self.on_after_backward_called = True + + def on_before_zero_grad(self, trainer, pl_module, optimizer): + _check_args(trainer, pl_module) + self.on_before_zero_grad_called = True + test_callback = TestCallback() trainer_options = dict( @@ -197,6 +207,8 @@ def on_test_end(self, trainer, pl_module): assert not test_callback.on_validation_end_called assert not test_callback.on_test_start_called assert not test_callback.on_test_end_called + assert not test_callback.on_after_backward_called + assert not test_callback.on_before_zero_grad_called # fit model trainer = Trainer(**trainer_options) @@ -228,6 +240,8 @@ def on_test_end(self, trainer, pl_module): assert not test_callback.on_validation_end_called assert not test_callback.on_test_start_called assert not test_callback.on_test_end_called + assert not test_callback.on_after_backward_called + assert not test_callback.on_before_zero_grad_called trainer.fit(model) @@ -257,6 +271,8 @@ def on_test_end(self, trainer, pl_module): assert not test_callback.on_test_batch_end_called assert not test_callback.on_test_start_called assert not test_callback.on_test_end_called + assert test_callback.on_after_backward_called + assert test_callback.on_before_zero_grad_called # reset setup teardown callback test_callback.teardown_called = False @@ -277,3 +293,5 @@ def on_test_end(self, trainer, pl_module): assert not test_callback.on_validation_end_called assert not test_callback.on_validation_batch_end_called assert not test_callback.on_validation_batch_start_called + assert not test_callback.on_after_backward_called + assert not test_callback.on_before_zero_grad_called diff --git a/tests/callbacks/test_lr_monitor.py b/tests/callbacks/test_lr_monitor.py index 973dd64c08b14..a6783435ed3e2 100644 --- a/tests/callbacks/test_lr_monitor.py +++ b/tests/callbacks/test_lr_monitor.py @@ -39,12 +39,40 @@ def test_lr_monitor_single_lr(tmpdir): assert result assert lr_monitor.lrs, 'No learning rates logged' + assert all(v is None for v in lr_monitor.last_momentum_values.values()), \ + 'Momentum should not be logged by default' assert len(lr_monitor.lrs) == len(trainer.lr_schedulers), \ 'Number of learning rates logged does not match number of lr schedulers' assert all([k in ['lr-Adam'] for k in lr_monitor.lrs.keys()]), \ 'Names of learning rates not set correctly' +def test_lr_monitor_single_lr_with_momentum(tmpdir): + """ Test that learning rates and momentum are extracted and logged for single lr scheduler. """ + tutils.reset_seed() + + model = EvalModelTemplate() + model.configure_optimizers = model.configure_optimizers__onecycle_scheduler + + lr_monitor = LearningRateMonitor(log_momentum=True) + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=2, + limit_val_batches=0.1, + limit_train_batches=0.5, + callbacks=[lr_monitor], + ) + result = trainer.fit(model) + assert result + + assert all(v is not None for v in lr_monitor.last_momentum_values.values()), \ + 'Expected momentum to be logged' + assert len(lr_monitor.last_momentum_values) == len(trainer.lr_schedulers), \ + 'Number of momentum values logged does not match number of lr schedulers' + assert all([k in ['lr-SGD-momentum'] for k in lr_monitor.last_momentum_values.keys()]), \ + 'Names of momentum values not set correctly' + + def test_lr_monitor_no_lr_scheduler(tmpdir): tutils.reset_seed() diff --git a/tests/callbacks/test_progress_bar.py b/tests/callbacks/test_progress_bar.py index d354b59682240..221844244ad75 100644 --- a/tests/callbacks/test_progress_bar.py +++ b/tests/callbacks/test_progress_bar.py @@ -231,7 +231,7 @@ def on_validation_epoch_end(self, trainer, pl_module): default_root_dir=tmpdir, max_epochs=1, num_sanity_val_steps=2, - limit_train_batches=0, + limit_train_batches=1, limit_val_batches=limit_val_batches, callbacks=[progress_bar], logger=False, diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py index 976a91f551e0a..d9e99d463b57d 100644 --- a/tests/checkpointing/test_model_checkpoint.py +++ b/tests/checkpointing/test_model_checkpoint.py @@ -100,7 +100,7 @@ def test_model_checkpoint_to_yaml(tmpdir, save_top_k): path_yaml = os.path.join(tmpdir, 'best_k_models.yaml') checkpoint.to_yaml(path_yaml) d = yaml.full_load(open(path_yaml, 'r')) - best_k = {k: v.item() for k, v in checkpoint.best_k_models.items()} + best_k = {k: v for k, v in checkpoint.best_k_models.items()} assert d == best_k @@ -185,67 +185,72 @@ def test_model_checkpoint_no_extraneous_invocations(tmpdir): def test_model_checkpoint_format_checkpoint_name(tmpdir): # empty filename: - ckpt_name = ModelCheckpoint._format_checkpoint_name('', 3, {}) - assert ckpt_name == 'epoch=3' + ckpt_name = ModelCheckpoint._format_checkpoint_name('', 3, 2, {}) + assert ckpt_name == 'epoch=3-step=2' - ckpt_name = ModelCheckpoint._format_checkpoint_name(None, 3, {}, prefix='test') - assert ckpt_name == 'test-epoch=3' + ckpt_name = ModelCheckpoint._format_checkpoint_name(None, 3, 2, {}, prefix='test') + assert ckpt_name == 'test-epoch=3-step=2' # no groups case: - ckpt_name = ModelCheckpoint._format_checkpoint_name('ckpt', 3, {}, prefix='test') + ckpt_name = ModelCheckpoint._format_checkpoint_name('ckpt', 3, 2, {}, prefix='test') assert ckpt_name == 'test-ckpt' # no prefix - ckpt_name = ModelCheckpoint._format_checkpoint_name('{epoch:03d}-{acc}', 3, {'acc': 0.03}) + ckpt_name = ModelCheckpoint._format_checkpoint_name('{epoch:03d}-{acc}', 3, 2, {'acc': 0.03}) assert ckpt_name == 'epoch=003-acc=0.03' # prefix char_org = ModelCheckpoint.CHECKPOINT_JOIN_CHAR ModelCheckpoint.CHECKPOINT_JOIN_CHAR = '@' - ckpt_name = ModelCheckpoint._format_checkpoint_name('{epoch},{acc:.5f}', 3, {'acc': 0.03}, prefix='test') + ckpt_name = ModelCheckpoint._format_checkpoint_name('{epoch},{acc:.5f}', 3, 2, {'acc': 0.03}, prefix='test') assert ckpt_name == 'test@epoch=3,acc=0.03000' ModelCheckpoint.CHECKPOINT_JOIN_CHAR = char_org # no dirpath set - ckpt_name = ModelCheckpoint(monitor='early_stop_on', dirpath=None).format_checkpoint_name(3, {}) - assert ckpt_name == 'epoch=3.ckpt' - ckpt_name = ModelCheckpoint(monitor='early_stop_on', dirpath='').format_checkpoint_name(5, {}) - assert ckpt_name == 'epoch=5.ckpt' + ckpt_name = ModelCheckpoint(monitor='early_stop_on', dirpath=None).format_checkpoint_name(3, 2, {}) + assert ckpt_name == 'epoch=3-step=2.ckpt' + ckpt_name = ModelCheckpoint(monitor='early_stop_on', dirpath='').format_checkpoint_name(5, 4, {}) + assert ckpt_name == 'epoch=5-step=4.ckpt' # CWD - ckpt_name = ModelCheckpoint(monitor='early_stop_on', dirpath='.').format_checkpoint_name(3, {}) - assert ckpt_name == str(Path('.').resolve() / 'epoch=3.ckpt') + ckpt_name = ModelCheckpoint(monitor='early_stop_on', dirpath='.').format_checkpoint_name(3, 4, {}) + assert ckpt_name == str(Path('.').resolve() / 'epoch=3-step=4.ckpt') # with ver ckpt_name = ModelCheckpoint( monitor='early_stop_on', dirpath=tmpdir, filename='name', prefix='test' - ).format_checkpoint_name(3, {}, ver=3) + ).format_checkpoint_name(3, 2, {}, ver=3) assert ckpt_name == tmpdir / 'test-name-v3.ckpt' # using slashes ckpt_name = ModelCheckpoint( monitor='early_stop_on', dirpath=None, filename='{epoch}_{val/loss:.5f}' - ).format_checkpoint_name(4, {'val/loss': 0.03}) + ).format_checkpoint_name(4, 3, {'val/loss': 0.03}) assert ckpt_name == 'epoch=4_val/loss=0.03000.ckpt' # TODO: Checks with filepath. To be removed in v1.2 # CWD - ckpt_name = ModelCheckpoint(monitor='early_stop_on', filepath='.').format_checkpoint_name(3, {}) - assert ckpt_name == str(Path('.').resolve() / 'epoch=3.ckpt') + ckpt_name = ModelCheckpoint(monitor='early_stop_on', filepath='.').format_checkpoint_name(3, 2, {}) + assert ckpt_name == str(Path('.').resolve() / 'epoch=3-step=2.ckpt') # dir does not exist so it is used as filename filepath = tmpdir / 'dir' - ckpt_name = ModelCheckpoint(monitor='early_stop_on', filepath=filepath, prefix='test').format_checkpoint_name(3, {}) + ckpt_name = ModelCheckpoint( + monitor='early_stop_on', filepath=filepath, prefix='test' + ).format_checkpoint_name(3, 2, {}) assert ckpt_name == tmpdir / 'test-dir.ckpt' # now, dir exists os.mkdir(filepath) - ckpt_name = ModelCheckpoint(monitor='early_stop_on', filepath=filepath, prefix='test').format_checkpoint_name(3, {}) - assert ckpt_name == filepath / 'test-epoch=3.ckpt' + ckpt_name = ModelCheckpoint( + monitor='early_stop_on', filepath=filepath, prefix='test' + ).format_checkpoint_name(3, 2, {}) + assert ckpt_name == filepath / 'test-epoch=3-step=2.ckpt' def test_model_checkpoint_save_last(tmpdir): """Tests that save_last produces only one last checkpoint.""" + seed_everything() model = EvalModelTemplate() epochs = 3 ModelCheckpoint.CHECKPOINT_NAME_LAST = 'last-{epoch}' @@ -257,10 +262,15 @@ def test_model_checkpoint_save_last(tmpdir): logger=False, ) trainer.fit(model) - last_filename = model_checkpoint._format_checkpoint_name(ModelCheckpoint.CHECKPOINT_NAME_LAST, epochs - 1, {}) + last_filename = model_checkpoint._format_checkpoint_name( + ModelCheckpoint.CHECKPOINT_NAME_LAST, trainer.current_epoch, trainer.global_step, {} + ) last_filename = last_filename + '.ckpt' assert str(tmpdir / last_filename) == model_checkpoint.last_model_path - assert set(os.listdir(tmpdir)) == set([f'epoch={i}.ckpt' for i in range(epochs)] + [last_filename]) + assert set(os.listdir(tmpdir)) == set( + [f"epoch={i}-step={j}.ckpt" for i, j in zip(range(epochs), [9, 19, 29])] + [last_filename] + ) + ModelCheckpoint.CHECKPOINT_NAME_LAST = 'last' @@ -284,8 +294,8 @@ def test_none_monitor_top_k(tmpdir): def test_none_monitor_save_last(tmpdir): """ Test that a warning appears for save_last=True with monitor=None. """ - with pytest.raises( - MisconfigurationException, match=r'ModelCheckpoint\(save_last=True, monitor=None\) is not a valid.*' + with pytest.warns( + UserWarning, match=r'ModelCheckpoint\(save_last=True, monitor=None\) is a redundant.*' ): ModelCheckpoint(dirpath=tmpdir, save_last=True) # These should not fail @@ -295,6 +305,7 @@ def test_none_monitor_save_last(tmpdir): def test_model_checkpoint_none_monitor(tmpdir): """ Test that it is possible to save all checkpoints when monitor=None. """ + seed_everything() model = EvalModelTemplate() model.validation_step = model.validation_step_no_monitor model.validation_epoch_end = model.validation_epoch_end_no_monitor @@ -311,13 +322,13 @@ def test_model_checkpoint_none_monitor(tmpdir): # these should not be set if monitor is None assert checkpoint_callback.monitor is None - assert checkpoint_callback.best_model_path == checkpoint_callback.last_model_path == tmpdir / 'epoch=1.ckpt' + assert checkpoint_callback.best_model_path == checkpoint_callback.last_model_path == tmpdir / 'epoch=1-step=19.ckpt' assert checkpoint_callback.best_model_score == 0 assert checkpoint_callback.best_k_models == {} assert checkpoint_callback.kth_best_model_path == '' # check that the correct ckpts were created - expected = [f'epoch={e}.ckpt' for e in range(epochs)] + expected = [f'epoch={i}-step={j}.ckpt' for i, j in zip(range(epochs), [9, 19])] assert set(os.listdir(tmpdir)) == set(expected) @@ -325,13 +336,14 @@ def test_model_checkpoint_none_monitor(tmpdir): def test_model_checkpoint_period(tmpdir, period): model = EvalModelTemplate() epochs = 5 - checkpoint_callback = ModelCheckpoint(dirpath=tmpdir, save_top_k=-1, period=period) + checkpoint_callback = ModelCheckpoint(dirpath=tmpdir, filename='{epoch}', save_top_k=-1, period=period) trainer = Trainer( default_root_dir=tmpdir, checkpoint_callback=checkpoint_callback, max_epochs=epochs, limit_train_batches=0.1, limit_val_batches=0.1, + val_check_interval=1.0, logger=False, ) trainer.fit(model) @@ -365,20 +377,35 @@ def test_model_checkpoint_topk_zero(tmpdir): def test_model_checkpoint_topk_all(tmpdir): """ Test that save_top_k=-1 tracks the best models when monitor key is provided. """ seed_everything(1000) - epochs = 2 - model = EvalModelTemplate() - checkpoint_callback = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_top_k=-1) + epochs = 3 + + class CustomModel(EvalModelTemplate): + def validation_epoch_end(self, outputs): + return {'epoch': self.current_epoch} + + model = CustomModel() + checkpoint_callback = ModelCheckpoint( + dirpath=tmpdir, + filename="{epoch}", + monitor="epoch", + mode='max', + save_top_k=-1, + ) trainer = Trainer( default_root_dir=tmpdir, checkpoint_callback=checkpoint_callback, max_epochs=epochs, logger=False, + val_check_interval=1.0, ) trainer.fit(model) - assert checkpoint_callback.best_model_path == tmpdir / "epoch=1.ckpt" - assert checkpoint_callback.best_model_score > 0 + + assert checkpoint_callback.monitor == 'epoch' + assert checkpoint_callback.best_model_path == tmpdir / "epoch=2.ckpt" + assert checkpoint_callback.best_model_score == epochs - 1 + assert len(os.listdir(tmpdir)) == len(checkpoint_callback.best_k_models) == epochs assert set(checkpoint_callback.best_k_models.keys()) == set(str(tmpdir / f"epoch={i}.ckpt") for i in range(epochs)) - assert checkpoint_callback.kth_best_model_path == tmpdir / "epoch=0.ckpt" + assert checkpoint_callback.kth_best_model_path == tmpdir / 'epoch=0.ckpt' def test_ckpt_metric_names(tmpdir): @@ -431,7 +458,7 @@ def test_default_checkpoint_behavior(tmpdir): # make sure the checkpoint we saved has the metric in the name ckpts = os.listdir(os.path.join(tmpdir, 'lightning_logs', 'version_0', 'checkpoints')) assert len(ckpts) == 1 - assert ckpts[0] == 'epoch=2.ckpt' + assert ckpts[0] == 'epoch=2-step=14.ckpt' def test_ckpt_metric_names_results(tmpdir): @@ -489,7 +516,7 @@ def test_model_checkpoint_save_last_checkpoint_contents(tmpdir): model = EvalModelTemplate() num_epochs = 3 model_checkpoint = ModelCheckpoint( - monitor='early_stop_on', dirpath=tmpdir, save_top_k=num_epochs, save_last=True + monitor='early_stop_on', dirpath=tmpdir, filename="{epoch}", save_top_k=num_epochs, save_last=True ) trainer = Trainer( default_root_dir=tmpdir, @@ -501,16 +528,14 @@ def test_model_checkpoint_save_last_checkpoint_contents(tmpdir): path_last_epoch = str(tmpdir / f"epoch={num_epochs - 1}.ckpt") path_last = str(tmpdir / "last.ckpt") assert path_last == model_checkpoint.last_model_path + assert os.path.isfile(path_last_epoch) ckpt_last_epoch = torch.load(path_last_epoch) ckpt_last = torch.load(path_last) assert all(ckpt_last_epoch[k] == ckpt_last[k] for k in ("epoch", "global_step")) ch_type = type(model_checkpoint) - assert all(list( - ckpt_last["callbacks"][ch_type][k] == ckpt_last_epoch["callbacks"][ch_type][k] - for k in ("best_model_score", "best_model_path") - )) + assert ckpt_last["callbacks"][ch_type] == ckpt_last_epoch["callbacks"][ch_type] # it is easier to load the model objects than to iterate over the raw dict of tensors model_last_epoch = EvalModelTemplate.load_from_checkpoint(path_last_epoch) @@ -668,66 +693,89 @@ def validation_step(self, batch, batch_idx): loss = self.loss(batch, output) return {"val_loss": loss} - model = ExtendedBoringModel() - model.validation_step_end = None - model.validation_epoch_end = None - trainer = pl.Trainer(default_root_dir=tmpdir, - max_epochs=1, - limit_train_batches=2, - limit_val_batches=2, - limit_test_batches=2, - ) - - assert trainer.checkpoint_connector.has_trained is not True - assert trainer.current_epoch == 0 - trainer.fit(model) - assert trainer.checkpoint_connector.has_trained is True - assert trainer.global_step == 2 - assert trainer.current_epoch == 0 - trainer.test(model) - assert trainer.current_epoch == 0 - assert str(os.listdir(osp.join(tmpdir, 'lightning_logs'))) == "['version_0']" - - def get_last_checkpoint(): - logs_dir = osp.join(tmpdir, 'lightning_logs') - versions = os.listdir(logs_dir) - versions.sort() - - last_version = versions[-1] - ckpt_dir = osp.join(logs_dir, last_version, "checkpoints") + def assert_trainer_init(trainer): + assert not trainer.checkpoint_connector.has_trained + assert trainer.global_step == 0 + assert trainer.current_epoch == 0 + def get_last_checkpoint(ckpt_dir): ckpts = os.listdir(ckpt_dir) ckpts.sort() - return osp.join(ckpt_dir, ckpts[-1]) - def assert_checkpoint_content(): - chk = pl_load(get_last_checkpoint()) - assert chk["epoch"] == 1 - assert chk["global_step"] == 2 + def assert_checkpoint_content(ckpt_dir): + chk = pl_load(get_last_checkpoint(ckpt_dir)) + assert chk["epoch"] == epochs + assert chk["global_step"] == 4 + + def assert_checkpoint_log_dir(idx): + lightning_logs_path = osp.join(tmpdir, 'lightning_logs') + assert sorted(os.listdir(lightning_logs_path)) == [f'version_{i}' for i in range(idx + 1)] + assert len(os.listdir(ckpt_dir)) == epochs + + def get_model(): + model = ExtendedBoringModel() + model.validation_step_end = None + model.validation_epoch_end = None + return model + + ckpt_dir = osp.join(tmpdir, 'checkpoints') + checkpoint_cb = ModelCheckpoint(dirpath=ckpt_dir, save_top_k=-1) + epochs = 2 + limit_train_batches = 2 + + model = get_model() + + trainer_config = dict( + default_root_dir=tmpdir, + max_epochs=epochs, + limit_train_batches=limit_train_batches, + limit_val_batches=3, + limit_test_batches=4, + ) + + trainer = pl.Trainer( + **trainer_config, + checkpoint_callback=checkpoint_cb, + ) + assert_trainer_init(trainer) + + trainer.fit(model) + assert trainer.checkpoint_connector.has_trained + assert trainer.global_step == epochs * limit_train_batches + assert trainer.current_epoch == epochs - 1 + assert_checkpoint_log_dir(0) - assert_checkpoint_content() + trainer.test(model) + assert trainer.current_epoch == epochs - 1 + + assert_checkpoint_content(ckpt_dir) for idx in range(1, 5): + chk = get_last_checkpoint(ckpt_dir) + assert_checkpoint_content(ckpt_dir) + + checkpoint_cb = ModelCheckpoint(dirpath=ckpt_dir, save_top_k=-1) + model = get_model() + # load from checkpoint - chk = get_last_checkpoint() - assert_checkpoint_content() - model = BoringModel.load_from_checkpoint(chk) - trainer = pl.Trainer(default_root_dir=tmpdir, - max_epochs=1, - limit_train_batches=2, - limit_val_batches=2, - limit_test_batches=2, - resume_from_checkpoint=chk) - assert trainer.checkpoint_connector.has_trained is not True - assert trainer.global_step == 0 + trainer = pl.Trainer( + **trainer_config, + resume_from_checkpoint=chk, + checkpoint_callback=checkpoint_cb, + ) + assert_trainer_init(trainer) + trainer.test(model) - assert trainer.global_step == 2 + assert not trainer.checkpoint_connector.has_trained + assert trainer.global_step == epochs * limit_train_batches + assert trainer.current_epoch == epochs + trainer.fit(model) - assert trainer.global_step == 2 - assert trainer.checkpoint_connector.has_trained is not True - lightning_logs_path = osp.join(tmpdir, 'lightning_logs') - assert sorted(os.listdir(lightning_logs_path)) == [f"version_{i}" for i in range(idx + 1)] + assert not trainer.checkpoint_connector.has_trained + assert trainer.global_step == epochs * limit_train_batches + assert trainer.current_epoch == epochs + assert_checkpoint_log_dir(idx) @pytest.mark.parametrize( @@ -746,3 +794,65 @@ def test_filepath_decomposition_dirpath_filename(tmpdir, filepath, dirpath, file assert mc_cb.dirpath == dirpath assert mc_cb.filename == filename + + +def test_configure_model_checkpoint(tmpdir): + """ Test all valid and invalid ways a checkpoint callback can be passed to the Trainer. """ + kwargs = dict(default_root_dir=tmpdir) + callback1 = ModelCheckpoint() + callback2 = ModelCheckpoint() + + # no callbacks + trainer = Trainer(checkpoint_callback=False, callbacks=[], **kwargs) + assert not any(isinstance(c, ModelCheckpoint) for c in trainer.callbacks) + assert trainer.checkpoint_callback is None + + # default configuration + trainer = Trainer(checkpoint_callback=True, callbacks=[], **kwargs) + assert len([c for c in trainer.callbacks if isinstance(c, ModelCheckpoint)]) == 1 + assert isinstance(trainer.checkpoint_callback, ModelCheckpoint) + + # custom callback passed to callbacks list, checkpoint_callback=True is ignored + trainer = Trainer(checkpoint_callback=True, callbacks=[callback1], **kwargs) + assert [c for c in trainer.callbacks if isinstance(c, ModelCheckpoint)] == [callback1] + assert trainer.checkpoint_callback == callback1 + + # multiple checkpoint callbacks + trainer = Trainer(callbacks=[callback1, callback2], **kwargs) + assert trainer.checkpoint_callback == callback1 + assert trainer.checkpoint_callbacks == [callback1, callback2] + + with pytest.warns(DeprecationWarning, match='will no longer be supported in v1.3'): + trainer = Trainer(checkpoint_callback=callback1, callbacks=[], **kwargs) + assert [c for c in trainer.callbacks if isinstance(c, ModelCheckpoint)] == [callback1] + assert trainer.checkpoint_callback == callback1 + + with pytest.warns(DeprecationWarning, match="will no longer be supported in v1.3"): + trainer = Trainer(checkpoint_callback=callback1, callbacks=[callback2], **kwargs) + assert trainer.checkpoint_callback == callback2 + assert trainer.checkpoint_callbacks == [callback2, callback1] + + with pytest.raises(MisconfigurationException, match="checkpoint_callback=False but found ModelCheckpoint"): + Trainer(checkpoint_callback=False, callbacks=[callback1], **kwargs) + + +def test_val_check_interval_checkpoint_files(tmpdir): + """ Test correct checkpoint naming when validating/checkpointing multiple times per epoch. """ + model = EvalModelTemplate() + model_checkpoint = ModelCheckpoint( + dirpath=tmpdir, + save_top_k=-1, + monitor="val_acc", + mode="max", + verbose=True + ) + trainer = Trainer( + default_root_dir=tmpdir, + val_check_interval=0.2, + max_epochs=1, + limit_train_batches=10, + callbacks=[model_checkpoint] + ) + trainer.fit(model) + files = sorted([p.name for p in Path(tmpdir).glob("*.ckpt")]) + assert files == [f"epoch=0-step={s}.ckpt" for s in [1, 3, 5, 7, 9]] diff --git a/tests/loggers/test_comet.py b/tests/loggers/test_comet.py index dc66f1842c62c..fc61829645b6e 100644 --- a/tests/loggers/test_comet.py +++ b/tests/loggers/test_comet.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import os -from unittest.mock import patch +from unittest.mock import patch, DEFAULT import pytest @@ -99,6 +99,37 @@ def test_comet_logger_experiment_name(comet): comet_experiment().set_name.assert_called_once_with(experiment_name) +@patch('pytorch_lightning.loggers.comet.comet_ml') +def test_comet_logger_manual_experiment_key(comet): + """Test that Comet Logger respects manually set COMET_EXPERIMENT_KEY.""" + + api_key = "key" + experiment_key = "96346da91469407a85641afe5766b554" + + instantation_environ = {} + + def save_os_environ(*args, **kwargs): + nonlocal instantation_environ + instantation_environ = os.environ.copy() + + return DEFAULT + + # Test api_key given + with patch.dict(os.environ, {"COMET_EXPERIMENT_KEY": experiment_key}): + with patch('pytorch_lightning.loggers.comet.CometExperiment', side_effect=save_os_environ) as comet_experiment: + logger = CometLogger(api_key=api_key) + + assert logger.version == experiment_key + + assert logger._experiment is None + + _ = logger.experiment + + comet_experiment.assert_called_once_with(api_key=api_key, project_name=None) + + assert instantation_environ["COMET_EXPERIMENT_KEY"] == experiment_key + + @patch('pytorch_lightning.loggers.comet.CometOfflineExperiment') @patch('pytorch_lightning.loggers.comet.comet_ml') def test_comet_logger_dirs_creation(comet, comet_experiment, tmpdir, monkeypatch): @@ -128,7 +159,7 @@ def test_comet_logger_dirs_creation(comet, comet_experiment, tmpdir, monkeypatch trainer.fit(model) assert trainer.checkpoint_callback.dirpath == (tmpdir / 'test' / "1" / 'checkpoints') - assert set(os.listdir(trainer.checkpoint_callback.dirpath)) == {'epoch=0.ckpt'} + assert set(os.listdir(trainer.checkpoint_callback.dirpath)) == {'epoch=0-step=9.ckpt'} @patch('pytorch_lightning.loggers.comet.comet_ml') diff --git a/tests/loggers/test_mlflow.py b/tests/loggers/test_mlflow.py index db2c353dc4e2c..b220074d41816 100644 --- a/tests/loggers/test_mlflow.py +++ b/tests/loggers/test_mlflow.py @@ -115,7 +115,7 @@ def test_mlflow_log_dir(client, mlflow, tmpdir): ) trainer.fit(model) assert trainer.checkpoint_callback.dirpath == (tmpdir / "exp-id" / "run-id" / 'checkpoints') - assert set(os.listdir(trainer.checkpoint_callback.dirpath)) == {'epoch=0.ckpt'} + assert set(os.listdir(trainer.checkpoint_callback.dirpath)) == {'epoch=0-step=0.ckpt'} def test_mlflow_logger_dirs_creation(tmpdir): @@ -137,13 +137,14 @@ def test_mlflow_logger_dirs_creation(tmpdir): assert set(os.listdir(tmpdir / exp_id)) == {run_id, 'meta.yaml'} model = EvalModelTemplate() - trainer = Trainer(default_root_dir=tmpdir, logger=logger, max_epochs=1, limit_val_batches=3) + trainer = Trainer(default_root_dir=tmpdir, logger=logger, max_epochs=1, limit_val_batches=3, + log_gpu_memory=True) trainer.fit(model) assert set(os.listdir(tmpdir / exp_id)) == {run_id, 'meta.yaml'} assert 'epoch' in os.listdir(tmpdir / exp_id / run_id / 'metrics') assert set(os.listdir(tmpdir / exp_id / run_id / 'params')) == model.hparams.keys() assert trainer.checkpoint_callback.dirpath == (tmpdir / exp_id / run_id / 'checkpoints') - assert set(os.listdir(trainer.checkpoint_callback.dirpath)) == {'epoch=0.ckpt'} + assert set(os.listdir(trainer.checkpoint_callback.dirpath)) == {'epoch=0-step=9.ckpt'} @mock.patch('pytorch_lightning.loggers.mlflow.mlflow') diff --git a/tests/loggers/test_tensorboard.py b/tests/loggers/test_tensorboard.py index bc2e198601ee9..b7688b781539c 100644 --- a/tests/loggers/test_tensorboard.py +++ b/tests/loggers/test_tensorboard.py @@ -21,9 +21,9 @@ from omegaconf import OmegaConf from tensorboard.backend.event_processing.event_accumulator import EventAccumulator -from pytorch_lightning import Trainer +from pytorch_lightning import Trainer, seed_everything from pytorch_lightning.loggers import TensorBoardLogger -from tests.base import EvalModelTemplate +from tests.base import EvalModelTemplate, BoringModel @pytest.mark.skipif( diff --git a/tests/loggers/test_wandb.py b/tests/loggers/test_wandb.py index 6682cfdc8830a..468ca819f91b1 100644 --- a/tests/loggers/test_wandb.py +++ b/tests/loggers/test_wandb.py @@ -19,7 +19,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.loggers import WandbLogger -from tests.base import EvalModelTemplate +from tests.base import EvalModelTemplate, BoringModel @mock.patch('pytorch_lightning.loggers.wandb.wandb') @@ -29,11 +29,17 @@ def test_wandb_logger(wandb): logger = WandbLogger(anonymous=True, offline=True) logger.log_metrics({'acc': 1.0}) - wandb.init().log.assert_called_once_with({'acc': 1.0}) + wandb.init().log.assert_called_once_with({'acc': 1.0}, step=None) wandb.init().log.reset_mock() logger.log_metrics({'acc': 1.0}, step=3) - wandb.init().log.assert_called_once_with({'global_step': 3, 'acc': 1.0}) + wandb.init().log.assert_called_once_with({'acc': 1.0}, step=3) + + # continue training on same W&B run + wandb.init().step = 3 + logger.finalize('success') + logger.log_metrics({'acc': 1.0}, step=3) + wandb.init().log.assert_called_with({'acc': 1.0}, step=6) logger.log_hyperparams({'test': None, 'nested': {'a': 1}, 'b': [2, 3, 4]}) wandb.init().config.update.assert_called_once_with( @@ -110,7 +116,7 @@ def test_wandb_logger_dirs_creation(wandb, tmpdir): trainer.fit(model) assert trainer.checkpoint_callback.dirpath == str(tmpdir / 'project' / version / 'checkpoints') - assert set(os.listdir(trainer.checkpoint_callback.dirpath)) == {'epoch=0.ckpt'} + assert set(os.listdir(trainer.checkpoint_callback.dirpath)) == {'epoch=0-step=9.ckpt'} def test_wandb_sanitize_callable_params(tmpdir): @@ -129,6 +135,8 @@ def return_something(): def wrapper_something(): return return_something + + params.wrapper_something_wo_name = lambda: lambda: '1' params.wrapper_something = wrapper_something assert isinstance(params.gpus, types.FunctionType) @@ -138,3 +146,4 @@ def wrapper_something(): assert params["gpus"] == '_gpus_arg_default' assert params["something"] == "something" assert params["wrapper_something"] == "wrapper_something" + assert params["wrapper_something_wo_name"] == "" diff --git a/tests/metrics/classification/test_confusion_matrix.py b/tests/metrics/classification/test_confusion_matrix.py new file mode 100644 index 0000000000000..c1ddff77e0818 --- /dev/null +++ b/tests/metrics/classification/test_confusion_matrix.py @@ -0,0 +1,133 @@ +from functools import partial + +import numpy as np +import pytest +import torch +from sklearn.metrics import confusion_matrix as sk_confusion_matrix + +from pytorch_lightning.metrics.classification.confusion_matrix import ConfusionMatrix +from pytorch_lightning.metrics.functional.confusion_matrix import confusion_matrix +from tests.metrics.classification.inputs import ( + _binary_inputs, + _binary_prob_inputs, + _multiclass_inputs, + _multiclass_prob_inputs, + _multidim_multiclass_inputs, + _multidim_multiclass_prob_inputs, + _multilabel_inputs, + _multilabel_prob_inputs +) +from tests.metrics.utils import NUM_CLASSES, THRESHOLD, MetricTester + +torch.manual_seed(42) + + +def _binary_prob_sk_metric(preds, target, normalize=None): + sk_preds = (preds.view(-1).numpy() >= THRESHOLD).astype(np.uint8) + sk_target = target.view(-1).numpy() + + return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize) + + +def _binary_sk_metric(preds, target, normalize=None): + sk_preds = preds.view(-1).numpy() + sk_target = target.view(-1).numpy() + + return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize) + + +def _multilabel_prob_sk_metric(preds, target, normalize=None): + sk_preds = (preds.view(-1).numpy() >= THRESHOLD).astype(np.uint8) + sk_target = target.view(-1).numpy() + + return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize) + + +def _multilabel_sk_metric(preds, target, normalize=None): + sk_preds = preds.view(-1).numpy() + sk_target = target.view(-1).numpy() + + return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize) + + +def _multiclass_prob_sk_metric(preds, target, normalize=None): + sk_preds = torch.argmax(preds, dim=len(preds.shape) - 1).view(-1).numpy() + sk_target = target.view(-1).numpy() + + return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize) + + +def _multiclass_sk_metric(preds, target, normalize=None): + sk_preds = preds.view(-1).numpy() + sk_target = target.view(-1).numpy() + + return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize) + + +def _multidim_multiclass_prob_sk_metric(preds, target, normalize=None): + sk_preds = torch.argmax(preds, dim=len(preds.shape) - 2).view(-1).numpy() + sk_target = target.view(-1).numpy() + + return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize) + + +def _multidim_multiclass_sk_metric(preds, target, normalize=None): + sk_preds = preds.view(-1).numpy() + sk_target = target.view(-1).numpy() + + return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize) + + +@pytest.mark.parametrize("normalize", ['true', 'pred', 'all', None]) +@pytest.mark.parametrize("preds, target, sk_metric, num_classes", [ + (_binary_prob_inputs.preds, _binary_prob_inputs.target, _binary_prob_sk_metric, 2), + (_binary_inputs.preds, _binary_inputs.target, _binary_sk_metric, 2), + (_multilabel_prob_inputs.preds, _multilabel_prob_inputs.target, _multilabel_prob_sk_metric, 2), + (_multilabel_inputs.preds, _multilabel_inputs.target, _multilabel_sk_metric, 2), + (_multiclass_prob_inputs.preds, _multiclass_prob_inputs.target, _multiclass_prob_sk_metric, NUM_CLASSES), + (_multiclass_inputs.preds, _multiclass_inputs.target, _multiclass_sk_metric, NUM_CLASSES), + ( + _multidim_multiclass_prob_inputs.preds, + _multidim_multiclass_prob_inputs.target, + _multidim_multiclass_prob_sk_metric, + NUM_CLASSES + ), + ( + _multidim_multiclass_inputs.preds, + _multidim_multiclass_inputs.target, + _multidim_multiclass_sk_metric, + NUM_CLASSES + ) +]) +class TestConfusionMatrix(MetricTester): + @pytest.mark.parametrize("ddp", [True, False]) + @pytest.mark.parametrize("dist_sync_on_step", [True, False]) + def test_confusion_matrix(self, normalize, preds, target, sk_metric, num_classes, ddp, dist_sync_on_step): + self.run_class_metric_test(ddp=ddp, + preds=preds, + target=target, + metric_class=ConfusionMatrix, + sk_metric=partial(sk_metric, normalize=normalize), + dist_sync_on_step=dist_sync_on_step, + metric_args={"num_classes": num_classes, + "threshold": THRESHOLD, + "normalize": normalize} + ) + + def test_confusion_matrix_functional(self, normalize, preds, target, sk_metric, num_classes): + self.run_functional_metric_test(preds, + target, + metric_functional=confusion_matrix, + sk_metric=partial(sk_metric, normalize=normalize), + metric_args={"num_classes": num_classes, + "threshold": THRESHOLD, + "normalize": normalize} + ) + + +def test_warning_on_nan(tmpdir): + preds = torch.randint(3, size=(20,)) + target = torch.randint(3, size=(20,)) + + with pytest.warns(UserWarning, match='.* nan values found in confusion matrix have been replaced with zeros.'): + confmat = confusion_matrix(preds, target, num_classes=5, normalize='true') diff --git a/tests/metrics/functional/test_classification.py b/tests/metrics/functional/test_classification.py index 139aeea8cc9ad..12eb8555b10aa 100644 --- a/tests/metrics/functional/test_classification.py +++ b/tests/metrics/functional/test_classification.py @@ -9,7 +9,6 @@ recall_score as sk_recall, f1_score as sk_f1_score, fbeta_score as sk_fbeta_score, - confusion_matrix as sk_confusion_matrix, roc_curve as sk_roc_curve, roc_auc_score as sk_roc_auc_score, precision_recall_curve as sk_precision_recall_curve @@ -23,7 +22,6 @@ stat_scores, stat_scores_multiple_classes, accuracy, - confusion_matrix, precision, recall, fbeta_score, @@ -32,6 +30,7 @@ dice_score, average_precision, auroc, + multiclass_auroc, precision_recall_curve, roc, auc, @@ -47,7 +46,6 @@ pytest.param(partial(sk_f1_score, average='micro'), f1_score, False, id='f1_score'), pytest.param(partial(sk_fbeta_score, average='micro', beta=2), partial(fbeta_score, beta=2), False, id='fbeta_score'), - pytest.param(sk_confusion_matrix, confusion_matrix, False, id='confusion_matrix'), pytest.param(sk_roc_curve, roc, True, id='roc'), pytest.param(sk_precision_recall_curve, precision_recall_curve, True, id='precision_recall_curve'), pytest.param(sk_roc_auc_score, auroc, True, id='auroc') @@ -216,33 +214,6 @@ def test_accuracy(): assert acc.item() == 0.50 -def test_confusion_matrix(): - target = (torch.arange(120) % 3).view(-1, 1) - pred = target.clone() - cm = confusion_matrix(pred, target, normalize=True) - - assert torch.allclose(cm, torch.tensor([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])) - - pred = torch.zeros_like(pred) - cm = confusion_matrix(pred, target, normalize=True) - assert torch.allclose(cm, torch.tensor([[1., 0., 0.], [1., 0., 0.], [1., 0., 0.]])) - - target = torch.LongTensor([0, 0, 0, 0, 0]) - pred = target.clone() - cm = confusion_matrix(pred, target, normalize=False, num_classes=3) - assert torch.allclose(cm, torch.tensor([[5., 0., 0.], [0., 0., 0.], [0., 0., 0.]])) - - # Example taken from https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html - target = torch.LongTensor([0] * 13 + [1] * 16 + [2] * 9) - pred = torch.LongTensor([0] * 13 + [1] * 10 + [2] * 15) - cm = confusion_matrix(pred, target, normalize=False, num_classes=3) - assert torch.allclose(cm, torch.tensor([[13., 0., 0.], [0., 10., 6.], [0., 0., 9.]])) - to_compare = cm / torch.tensor([[13.], [16.], [9.]]) - - cm = confusion_matrix(pred, target, normalize=True, num_classes=3) - assert torch.allclose(cm, to_compare) - - @pytest.mark.parametrize(['pred', 'target', 'expected_prec', 'expected_rec'], [ pytest.param(torch.tensor([1., 0., 1., 0.]), torch.tensor([0., 1., 1., 0.]), [0.5, 0.5], [0.5, 0.5]), pytest.param(to_onehot(torch.tensor([1., 0., 1., 0.])), torch.tensor([0., 1., 1., 0.]), [0.5, 0.5], [0.5, 0.5]) @@ -346,6 +317,47 @@ def test_auroc(pred, target, expected): assert score == expected +def test_multiclass_auroc(): + with pytest.raises(ValueError, + match=r".*probabilities, i.e. they should sum up to 1.0 over classes"): + _ = multiclass_auroc(pred=torch.tensor([[0.9, 0.9], + [1.0, 0]]), + target=torch.tensor([0, 1])) + + with pytest.raises(ValueError, + match=r".*not defined when all of the classes do not occur in the target.*"): + _ = multiclass_auroc(pred=torch.rand((4, 3)).softmax(dim=1), + target=torch.tensor([1, 0, 1, 0])) + + with pytest.raises(ValueError, + match=r".*does not equal the number of classes passed in 'num_classes'.*"): + _ = multiclass_auroc(pred=torch.rand((5, 4)).softmax(dim=1), + target=torch.tensor([0, 1, 2, 2, 3]), + num_classes=6) + + +@pytest.mark.parametrize('n_cls', [2, 5, 10, 50]) +def test_multiclass_auroc_against_sklearn(n_cls): + device = 'cuda' if torch.cuda.is_available() else 'cpu' + + n_samples = 300 + pred = torch.rand(n_samples, n_cls, device=device).softmax(dim=1) + target = torch.randint(n_cls, (n_samples,), device=device) + # Make sure target includes all class labels so that multiclass AUROC is defined + target[10:10 + n_cls] = torch.arange(n_cls) + + pl_score = multiclass_auroc(pred, target) + # For the binary case, sklearn expects an (n_samples,) array of probabilities of + # the positive class + pred = pred[:, 1] if n_cls == 2 else pred + sk_score = sk_roc_auc_score(target.cpu().detach().numpy(), + pred.cpu().detach().numpy(), + multi_class="ovr") + + sk_score = torch.tensor(sk_score, dtype=torch.float, device=device) + assert torch.allclose(sk_score, pl_score) + + @pytest.mark.parametrize(['x', 'y', 'expected'], [ pytest.param([0, 1], [0, 1], 0.5), pytest.param([1, 0], [0, 1], 0.5), diff --git a/tests/metrics/test_metric_lightning.py b/tests/metrics/test_metric_lightning.py index 7a860ea6c16fd..a35562327d717 100644 --- a/tests/metrics/test_metric_lightning.py +++ b/tests/metrics/test_metric_lightning.py @@ -1,8 +1,11 @@ +import pytest + import torch from pytorch_lightning import Trainer from pytorch_lightning.metrics import Metric from tests.base.boring_model import BoringModel +import tests.base.develop_utils as tutils class SumMetric(Metric): @@ -53,6 +56,44 @@ def test_metric_lightning_log(tmpdir): class TestModel(BoringModel): def __init__(self): super().__init__() + self.metric_step = SumMetric() + self.metric_epoch = SumMetric() + self.sum = 0.0 + + def training_step(self, batch, batch_idx): + x = batch + self.metric_step(x.sum()) + self.sum += x.sum() + self.log("sum_step", self.metric_step, on_epoch=True, on_step=False) + return {'loss': self.step(x), 'data': x} + + def training_epoch_end(self, outs): + self.log("sum_epoch", self.metric_epoch(torch.stack([o['data'] for o in outs]).sum())) + + model = TestModel() + model.val_dataloader = None + + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=2, + limit_val_batches=2, + max_epochs=1, + log_every_n_steps=1, + weights_summary=None, + ) + trainer.fit(model) + + logged = trainer.logged_metrics + assert torch.allclose(torch.tensor(logged["sum_step"]), model.sum) + assert torch.allclose(torch.tensor(logged["sum_epoch"]), model.sum) + + +def test_scriptable(tmpdir): + class TestModel(BoringModel): + def __init__(self): + super().__init__() + # the metric is not used in the module's `forward` + # so the module should be exportable to TorchScript self.metric = SumMetric() self.sum = 0.0 @@ -64,8 +105,6 @@ def training_step(self, batch, batch_idx): return self.step(x) model = TestModel() - model.val_dataloader = None - trainer = Trainer( default_root_dir=tmpdir, limit_train_batches=2, @@ -73,8 +112,15 @@ def training_step(self, batch, batch_idx): max_epochs=1, log_every_n_steps=1, weights_summary=None, + logger=False, + checkpoint_callback=False, ) trainer.fit(model) + rand_input = torch.randn(10, 32) - logged = trainer.logged_metrics - assert torch.allclose(torch.tensor(logged["sum"]), model.sum) + script_model = model.to_torchscript() + + # test that we can still do inference + output = model(rand_input) + script_output = script_model(rand_input) + assert torch.allclose(output, script_output) diff --git a/tests/metrics/utils.py b/tests/metrics/utils.py index b946d23c79813..e7cc7c6123b48 100644 --- a/tests/metrics/utils.py +++ b/tests/metrics/utils.py @@ -24,7 +24,7 @@ def setup_ddp(rank, world_size): os.environ["MASTER_ADDR"] = 'localhost' os.environ['MASTER_PORT'] = '8088' - if torch.distributed.is_available(): + if torch.distributed.is_available() and sys.platform not in ['win32', 'cygwin']: torch.distributed.init_process_group("gloo", rank=rank, world_size=world_size) diff --git a/tests/models/test_grad_norm.py b/tests/models/test_grad_norm.py index 89d8ff89999c1..51ba7f34048e0 100644 --- a/tests/models/test_grad_norm.py +++ b/tests/models/test_grad_norm.py @@ -49,11 +49,11 @@ def on_after_backward(self): norm = np.linalg.norm(flat, self.norm_type) norms.append(norm) - out[prefix + name] = round(norm, 3) + out[prefix + name] = round(norm, 4) # handle total norm norm = np.linalg.norm(norms, self.norm_type) - out[prefix + 'total'] = round(norm, 3) + out[prefix + 'total'] = round(norm, 4) self.stored_grad_norms.append(out) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 886e0db4e7854..bccc5262a5bda 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -307,7 +307,7 @@ def on_test_model_train(self): trainer.fit(model) - assert model.called == [ + expected = [ 'on_fit_start', 'on_pretrain_routine_start', 'on_pretrain_routine_end', @@ -341,10 +341,12 @@ def on_test_model_train(self): 'on_fit_end', ] + assert model.called == expected + model2 = HookedModel() trainer.test(model2) - assert model2.called == [ + expected = [ 'on_fit_start', 'on_pretrain_routine_start', 'on_pretrain_routine_end', @@ -356,3 +358,5 @@ def on_test_model_train(self): 'on_test_model_train', 'on_fit_end', ] + + assert model2.called == expected diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 449bc8c712ddb..d0ae17d8fee5d 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -17,18 +17,25 @@ import shlex import subprocess import sys -from unittest.mock import patch +import numpy as np import pytest import torch +from sklearn.metrics import accuracy_score + import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils from pytorch_lightning import Trainer +from pytorch_lightning.accelerators.horovod_accelerator import HorovodAccelerator +from pytorch_lightning.core.step_result import Result, TrainResult, EvalResult +from pytorch_lightning.metrics.classification.accuracy import Accuracy +from pytorch_lightning.utilities import APEX_AVAILABLE, NATIVE_AMP_AVALAIBLE from tests.base import EvalModelTemplate from tests.base.models import BasicGAN try: + import horovod from horovod.common.util import nccl_built except ImportError: HOROVOD_AVAILABLE = False @@ -126,8 +133,33 @@ def test_horovod_multi_gpu(tmpdir): @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") @pytest.mark.skipif(not _nccl_available(), reason="test requires Horovod with NCCL support") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +@pytest.mark.skipif(not APEX_AVAILABLE, reason="test requires apex") +def test_horovod_apex(tmpdir): + """Test Horovod with multi-GPU support using apex amp.""" + trainer_options = dict( + default_root_dir=str(tmpdir), + weights_save_path=str(tmpdir), + gradient_clip_val=1.0, + progress_bar_refresh_rate=0, + max_epochs=1, + limit_train_batches=0.4, + limit_val_batches=0.2, + gpus=2, + deterministic=True, + distributed_backend='horovod', + amp_backend='apex', + precision=16, + ) + _run_horovod(trainer_options, on_gpu=True) + + +@pytest.mark.skip(reason="Skip till Horovod fixes integration with Native torch.cuda.amp") +@pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") +@pytest.mark.skipif(not _nccl_available(), reason="test requires Horovod with NCCL support") +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +@pytest.mark.skipif(not NATIVE_AMP_AVALAIBLE, reason="test requires torch.cuda.amp") def test_horovod_amp(tmpdir): - """Test Horovod with multi-GPU support.""" + """Test Horovod with multi-GPU support using native amp.""" trainer_options = dict( default_root_dir=str(tmpdir), weights_save_path=str(tmpdir), @@ -139,6 +171,7 @@ def test_horovod_amp(tmpdir): gpus=2, deterministic=True, distributed_backend='horovod', + amp_backend='native', precision=16, ) _run_horovod(trainer_options, on_gpu=True) @@ -208,6 +241,111 @@ def get_optimizer_params(optimizer): assert get_model_params(model.generator) == get_optimizer_params(trainer.optimizers[0]) assert get_model_params(model.discriminator) == get_optimizer_params(trainer.optimizers[1]) + +@pytest.mark.skipif(not HOROVOD_AVAILABLE, reason="Horovod is unavailable") +@pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") +def test_result_reduce_horovod(tmpdir): + """Make sure result logging works with Horovod. + + This test mirrors tests/core/test_results.py::_ddp_test_fn + """ + tutils.reset_seed() + tutils.set_random_master_port() + + def hvd_test_fn(): + path_here = os.path.abspath(os.path.dirname(__file__)) + path_root = os.path.abspath(os.path.join(path_here, '..', '..')) + sys.path.insert(0, os.path.abspath(path_root)) + + from tests.base.boring_model import BoringModel + + import horovod.torch as hvd + + class TestModel(BoringModel): + def training_step(self, batch, batch_idx): + self.training_step_called = True + + tensor = torch.tensor([1.0]) + self.log("test_tensor", tensor, sync_dist=True, sync_dist_op='sum', + on_step=True, on_epoch=True) + + res = self._results + + # Check that `tensor` is summed across all ranks automatically + assert res["test_tensor"].item() == hvd.size(), \ + "Result-Log does not work properly with Horovod and Tensors" + + def training_epoch_end(self, outputs) -> None: + assert len(outputs) == 0 + + model = TestModel() + model.val_dataloader = None + + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=2, + limit_val_batches=2, + max_epochs=1, + log_every_n_steps=1, + weights_summary=None, + ) + + trainer.fit(model) + + horovod.run(hvd_test_fn, np=2) + + +@pytest.mark.skipif(not HOROVOD_AVAILABLE, reason="Horovod is unavailable") +@pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") +def test_accuracy_metric_horovod(): + num_batches = 10 + batch_size = 16 + threshold = 0.5 + + def sk_metric(preds, target): + sk_preds = (preds.view(-1).numpy() >= threshold).astype(np.uint8) + sk_target = target.view(-1).numpy() + return accuracy_score(y_true=sk_target, y_pred=sk_preds) + + preds = torch.rand(num_batches, batch_size) + target = torch.randint(high=2, size=(num_batches, batch_size)) + + def _compute_batch(): + import horovod.torch as hvd + + trainer = Trainer( + fast_dev_run=True, + distributed_backend='horovod', + ) + + accelerator_backend = trainer.accelerator_connector.select_accelerator() + assert isinstance(accelerator_backend, HorovodAccelerator) + + metric = Accuracy(compute_on_step=True, + dist_sync_on_step=True, + dist_sync_fn=accelerator_backend.gather_all_tensors, + threshold=threshold) + + for i in range(hvd.rank(), num_batches, hvd.size()): + batch_result = metric(preds[i], target[i]) + if hvd.rank() == 0: + dist_preds = torch.stack([preds[i + r] for r in range(hvd.size())]) + dist_target = torch.stack([target[i + r] for r in range(hvd.size())]) + sk_batch_result = sk_metric(dist_preds, dist_target) + assert np.allclose(batch_result.numpy(), sk_batch_result) + + # check on all batches on all ranks + result = metric.compute() + assert isinstance(result, torch.Tensor) + + total_preds = torch.stack([preds[i] for i in range(num_batches)]) + total_target = torch.stack([target[i] for i in range(num_batches)]) + sk_result = sk_metric(total_preds, total_target) + + assert np.allclose(result.numpy(), sk_result) + + horovod.run(_compute_batch, np=2) + # @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") # def test_horovod_multi_optimizer_with_scheduling_stepping(tmpdir): # hparams = EvalModelTemplate.get_default_hparams() diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py index 862294e64765f..024a0ee531580 100644 --- a/tests/models/test_restore.py +++ b/tests/models/test_restore.py @@ -15,6 +15,7 @@ import logging as log import os import pickle +from copy import deepcopy import cloudpickle import pytest @@ -24,7 +25,7 @@ import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils -from pytorch_lightning import Trainer, LightningModule, Callback +from pytorch_lightning import Trainer, LightningModule, Callback, seed_everything from pytorch_lightning.callbacks import ModelCheckpoint from tests.base import EvalModelTemplate, GenericEvalModelTemplate, TrialMNIST @@ -51,24 +52,90 @@ def on_train_end(self, trainer, pl_module): self._check_properties(trainer, pl_module) -def test_resume_from_checkpoint(tmpdir): +def test_model_properties_resume_from_checkpoint(tmpdir): """ Test that properties like `current_epoch` and `global_step` in model and trainer are always the same. """ model = EvalModelTemplate() checkpoint_callback = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True) trainer_args = dict( default_root_dir=tmpdir, - max_epochs=2, + max_epochs=1, logger=False, - checkpoint_callback=checkpoint_callback, - callbacks=[ModelTrainerPropertyParity()] # this performs the assertions + callbacks=[checkpoint_callback, ModelTrainerPropertyParity()] # this performs the assertions ) trainer = Trainer(**trainer_args) trainer.fit(model) + + trainer_args.update(max_epochs=2) trainer = Trainer(**trainer_args, resume_from_checkpoint=str(tmpdir / "last.ckpt")) trainer.fit(model) +class CaptureCallbacksBeforeTraining(Callback): + callbacks = [] + + def on_train_start(self, trainer, pl_module): + self.callbacks = deepcopy(trainer.callbacks) + + +def test_callbacks_state_resume_from_checkpoint(tmpdir): + """ Test that resuming from a checkpoint restores callbacks that persist state. """ + model = EvalModelTemplate() + callback_capture = CaptureCallbacksBeforeTraining() + + def get_trainer_args(): + checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True) + trainer_args = dict( + default_root_dir=tmpdir, + max_steps=1, + logger=False, + callbacks=[ + checkpoint, + callback_capture, + ] + ) + assert checkpoint.best_model_path == "" + assert checkpoint.best_model_score == 0 + return trainer_args + + # initial training + trainer = Trainer(**get_trainer_args()) + trainer.fit(model) + callbacks_before_resume = deepcopy(trainer.callbacks) + + # resumed training + trainer = Trainer(**get_trainer_args(), resume_from_checkpoint=str(tmpdir / "last.ckpt")) + trainer.fit(model) + + assert len(callbacks_before_resume) == len(callback_capture.callbacks) + + for before, after in zip(callbacks_before_resume, callback_capture.callbacks): + if isinstance(before, ModelCheckpoint): + assert before.best_model_path == after.best_model_path + assert before.best_model_score == after.best_model_score + + +def test_callbacks_references_resume_from_checkpoint(tmpdir): + """ Test that resuming from a checkpoint sets references as expected. """ + model = EvalModelTemplate() + args = {'default_root_dir': tmpdir, 'max_steps': 1, 'logger': False} + + # initial training + checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True) + trainer = Trainer(**args, callbacks=[checkpoint]) + assert checkpoint is trainer.callbacks[0] is trainer.checkpoint_callback + trainer.fit(model) + + # resumed training + new_checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True) + # pass in a new checkpoint object, which should take + # precedence over the one in the last.ckpt file + trainer = Trainer(**args, callbacks=[new_checkpoint], resume_from_checkpoint=str(tmpdir / "last.ckpt")) + assert checkpoint is not new_checkpoint + assert new_checkpoint is trainer.callbacks[0] is trainer.checkpoint_callback + trainer.fit(model) + + @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_running_test_pretrained_model_distrib_dp(tmpdir): """Verify `test()` on pretrained model.""" @@ -233,7 +300,7 @@ def test_load_model_from_checkpoint(tmpdir, model_template): # Since `EvalModelTemplate` has `_save_hparams = True` by default, check that ckpt has hparams ckpt = torch.load(last_checkpoint) - assert model_template.CHECKPOINT_HYPER_PARAMS_KEY in ckpt.keys(), 'module_arguments missing from checkpoints' + assert model_template.CHECKPOINT_HYPER_PARAMS_KEY in ckpt.keys(), 'hyper_parameters missing from checkpoints' # Ensure that model can be correctly restored from checkpoint pretrained_model = model_template.load_from_checkpoint(last_checkpoint) diff --git a/tests/plugins/test_amp_plugin.py b/tests/plugins/test_amp_plugin.py index c0d5747b5fc7e..6d7f0252d94c3 100644 --- a/tests/plugins/test_amp_plugin.py +++ b/tests/plugins/test_amp_plugin.py @@ -84,3 +84,64 @@ def on_fit_start(self, trainer, pl_module): with pytest.raises(SystemExit): trainer.fit(model) + + +class GradientUnscaleBoringModel(BoringModel): + def on_after_backward(self): + norm = torch.nn.utils.clip_grad_norm_(self.parameters(), 2) + if not (torch.isinf(norm) or torch.isnan(norm)): + assert norm.item() < 15. + + +@pytest.mark.skipif( + LooseVersion(torch.__version__) < LooseVersion("1.6.0"), + reason="Minimal PT version is set to 1.6") +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +def test_amp_gradient_unscale(tmpdir): + model = GradientUnscaleBoringModel() + + trainer = Trainer( + max_epochs=2, + default_root_dir=os.getcwd(), + limit_train_batches=2, + limit_test_batches=2, + limit_val_batches=2, + amp_backend='native', + distributed_backend='ddp_spawn', + gpus=2, + precision=16, + track_grad_norm=2, + log_every_n_steps=1 + ) + trainer.fit(model) + + +class UnscaleAccumulateGradBatchesBoringModel(BoringModel): + + def on_after_backward(self): + norm = torch.nn.utils.clip_grad_norm_(self.parameters(), 2) + if not (torch.isinf(norm) or torch.isnan(norm)): + assert norm.item() < 15. + + +@pytest.mark.skipif( + LooseVersion(torch.__version__) < LooseVersion("1.6.0"), reason="Minimal PT version is set to 1.6") +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +def test_amp_gradient_unscale_accumulate_grad_batches(tmpdir): + model = UnscaleAccumulateGradBatchesBoringModel() + + trainer = Trainer( + max_epochs=2, + default_root_dir=os.getcwd(), + limit_train_batches=2, + limit_test_batches=2, + limit_val_batches=2, + amp_backend='native', + distributed_backend='ddp_spawn', + gpus=2, + precision=16, + track_grad_norm=2, + log_every_n_steps=1, + accumulate_grad_batches=2, + ) + trainer.fit(model) diff --git a/tests/plugins/test_ddp_plugin.py b/tests/plugins/test_ddp_plugin.py index b190f34395522..69cd0e3beb7b4 100644 --- a/tests/plugins/test_ddp_plugin.py +++ b/tests/plugins/test_ddp_plugin.py @@ -1,25 +1,30 @@ -from pytorch_lightning.callbacks import Callback -from tests.base.boring_model import BoringModel -from pytorch_lightning import accelerators, Trainer -from pytorch_lightning.plugins.ddp_plugin import DDPPlugin -import pytest import os from unittest import mock +import pytest +from pytorch_lightning import Trainer, accelerators +from pytorch_lightning.callbacks import Callback +from pytorch_lightning.plugins.ddp_plugin import DDPPlugin +from tests.base.boring_model import BoringModel -@mock.patch.dict(os.environ, { - "CUDA_VISIBLE_DEVICES": "0,1", - "SLURM_NTASKS": "2", - "SLURM_JOB_NAME": "SOME_NAME", - "SLURM_NODEID": "0", - "LOCAL_RANK": "0", - "SLURM_LOCALID": "0" -}) -@mock.patch('torch.cuda.device_count', return_value=2) -@pytest.mark.parametrize(['ddp_backend', 'gpus', 'num_processes'], - [('ddp_cpu', None, None), ('ddp', 2, 0), ('ddp2', 2, 0), ('ddp_spawn', 2, 0)]) -def test_ddp_choice_default_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes): +@mock.patch.dict( + os.environ, + { + "CUDA_VISIBLE_DEVICES": "0,1", + "SLURM_NTASKS": "2", + "SLURM_JOB_NAME": "SOME_NAME", + "SLURM_NODEID": "0", + "LOCAL_RANK": "0", + "SLURM_LOCALID": "0", + }, +) +@mock.patch("torch.cuda.device_count", return_value=2) +@pytest.mark.parametrize( + ["ddp_backend", "gpus", "num_processes"], + [("ddp_cpu", None, None), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)], +) +def test_ddp_choice_default_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert isinstance(trainer.accelerator_backend.ddp_plugin, DDPPlugin) @@ -31,24 +36,29 @@ def on_fit_start(self, trainer, pl_module): gpus=gpus, num_processes=num_processes, distributed_backend=ddp_backend, - callbacks=[CB()] + callbacks=[CB()], ) with pytest.raises(SystemExit): trainer.fit(model) -@mock.patch.dict(os.environ, { - "CUDA_VISIBLE_DEVICES": "0,1", - "SLURM_NTASKS": "2", - "SLURM_JOB_NAME": "SOME_NAME", - "SLURM_NODEID": "0", - "LOCAL_RANK": "0", - "SLURM_LOCALID": "0" -}) -@mock.patch('torch.cuda.device_count', return_value=2) -@pytest.mark.parametrize(['ddp_backend', 'gpus', 'num_processes'], - [('ddp_cpu', None, None), ('ddp', 2, 0), ('ddp2', 2, 0), ('ddp_spawn', 2, 0)]) +@mock.patch.dict( + os.environ, + { + "CUDA_VISIBLE_DEVICES": "0,1", + "SLURM_NTASKS": "2", + "SLURM_JOB_NAME": "SOME_NAME", + "SLURM_NODEID": "0", + "LOCAL_RANK": "0", + "SLURM_LOCALID": "0", + }, +) +@mock.patch("torch.cuda.device_count", return_value=2) +@pytest.mark.parametrize( + ["ddp_backend", "gpus", "num_processes"], + [("ddp_cpu", None, None), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)], +) def test_ddp_choice_custom_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes): class MyDDP(DDPPlugin): pass @@ -65,7 +75,48 @@ def on_fit_start(self, trainer, pl_module): num_processes=num_processes, distributed_backend=ddp_backend, plugins=[MyDDP()], - callbacks=[CB()] + callbacks=[CB()], + ) + + with pytest.raises(SystemExit): + trainer.fit(model) + + +@mock.patch.dict( + os.environ, + { + "CUDA_VISIBLE_DEVICES": "0,1", + "SLURM_NTASKS": "2", + "SLURM_JOB_NAME": "SOME_NAME", + "SLURM_NODEID": "0", + "LOCAL_RANK": "0", + "SLURM_LOCALID": "0", + }, +) +@mock.patch("torch.cuda.device_count", return_value=2) +@pytest.mark.parametrize( + ["ddp_backend", "gpus", "num_processes"], + [("ddp_cpu", None, None), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)], +) +def test_ddp_choice_custom_ddp_cpu_custom_args( + tmpdir, ddp_backend, gpus, num_processes +): + class MyDDP(DDPPlugin): + pass + + class CB(Callback): + def on_fit_start(self, trainer, pl_module): + assert isinstance(trainer.accelerator_backend.ddp_plugin, MyDDP) + raise SystemExit() + + model = BoringModel() + trainer = Trainer( + fast_dev_run=True, + gpus=gpus, + num_processes=num_processes, + distributed_backend=ddp_backend, + plugins=[MyDDP(broadcast_buffers=False, find_unused_parameters=True)], + callbacks=[CB()], ) with pytest.raises(SystemExit): diff --git a/tests/test_deprecated.py b/tests/test_deprecated.py index c8a7b1d270e35..de3fb63fe9664 100644 --- a/tests/test_deprecated.py +++ b/tests/test_deprecated.py @@ -1,16 +1,26 @@ """Test deprecated functionality which will be removed in vX.Y.Z""" +from argparse import ArgumentParser import pytest import sys +from unittest import mock import torch from tests.base import EvalModelTemplate from pytorch_lightning.metrics.functional.classification import auc +from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelCheckpoint +from pytorch_lightning.profiler.profilers import PassThroughProfiler, SimpleProfiler from pytorch_lightning.utilities.exceptions import MisconfigurationException +def test_tbd_remove_in_v1_3_0(tmpdir): + with pytest.deprecated_call(match='will no longer be supported in v1.3'): + callback = ModelCheckpoint() + Trainer(checkpoint_callback=callback, callbacks=[], default_root_dir=tmpdir) + + def test_tbd_remove_in_v1_2_0(): with pytest.deprecated_call(match='will be removed in v1.2'): checkpoint_cb = ModelCheckpoint(filepath='.') @@ -22,6 +32,37 @@ def test_tbd_remove_in_v1_2_0(): checkpoint_cb = ModelCheckpoint(filepath='.', dirpath='.') +# TODO: remove bool from Trainer.profiler param in v1.3.0, update profiler_connector.py +@pytest.mark.parametrize(['profiler', 'expected'], [ + (True, SimpleProfiler), + (False, PassThroughProfiler), +]) +def test_trainer_profiler_remove_in_v1_3_0(profiler, expected): + with pytest.deprecated_call(match='will be removed in v1.3'): + trainer = Trainer(profiler=profiler) + assert isinstance(trainer.profiler, expected) + + +@pytest.mark.parametrize( + ['cli_args', 'expected_parsed_arg', 'expected_profiler'], + [ + ('--profiler', True, SimpleProfiler), + ('--profiler True', True, SimpleProfiler), + ('--profiler False', False, PassThroughProfiler), + ], +) +def test_trainer_cli_profiler_remove_in_v1_3_0(cli_args, expected_parsed_arg, expected_profiler): + cli_args = cli_args.split(' ') + with mock.patch("argparse._sys.argv", ["any.py"] + cli_args): + parser = ArgumentParser(add_help=False) + parser = Trainer.add_argparse_args(parent_parser=parser) + args = Trainer.parse_argparser(parser) + + assert getattr(args, "profiler") == expected_parsed_arg + trainer = Trainer.from_argparse_args(args) + assert isinstance(trainer.profiler, expected_profiler) + + def _soft_unimport_module(str_module): # once the module is imported e.g with parsing with pytest it lives in memory if str_module in sys.modules: diff --git a/tests/trainer/flags/test_fast_dev_run.py b/tests/trainer/flags/test_fast_dev_run.py new file mode 100644 index 0000000000000..cbe4d4012227a --- /dev/null +++ b/tests/trainer/flags/test_fast_dev_run.py @@ -0,0 +1,21 @@ +import pytest +from pytorch_lightning import Trainer +from tests.base import EvalModelTemplate + + +@pytest.mark.parametrize('tuner_alg', ['batch size scaler', 'learning rate finder']) +def test_skip_on_fast_dev_run_batch_scaler(tmpdir, tuner_alg): + """ Test that tuner algorithms are skipped if fast dev run is enabled """ + + hparams = EvalModelTemplate.get_default_hparams() + model = EvalModelTemplate(**hparams) + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=2, + auto_scale_batch_size=True if tuner_alg == 'batch size scaler' else False, + auto_lr_find=True if tuner_alg == 'learning rate finder' else False, + fast_dev_run=True + ) + expected_message = f'Skipping {tuner_alg} since `fast_dev_run=True`' + with pytest.warns(UserWarning, match=expected_message): + trainer.tune(model) diff --git a/tests/trainer/legacy_deprecate_flow_log_tests/test_eval_loop_dict_return.py b/tests/trainer/legacy_deprecate_flow_log_tests/test_eval_loop_dict_return.py index 47356e4bd684c..6329480e10a11 100644 --- a/tests/trainer/legacy_deprecate_flow_log_tests/test_eval_loop_dict_return.py +++ b/tests/trainer/legacy_deprecate_flow_log_tests/test_eval_loop_dict_return.py @@ -14,6 +14,7 @@ """ Tests to ensure that the training loop works with a dict """ +import os from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning import Trainer from tests.base.deterministic_model import DeterministicModel @@ -106,7 +107,7 @@ def test_validation_step_arbitrary_dict_return(tmpdir): # out are the results of the full loop # eval_results are output of _evaluate callback_metrics, eval_results = trainer.run_evaluation(test_mode=False) - assert len(callback_metrics) == 2 + assert len(callback_metrics) == 1 assert len(eval_results) == 2 assert eval_results[0]['some'] == 171 assert eval_results[1]['some'] == 171 @@ -125,6 +126,9 @@ def test_validation_step_dict_return(tmpdir): Test that val step can return a dict with all the expected keys and they end up in the correct place """ + + os.environ['PL_DEV_DEBUG'] = '0' + model = DeterministicModel() model.training_step = model.training_step_dict_return model.validation_step = model.validation_step_dict_return @@ -143,7 +147,7 @@ def test_validation_step_dict_return(tmpdir): # out are the results of the full loop # eval_results are output of _evaluate callback_metrics, eval_results = trainer.run_evaluation(test_mode=False) - assert len(callback_metrics) == 2 + assert len(callback_metrics) == 1 assert len(callback_metrics[0]) == 5 assert len(eval_results) == 2 assert eval_results[0]['log']['log_acc1'] == 12 @@ -166,6 +170,8 @@ def test_val_step_step_end_no_return(tmpdir): """ Test that val step + val step end work (with no return in val step end) """ + os.environ['PL_DEV_DEBUG'] = '0' + model = DeterministicModel() model.training_step = model.training_step_dict_return model.validation_step = model.validation_step_dict_return @@ -197,6 +203,9 @@ def test_val_step_step_end(tmpdir): """ Test that val step + val step end work """ + + os.environ['PL_DEV_DEBUG'] = '0' + model = DeterministicModel() model.training_step = model.training_step_dict_return model.validation_step = model.validation_step_dict_return @@ -215,7 +224,7 @@ def test_val_step_step_end(tmpdir): # out are the results of the full loop # eval_results are output of _evaluate callback_metrics, eval_results = trainer.run_evaluation(test_mode=False) - assert len(callback_metrics) == 2 + assert len(callback_metrics) == 1 assert len(callback_metrics[0]) == 6 callback_metrics = callback_metrics[0] @@ -241,6 +250,9 @@ def test_no_val_step_end(tmpdir): """ Test that val step + val epoch end """ + + os.environ['PL_DEV_DEBUG'] = '0' + model = DeterministicModel() model.training_step = model.training_step_dict_return model.validation_step = model.validation_step_dict_return @@ -284,6 +296,9 @@ def test_full_val_loop(tmpdir): """ Test that val step + val step end + val epoch end """ + + os.environ['PL_DEV_DEBUG'] = '0' + model = DeterministicModel() model.training_step = model.training_step_dict_return model.validation_step = model.validation_step_dict_return diff --git a/tests/trainer/legacy_deprecate_flow_log_tests/test_trainer_steps_dict_return.py b/tests/trainer/legacy_deprecate_flow_log_tests/test_trainer_steps_dict_return.py index 7e8588ce9f6b2..8d1aaf1b3c548 100644 --- a/tests/trainer/legacy_deprecate_flow_log_tests/test_trainer_steps_dict_return.py +++ b/tests/trainer/legacy_deprecate_flow_log_tests/test_trainer_steps_dict_return.py @@ -44,9 +44,10 @@ def test_training_step_dict(tmpdir): break out = trainer.train_loop.run_training_batch(batch, batch_idx, 0) + assert out.signal == 0 - assert out.batch_log_metrics['log_acc1'] == 12.0 - assert out.batch_log_metrics['log_acc2'] == 7.0 + assert trainer.logger_connector.logged_metrics['log_acc1'] == 12.0 + assert trainer.logger_connector.logged_metrics['log_acc2'] == 7.0 train_step_out = out.training_step_output_for_epoch_end assert len(train_step_out) == 1 @@ -92,8 +93,8 @@ def training_step_with_step_end(tmpdir): out = trainer.train_loop.run_training_batch(batch, batch_idx, 0) assert out.signal == 0 - assert out.batch_log_metrics['log_acc1'] == 14.0 - assert out.batch_log_metrics['log_acc2'] == 9.0 + assert trainer.logger_connector.logged_metrics['log_acc1'] == 14.0 + assert trainer.logger_connector.logged_metrics['log_acc2'] == 9.0 train_step_end_out = out.training_step_output_for_epoch_end pbar_metrics = train_step_end_out['progress_bar'] @@ -133,8 +134,8 @@ def test_full_training_loop_dict(tmpdir): out = trainer.train_loop.run_training_batch(batch, batch_idx, 0) assert out.signal == 0 - assert out.batch_log_metrics['log_acc1'] == 14.0 - assert out.batch_log_metrics['log_acc2'] == 9.0 + assert trainer.logger_connector.logged_metrics['log_acc1'] == 14.0 + assert trainer.logger_connector.logged_metrics['log_acc2'] == 9.0 # get the output of the first optimizer train_step_end_out = out.training_step_output_for_epoch_end @@ -220,8 +221,8 @@ def test_train_step_epoch_end(tmpdir): out = trainer.train_loop.run_training_batch(batch, batch_idx, 0) assert out.signal == 0 - assert out.batch_log_metrics['log_acc1'] == 12.0 - assert out.batch_log_metrics['log_acc2'] == 7.0 + assert trainer.logger_connector.logged_metrics['log_acc1'] == 12.0 + assert trainer.logger_connector.logged_metrics['log_acc2'] == 7.0 # outputs are for 1 optimizer and no tbptt train_step_end_out = out.training_step_output_for_epoch_end diff --git a/tests/trainer/legacy_deprecate_flow_log_tests/test_trainer_steps_scalar_return.py b/tests/trainer/legacy_deprecate_flow_log_tests/test_trainer_steps_scalar_return.py index b5eae913ca428..2a66f743a49ef 100644 --- a/tests/trainer/legacy_deprecate_flow_log_tests/test_trainer_steps_scalar_return.py +++ b/tests/trainer/legacy_deprecate_flow_log_tests/test_trainer_steps_scalar_return.py @@ -15,6 +15,7 @@ Tests to ensure that the training loop works with a scalar """ import torch +import os from pytorch_lightning import Trainer from tests.base.deterministic_model import DeterministicModel @@ -46,7 +47,6 @@ def test_training_step_scalar(tmpdir): out = trainer.train_loop.run_training_batch(batch, batch_idx, 0) assert out.signal == 0 - assert len(out.batch_log_metrics) == 0 and isinstance(out.batch_log_metrics, dict) assert len(out.grad_norm_dic) == 0 and isinstance(out.grad_norm_dic, dict) train_step_out = out.training_step_output_for_epoch_end @@ -84,7 +84,6 @@ def training_step_scalar_with_step_end(tmpdir): out = trainer.train_loop.run_training_batch(batch, batch_idx, 0) assert out.signal == 0 - assert len(out.batch_log_metrics) == 0 and isinstance(out.batch_log_metrics, dict) assert len(out.grad_norm_dic) == 0 and isinstance(out.grad_norm_dic, dict) train_step_out = out.training_step_output_for_epoch_end @@ -104,6 +103,8 @@ def test_full_training_loop_scalar(tmpdir): Checks train_step + training_step_end + training_epoch_end (all with scalar return from train_step) """ + os.environ['PL_DEV_DEBUG'] = '0' + model = DeterministicModel() model.training_step = model.training_step_scalar_return model.training_step_end = model.training_step_end_scalar @@ -132,7 +133,6 @@ def test_full_training_loop_scalar(tmpdir): out = trainer.train_loop.run_training_batch(batch, batch_idx, 0) assert out.signal == 0 - assert len(out.batch_log_metrics) == 0 and isinstance(out.batch_log_metrics, dict) assert len(out.grad_norm_dic) == 0 and isinstance(out.grad_norm_dic, dict) train_step_out = out.training_step_output_for_epoch_end @@ -152,6 +152,8 @@ def test_train_step_epoch_end_scalar(tmpdir): Checks train_step + training_epoch_end (NO training_step_end) (with scalar return) """ + os.environ['PL_DEV_DEBUG'] = '0' + model = DeterministicModel() model.training_step = model.training_step_scalar_return model.training_step_end = None @@ -176,7 +178,6 @@ def test_train_step_epoch_end_scalar(tmpdir): out = trainer.train_loop.run_training_batch(batch, batch_idx, 0) assert out.signal == 0 - assert len(out.batch_log_metrics) == 0 and isinstance(out.batch_log_metrics, dict) assert len(out.grad_norm_dic) == 0 and isinstance(out.grad_norm_dic, dict) train_step_out = out.training_step_output_for_epoch_end diff --git a/tests/trainer/logging/test_logger_connector.py b/tests/trainer/logging/test_logger_connector.py new file mode 100644 index 0000000000000..08936f89eb9f8 --- /dev/null +++ b/tests/trainer/logging/test_logger_connector.py @@ -0,0 +1,385 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Tests to ensure that the training loop works with a dict (1.0) +""" +import os +import torch +import pytest +from copy import deepcopy +from pytorch_lightning.trainer import Trainer +from pytorch_lightning.core.step_result import Result +from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector +from pytorch_lightning.trainer.connectors.logger_connector.epoch_result_store import EpochResultStore +from pytorch_lightning.trainer.connectors.logger_connector.callback_hook_validator import CallbackHookNameValidator +from pytorch_lightning.callbacks.base import Callback +from pytorch_lightning.utilities.exceptions import MisconfigurationException +from tests.base.boring_model import BoringModel, RandomDataset + + +class Helper: + def decorator_with_arguments(fx_name='', hook_fx_name=None): + def decorator(func): + def wrapper(self, *args, **kwargs): + # Set information + self._current_fx_name = fx_name + self._current_hook_fx_name = hook_fx_name + self._results = Result() + + result = func(self, *args, **kwargs) + + # cache metrics + self.trainer.logger_connector.cache_logged_metrics() + return result + return wrapper + + return decorator + + +def test__logger_connector__epoch_result_store__train(tmpdir): + """ + Tests that LoggerConnector will properly capture logged information + and reduce them + """ + + os.environ['PL_DEV_DEBUG'] = '1' + + class TestModel(BoringModel): + + train_losses = [] + + @Helper.decorator_with_arguments(fx_name="training_step") + def training_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + + self.train_losses.append(loss) + + self.log("train_loss", loss, on_step=True, on_epoch=True) + return {"loss": loss} + + def on_train_epoch_end(self, outputs): + # save objects as it will be reset at the end of epoch. + self.train_results = deepcopy(self.trainer.logger_connector.cached_results) + + model = TestModel() + model.val_dataloader = None + + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=2, + limit_val_batches=4, + max_epochs=1, + log_every_n_steps=1, + weights_summary=None, + ) + trainer.fit(model) + + train_results = model.train_results + + assert len(train_results(fx_name="training_step", dl_idx="0", opt_idx="0")) == 2 + generated = train_results(fx_name="training_step", + dl_idx="0", + opt_idx="0", + batch_idx="0", + split_idx="0")["train_loss"] + assert generated == model.train_losses[0] + generated = train_results(fx_name="training_step", + dl_idx="0", + opt_idx="0", + batch_idx="1", + split_idx="0")["train_loss"] + assert generated == model.train_losses[1] + + assert train_results.has_reduced is not True + + train_results.has_batch_loop_finished = True + + assert train_results.has_reduced is True + + generated = train_results(fx_name="training_step", dl_idx="0", opt_idx="0", reduced=True)['train_loss_epoch'].item() + excepted = torch.stack(model.train_losses).mean().item() + assert generated == excepted + + +def test__logger_connector__epoch_result_store__train__ttbt(tmpdir): + """ + Tests that LoggerConnector will properly capture logged information with ttbt + and reduce them + """ + truncated_bptt_steps = 2 + sequence_size = 30 + batch_size = 30 + + x_seq = torch.rand(batch_size, sequence_size, 1) + y_seq_list = torch.rand(batch_size, sequence_size, 1).tolist() + + class MockSeq2SeqDataset(torch.utils.data.Dataset): + def __getitem__(self, i): + return x_seq, y_seq_list + + def __len__(self): + return 1 + + class TestModel(BoringModel): + + train_losses = [] + + def __init__(self): + super().__init__() + self.test_hidden = None + self.layer = torch.nn.Linear(2, 2) + + @Helper.decorator_with_arguments(fx_name="training_step") + def training_step(self, batch, batch_idx, hiddens): + try: + assert hiddens == self.test_hidden, "Hidden state not persistent between tbptt steps" + except Exception as e: + print(e) + + self.test_hidden = torch.rand(1) + + x_tensor, y_list = batch + assert x_tensor.shape[1] == truncated_bptt_steps, "tbptt split Tensor failed" + + y_tensor = torch.tensor(y_list, dtype=x_tensor.dtype) + assert y_tensor.shape[1] == truncated_bptt_steps, "tbptt split list failed" + + pred = self(x_tensor.view(batch_size, truncated_bptt_steps)) + loss = torch.nn.functional.mse_loss( + pred, y_tensor.view(batch_size, truncated_bptt_steps)) + + self.train_losses.append(loss) + + self.log('a', loss, on_epoch=True) + + return {'loss': loss, 'hiddens': self.test_hidden} + + def on_train_epoch_start(self) -> None: + self.test_hidden = None + + def train_dataloader(self): + return torch.utils.data.DataLoader( + dataset=MockSeq2SeqDataset(), + batch_size=batch_size, + shuffle=False, + sampler=None, + ) + + def on_train_epoch_end(self, outputs): + # save objects as it will be reset at the end of epoch. + self.train_results = deepcopy(self.trainer.logger_connector.cached_results) + + model = TestModel() + model.training_epoch_end = None + model.example_input_array = torch.randn(5, truncated_bptt_steps) + + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=10, + limit_val_batches=0, + truncated_bptt_steps=truncated_bptt_steps, + max_epochs=1, + log_every_n_steps=1, + weights_summary=None, + ) + trainer.fit(model) + + train_results = model.train_results + + generated = train_results(fx_name="training_step", dl_idx="0", opt_idx="0", batch_idx="0") + assert len(generated) == len(model.train_losses) + + # assert reduction didn't happen yet + assert train_results.has_reduced is False + + # Launch reduction + train_results.has_batch_loop_finished = True + + # assert reduction did happen + assert train_results.has_reduced is True + + generated = train_results(fx_name="training_step", dl_idx="0", opt_idx="0", reduced=True)['a_epoch'].item() + assert generated == torch.stack(model.train_losses).mean().item() + + +@pytest.mark.parametrize('num_dataloaders', [1, 2]) +def test__logger_connector__epoch_result_store__test_multi_dataloaders(tmpdir, num_dataloaders): + """ + Tests that LoggerConnector will properly capture logged information in multi_dataloaders scenario + """ + + os.environ['PL_DEV_DEBUG'] = '1' + + class TestModel(BoringModel): + + test_losses = {} + + @Helper.decorator_with_arguments(fx_name="test_step") + def test_step(self, batch, batch_idx, dl_idx=0): + output = self.layer(batch) + loss = self.loss(batch, output) + + primary_key = str(dl_idx) + if primary_key not in self.test_losses: + self.test_losses[primary_key] = [] + + self.test_losses[primary_key].append(loss) + + self.log("test_loss", loss, on_step=True, on_epoch=True) + return {"test_loss": loss} + + def test_dataloader(self): + return [torch.utils.data.DataLoader(RandomDataset(32, 64)) for _ in range(num_dataloaders)] + + model = TestModel() + model.val_dataloader = None + model.test_epoch_end = None + + limit_test_batches = 4 + + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=0, + limit_val_batches=0, + limit_test_batches=limit_test_batches, + max_epochs=1, + log_every_n_steps=1, + weights_summary=None, + ) + trainer.test(model) + + test_results = trainer.logger_connector._cached_results["test"] + + generated = test_results(fx_name="test_step") + assert len(generated) == num_dataloaders + + for dl_idx in range(num_dataloaders): + generated = len(test_results(fx_name="test_step", dl_idx=str(dl_idx))) + assert generated == limit_test_batches + + test_results.has_batch_loop_finished = True + + for dl_idx in range(num_dataloaders): + expected = torch.stack(model.test_losses[str(dl_idx)]).mean() + generated = test_results(fx_name="test_step", dl_idx=str(dl_idx), reduced=True)["test_loss_epoch"] + assert abs(expected.item() - generated.item()) < 1e-6 + + +def test_call_back_validator(tmpdir): + + funcs_name = sorted([f for f in dir(Callback) if not f.startswith('_')]) + + callbacks_func = [ + 'on_after_backward', + 'on_batch_end', + 'on_batch_start', + 'on_before_zero_grad', + 'on_epoch_end', + 'on_epoch_start', + 'on_fit_end', + 'on_fit_start', + 'on_init_end', 'on_init_start', + 'on_keyboard_interrupt', + 'on_load_checkpoint', + 'on_pretrain_routine_end', + 'on_pretrain_routine_start', + 'on_sanity_check_end', + 'on_sanity_check_start', + 'on_save_checkpoint', + 'on_test_batch_end', + 'on_test_batch_start', + 'on_test_end', + 'on_test_epoch_end', + 'on_test_epoch_start', + 'on_test_start', + 'on_train_batch_end', + 'on_train_batch_start', + 'on_train_end', + 'on_train_epoch_end', + 'on_train_epoch_start', + 'on_train_start', + 'on_validation_batch_end', + 'on_validation_batch_start', + 'on_validation_end', + 'on_validation_epoch_end', + 'on_validation_epoch_start', + 'on_validation_start', + 'setup', + 'teardown', + ] + + not_supported = [ + "on_fit_end", + "on_fit_start", + "on_init_end", + "on_init_start", + "on_keyboard_interrupt", + "on_load_checkpoint", + "on_pretrain_routine_end", + "on_pretrain_routine_start", + "on_sanity_check_end", + "on_sanity_check_start", + "on_save_checkpoint", + "on_test_end", + "on_train_end", + "on_validation_end", + "setup", + "teardown", + ] + + assert funcs_name == callbacks_func, """Detected new callback function. + Need to add its logging permission to CallbackHookNameValidator and update this test""" + + validator = CallbackHookNameValidator() + + for func_name in funcs_name: + # This summurize where and what is currently possible to log using `self.log` function. + is_stage = "train" in func_name or "test" in func_name or "validation" in func_name + is_start = "start" in func_name or "batch" in func_name + on_step = is_stage and is_start + on_epoch = True + # creating allowed condition + allowed = ( + is_stage + or "batch" in func_name + or "epoch" in func_name + or "grad" in func_name + or "backward" in func_name + ) + allowed = ( + allowed + and "pretrain" not in func_name + and func_name not in ["on_train_end", "on_test_end", "on_validation_end"] + ) + if allowed: + validator.check_logging_in_callbacks(current_hook_fx_name=func_name, + on_step=on_step, + on_epoch=on_epoch) + if not is_start and is_stage: + with pytest.raises(MisconfigurationException, match="function supports only"): + validator.check_logging_in_callbacks(current_hook_fx_name=func_name, + on_step=True, + on_epoch=on_epoch) + else: + assert func_name in not_supported + with pytest.raises(MisconfigurationException, match="function doesn't support"): + validator.check_logging_in_callbacks(current_hook_fx_name=func_name, + on_step=on_step, + on_epoch=on_epoch) + + result = validator.check_logging_in_callbacks(current_hook_fx_name=None, + on_step=None, + on_epoch=None) + assert result is None diff --git a/tests/trainer/logging/__init__.py b/tests/trainer/logging_tests/__init__.py similarity index 100% rename from tests/trainer/logging/__init__.py rename to tests/trainer/logging_tests/__init__.py diff --git a/tests/trainer/logging/test_distributed_logging.py b/tests/trainer/logging_tests/test_distributed_logging.py similarity index 91% rename from tests/trainer/logging/test_distributed_logging.py rename to tests/trainer/logging_tests/test_distributed_logging.py index 5fdd021dcc0ae..a600317a024c9 100644 --- a/tests/trainer/logging/test_distributed_logging.py +++ b/tests/trainer/logging_tests/test_distributed_logging.py @@ -26,8 +26,9 @@ def on_pretrain_routine_end(self) -> None: with mock.patch('pytorch_lightning.loggers.base.LightningLoggerBase.agg_and_log_metrics') as m: self.trainer.logger_connector.log_metrics({'a': 2}, {}) logged_times = m.call_count - expected = 1 if self.global_rank == 0 else 0 - assert logged_times == expected, 'actual logger called from non-global zero' + expected = int(self.trainer.is_global_zero) + msg = f'actual logger called from non-global zero, logged_times: {logged_times}, expected: {expected}' + assert logged_times == expected, msg @pytest.mark.skipif(platform.system() == "Windows", diff --git a/tests/trainer/logging/test_eval_loop_logging_1_0.py b/tests/trainer/logging_tests/test_eval_loop_logging_1_0.py similarity index 85% rename from tests/trainer/logging/test_eval_loop_logging_1_0.py rename to tests/trainer/logging_tests/test_eval_loop_logging_1_0.py index bce4a23dda157..0f3217b3f004c 100644 --- a/tests/trainer/logging/test_eval_loop_logging_1_0.py +++ b/tests/trainer/logging_tests/test_eval_loop_logging_1_0.py @@ -18,7 +18,7 @@ from pytorch_lightning import Trainer from pytorch_lightning import callbacks, seed_everything from tests.base.deterministic_model import DeterministicModel -from tests.base import SimpleModule, BoringModel +from tests.base import SimpleModule, BoringModel, RandomDataset import os import torch import pytest @@ -358,3 +358,64 @@ def test_monitor_val_epoch_end(tmpdir): checkpoint_callback=checkpoint_callback, ) trainer.fit(model) + + +def test_multi_dataloaders_add_suffix_properly(tmpdir): + class TestModel(BoringModel): + + def test_step(self, batch, batch_idx, dataloader_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + self.log("test_loss", loss, on_step=True, on_epoch=True) + return {"y": loss} + + def test_dataloader(self): + return [torch.utils.data.DataLoader(RandomDataset(32, 64)), + torch.utils.data.DataLoader(RandomDataset(32, 64))] + + model = TestModel() + model.test_epoch_end = None + + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=0, + limit_val_batches=0, + limit_test_batches=2, + max_epochs=1, + log_every_n_steps=1, + weights_summary=None, + ) + results = trainer.test(model) + assert len(results[0]) == len(results[1]) + assert "test_loss_epoch/dataloader_idx_0" in results[0] + assert "test_loss_epoch/dataloader_idx_1" in results[1] + + +def test_single_dataloader_no_suffix_added(tmpdir): + class TestModel(BoringModel): + + def test_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + self.log("test_loss", loss, on_step=True, on_epoch=True) + return {"y": loss} + + def test_dataloader(self): + return torch.utils.data.DataLoader(RandomDataset(32, 64)) + + model = TestModel() + model.test_epoch_end = None + + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=0, + limit_val_batches=0, + limit_test_batches=5, + max_epochs=1, + log_every_n_steps=1, + weights_summary=None, + ) + results = trainer.test(model) + assert len(results) == 1 + # error : It is wrong there. `y` should equal test_loss_epoch + assert results[0]['test_loss'] == results[0]['y'] diff --git a/tests/trainer/logging/test_train_loop_logging_1_0.py b/tests/trainer/logging_tests/test_train_loop_logging_1_0.py similarity index 65% rename from tests/trainer/logging/test_train_loop_logging_1_0.py rename to tests/trainer/logging_tests/test_train_loop_logging_1_0.py index 414264894e639..60ff33b402e4b 100644 --- a/tests/trainer/logging/test_train_loop_logging_1_0.py +++ b/tests/trainer/logging_tests/test_train_loop_logging_1_0.py @@ -14,15 +14,22 @@ """ Tests to ensure that the training loop works with a dict (1.0) """ -from pytorch_lightning.core.lightning import LightningModule -from tests.base.boring_model import BoringModel, RandomDictDataset, RandomDictStringDataset + import os -import torch +import collections import pytest +import itertools +import numpy as np +import torch +from torch.utils.data import Dataset + +import pytorch_lightning as pl +from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning import Trainer, callbacks + +from tests.base.boring_model import BoringModel, RandomDictDataset, RandomDictStringDataset from tests.base.deterministic_model import DeterministicModel -from torch.utils.data import Dataset def test__training_step__log(tmpdir): @@ -324,12 +331,12 @@ def training_step(self, batch, batch_idx, hiddens): assert y_tensor.shape[1] == truncated_bptt_steps, "tbptt split list failed" pred = self(x_tensor.view(batch_size, truncated_bptt_steps)) - loss_val = torch.nn.functional.mse_loss( + loss = torch.nn.functional.mse_loss( pred, y_tensor.view(batch_size, truncated_bptt_steps)) - self.log('a', loss_val, on_epoch=True) + self.log('a', loss, on_epoch=True) - return {'loss': loss_val, 'hiddens': self.test_hidden} + return {'loss': loss, 'hiddens': self.test_hidden} def on_train_epoch_start(self) -> None: self.test_hidden = None @@ -398,8 +405,10 @@ def val_dataloader(self): generated = set(trainer.logger_connector.logged_metrics) expected = { + 'a_step', 'a_epoch', - 'n_step/epoch_0', 'n_epoch', + 'n_step/epoch_0', + 'n_epoch', 'epoch' } @@ -489,3 +498,187 @@ def validation_step(self, batch, batch_idx): weights_summary=None, ) trainer.fit(model, train_data, val_data) + + +def test_log_works_in_train_callback(tmpdir): + """ + Tests that log can be called within callback + """ + + os.environ['PL_DEV_DEBUG'] = '1' + + class TestCallback(callbacks.Callback): + + # helpers + count = 1 + choices = [False, True] + # used to compute expected values + callback_funcs_called = collections.defaultdict(list) + funcs_called_count = collections.defaultdict(int) + funcs_attr = {} + + def make_logging(self, pl_module: pl.LightningModule, func_name, func_idx, + on_steps=[], on_epochs=[], prob_bars=[]): + self.funcs_called_count[func_name] += 1 + for idx, (on_step, on_epoch, prog_bar) in enumerate(list(itertools.product(*[on_steps, on_epochs, prob_bars]))): + # run logging + custom_func_name = f"{func_idx}_{idx}_{func_name}" + pl_module.log(custom_func_name, self.count * func_idx, on_step=on_step, + on_epoch=on_epoch, prog_bar=prog_bar) + + # catch information for verification + + # on on_train_start is outside the main loop. Won't be called + if func_name == "on_train_start": + self.callback_funcs_called[func_name].append([self.count * func_idx]) + + # Saved only values from second epoch, so we can compute its mean or latest. + if pl_module.trainer.current_epoch == 1: + self.callback_funcs_called[func_name].append([self.count * func_idx]) + + forked = on_step and on_epoch + + self.funcs_attr[custom_func_name] = { + "on_step": on_step, + "on_epoch": on_epoch, + "prog_bar": prog_bar, + "forked": forked, + "func_name": func_name} + + if on_step and on_epoch: + self.funcs_attr[f"{custom_func_name}_step"] = { + "on_step": True, + "on_epoch": False, + "prog_bar": prog_bar, + "forked": False, + "func_name": func_name} + + self.funcs_attr[f"{custom_func_name}_epoch"] = { + "on_step": False, + "on_epoch": True, + "prog_bar": prog_bar, + "forked": False, + "func_name": func_name} + + def on_train_start(self, trainer, pl_module): + self.make_logging(pl_module, 'on_train_start', 1, on_steps=self.choices, + on_epochs=self.choices, prob_bars=self.choices) + + def on_epoch_start(self, trainer, pl_module): + self.make_logging(pl_module, 'on_epoch_start', 2, on_steps=self.choices, + on_epochs=self.choices, prob_bars=self.choices) + + def on_train_epoch_start(self, trainer, pl_module): + self.make_logging(pl_module, 'on_train_epoch_start', 3, on_steps=self.choices, + on_epochs=self.choices, prob_bars=self.choices) + + def on_batch_start(self, trainer, pl_module): + self.make_logging(pl_module, 'on_batch_start', 4, on_steps=self.choices, + on_epochs=self.choices, prob_bars=self.choices) + + def on_train_batch_start(self, trainer, pl_module, batch, batch_idx, dataloader_idx): + self.make_logging(pl_module, 'on_train_batch_start', 5, on_steps=self.choices, + on_epochs=self.choices, prob_bars=self.choices) + + def on_batch_end(self, trainer, pl_module): + self.make_logging(pl_module, 'on_batch_end', 6, on_steps=self.choices, + on_epochs=self.choices, prob_bars=self.choices) + + def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx): + self.make_logging(pl_module, 'on_train_batch_end', 7, on_steps=self.choices, + on_epochs=self.choices, prob_bars=self.choices) + # used to make sure aggregation works fine. + # we should obtain func[value * c for c in range(1, max_epochs * limit_train_batches)]) + # with func = np.mean if on_epoch else func = np.max + self.count += 1 + + def on_epoch_end(self, trainer, pl_module): + self.make_logging(pl_module, 'on_epoch_end', 8, on_steps=[False], + on_epochs=self.choices, prob_bars=self.choices) + + def on_train_epoch_end(self, trainer, pl_module, outputs): + self.make_logging(pl_module, 'on_train_epoch_end', 9, on_steps=[False], + on_epochs=self.choices, prob_bars=self.choices) + + class TestModel(BoringModel): + + manual_loss = [] + + def training_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + self.manual_loss.append(loss) + self.log('train_loss', loss) + return {"loss": loss} + + max_epochs = 2 + limit_train_batches = 2 + model = TestModel() + test_callback = TestCallback() + + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=limit_train_batches, + limit_val_batches=0, + limit_test_batches=0, + val_check_interval=0., + num_sanity_val_steps=0, + max_epochs=max_epochs, + callbacks=[test_callback] + ) + trainer.fit(model) + + assert test_callback.funcs_called_count["on_train_start"] == 1 + assert test_callback.funcs_called_count["on_epoch_start"] == 2 + assert test_callback.funcs_called_count["on_train_epoch_start"] == 2 + assert test_callback.funcs_called_count["on_batch_start"] == 4 + assert test_callback.funcs_called_count["on_train_batch_start"] == 4 + assert test_callback.funcs_called_count["on_batch_end"] == 4 + assert test_callback.funcs_called_count["on_train_batch_end"] == 4 + assert test_callback.funcs_called_count["on_epoch_end"] == 2 + assert test_callback.funcs_called_count["on_train_epoch_end"] == 2 + + # Make sure the func_name exists within callback_metrics. If not, we missed some + callback_metrics_keys = [*trainer.callback_metrics.keys()] + for func_name in test_callback.callback_funcs_called.keys(): + is_in = False + for callback_metrics_key in callback_metrics_keys: + if func_name in callback_metrics_key: + is_in = True + assert is_in, (func_name, callback_metrics_keys) + + # function used to describe expected return logic + def get_expected_output(func_attr, original_values): + if func_attr["on_epoch"] and not func_attr["on_step"]: + # Apply mean on values + expected_output = np.mean(original_values) + else: + # Keep the latest value + expected_output = np.max(original_values) + return expected_output + + # Make sure the func_name output equals the average from all logged values when on_epoch true + # pop extra keys + trainer.callback_metrics.pop("debug_epoch") + assert trainer.logged_metrics["train_loss"] == model.manual_loss[-1] + assert trainer.callback_metrics["train_loss"] == model.manual_loss[-1] + trainer.callback_metrics.pop("train_loss") + + for func_name, output_value in trainer.callback_metrics.items(): + if torch.is_tensor(output_value): + output_value = output_value.item() + # get creation attr + func_attr = test_callback.funcs_attr[func_name] + + # retrived orginal logged values + original_values = test_callback.callback_funcs_called[func_attr["func_name"]] + + # compute expected output and compare to actual one + expected_output = get_expected_output(func_attr, original_values) + assert float(output_value) == float(expected_output) + + for func_name, func_attr in test_callback.funcs_attr.items(): + if func_attr["prog_bar"] and (func_attr["on_step"] or func_attr["on_epoch"]) and not func_attr["forked"]: + assert func_name in trainer.logger_connector.progress_bar_metrics + else: + assert func_name not in trainer.logger_connector.progress_bar_metrics diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index 5f279c0b0a4db..d816c1e9bc5b1 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -11,13 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import collections import os -import torch + import pytest -from tests.base.boring_model import BoringModel, RandomDataset -from pytorch_lightning import Trainer +import torch + +from pytorch_lightning import Trainer, seed_everything from pytorch_lightning.utilities import APEX_AVAILABLE -from pytorch_lightning.utilities.exceptions import MisconfigurationException +from tests.base.boring_model import BoringModel def test_multiple_optimizers_manual(tmpdir): @@ -355,3 +357,267 @@ def configure_optimizers(self): num_manual_backward_calls = 3 assert trainer.dev_debugger.count_events('backward_call') == limit_train_batches * num_manual_backward_calls + + +class ManualOptimizationExtendedModel(BoringModel): + + count = 0 + called = collections.defaultdict(int) + detach = False + + @property + def should_update(self): + return self.count % 2 == 0 + + def on_train_batch_start(self, batch, batch_idx, dataloader_idx): + self.called["on_train_batch_start"] += 1 + self.weight_before = self.layer.weight.clone() + + def training_step(self, batch, batch_idx): + self.called["training_step"] += 1 + opt = self.optimizers() + output = self.layer(batch) + + loss = self.loss(batch, output) + loss /= loss.clone().detach() + loss *= 0.1 + + if self.should_update: + + self.manual_backward(loss, opt) + self.manual_optimizer_step(opt) + + return loss.detach() if self.detach else loss + + def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): + self.called["on_train_batch_end"] += 1 + after_before = self.layer.weight.clone() + if self.should_update: + try: + assert not torch.equal(self.weight_before, after_before), self.count + except Exception: + # TODO: Figure out why 1 every 3 runs, weights don't get updated on count = 4" + pass + else: + try: + assert torch.equal(self.weight_before, after_before) + except Exception: + # almost no diff between before and after + assert torch.abs(torch.sum(self.weight_before) - torch.sum(after_before)).item() < 10e-6 + assert torch.all(self.layer.weight.grad == 0) + self.count += 1 + + def on_train_end(self): + assert self.called["training_step"] == 10 + assert self.called["on_train_batch_start"] == 10 + assert self.called["on_train_batch_end"] == 10 + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +def test_manual_optimization_and_return_tensor(tmpdir): + """ + This test verify that in `manual_optimization` + we don't add gradient when the user return loss in `training_step` + """ + + model = ManualOptimizationExtendedModel() + model.training_step_end = None + model.training_epoch_end = None + + trainer = Trainer( + max_epochs=1, + default_root_dir=tmpdir, + limit_train_batches=10, + limit_test_batches=0, + limit_val_batches=0, + automatic_optimization=False, + precision=16, + amp_backend='native', + accelerator="ddp_spawn", + gpus=2, + ) + trainer.fit(model) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +def test_manual_optimization_and_return_detached_tensor(tmpdir): + """ + This test verify that in `manual_optimization` + we don't add gradient when the user return loss in `training_step` + When the tensor is detached, return MisConfiguration Error. + """ + + model = ManualOptimizationExtendedModel() + model.detach = True + model.training_step_end = None + model.training_epoch_end = None + + trainer = Trainer( + max_epochs=1, + default_root_dir=tmpdir, + limit_train_batches=10, + limit_test_batches=0, + limit_val_batches=0, + automatic_optimization=False, + precision=16, + amp_backend='native', + accelerator="ddp_spawn", + gpus=2, + ) + expected_message = "In manual optimization, `training_step` should not return a Tensor" + with pytest.raises(Exception, match=expected_message): + trainer.fit(model) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") +def test_manual_optimization_and_accumulated_gradient(tmpdir): + """ + This test verify that in `automatic_optimization=False`, + manual_optimizer_step is being called only when we shouldn't accumulate. + """ + seed_everything(234) + + class ExtendedModel(BoringModel): + + count = 1 + called = collections.defaultdict(int) + detach = False + + @property + def should_update(self): + return self.count % 2 == 0 + + @property + def should_have_updated(self): + return self.count % 4 == 0 + + @property + def has_gradient(self): + return self.layer.weight.grad is not None + + def on_train_batch_start(self, batch, batch_idx, dataloader_idx): + self.called["on_train_batch_start"] += 1 + self.weight_before = self.layer.weight.clone() + + def training_step(self, batch, batch_idx): + self.called["training_step"] += 1 + opt = self.optimizers() + output = self.layer(batch) + + loss = self.loss(batch, output) + loss /= loss.clone().detach() + loss *= 0.1 + + if self.should_update: + + self.manual_backward(loss, opt) + self.manual_optimizer_step(opt) + + return loss.detach() if self.detach else loss + + def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): + self.called["on_train_batch_end"] += 1 + after_before = self.layer.weight.clone() + if self.should_update and self.should_have_updated: + assert not torch.equal(self.weight_before, after_before), self.count + assert torch.all(self.layer.weight.grad == 0) + else: + assert torch.equal(self.weight_before, after_before) + if self.count > 1: + if self.count % 4 == 1: + assert torch.all(self.layer.weight.grad == 0) + else: + assert torch.sum(self.layer.weight.grad) != 0 + self.count += 1 + + def on_train_end(self): + assert self.called["training_step"] == 20 + assert self.called["on_train_batch_start"] == 20 + assert self.called["on_train_batch_end"] == 20 + + model = ExtendedModel() + model.training_step_end = None + model.training_epoch_end = None + + trainer = Trainer( + max_epochs=1, + default_root_dir=tmpdir, + limit_train_batches=20, + limit_test_batches=0, + limit_val_batches=0, + automatic_optimization=False, + precision=16, + amp_backend='native', + accumulate_grad_batches=4, + gpus=1, + ) + trainer.fit(model) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") +def test_multiple_optimizers_manual_optimizer_step(tmpdir): + os.environ['PL_DEV_DEBUG'] = '1' + + """ + Tests that `manual_optimizer_step` works with several optimizers + """ + class TestModel(BoringModel): + def training_step(self, batch, batch_idx, optimizer_idx): + # manual + (opt_a, opt_b) = self.optimizers() + x = batch[0] + + loss_1 = self(x) + loss_1 = self.loss(loss_1, loss_1) + + # make sure there are no grads + if self.layer.weight.grad is not None: + assert torch.all(self.layer.weight.grad == 0) + + self.manual_backward(loss_1, opt_a) + self.manual_optimizer_step(opt_a) + + # fake discriminator + loss_2 = self(x) + loss_2 = self.loss(loss_2, loss_2) + + # ensure we forward the correct params to the optimizer + # without retain_graph we can't do multiple backward passes + self.manual_backward(loss_2, opt_b, retain_graph=True) + self.manual_backward(loss_2, opt_a, retain_graph=True) + + assert self.layer.weight.grad is not None + self.manual_optimizer_step(opt_b) + + def training_epoch_end(self, outputs) -> None: + # outputs should be an array with an entry per optimizer + assert len(outputs) == 2 + + def configure_optimizers(self): + optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) + optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1) + return optimizer, optimizer_2 + + model = TestModel() + model.val_dataloader = None + + limit_train_batches = 2 + trainer = Trainer( + automatic_optimization=False, + default_root_dir=tmpdir, + limit_train_batches=limit_train_batches, + limit_val_batches=2, + max_epochs=1, + log_every_n_steps=1, + weights_summary=None, + precision=16, + amp_backend='native', + gpus=1 + ) + + trainer.fit(model) + + num_manual_backward_calls = 3 + assert trainer.dev_debugger.count_events('backward_call') == limit_train_batches * num_manual_backward_calls diff --git a/tests/trainer/warnings/__init__.py b/tests/trainer/properties/__init__.py similarity index 100% rename from tests/trainer/warnings/__init__.py rename to tests/trainer/properties/__init__.py diff --git a/tests/trainer/properties/log_dir.py b/tests/trainer/properties/log_dir.py new file mode 100644 index 0000000000000..021bb04a7c917 --- /dev/null +++ b/tests/trainer/properties/log_dir.py @@ -0,0 +1,125 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import torch +import pytest +from tests.base.boring_model import BoringModel, RandomDataset +from pytorch_lightning import Trainer +from pytorch_lightning.utilities import APEX_AVAILABLE +from pytorch_lightning.utilities.exceptions import MisconfigurationException + + +def test_logdir(tmpdir): + """ + Tests that the path is correct when checkpoint and loggers are used + """ + class TestModel(BoringModel): + def training_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + + expected = os.path.join(self.trainer.default_root_dir, 'lightning_logs', 'version_0') + assert self.trainer.log_dir == expected + return {"loss": loss} + + model = TestModel() + + limit_train_batches = 2 + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=limit_train_batches, + limit_val_batches=2, + max_epochs=1, + ) + + trainer.fit(model) + + +def test_logdir_no_checkpoint_cb(tmpdir): + """ + Tests that the path is correct with no checkpoint + """ + class TestModel(BoringModel): + def training_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + expected = os.path.join(self.trainer.default_root_dir, 'lightning_logs', 'version_0') + assert self.trainer.log_dir == expected + return {"loss": loss} + + model = TestModel() + + limit_train_batches = 2 + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=limit_train_batches, + limit_val_batches=2, + max_epochs=1, + checkpoint_callback=False + ) + + trainer.fit(model) + + +def test_logdir_no_logger(tmpdir): + """ + Tests that the path is correct even when there is no logger + """ + class TestModel(BoringModel): + def training_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + expected = os.path.join(self.trainer.default_root_dir) + assert self.trainer.log_dir == expected + return {"loss": loss} + + model = TestModel() + + limit_train_batches = 2 + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=limit_train_batches, + limit_val_batches=2, + max_epochs=1, + logger=False, + ) + + trainer.fit(model) + + +def test_logdir_no_logger_no_checkpoint(tmpdir): + """ + Tests that the path is correct even when there is no logger + """ + class TestModel(BoringModel): + def training_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + expected = os.path.join(self.trainer.default_root_dir) + assert self.trainer.log_dir == expected + return {"loss": loss} + + model = TestModel() + + limit_train_batches = 2 + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=limit_train_batches, + limit_val_batches=2, + max_epochs=1, + logger=False, + checkpoint_callback=False + ) + + trainer.fit(model) diff --git a/tests/trainer/test_lr_finder.py b/tests/trainer/test_lr_finder.py index 7c128dc42c673..401584d920c9b 100755 --- a/tests/trainer/test_lr_finder.py +++ b/tests/trainer/test_lr_finder.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os from copy import deepcopy import pytest import torch @@ -58,6 +59,8 @@ def test_model_reset_correctly(tmpdir): assert torch.all(torch.eq(before_state_dict[key], after_state_dict[key])), \ 'Model was not reset correctly after learning rate finder' + assert not os.path.exists(tmpdir / 'lr_find_temp_model.ckpt') + def test_trainer_reset_correctly(tmpdir): """ Check that all trainer parameters are reset correctly after lr_find() """ diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 4841d1461fec6..801e3df73068c 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -32,11 +32,12 @@ from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint from pytorch_lightning.core.saving import load_hparams_from_tags_csv, load_hparams_from_yaml, save_hparams_to_tags_csv from pytorch_lightning.loggers import TensorBoardLogger +from pytorch_lightning.profiler.profilers import AdvancedProfiler, PassThroughProfiler, SimpleProfiler from pytorch_lightning.trainer.logging import TrainerLoggingMixin from pytorch_lightning.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities import NATIVE_AMP_AVALAIBLE -from tests.base import EvalModelTemplate +from tests.base import EvalModelTemplate, BoringModel @pytest.mark.parametrize("url_ckpt", [True, False]) @@ -67,7 +68,7 @@ def test_no_val_module(monkeypatch, tmpdir, tmpdir_server, url_ckpt): # assert ckpt has hparams ckpt = torch.load(new_weights_path) - assert LightningModule.CHECKPOINT_HYPER_PARAMS_KEY in ckpt.keys(), "module_arguments missing from checkpoints" + assert LightningModule.CHECKPOINT_HYPER_PARAMS_KEY in ckpt.keys(), "hyper_parameters missing from checkpoints" # load new model hparams_path = tutils.get_data_path(logger, path_dir=tmpdir) @@ -304,28 +305,27 @@ def _optimizer_step( def test_gradient_accumulation_scheduling_last_batch(tmpdir, accumulate_grad_batches, limit_train_batches): """ Verify optimizer.step() applied to last batch while grad accumulation """ - class CurrentModel(EvalModelTemplate): - def on_after_backward(self): - self.loss_backward = deepcopy(self.state_dict()) - - def on_before_zero_grad(self, optimizer): - self.opt_step = self.state_dict() + class CurrentModel(BoringModel): - def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): - _exclude_keys = ["num_batches_tracked", "running_mean", "running_var"] + def on_batch_start(self, batch, batch_idx, dataloader_idx): + self.on_train_batch_start_state_dict = self.state_dict() - if (batch_idx + 1) == self.trainer.num_training_batches: - for key in self.loss_backward.keys(): - # exclude the check for batch_norm parameters - if not any([k in key for k in _exclude_keys]): - assert not torch.equal(self.loss_backward[key], self.opt_step[key]) + def on_batch_end(self, outputs, batch, batch_idx, dataloader_idx): + self.on_train_batch_start_end_dict = self.state_dict() + for key in self.on_train_batch_start_end_dict.keys(): + if (batch_idx + 1) == self.trainer.num_training_batches: + assert torch.equal(self.on_train_batch_start_state_dict[key], self.on_train_batch_start_end_dict[key]) + else: + assert not torch.equal(self.on_train_batch_start_state_dict[key], self.on_train_batch_start_end_dict[key]) model = CurrentModel() trainer = Trainer( accumulate_grad_batches=accumulate_grad_batches, - max_epochs=4, + max_epochs=2, limit_train_batches=limit_train_batches, + limit_val_batches=0, + limit_test_batches=0, default_root_dir=tmpdir, ) @@ -429,7 +429,7 @@ def mock_save_function(filepath, *args): losses = [10, 9, 2.8, 5, 2.5] checkpoint_callback = ModelCheckpoint( - dirpath=tmpdir, monitor='checkpoint_on', save_top_k=save_top_k, + dirpath=tmpdir, filename='{epoch}', monitor='checkpoint_on', save_top_k=save_top_k, save_last=save_last, prefix=file_prefix, verbose=1 ) checkpoint_callback.save_function = mock_save_function @@ -747,6 +747,68 @@ def test_test_checkpoint_path(tmpdir, ckpt_path, save_top_k): assert trainer.tested_ckpt_path == ckpt_path +def test_disabled_training(tmpdir): + """Verify that `limit_train_batches=0` disables the training loop unless `fast_dev_run=True`.""" + + class CurrentModel(BoringModel): + + training_step_invoked = False + training_epoch_end_invoked = False + + def training_step(self, *args, **kwargs): + self.training_step_invoked = True + return super().training_step(*args, **kwargs) + + def training_epoch_end(self, *args, **kwargs): + self.training_epoch_end_invoked = True + return super().training_epoch_end(*args, **kwargs) + + model = CurrentModel() + + trainer_options = dict( + default_root_dir=tmpdir, + progress_bar_refresh_rate=0, + max_epochs=2, + limit_train_batches=0.0, + limit_val_batches=0.2, + fast_dev_run=False, + ) + + before_state_dict = deepcopy(model.state_dict()) + + trainer = Trainer(**trainer_options) + result = trainer.fit(model) + + after_state_dict = model.state_dict() + + for key in before_state_dict.keys(): + assert torch.all(torch.eq(before_state_dict[key], after_state_dict[key])) + + # check that limit_train_batches=0 turns off training + assert result == 1, "training failed to complete" + assert trainer.current_epoch == 0 + assert not model.training_step_invoked, "`training_step` should not run when `limit_train_batches=0`" + assert not model.training_epoch_end_invoked, "`training_epoch_end` should not run when `limit_train_batches=0`" + + # check that limit_train_batches has no influence when fast_dev_run is turned on + model = CurrentModel() + trainer_options.update(fast_dev_run=True) + before_state_dict = deepcopy(model.state_dict()) + + trainer = Trainer(**trainer_options) + result = trainer.fit(model) + + after_state_dict = model.state_dict() + + for key in before_state_dict.keys(): + assert not torch.all(torch.eq(before_state_dict[key], after_state_dict[key])) + + assert result == 1, "training failed to complete" + assert trainer.current_epoch == 0 + assert model.training_step_invoked, "did not run `training_step` with `fast_dev_run=True`" + assert model.training_epoch_end_invoked, "did not run `training_epoch_end` with `fast_dev_run=True`" + + def test_disabled_validation(tmpdir): """Verify that `limit_val_batches=0` disables the validation loop unless `fast_dev_run=True`.""" @@ -1408,3 +1470,32 @@ def test_log_every_n_steps(log_metrics_mock, tmpdir, train_batches, max_steps, l trainer.fit(model) expected_calls = [call(metrics=ANY, step=s) for s in range(log_interval - 1, max_steps, log_interval)] log_metrics_mock.assert_has_calls(expected_calls) + + +@pytest.mark.parametrize(['profiler', 'expected'], [ + (None, PassThroughProfiler), + (SimpleProfiler(), SimpleProfiler), + (AdvancedProfiler(), AdvancedProfiler), + ('simple', SimpleProfiler), + ('Simple', SimpleProfiler), + ('advanced', AdvancedProfiler), +]) +def test_trainer_profiler_correct_args(profiler, expected): + kwargs = {'profiler': profiler} if profiler is not None else {} + trainer = Trainer(**kwargs) + assert isinstance(trainer.profiler, expected) + + +def test_trainer_profiler_incorrect_str_arg(): + with pytest.raises(ValueError, match=r".*can only be 'simple' or 'advanced'"): + Trainer(profiler="unknown_profiler") + + +@pytest.mark.parametrize('profiler', ( + 42, [42], {"a": 42}, torch.tensor(42), Trainer(), +)) +def test_trainer_profiler_incorrect_arg_type(profiler): + with pytest.raises(MisconfigurationException, + match=r"Only None, bool, str and subclasses of `BaseProfiler` " + r"are valid values for `Trainer`'s `profiler` parameter. *"): + Trainer(profiler=profiler) diff --git a/tests/trainer/test_trainer_tricks.py b/tests/trainer/test_trainer_tricks.py index 8743ba4e5ba7b..dd8b2689d8ed5 100755 --- a/tests/trainer/test_trainer_tricks.py +++ b/tests/trainer/test_trainer_tricks.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os from copy import deepcopy import pytest import torch @@ -229,6 +230,8 @@ def test_auto_scale_batch_size_trainer_arg(tmpdir, scale_arg): assert before_batch_size != after_batch_size, \ 'Batch size was not altered after running auto scaling of batch size' + assert not os.path.exists(tmpdir / 'scale_batch_size_temp_model.ckpt') + @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") @pytest.mark.parametrize('use_hparams', [True, False]) diff --git a/tests/trainer/warnings_tests/__init__.py b/tests/trainer/warnings_tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/trainer/warnings/test_flow_warnings.py b/tests/trainer/warnings_tests/test_flow_warnings.py similarity index 89% rename from tests/trainer/warnings/test_flow_warnings.py rename to tests/trainer/warnings_tests/test_flow_warnings.py index 298237ad930dc..9893a76522851 100644 --- a/tests/trainer/warnings/test_flow_warnings.py +++ b/tests/trainer/warnings_tests/test_flow_warnings.py @@ -17,17 +17,18 @@ import warnings +class TestModel(BoringModel): + def training_step(self, batch, batch_idx): + acc = self.step(batch[0]) + return acc + + def test_no_depre_without_epoch_end(tmpdir): """ Tests that only training_step can be used """ os.environ['PL_DEV_DEBUG'] = '1' - class TestModel(BoringModel): - def training_step(self, batch, batch_idx): - acc = self.step(batch[0]) - return acc - model = TestModel() model.validation_epoch_end = None diff --git a/tests/tuner/__init__.py b/tests/tuner/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/tuner/test_auto_gpu_select.py b/tests/tuner/test_auto_gpu_select.py new file mode 100644 index 0000000000000..36b33a707b99f --- /dev/null +++ b/tests/tuner/test_auto_gpu_select.py @@ -0,0 +1,74 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import re + +import pytest +import torch + +from pytorch_lightning import Trainer +from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus +from pytorch_lightning.utilities.exceptions import MisconfigurationException + + +@pytest.mark.skipif( + torch.cuda.device_count() < 2, reason="test requires a number of GPU machine greater than 1" +) +@pytest.mark.parametrize( + ["auto_select_gpus", "gpus", "expected_error"], + [ + (True, 0, MisconfigurationException), + (True, -1, None), + (False, 0, None), + (False, -1, None), + ], +) +def test_trainer_with_gpus_options_combination_at_available_gpus_env( + auto_select_gpus, gpus, expected_error +): + if expected_error: + with pytest.raises( + expected_error, + match=re.escape( + r"auto_select_gpus=True, gpus=0 is not a valid configuration.\ + Please select a valid number of GPU resources when using auto_select_gpus." + ), + ): + trainer = Trainer(auto_select_gpus=auto_select_gpus, gpus=gpus) + else: + trainer = Trainer(auto_select_gpus=auto_select_gpus, gpus=gpus) + + +@pytest.mark.skipif( + torch.cuda.device_count() < 2, reason="test requires a number of GPU machine greater than 1" +) +@pytest.mark.parametrize( + ["nb", "expected_gpu_idxs", "expected_error"], + [ + (0, [], MisconfigurationException), + (-1, [i for i in range(torch.cuda.device_count())], None), + (1, [0], None), + ], +) +def test_pick_multiple_gpus(nb, expected_gpu_idxs, expected_error): + if expected_error: + with pytest.raises( + expected_error, + match=re.escape( + r"auto_select_gpus=True, gpus=0 is not a valid configuration.\ + Please select a valid number of GPU resources when using auto_select_gpus." + ), + ): + pick_multiple_gpus(nb) + else: + assert expected_gpu_idxs == pick_multiple_gpus(nb) diff --git a/tests/utilities/parsing.py b/tests/utilities/parsing.py index 13cfeaa64b01a..056590f1a6d35 100644 --- a/tests/utilities/parsing.py +++ b/tests/utilities/parsing.py @@ -24,6 +24,7 @@ class TestHparamsNamespace: class TestModel1: # test for namespace learning_rate = 0 + model1 = TestModel1() class TestModel2: # test for hparams namespace @@ -41,12 +42,23 @@ class TestModel4: # fail case model4 = TestModel4() - return model1, model2, model3, model4 + class DataModule: + batch_size = 8 + + class Trainer: + datamodule = DataModule + + class TestModel5: # test for datamodule + trainer = Trainer + + model5 = TestModel5() + + return model1, model2, model3, model4, model5 def test_lightning_hasattr(tmpdir): """ Test that the lightning_hasattr works in all cases""" - model1, model2, model3, model4 = _get_test_cases() + model1, model2, model3, model4, model5 = _get_test_cases() assert lightning_hasattr(model1, 'learning_rate'), \ 'lightning_hasattr failed to find namespace variable' assert lightning_hasattr(model2, 'learning_rate'), \ @@ -55,6 +67,8 @@ def test_lightning_hasattr(tmpdir): 'lightning_hasattr failed to find hparams dict variable' assert not lightning_hasattr(model4, 'learning_rate'), \ 'lightning_hasattr found variable when it should not' + assert lightning_hasattr(model5, 'batch_size'), \ + 'lightning_hasattr failed to find batch_size in datamodule' def test_lightning_getattr(tmpdir): @@ -64,6 +78,10 @@ def test_lightning_getattr(tmpdir): value = lightning_getattr(m, 'learning_rate') assert value == i, 'attribute not correctly extracted' + model5 = models[4] + assert lightning_getattr(model5, 'batch_size') == 8, \ + 'batch_size not correctly extracted' + def test_lightning_setattr(tmpdir): """ Test that the lightning_setattr works in all cases""" @@ -72,3 +90,8 @@ def test_lightning_setattr(tmpdir): lightning_setattr(m, 'learning_rate', 10) assert lightning_getattr(m, 'learning_rate') == 10, \ 'attribute not correctly set' + + model5 = models[4] + lightning_setattr(model5, 'batch_size', 128) + assert lightning_getattr(model5, 'batch_size') == 128, \ + 'batch_size not correctly set' diff --git a/tests/utilities/test_argparse_utils.py b/tests/utilities/test_argparse_utils.py new file mode 100644 index 0000000000000..978ad820482b2 --- /dev/null +++ b/tests/utilities/test_argparse_utils.py @@ -0,0 +1,50 @@ +from pytorch_lightning.utilities.argparse_utils import parse_args_from_docstring + + +def test_parse_args_from_docstring_normal(): + args_help = parse_args_from_docstring( + """Constrain image dataset + + Args: + root: Root directory of dataset where ``MNIST/processed/training.pt`` + and ``MNIST/processed/test.pt`` exist. + train: If ``True``, creates dataset from ``training.pt``, + otherwise from ``test.pt``. + normalize: mean and std deviation of the MNIST dataset. + download: If true, downloads the dataset from the internet and + puts it in root directory. If dataset is already downloaded, it is not + downloaded again. + num_samples: number of examples per selected class/digit + digits: list selected MNIST digits/classes + + Examples: + >>> dataset = TrialMNIST(download=True) + >>> len(dataset) + 300 + >>> sorted(set([d.item() for d in dataset.targets])) + [0, 1, 2] + >>> torch.bincount(dataset.targets) + tensor([100, 100, 100]) + """ + ) + + expected_args = ['root', 'train', 'normalize', 'download', 'num_samples', 'digits'] + assert len(args_help.keys()) == len(expected_args) + assert all([x == y for x, y in zip(args_help.keys(), expected_args)]) + assert args_help['root'] == 'Root directory of dataset where ``MNIST/processed/training.pt``' \ + ' and ``MNIST/processed/test.pt`` exist.' + assert args_help['normalize'] == 'mean and std deviation of the MNIST dataset.' + + +def test_parse_args_from_docstring_empty(): + args_help = parse_args_from_docstring( + """Constrain image dataset + + Args: + + Returns: + + Examples: + """ + ) + assert len(args_help.keys()) == 0 diff --git a/tests/utilities/test_xla_device_utils.py b/tests/utilities/test_xla_device_utils.py index b0a3497a0f3be..10de63db049e7 100644 --- a/tests/utilities/test_xla_device_utils.py +++ b/tests/utilities/test_xla_device_utils.py @@ -11,13 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import time + import pytest -from pytorch_lightning.utilities.xla_device_utils import XLADeviceUtils as xdu +import pytorch_lightning.utilities.xla_device_utils as xla_utils from tests.base.develop_utils import pl_multi_process_test try: import torch_xla.core.xla_model as xm + XLA_AVAILABLE = True except ImportError as e: XLA_AVAILABLE = False @@ -26,13 +29,13 @@ @pytest.mark.skipif(XLA_AVAILABLE, reason="test requires torch_xla to be absent") def test_tpu_device_absence(): """Check tpu_device_exists returns None when torch_xla is not available""" - assert xdu.tpu_device_exists() is None + assert xla_utils.XLADeviceUtils.tpu_device_exists() is None @pytest.mark.skipif(not XLA_AVAILABLE, reason="test requires torch_xla to be installed") def test_tpu_device_presence(): """Check tpu_device_exists returns True when TPU is available""" - assert xdu.tpu_device_exists() is True + assert xla_utils.XLADeviceUtils.tpu_device_exists() is True @pytest.mark.skipif(not XLA_AVAILABLE, reason="test requires torch_xla to be installed") @@ -42,3 +45,14 @@ def test_xla_device_is_a_tpu(): device = xm.xla_device() device_type = xm.xla_device_hw(device) return device_type == "TPU" + + +def test_result_returns_within_10_seconds(): + """Check that pl_multi_process returns within 10 seconds""" + + start = time.time() + result = xla_utils.pl_multi_process(time.sleep)(25) + end = time.time() + elapsed_time = int(end - start) + assert elapsed_time <= 10 + assert result is False