Lightning-AI
diff --git a/‎CHANGELOG.md‎
Lines changed: 3 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/source/tpu.rst‎
Lines changed: 25 additions & 4 deletions b/‎docs/source/tpu.rst‎
Lines changed: 25 additions & 4 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 0 additions & 2 deletions b/‎pyproject.toml‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎pytorch_lightning/accelerators/accelerator_connector.py‎
Lines changed: 24 additions & 13 deletions b/‎pytorch_lightning/accelerators/accelerator_connector.py‎
Lines changed: 24 additions & 13 deletions
diff --git a/‎pytorch_lightning/accelerators/horovod_accelerator.py‎
Lines changed: 4 additions & 4 deletions b/‎pytorch_lightning/accelerators/horovod_accelerator.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎pytorch_lightning/callbacks/early_stopping.py‎
Lines changed: 5 additions & 7 deletions b/‎pytorch_lightning/callbacks/early_stopping.py‎
Lines changed: 5 additions & 7 deletions
diff --git a/‎pytorch_lightning/callbacks/gpu_stats_monitor.py‎
Lines changed: 2 additions & 2 deletions b/‎pytorch_lightning/callbacks/gpu_stats_monitor.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎pytorch_lightning/callbacks/model_checkpoint.py‎
Lines changed: 3 additions & 5 deletions b/‎pytorch_lightning/callbacks/model_checkpoint.py‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎pytorch_lightning/core/lightning.py‎
Lines changed: 2 additions & 11 deletions b/‎pytorch_lightning/core/lightning.py‎
Lines changed: 2 additions & 11 deletions
diff --git a/‎pytorch_lightning/core/memory.py‎
Lines changed: 2 additions & 2 deletions b/‎pytorch_lightning/core/memory.py‎
Lines changed: 2 additions & 2 deletions
@@ -65,6 +65,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Changed `iou` [func] to allow float input ([#4704](https://github.com/PyTorchLightning/pytorch-lightning/pull/4704))
 
 
+- Changed `callbacks` argument in `Trainer` to allow `Callback` input ([#5446](https://github.com/PyTorchLightning/pytorch-lightning/pull/5446))
+
+
 ### Deprecated
 
 - `stat_scores_multiple_classes` is deprecated in favor of `stat_scores` ([#4839](https://github.com/PyTorchLightning/pytorch-lightning/pull/4839))
 
@@ -40,7 +40,7 @@ To access TPUs, there are three main ways.
 ----------------
 
 Colab TPUs
------------
+----------
 Colab is like a jupyter notebook with a free GPU or TPU
 hosted on GCP.
 
@@ -129,8 +129,7 @@ That's it! Your model will train on all 8 TPU cores.
 ----------------
 
 TPU core training
-
-------------------------
+-----------------
 
 Lightning supports training on a single TPU core or 8 TPU cores.
 
@@ -177,7 +176,7 @@ on how to set up the instance groups and VMs needed to run TPU Pods.
 ----------------
 
 16 bit precision
------------------
+----------------
 Lightning also supports training in 16-bit precision with TPUs.
 By default, TPU training will use 32-bit precision. To enable 16-bit,
 set the 16-bit flag.
@@ -194,6 +193,28 @@ Under the hood the xla library will use the `bfloat16 type <https://en.wikipedia
 
 ----------------
 
+Performance considerations
+--------------------------
+
+The TPU was designed for specific workloads and operations to carry out large volumes of matrix multiplication,
+convolution operations and other commonly used ops in applied deep learning.
+The specialization makes it a strong choice for NLP tasks, sequential convolutional networks, and under low precision operation.
+There are cases in which training on TPUs is slower when compared with GPUs, for possible reasons listed:
+
+- Too small batch size.
+- Explicit evaluation of tensors during training, e.g. ``tensor.item()``
+- Tensor shapes (e.g. model inputs) change often during training.
+- Limited resources when using TPU's with PyTorch `Link <https://github.com/pytorch/xla/issues/2054#issuecomment-627367729>`_
+- XLA Graph compilation during the initial steps `Reference <https://github.com/pytorch/xla/issues/2383#issuecomment-666519998>`_
+- Some tensor ops are not fully supported on TPU, or not supported at all. These operations will be performed on CPU (context switch).
+- PyTorch integration is still experimental. Some performance bottlenecks may simply be the result of unfinished implementation.
+
+The official PyTorch XLA `performance guide <https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md#known-performance-caveats>`_
+has more detailed information on how PyTorch code can be optimized for TPU. In particular, the
+`metrics report <https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md#get-a-metrics-report>`_ allows
+one to identify operations that lead to context switching.
+
+
 About XLA
 ----------
 XLA is the library that interfaces PyTorch with the TPUs.
 
@@ -39,15 +39,13 @@ skip_glob = [
     "tests/backends/*",
     "tests/base/*",
     "tests/callbacks/*",
-    "tests/checkpointing/*",
     "tests/core/*",
     "tests/loggers/*",
     "tests/metrics/*",
     "tests/models/*",
     "tests/plugins/*",
     "tests/trainer/*",
     "tests/tuner/*",
-    "tests/utilities/*",
 ]
 profile = "black"
 line_length = 120
 
@@ -185,14 +185,21 @@ def select_accelerator(self):
         # ----------------------------------
         # choose an accelerator for the user
         # ----------------------------------
-        use_slurm_ddp = self.trainer.use_ddp and self.trainer.is_slurm_managing_tasks
+        use_slurm_ddp = (
+            self.trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)
+            and self.trainer.is_slurm_managing_tasks
+        )
 
         # torchelastic or general non_slurm ddp
         te_flags_passed = 'WORLD_SIZE' in os.environ and ('GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ)
-        use_torchelastic_ddp = self.trainer.use_ddp and te_flags_passed
+        use_torchelastic_ddp = (
+            self.trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN) and te_flags_passed
+        )
 
-        use_ddp_spawn = self.trainer.use_ddp and self.trainer.distributed_backend == "ddp_spawn"
-        use_ddp_cpu_spawn = self.trainer.use_ddp and self.trainer.distributed_backend == "ddp_cpu"
+        use_ddp_cpu_spawn = (
+            self.trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)
+            and self.trainer._device_type == DeviceType.CPU
+        )
 
         use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and self._is_using_torchelastic()
         use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.trainer.is_slurm_managing_tasks
@@ -204,8 +211,9 @@ def select_accelerator(self):
 
         cluster_env = self._select_environment()
 
+        # TODO: clean-up this branching as most just select class and uses the very same arguments
         # choose the appropriate accelerator backend
-        if self.trainer.use_ddp2:
+        if self.trainer._distrib_type == DistributedType.DDP2:
             accelerator_backend = accelerators.DDP2Accelerator(
                 self.trainer,
                 cluster_env,
@@ -240,7 +248,7 @@ def select_accelerator(self):
                 self.trainer.plugin_connector.ddp_plugin
             )
 
-        elif use_ddp_spawn:
+        elif self.trainer._distrib_type == DistributedType.DDP_SPAWN:
             accelerator_backend = accelerators.DDPSpawnAccelerator(
                 self.trainer,
                 nprocs=self.trainer.num_processes,
@@ -263,16 +271,16 @@ def select_accelerator(self):
                 ddp_plugin=self.trainer.plugin_connector.ddp_plugin
             )
 
-        elif self.trainer.use_dp:
+        elif self.trainer._distrib_type == DistributedType.DP:
             accelerator_backend = accelerators.DataParallelAccelerator(self.trainer, cluster_env)
 
-        elif self.trainer.use_horovod:
+        elif self.trainer._distrib_type == DistributedType.HOROVOD:
             accelerator_backend = accelerators.HorovodAccelerator(self.trainer, cluster_env)
 
-        elif self.trainer.use_single_gpu:
+        elif self.trainer._device_type == DeviceType.GPU and self.trainer.num_gpus == 1:
             accelerator_backend = accelerators.GPUAccelerator(self.trainer, cluster_env)
 
-        elif self.trainer.use_tpu:
+        elif self.trainer._device_type == DeviceType.TPU:
             accelerator_backend = accelerators.TPUAccelerator(self.trainer, cluster_env)
 
         elif self.trainer.distributed_backend is None:
@@ -347,13 +355,16 @@ def set_distributed_mode(self):
             self._set_horovod_backend()
 
         # throw error to force user ddp or ddp2 choice
-        if self.trainer.num_nodes > 1 and self.trainer._distrib_type not in (DistributedType.DDP2, DistributedType.DDP):
+        _ddp = (DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2)
+        if (self.trainer.num_nodes > 1 and self.trainer._distrib_type not in _ddp):
             raise MisconfigurationException(
                 'DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. '
                 'To silence this warning set `accelerator="ddp"` or `accelerator="ddp2"`'
             )
 
-        rank_zero_info(f'GPU available: {torch.cuda.is_available()}, used: {self.trainer.on_gpu}')
+        rank_zero_info(
+            f'GPU available: {torch.cuda.is_available()}, used: {self.trainer._device_type == DeviceType.GPU}'
+        )
         num_cores = self.trainer.tpu_cores if self.trainer.tpu_cores is not None else 0
         rank_zero_info(f'TPU available: {_TPU_AVAILABLE}, using: {num_cores} TPU cores')
 
@@ -366,7 +377,7 @@ def _set_horovod_backend(self):
 
         # Initialize Horovod to get rank / size info
         hvd.init()
-        if self.trainer.on_gpu:
+        if self.trainer._device_type == DeviceType.GPU:
             # Horovod assigns one local GPU per process
             self.trainer.root_gpu = hvd.local_rank()
 
 
@@ -19,7 +19,7 @@
 
 from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp
 from pytorch_lightning.cluster_environments import ClusterEnvironment
-from pytorch_lightning.utilities import _HOROVOD_AVAILABLE, AMPType
+from pytorch_lightning.utilities import _HOROVOD_AVAILABLE, AMPType, DeviceType
 from pytorch_lightning.utilities.distributed import rank_zero_only
 
 if _HOROVOD_AVAILABLE:
@@ -46,7 +46,7 @@ def setup(self, model):
         # call setup after the ddp process has connected
         self.trainer.call_setup_hook(model)
 
-        if torch.cuda.is_available() and self.trainer.on_gpu:
+        if torch.cuda.is_available() and self.trainer._device_type == DeviceType.GPU:
             # Horovod: pin GPU to local rank
             assert self.trainer.root_gpu == hvd.local_rank()
             torch.cuda.set_device(self.trainer.root_gpu)
@@ -116,7 +116,7 @@ def train(self):
         return results
 
     def _step(self, model_step: Callable, args):
-        if self.trainer.on_gpu:
+        if self.trainer._device_type == DeviceType.GPU:
             args[0] = self.batch_to_device(args[0], hvd.local_rank())
 
         if self.trainer.amp_backend == AMPType.NATIVE:
@@ -141,7 +141,7 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
         optimizer.synchronize()
 
     def on_train_epoch_end(self, outputs):
-        hvd.join(hvd.local_rank() if self.trainer.on_gpu else -1)
+        hvd.join(hvd.local_rank() if self.trainer._device_type == DeviceType.GPU else -1)
 
     def barrier(self, name: Optional[str] = None):
         hvd.join()
 
@@ -25,6 +25,7 @@
 
 from pytorch_lightning.callbacks.base import Callback
 from pytorch_lightning.utilities import rank_zero_info, rank_zero_warn
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
 class EarlyStopping(Callback):
@@ -96,15 +97,12 @@ def __init__(
         self.best_score = torch_inf if self.monitor_op == torch.lt else -torch_inf
 
     def __init_monitor_mode(self):
-        # TODO: Update with MisconfigurationException when auto mode is removed in v1.3
         if self.mode not in self.mode_dict and self.mode != 'auto':
-            if self.verbose > 0:
-                rank_zero_warn(
-                    f'EarlyStopping mode={self.mode} is unknown, fallback to auto mode.',
-                    RuntimeWarning,
-                )
-            self.mode = 'auto'
+            raise MisconfigurationException(
+                f"`mode` can be auto, {', '.join(self.mode_dict.keys())}, got {self.mode}"
+            )
 
+        # TODO: Update with MisconfigurationException when auto mode is removed in v1.3
         if self.mode == 'auto':
             rank_zero_warn(
                 "mode='auto' is deprecated in v1.1 and will be removed in v1.3."
 
@@ -27,7 +27,7 @@
 from typing import Dict, List, Tuple
 
 from pytorch_lightning.callbacks.base import Callback
-from pytorch_lightning.utilities import rank_zero_only
+from pytorch_lightning.utilities import rank_zero_only, DeviceType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.parsing import AttributeDict
 
@@ -104,7 +104,7 @@ def on_train_start(self, trainer, *args, **kwargs):
                 'Cannot use GPUStatsMonitor callback with Trainer that has no logger.'
             )
 
-        if not trainer.on_gpu:
+        if trainer._device_type != DeviceType.GPU:
             raise MisconfigurationException(
                 'You are using GPUStatsMonitor but are not running on GPU'
                 f' since gpus attribute in Trainer is set to {trainer.gpus}.'
 
@@ -287,14 +287,12 @@ def __init_monitor_mode(self, monitor, mode):
             "max": (-torch_inf, "max"),
         }
 
-        # TODO: Update with MisconfigurationException when auto mode is removed in v1.3
         if mode not in mode_dict and mode != 'auto':
-            rank_zero_warn(
-                f"ModelCheckpoint mode {mode} is unknown, fallback to auto mode",
-                RuntimeWarning,
+            raise MisconfigurationException(
+                f"`mode` can be auto, {', '.join(mode_dict.keys())}, got {mode}"
             )
-            mode = "auto"
 
+        # TODO: Update with MisconfigurationException when auto mode is removed in v1.3
         if mode == 'auto':
             rank_zero_warn(
                 "mode='auto' is deprecated in v1.1 and will be removed in v1.3."
 
@@ -85,17 +85,8 @@ def __init__(self, *args, **kwargs):
         #: Pointer to the logger object
         self.logger = None
 
-        #: True if using dp
-        self.use_dp = False
-
-        #: True if using ddp
-        self.use_ddp = False
-
-        #: True if using ddp2
-        self.use_ddp2 = False
-
-        # True if on tpu
-        self.use_tpu = False
+        self._distrib_type = None
+        self._device_type = None
 
         #: True if using amp
         self.use_amp = False
 
@@ -23,7 +23,7 @@
 import torch.nn as nn
 from torch.utils.hooks import RemovableHandle
 
-from pytorch_lightning.utilities import AMPType
+from pytorch_lightning.utilities import AMPType, DeviceType
 
 PARAMETER_NUM_UNITS = [" ", "K", "M", "B", "T"]
 UNKNOWN_SIZE = "?"
@@ -229,7 +229,7 @@ def _forward_example_input(self) -> None:
         input_ = model.example_input_array
         input_ = model.transfer_batch_to_device(input_, model.device)
 
-        if trainer is not None and trainer.amp_backend == AMPType.NATIVE and not trainer.use_tpu:
+        if trainer is not None and trainer.amp_backend == AMPType.NATIVE and trainer._device_type != DeviceType.TPU:
             model.forward = torch.cuda.amp.autocast()(model.forward)
 
         mode = model.training