Lightning-AI
diff --git a/‎pytorch_lightning/callbacks/quantization.py‎
Lines changed: 1 addition & 1 deletion b/‎pytorch_lightning/callbacks/quantization.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pytorch_lightning/distributed/dist.py‎
Lines changed: 2 additions & 3 deletions b/‎pytorch_lightning/distributed/dist.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎pytorch_lightning/overrides/torch_distributed.py‎
Lines changed: 99 additions & 0 deletions b/‎pytorch_lightning/overrides/torch_distributed.py‎
Lines changed: 99 additions & 0 deletions
diff --git a/‎pytorch_lightning/plugins/training_type/ddp.py‎
Lines changed: 10 additions & 5 deletions b/‎pytorch_lightning/plugins/training_type/ddp.py‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎pytorch_lightning/plugins/training_type/ddp_spawn.py‎
Lines changed: 10 additions & 6 deletions b/‎pytorch_lightning/plugins/training_type/ddp_spawn.py‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎pytorch_lightning/trainer/connectors/accelerator_connector.py‎
Lines changed: 4 additions & 1 deletion b/‎pytorch_lightning/trainer/connectors/accelerator_connector.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎pytorch_lightning/utilities/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎pytorch_lightning/utilities/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pytorch_lightning/utilities/auto_restart.py‎
Lines changed: 26 additions & 9 deletions b/‎pytorch_lightning/utilities/auto_restart.py‎
Lines changed: 26 additions & 9 deletions
diff --git a/‎pytorch_lightning/utilities/cloud_io.py‎
Lines changed: 8 additions & 1 deletion b/‎pytorch_lightning/utilities/cloud_io.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎pytorch_lightning/utilities/imports.py‎
Lines changed: 2 additions & 1 deletion b/‎pytorch_lightning/utilities/imports.py‎
Lines changed: 2 additions & 1 deletion
@@ -28,7 +28,7 @@
 if _TORCH_GREATER_EQUAL_1_8:
     from torch.quantization import FakeQuantizeBase
 else:
-    # For torch 1.7.
+    # For torch 1.6 and 1.7.
     from torch.quantization import FakeQuantize as FakeQuantizeBase
 
 import pytorch_lightning as pl
 
@@ -13,8 +13,7 @@
 # limitations under the License.
 from typing import Any
 
-import torch.distributed
-
+from pytorch_lightning.overrides.torch_distributed import broadcast_object_list
 from pytorch_lightning.utilities import rank_zero_deprecation
 from pytorch_lightning.utilities.distributed import group as _group
 
@@ -41,6 +40,6 @@ def broadcast(self, obj: Any, group=_group.WORLD):
         if self.rank != 0:
             obj = [None] * len(obj)
 
-        torch.distributed.broadcast_object_list(obj, 0, group=group or _group.WORLD)
+        broadcast_object_list(obj, 0, group=group or _group.WORLD)
 
         return obj[0]
@@ -0,0 +1,99 @@
+import logging
+import pickle
+
+import torch
+
+from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8
+
+log = logging.getLogger(__name__)
+
+if torch.distributed.is_available():
+    from torch.distributed import Backend, broadcast, get_backend, get_rank, GroupMember
+
+# The code underneath is taken from PyTorch `torch/distributed/distributed_c10d.py`
+# and enable broadcasting for PyTorch 1.6 and lower.
+
+
+# https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L160
+def _rank_not_in_group(group):
+    """Helper that checks if the current process's rank is not in a given group."""
+    if group is None:
+        return False
+    return group == GroupMember.NON_GROUP_MEMBER
+
+
+# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L1164
+def _object_to_tensor(obj):
+    buffer = pickle.dumps(obj)
+    byte_storage = torch.ByteStorage.from_buffer(buffer)  # type: ignore[attr-defined]
+    byte_tensor = torch.ByteTensor(byte_storage)
+    local_size = torch.LongTensor([byte_tensor.numel()])
+    return byte_tensor, local_size
+
+
+# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py
+def _tensor_to_object(tensor, tensor_size):
+    buf = tensor.numpy().tobytes()[:tensor_size]
+    out = pickle.loads(buf)
+    return out
+
+
+# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L1327
+def _broadcast_object_list(object_list, src=0, group=None):
+    if _rank_not_in_group(group):
+        return
+
+    my_rank = get_rank()
+    # Serialize object_list elements to tensors on src rank.
+    if my_rank == src:
+        tensor_list, size_list = zip(*(_object_to_tensor(obj) for obj in object_list))
+        object_sizes_tensor = torch.cat(size_list)
+    else:
+        object_sizes_tensor = torch.LongTensor(len(object_list))
+
+    group_backend = get_backend(group)
+    is_nccl_backend = group_backend == Backend.NCCL
+    current_device = torch.device("cpu")
+    if is_nccl_backend:
+        # See note about using torch.cuda.current_device() here in docstring.
+        # We cannot simply use my_rank since rank == device is not necessarily
+        # true.
+        current_device = torch.device("cuda", torch.cuda.current_device())
+        object_sizes_tensor = object_sizes_tensor.to(current_device)
+        object_sizes_tensor = object_sizes_tensor.to(current_device)
+
+    # Broadcast object sizes
+    broadcast(object_sizes_tensor, src=src, group=group)
+
+    # Concatenate and broadcast serialized object tensors
+    if my_rank == src:
+        object_tensor = torch.cat(tensor_list)
+    else:
+        object_tensor = torch.ByteTensor(torch.sum(object_sizes_tensor).item())
+
+    if is_nccl_backend:
+        object_tensor = object_tensor.to(current_device)
+
+    broadcast(object_tensor, src=src, group=group)
+
+    # Deserialize objects using their stored sizes.
+    offset = 0
+    if my_rank != src:
+        for i, obj_size in enumerate(object_sizes_tensor):
+            obj_view = object_tensor[offset : offset + obj_size]
+            obj_view = obj_view.type(torch.ByteTensor)  # type: ignore[call-overload]
+            offset += obj_size
+            object_list[i] = _tensor_to_object(obj_view, obj_size)
+
+
+if not torch.distributed.is_available():
+    # avoid failures on early PyTorch versions for Windows where
+    # not all functions used in `broadcast_object_list` are available.
+    def _broadcast_noop(obj, *_, **__):
+        return obj
+
+    broadcast_object_list = _broadcast_noop
+elif _TORCH_GREATER_EQUAL_1_8:
+    from torch.distributed.distributed_c10d import broadcast_object_list
+else:
+    broadcast_object_list = _broadcast_object_list
@@ -34,6 +34,7 @@
 from pytorch_lightning.core.optimizer import LightningOptimizer
 from pytorch_lightning.overrides import LightningDistributedModule
 from pytorch_lightning.overrides.distributed import prepare_for_backward
+from pytorch_lightning.overrides.torch_distributed import broadcast_object_list
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
@@ -42,6 +43,7 @@
     _FAIRSCALE_AVAILABLE,
     _HYDRA_AVAILABLE,
     _IS_WINDOWS,
+    _TORCH_GREATER_EQUAL_1_7,
     _TORCH_GREATER_EQUAL_1_8,
     _TORCH_GREATER_EQUAL_1_9,
     _TORCH_GREATER_EQUAL_1_10,
@@ -285,12 +287,15 @@ def pre_configure_ddp(self):
         # when not all parameter backward hooks are fired by the autograd engine even if require_grad is set to True.
         # This flag does come with a performance hit, so it is suggested to disable in cases where it is possible.
         self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True)
-        if not self.lightning_module.automatic_optimization and not self._ddp_kwargs.get(
-            "find_unused_parameters", False
+        # todo: PyTorch 1.7.0 DDP introduces `self.reducer._rebuild_buckets()` breaking manual_optimization
+        if (
+            _TORCH_GREATER_EQUAL_1_7
+            and not self.lightning_module.automatic_optimization
+            and not self._ddp_kwargs.get("find_unused_parameters", False)
         ):
-            # TODO: PyTorch 1.7.0 DDP introduces `self.reducer._rebuild_buckets()` breaking manual_optimization
             rank_zero_warn(
-                "Lightning `manual_optimization` needs to set `find_unused_parameters=True` to properly work with DDP."
+                "From PyTorch 1.7.0, Lightning ``manual_optimization`` needs to set ``find_unused_parameters=True`` "
+                "to properly work with DDP."
             )
             self._ddp_kwargs["find_unused_parameters"] = True
 
@@ -393,7 +398,7 @@ def broadcast(self, obj: object, src: int = 0) -> object:
         obj = [obj]
         if self.global_rank != src:
             obj = [None]
-        torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD)
+        broadcast_object_list(obj, src, group=_group.WORLD)
         return obj[0]
 
     def pre_backward(self, closure_loss: torch.Tensor) -> None:
 
@@ -27,11 +27,12 @@
 import pytorch_lightning as pl
 from pytorch_lightning.overrides import LightningDistributedModule
 from pytorch_lightning.overrides.distributed import prepare_for_backward
+from pytorch_lightning.overrides.torch_distributed import broadcast_object_list
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
 from pytorch_lightning.trainer.states import TrainerFn
-from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_8, rank_zero_warn
+from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_7, _TORCH_GREATER_EQUAL_1_8, rank_zero_warn
 from pytorch_lightning.utilities.apply_func import apply_to_collection, move_data_to_device
 from pytorch_lightning.utilities.cloud_io import atomic_save
 from pytorch_lightning.utilities.cloud_io import load as pl_load
@@ -245,12 +246,15 @@ def pre_configure_ddp(self):
         # when not all parameter backward hooks are fired by the autograd engine even if require_grad is set to True.
         # This flag does come with a performance hit, so it is suggested to disable in cases where it is possible.
         self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True)
-        if not self.lightning_module.automatic_optimization and not self._ddp_kwargs.get(
-            "find_unused_parameters", False
+        # todo: PyTorch 1.7.0 DDP introduces `self.reducer._rebuild_buckets()` breaking manual_optimization
+        if (
+            _TORCH_GREATER_EQUAL_1_7
+            and not self.lightning_module.automatic_optimization
+            and not self._ddp_kwargs.get("find_unused_parameters", False)
         ):
-            # TODO: PyTorch 1.7.0 DDP introduces `self.reducer._rebuild_buckets()` breaking manual_optimization
             rank_zero_warn(
-                "Lightning `manual_optimization` needs to set `find_unused_parameters=True` to properly work with DDP."
+                "From PyTorch 1.7.0, Lightning ``manual_optimization`` needs to set ``find_unused_parameters=True`` "
+                "to properly work with DDP."
             )
             self._ddp_kwargs["find_unused_parameters"] = True
 
@@ -327,7 +331,7 @@ def broadcast(self, obj: object, src: int = 0) -> object:
         obj = [obj]
         if self.global_rank != src:
             obj = [None]
-        torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD)
+        broadcast_object_list(obj, src, group=_group.WORLD)
         return obj[0]
 
     def model_to_device(self):
 
@@ -74,6 +74,7 @@
 from pytorch_lightning.utilities.imports import (
     _HOROVOD_AVAILABLE,
     _IPU_AVAILABLE,
+    _TORCH_GREATER_EQUAL_1_7,
     _TORCH_GREATER_EQUAL_1_8,
     _TPU_AVAILABLE,
 )
@@ -189,8 +190,10 @@ def _init_deterministic(self, deterministic: bool) -> None:
         self.deterministic = deterministic
         if _TORCH_GREATER_EQUAL_1_8:
             torch.use_deterministic_algorithms(deterministic)
-        else:
+        elif _TORCH_GREATER_EQUAL_1_7:
             torch.set_deterministic(deterministic)
+        else:  # the minimum version Lightning supports is PyTorch 1.6
+            torch._set_deterministic(deterministic)
         if deterministic:
             # fixing non-deterministic part of horovod
             # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383
 
@@ -44,6 +44,7 @@
     _OMEGACONF_AVAILABLE,
     _POPTORCH_AVAILABLE,
     _RICH_AVAILABLE,
+    _TORCH_GREATER_EQUAL_1_7,
     _TORCH_GREATER_EQUAL_1_8,
     _TORCH_GREATER_EQUAL_1_9,
     _TORCH_GREATER_EQUAL_1_10,
 
@@ -305,6 +305,9 @@ def _wrap_generator_samplers(self) -> None:
         # access wrapped dataset attributes
         dataset_dict = self.dataset.__dict__
 
+        # create a tuple of sampler names
+        samplers_names = tuple(v.__class__.__name__ for k, v in dataset_dict.items() if isinstance(v, Sampler))
+
         # create a dictionary of generator present within the dataset attributes
         dataset_sampler_generators = {k: v for k, v in dataset_dict.items() if isinstance(v, (Generator, Iterator))}
 
@@ -315,17 +318,31 @@ def _wrap_generator_samplers(self) -> None:
             if isinstance(generator, Sampler):
                 continue
 
-            # wrap the generator into a `FastForwardSampler`
-            sampler = FastForwardSampler(generator, attr_name=generator_attr_name)
+            # used to handle a weird behaviour from PyTorch 1.6
+            # where the sampler is converted to a list_iterator
+            is_legacy = False
+
+            if isinstance(generator, Generator):
+                # Generator name have the  the form `SamplerName.__iter__`
+                generator_name = generator.__qualname__.split(".")[0]
+            else:
+                # assume the retrieved iterator is coming from sampler.
+                is_legacy = True
+
+            # validate the base generator name matches a sampler name.
+            if is_legacy or any(sampler_name == generator_name for sampler_name in samplers_names):
+
+                # wrap the generator into a `FastForwardSampler`
+                sampler = FastForwardSampler(generator, attr_name=generator_attr_name)
 
-            # if `CaptureIterableDataset` was available, the sampler should reload its own state.
-            if self._state_dict is not None:
-                sampler.load_state_dict(self._state_dict[generator_attr_name])
-            # store the samplers
-            self.samplers[generator_attr_name] = sampler
+                # if `CaptureIterableDataset` was available, the sampler should reload its own state.
+                if self._state_dict is not None:
+                    sampler.load_state_dict(self._state_dict[generator_attr_name])
+                # store the samplers
+                self.samplers[generator_attr_name] = sampler
 
-            # replace generator with the generator from the `FastForwardSampler`.
-            dataset_dict[generator_attr_name] = iter(sampler)
+                # replace generator with the generator from the `FastForwardSampler`.
+                dataset_dict[generator_attr_name] = iter(sampler)
 
         self.reset_on_epoch()
 
 
@@ -19,6 +19,7 @@
 import fsspec
 import torch
 from fsspec.implementations.local import AbstractFileSystem, LocalFileSystem
+from packaging.version import Version
 
 
 def load(
@@ -58,6 +59,12 @@ def atomic_save(checkpoint: Dict[str, Any], filepath: Union[str, Path]) -> None:
     """
 
     bytesbuffer = io.BytesIO()
-    torch.save(checkpoint, bytesbuffer)
+    # Can't use the new zipfile serialization for 1.6.0 because there's a bug in
+    # torch.hub.load_state_dict_from_url() that prevents it from loading the new files.
+    # More details can be found here: https://github.com/pytorch/pytorch/issues/42239
+    if Version(torch.__version__).release[:3] == (1, 6, 0):
+        torch.save(checkpoint, bytesbuffer, _use_new_zipfile_serialization=False)
+    else:
+        torch.save(checkpoint, bytesbuffer)
     with fsspec.open(filepath, "wb") as f:
         f.write(bytesbuffer.getvalue())
@@ -70,6 +70,7 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version:
 
 _IS_WINDOWS = platform.system() == "Windows"
 _IS_INTERACTIVE = hasattr(sys, "ps1")  # https://stackoverflow.com/a/64523765
+_TORCH_GREATER_EQUAL_1_7 = _compare_version("torch", operator.ge, "1.7.0")
 _TORCH_GREATER_EQUAL_1_8 = _compare_version("torch", operator.ge, "1.8.0")
 _TORCH_GREATER_EQUAL_1_8_1 = _compare_version("torch", operator.ge, "1.8.1")
 _TORCH_GREATER_EQUAL_1_9 = _compare_version("torch", operator.ge, "1.9.0")
@@ -111,4 +112,4 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version:
 
 # experimental feature within PyTorch Lightning.
 def _fault_tolerant_training() -> bool:
-    return bool(int(os.getenv("PL_FAULT_TOLERANT_TRAINING", 0)))
+    return _TORCH_GREATER_EQUAL_1_7 and int(os.getenv("PL_FAULT_TOLERANT_TRAINING", 0))