Move sync code from step result to lightning module [6/n] (#7651)

carmocca · web-flow · commit 2103b5efc986 · 2021-05-24T13:13:55.000+01:00
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
@@ -17,6 +17,7 @@
 import copy
 import inspect
 import logging
+import numbers
 import os
 import tempfile
 import types
@@ -42,10 +43,11 @@
 from pytorch_lightning.utilities.apply_func import apply_to_collection, convert_to_tensors
 from pytorch_lightning.utilities.cloud_io import get_filesystem
 from pytorch_lightning.utilities.device_dtype_mixin import DeviceDtypeModuleMixin
+from pytorch_lightning.utilities.distributed import sync_ddp_if_available, tpu_distributed
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.parsing import AttributeDict, collect_init_args, save_hyperparameters
 from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature
-from pytorch_lightning.utilities.types import EPOCH_OUTPUT, STEP_OUTPUT
+from pytorch_lightning.utilities.types import _METRIC, EPOCH_OUTPUT, STEP_OUTPUT
 from pytorch_lightning.utilities.warnings import WarningCache
 
 warning_cache = WarningCache()
@@ -336,6 +338,15 @@ def log(
                     f"Logged key: {name} should not contain information about dataloader_idx."
                 )
 
+            value = self.__sync(
+                value,
+                sync_fn=self.trainer.training_type_plugin.reduce,
+                sync_dist=sync_dist,
+                sync_dist_op=sync_dist_op,
+                sync_dist_group=sync_dist_group,
+                device=self.device,
+            )
+
             self._results.log(
                 name,
                 value,
@@ -345,12 +356,7 @@ def log(
                 on_epoch=on_epoch,
                 reduce_fx=reduce_fx,
                 enable_graph=enable_graph,
-                sync_dist=sync_dist,
-                sync_dist_op=sync_dist_op,
-                sync_dist_group=sync_dist_group,
-                sync_fn=self.trainer.training_type_plugin.reduce,
                 dataloader_idx=(self._current_dataloader_idx if add_dataloader_idx else None),
-                device=self.device,
             )
 
     def log_dict(
@@ -410,6 +416,31 @@ def log_dict(
                 add_dataloader_idx=add_dataloader_idx
             )
 
+    @staticmethod
+    def __sync(
+        value: _METRIC,
+        sync_fn: Optional[Callable] = None,
+        sync_dist: bool = False,
+        sync_dist_op: Union[Any, str] = 'mean',
+        sync_dist_group: Optional[Any] = None,
+        device: torch.device = None,
+    ) -> _METRIC:
+        """Sync across workers when using distributed training"""
+        if not isinstance(value, (torch.Tensor, numbers.Number)):
+            return value
+
+        sync_fn = sync_fn or sync_ddp_if_available
+        dist_available = torch.distributed.is_available() and torch.distributed.is_initialized() or tpu_distributed()
+        if not sync_dist or not dist_available:
+            return value
+
+        # TODO: Find a way to make the reduction only once, so we don't need to clone.
+        if isinstance(value, torch.Tensor):
+            value = value.clone()
+        else:
+            value = torch.tensor(value, device=device, dtype=torch.float)
+        return sync_fn(value, group=sync_dist_group, reduce_op=sync_dist_op)
+
     def write_prediction(
         self, name: str, value: Union[torch.Tensor, List[torch.Tensor]], filename: str = 'predictions.pt'
     ):
diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py
@@ -13,16 +13,13 @@
 # limitations under the License.
 """Result class for easier logging and epoch-wise reduction."""
 
-import numbers
 from copy import copy
 from typing import Any, Callable, Dict, Iterable, List, MutableMapping, Optional, Sequence, Tuple, Union
 
 import torch
 from torch import Tensor
 from torchmetrics import Metric
 
-from pytorch_lightning.utilities.distributed import sync_ddp_if_available, tpu_distributed
-
 
 class Result(Dict):
 
@@ -86,29 +83,12 @@ def log(
         on_epoch: bool = True,
         reduce_fx: Callable = torch.mean,
         enable_graph: bool = False,
-        sync_dist: bool = False,
-        sync_dist_op: Union[Any, str] = 'mean',
-        sync_dist_group: Optional[Any] = None,
-        sync_fn: Callable = None,
         dataloader_idx: Optional[int] = None,
-        device: torch.device = None,
     ):
         # no metrics should be logged with graphs
         if not enable_graph and isinstance(value, torch.Tensor):
             value = value.detach()
 
-        # sync across workers when using distributed training
-        sync_fn = sync_fn or sync_ddp_if_available
-
-        if sync_dist and isinstance(value, (torch.Tensor, numbers.Number)):
-            is_dist_initialized = torch.distributed.is_available() and torch.distributed.is_initialized()
-            # TODO: Find a way to make the reduction only once, so we don't need to clone.
-            if (is_dist_initialized or tpu_distributed()) and isinstance(value, torch.Tensor):
-                value = value.clone()
-            else:
-                value = torch.tensor(value, device=device, dtype=torch.float)
-            value = sync_fn(value, group=sync_dist_group, reduce_op=sync_dist_op)
-
         if isinstance(value, torch.Tensor) and value.device.type == "xla":
             value = value.cpu()
 
diff --git a/pytorch_lightning/utilities/types.py b/pytorch_lightning/utilities/types.py
@@ -16,12 +16,13 @@
  - Do not include any `_TYPE` suffix
  - Types used in public hooks (as those in the `LightningModule` and `Callback`) should be public (no trailing `_`)
 """
+from numbers import Number
 from typing import Any, Dict, Iterator, List, Union
 
 import torch
 from torchmetrics import Metric
 
-_METRIC = Union[Metric, torch.Tensor, int, float]
+_METRIC = Union[Metric, torch.Tensor, Number]
 STEP_OUTPUT = Union[torch.Tensor, Dict[str, Any]]
 EPOCH_OUTPUT = List[STEP_OUTPUT]
 _EVALUATE_OUTPUT = List[Dict[str, float]]  # 1 dict per DataLoader
diff --git a/tests/core/test_metric_result_integration.py b/tests/core/test_metric_result_integration.py
@@ -93,7 +93,6 @@ def _ddp_test_fn(rank, worldsize):
 @RunIf(skip_windows=True)
 def test_result_reduce_ddp():
     """Make sure result logging works with DDP"""
-    tutils.reset_seed()
     tutils.set_random_master_port()
 
     worldsize = 2
diff --git a/tests/core/test_results.py b/tests/core/test_results.py
@@ -21,7 +21,7 @@
 from torch.utils.data import DataLoader
 
 import tests.helpers.utils as tutils
-from pytorch_lightning import Trainer
+from pytorch_lightning import LightningModule, Trainer
 from pytorch_lightning.core.step_result import Result
 from tests.helpers import BoringDataModule, BoringModel
 from tests.helpers.runif import RunIf
@@ -36,24 +36,19 @@ def _setup_ddp(rank, worldsize):
     dist.init_process_group("gloo", rank=rank, world_size=worldsize)
 
 
-def _ddp_test_fn(rank, worldsize, result_cls: Result):
+def _ddp_test_fn(rank, worldsize):
     _setup_ddp(rank, worldsize)
     tensor = torch.tensor([1.0])
-
-    res = result_cls()
-    res.log("test_tensor", tensor, sync_dist=True, sync_dist_op=torch.distributed.ReduceOp.SUM)
-
-    assert res["test_tensor"].item() == dist.get_world_size(), "Result-Log does not work properly with DDP and Tensors"
+    actual = LightningModule._LightningModule__sync(tensor, sync_dist=True, sync_dist_op=torch.distributed.ReduceOp.SUM)
+    assert actual.item() == dist.get_world_size(), "Result-Log does not work properly with DDP and Tensors"
 
 
 @RunIf(skip_windows=True)
 def test_result_reduce_ddp():
     """Make sure result logging works with DDP"""
-    tutils.reset_seed()
     tutils.set_random_master_port()
-
     worldsize = 2
-    mp.spawn(_ddp_test_fn, args=(worldsize, Result), nprocs=worldsize)
+    mp.spawn(_ddp_test_fn, args=(worldsize, ), nprocs=worldsize)
 
 
 @pytest.mark.parametrize(