Fix BF16 teardown for TPU precision plugin (#10990)

awaelchli · pre-commit-ci[bot] · justusschock · awaelchli · commit f69bb3a7994b · 2022-01-05T13:33:05.000+01:00
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
Co-authored-by: Justus Schock &lt;12886177+justusschock@users.noreply.github.com&gt;
Co-authored-by: Rohit Gupta &lt;rohitgr1998@gmail.com&gt;
Co-authored-by: Kaushik B &lt;45285388+kaushikb11@users.noreply.github.com&gt;
Co-authored-by: thomas chaton &lt;thomas@grid.ai&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed data fetcher selection ([#11294](https://github.com/PyTorchLightning/pytorch-lightning/pull/11294))
 - Fixed a race condition that could result in incorrect (zero) values being observed in prediction writer callbacks ([#11288](https://github.com/PyTorchLightning/pytorch-lightning/pull/11288))
 - Fixed dataloaders not getting reloaded the correct amount of times when setting `reload_dataloaders_every_n_epochs` and `check_val_every_n_epoch` ([#10948](https://github.com/PyTorchLightning/pytorch-lightning/pull/10948))
+- Fixed an issue with the `TPUSpawnPlugin` handling the `XLA_USE_BF16` environment variable incorrectly ([#10990](https://github.com/PyTorchLightning/pytorch-lightning/pull/10990))
+
 
 ## [1.5.7] - 2021-12-21
 
diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
@@ -188,6 +188,7 @@ def teardown(self) -> None:
         It is the right place to release memory and free other resources.
         """
         self.training_type_plugin.teardown()
+        self.precision_plugin.teardown()
 
     def batch_to_device(self, batch: Any, device: Optional[torch.device] = None, dataloader_idx: int = 0) -> Any:
         """Moves the batch to the correct device. The returned batch is of the same type as the input batch, just
diff --git a/pytorch_lightning/plugins/precision/precision_plugin.py b/pytorch_lightning/plugins/precision/precision_plugin.py
@@ -252,3 +252,9 @@ def predict_step_context(self) -> Generator[None, None, None]:
         """A contextmanager for the predict step."""
         with self.forward_context():
             yield
+
+    def teardown(self) -> None:
+        """This method is called to teardown the training process.
+
+        It is the right place to release memory and free other resources.
+        """
diff --git a/pytorch_lightning/plugins/precision/tpu_bf16.py b/pytorch_lightning/plugins/precision/tpu_bf16.py
@@ -28,5 +28,8 @@ class TPUBf16PrecisionPlugin(TPUPrecisionPlugin):
     def connect(
         self, model: nn.Module, optimizers: List[Optimizer], lr_schedulers: List[Any]
     ) -> Tuple[nn.Module, List[Optimizer], List[Any]]:
-        os.environ["XLA_USE_BF16"] = str(1)
+        os.environ["XLA_USE_BF16"] = "1"
         return super().connect(model=model, optimizers=optimizers, lr_schedulers=lr_schedulers)
+
+    def teardown(self) -> None:
+        os.environ.pop("XLA_USE_BF16", None)
diff --git a/pytorch_lightning/plugins/training_type/single_device.py b/pytorch_lightning/plugins/training_type/single_device.py
@@ -81,6 +81,7 @@ def broadcast(self, obj: object, src: int = 0) -> object:
         return obj
 
     def teardown(self) -> None:
+        super().teardown()
         if self.on_gpu:
             # GPU teardown
             self.lightning_module.cpu()
diff --git a/pytorch_lightning/plugins/training_type/single_tpu.py b/pytorch_lightning/plugins/training_type/single_tpu.py
@@ -82,6 +82,7 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], filepath: _PATH) -> None:
         return self.checkpoint_io.save_checkpoint(checkpoint, filepath)
 
     def teardown(self) -> None:
+        super().teardown()
         # TPU teardown
         os.environ.pop("PT_XLA_DEBUG", None)
 
diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py
@@ -357,7 +357,7 @@ def all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None, sync_gra
         return xm.all_gather(tensor)
 
     def teardown(self) -> None:
-        # TPU teardown
+        super().teardown()
         os.environ.pop("PT_XLA_DEBUG", None)
         self.barrier("teardown")
 
diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py
@@ -312,7 +312,6 @@ def model_sharded_context(self) -> Generator:
         """
         yield
 
-    @abstractmethod
     def teardown(self) -> None:
         """This method is called to teardown the training process.
 
diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
@@ -122,7 +122,6 @@ def test_model_16bit_tpu_cores_1(tmpdir):
 
     model = BoringModel()
     tpipes.run_model_test(trainer_options, model, on_gpu=False)
-    assert os.environ.get("XLA_USE_BF16") == str(1), "XLA_USE_BF16 was not set in environment variables"
 
 
 @pytest.mark.parametrize("tpu_core", [1, 5])
@@ -144,7 +143,6 @@ def test_model_16bit_tpu_index(tmpdir, tpu_core):
     model = BoringModel()
     tpipes.run_model_test(trainer_options, model, on_gpu=False)
     assert torch_xla._XLAC._xla_get_default_device() == f"xla:{tpu_core}"
-    assert os.environ.get("XLA_USE_BF16") == str(1), "XLA_USE_BF16 was not set in environment variables"
 
 
 @RunIf(tpu=True)
diff --git a/tests/plugins/precision/__init__.py b/tests/plugins/precision/__init__.py
diff --git a/tests/plugins/precision/test_tpu_bf16_plugin.py b/tests/plugins/precision/test_tpu_bf16_plugin.py
@@ -0,0 +1,25 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from unittest.mock import Mock
+
+from pytorch_lightning.plugins import TPUBf16PrecisionPlugin
+
+
+def test_teardown():
+    plugin = TPUBf16PrecisionPlugin()
+    plugin.connect(Mock(), Mock(), Mock())
+    assert os.environ.get("XLA_USE_BF16") == "1"
+    plugin.teardown()
+    assert "XLA_USE_BF16" not in os.environ