Skip to content

Commit 7d35da1

Browse files
awaelchlitchaton
andcommitted
Don't register signal in thread (#10610)
Co-authored-by: tchaton <[email protected]>
1 parent c179a7d commit 7d35da1

File tree

3 files changed

+30
-4
lines changed

3 files changed

+30
-4
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
3232
- Fixed the `{validation,test}_step` outputs getting moved to CPU with `Trainer(move_metrics_to_cpu=True)` ([#10631](https://github.com/PyTorchLightning/pytorch-lightning/pull/10631))
3333

3434

35+
- Fixed signals being registered within threads ([#10610](https://github.com/PyTorchLightning/pytorch-lightning/pull/10610))
36+
37+
3538

3639
## [1.5.2] - 2021-11-16
3740

pytorch_lightning/trainer/connectors/signal_connector.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import os
33
import signal
44
import sys
5+
import threading
56
from signal import Signals
67
from subprocess import call
78
from types import FrameType, FunctionType
@@ -43,11 +44,11 @@ def register_signal_handlers(self) -> None:
4344

4445
# signal.SIGUSR1 doesn't seem available on windows
4546
if not self._is_on_windows():
46-
if not self._has_already_handler(signal.SIGUSR1):
47-
signal.signal(signal.SIGUSR1, HandlersCompose(sigusr1_handlers))
47+
if sigusr1_handlers and not self._has_already_handler(signal.SIGUSR1):
48+
self._register_signal(signal.SIGUSR1, HandlersCompose(sigusr1_handlers))
4849

49-
if not self._has_already_handler(signal.SIGTERM):
50-
signal.signal(signal.SIGTERM, HandlersCompose(sigterm_handlers))
50+
if sigterm_handlers and not self._has_already_handler(signal.SIGTERM):
51+
self._register_signal(signal.SIGTERM, HandlersCompose(sigterm_handlers))
5152

5253
def slurm_sigusr1_handler_fn(self, signum: Signals, frame: FrameType) -> None:
5354
if self.trainer.is_global_zero:
@@ -107,3 +108,8 @@ def _has_already_handler(self, signum: Signals) -> bool:
107108
return isinstance(signal.getsignal(signum), FunctionType)
108109
except AttributeError:
109110
return False
111+
112+
@staticmethod
113+
def _register_signal(signum: Signals, handlers: HandlersCompose) -> None:
114+
if threading.current_thread() is threading.main_thread():
115+
signal.signal(signum, handlers)

tests/trainer/connectors/test_signal_connector.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14+
import concurrent.futures
1415
import os
1516
import signal
1617
from time import sleep
@@ -57,3 +58,19 @@ def training_step(self, batch, batch_idx):
5758
else:
5859
trainer.fit(model)
5960
assert trainer._terminate_gracefully == (False if register_handler else terminate_gracefully)
61+
62+
# reset the signal to system defaults
63+
signal.signal(signal.SIGUSR1, signal.SIG_DFL)
64+
65+
66+
def _registering_signals():
67+
trainer = Trainer()
68+
trainer.signal_connector.register_signal_handlers()
69+
70+
71+
@RunIf(skip_windows=True)
72+
@mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})
73+
def test_signal_connector_in_thread():
74+
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
75+
for future in concurrent.futures.as_completed([executor.submit(_registering_signals)]):
76+
assert future.exception() is None

0 commit comments

Comments
 (0)