Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
9abe28e
add ClusterEnvironment for LSF systems
ajtritt Dec 3, 2020
f2e44c1
update init file
ajtritt Dec 4, 2020
615f08d
add available cluster environments
ajtritt Dec 4, 2020
86f2fa1
clean up LSFEnvironment
ajtritt Dec 9, 2020
b72b42d
add ddp_hpc as a distributed backend
ajtritt Dec 9, 2020
6a9a4ca
clean up SLURMEnvironment
ajtritt Dec 9, 2020
5bbba77
Merge branch 'master' into lsf_env
ajtritt Dec 9, 2020
94e4d4b
remove extra blank line
ajtritt Dec 9, 2020
113e787
init device for DDPHPCAccelerator
ajtritt Dec 10, 2020
d12d652
committing current state
ajtritt Dec 11, 2020
d0ac793
Merge branch 'master' into lsf_env
ajtritt Dec 11, 2020
b53d153
add additional methods to ClusterEnvironments
ajtritt Dec 11, 2020
0b6edfe
add NVIDIA mixin for setting up CUDA envars
ajtritt Dec 11, 2020
f7d87f6
remove troubleshooting prints
ajtritt Dec 12, 2020
3c9edf9
cleanup SLURMEnvironment
ajtritt Dec 12, 2020
77f3b71
fix docstring
ajtritt Dec 12, 2020
eb7d07c
cleanup TorchElasticEnvironment and add documentation
ajtritt Dec 12, 2020
09064e1
PEP8 puts a cork in it
ajtritt Dec 12, 2020
fb30942
Merge branch 'master' into lsf_env
ajtritt Dec 12, 2020
7be8f1d
add set_ranks_to_trainer
ajtritt Feb 11, 2021
5c04b8e
Merge remote-tracking branch 'pl/master' into lsf_env
ajtritt Feb 12, 2021
004daef
Merge remote-tracking branch 'pl/master' into lsf_env
ajtritt Feb 12, 2021
a113210
remove unused import
ajtritt Feb 12, 2021
d17281c
move to new location
ajtritt Feb 12, 2021
b4028a7
Merge branch 'master' into lsf_env
awaelchli Jul 9, 2021
7a23376
update LSF environment
awaelchli Jul 9, 2021
02410ff
remove mixin
awaelchli Jul 9, 2021
7f91740
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 9, 2021
1b3bc7a
changelog
awaelchli Jul 9, 2021
5ec0e9f
Merge remote-tracking branch 'ajtritt/lsf_env' into lsf_env
awaelchli Jul 9, 2021
92215ab
reset slurm env
awaelchli Jul 9, 2021
a613759
add tests
awaelchli Jul 9, 2021
f7c5e0e
add licence
awaelchli Jul 9, 2021
00de88e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 9, 2021
cfd59b8
test node_rank
awaelchli Jul 9, 2021
5ec99e9
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 9, 2021
cfe544f
add lsf env to docs
awaelchli Jul 9, 2021
71569de
add auto detection for lsf environment
awaelchli Jul 9, 2021
7c26b41
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 9, 2021
077964d
fix is_using_lsf() and test
awaelchli Jul 9, 2021
7f127c8
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 9, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Added `FastForwardSampler` and `CaptureIterableDataset` ([#8307](https://github.com/PyTorchLightning/pytorch-lightning/pull/8307))


- Added `LSFEnvironment` for distributed training with the LSF resource manager `jsrun` ([#5102](https://github.com/PyTorchLightning/pytorch-lightning/pull/5102))


### Changed


Expand Down
1 change: 1 addition & 0 deletions docs/source/extensions/plugins.rst
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ Cluster Environments

ClusterEnvironment
LightningEnvironment
LSFEnvironment
TorchElasticEnvironment
KubeflowEnvironment
SLURMEnvironment
1 change: 1 addition & 0 deletions pytorch_lightning/plugins/environments/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@
from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment # noqa: F401
from pytorch_lightning.plugins.environments.kubeflow_environment import KubeflowEnvironment # noqa: F401
from pytorch_lightning.plugins.environments.lightning_environment import LightningEnvironment # noqa: F401
from pytorch_lightning.plugins.environments.lsf_environment import LSFEnvironment # noqa: F401
from pytorch_lightning.plugins.environments.slurm_environment import SLURMEnvironment # noqa: F401
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment # noqa: F401
160 changes: 160 additions & 0 deletions pytorch_lightning/plugins/environments/lsf_environment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
# Copyright The PyTorch Lightning team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import socket

from pytorch_lightning import _logger as log
from pytorch_lightning.plugins.environments import ClusterEnvironment


class LSFEnvironment(ClusterEnvironment):
"""
An environment for running on clusters managed by the LSF resource manager.

It is expected that any execution using this ClusterEnvironment was executed
using the Job Step Manager i.e. ``jsrun``.

This plugin expects the following environment variables.

LSB_JOBID:
The LSF assigned job ID

LSB_HOSTS:
The hosts used in the job. This string is expected to have the format "batch <rank_0_host> ...."

JSM_NAMESPACE_LOCAL_RANK:
The node local rank for the task. This environment variable is set by jsrun

JSM_NAMESPACE_SIZE:
The world size for the task. This environment variable is set by jsrun
"""

def __init__(self):
self._master_address = self._get_master_address()
self._master_port = self._get_master_port()
log.debug(f"MASTER_ADDR: {self._master_address}")
log.debug(f"MASTER_PORT: {self._master_port}")

@staticmethod
def is_using_lsf() -> bool:
""" Returns ``True`` if the current process was launched using the jsrun command. """
required_env_vars = (
"LSB_JOBID",
"LSB_HOSTS",
"JSM_NAMESPACE_LOCAL_RANK",
"JSM_NAMESPACE_SIZE",
)
return all(v in os.environ for v in required_env_vars)

def creates_children(self) -> bool:
return True

def master_address(self):
""" The master address is read from a list of hosts contained in the environment variable `LSB_HOSTS`. """
return self._master_address

def master_port(self):
""" THe master port gets calculated from the LSF job ID. """
return self._master_port

def world_size(self):
""" The world size is read from the environment variable `JSM_NAMESPACE_SIZE`. """
var = "JSM_NAMESPACE_SIZE"
world_size = os.environ.get(var)
if world_size is None:
raise ValueError(
f"Cannot determine world size from environment variable {var}."
" Make sure you run your executable with `jsrun`"
)
return int(world_size)

def set_world_size(self, size: int) -> None:
log.debug("LSFEnvironment.set_world_size was called, but setting world size is not allowed. Ignored.")

def global_rank(self):
""" The world size is read from the environment variable `JSM_NAMESPACE_RANK`. """
var = "JSM_NAMESPACE_RANK"
global_rank = os.environ.get(var)
if global_rank is None:
raise ValueError(
f"Cannot determine global rank from environment variable {var}."
" Make sure you run your executable with `jsrun`"
)
return int(global_rank)

def set_global_rank(self, rank: int) -> None:
log.debug("LSFEnvironment.set_global_rank was called, but setting global rank is not allowed. Ignored.")

def local_rank(self):
""" The local rank is read from the environment variable `JSM_NAMESPACE_LOCAL_RANK`. """
var = "JSM_NAMESPACE_LOCAL_RANK"
local_rank = os.environ.get(var)
if local_rank is None:
raise ValueError(
f"Cannot determine local rank from environment variable {var}."
" Make sure you run your executable with `jsrun`"
)
return int(local_rank)

def node_rank(self):
"""
The node rank is determined by the position of the current hostname in the list of hosts stored in
the environment variable `LSB_HOSTS`.
"""
hosts = self._read_hosts()
count = dict()
for host in hosts:
if "batch" in host or "login" in host:
continue
if host not in count:
count[host] = len(count)
return count[socket.gethostname()]

@staticmethod
def _read_hosts():
hosts = os.environ.get("LSB_HOSTS")
if not hosts:
raise ValueError("Could not find hosts in environment variable LSB_HOSTS")
hosts = hosts.split()
if len(hosts) < 2:
raise ValueError(
"Cannot parse hosts from LSB_HOSTS environment variable."
" Expected format: \"batch <rank_0_host> ...\""
)
return hosts

def _get_master_address(self):
hosts = self._read_hosts()
return hosts[1]

@staticmethod
def _get_master_port():
"""
A helper function for accessing the master port.
Uses the LSF job ID so all ranks can compute the master port.
"""
# check for user-specified master port
port = os.environ.get("MASTER_PORT")
if not port:
jobid = os.environ.get("LSB_JOBID")
if not jobid:
raise ValueError("Could not find job id in environment variable LSB_JOBID")
port = int(jobid)
# all ports should be in the 10k+ range
port = int(port) % 1000 + 10000
log.debug(f"calculated LSF master port: {port}")
else:
log.debug(f"using externally specified master port: {port}")
return int(port)
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
ClusterEnvironment,
KubeflowEnvironment,
LightningEnvironment,
LSFEnvironment,
SLURMEnvironment,
TorchElasticEnvironment,
)
Expand Down Expand Up @@ -554,6 +555,8 @@ def select_cluster_environment(self) -> ClusterEnvironment:
env = TorchElasticEnvironment()
elif KubeflowEnvironment.is_using_kubeflow():
env = KubeflowEnvironment()
elif LSFEnvironment.is_using_lsf():
env = LSFEnvironment()
else:
env = LightningEnvironment()
return env
Expand Down
13 changes: 13 additions & 0 deletions tests/plugins/environments/test_kubeflow_environment.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
# Copyright The PyTorch Lightning team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
from unittest import mock
Expand Down
13 changes: 13 additions & 0 deletions tests/plugins/environments/test_lightning_environment.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
# Copyright The PyTorch Lightning team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from unittest import mock

Expand Down
89 changes: 89 additions & 0 deletions tests/plugins/environments/test_lsf_environment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# Copyright The PyTorch Lightning team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from unittest import mock

import pytest

from pytorch_lightning.plugins.environments import LSFEnvironment


@mock.patch.dict(os.environ, {
"LSB_HOSTS": "batch 10.10.10.0 10.10.10.1",
"LSB_JOBID": "1234",
})
def test_missing_lsb_hosts():
""" Test an error when the lsb hosts list cannot be found. """
del os.environ["LSB_HOSTS"]
with pytest.raises(ValueError, match="Could not find hosts in environment variable LSB_HOSTS"):
LSFEnvironment()


@mock.patch.dict(os.environ, {
"LSB_HOSTS": "batch 10.10.10.0 10.10.10.1",
"LSB_JOBID": "1234",
})
def test_missing_lsb_job_id():
""" Test an error when the job id cannot be found. """
del os.environ["LSB_JOBID"]
with pytest.raises(ValueError, match="Could not find job id in environment variable LSB_JOBID"):
LSFEnvironment()


@mock.patch.dict(
os.environ, {
"MASTER_PORT": "4321",
"LSB_JOBID": "1234",
"LSB_HOSTS": "batch 10.10.10.0 10.10.10.1",
}
)
def test_manual_master_port_and_address():
""" Test a user can set the port manually through the MASTER_PORT env variable. """
env = LSFEnvironment()
assert env.master_port() == 4321


@mock.patch.dict(
os.environ, {
"LSB_HOSTS": "batch 10.10.10.0 10.10.10.1 10.10.10.2 10.10.10.3",
"LSB_JOBID": "1234",
"JSM_NAMESPACE_SIZE": "4",
"JSM_NAMESPACE_RANK": "3",
"JSM_NAMESPACE_LOCAL_RANK": "1"
}
)
def test_attributes_from_environment_variables():
""" Test that the LSF environment takes the attributes from the environment variables. """
env = LSFEnvironment()
assert env.creates_children()
assert env.master_address() == "10.10.10.0"
assert env.master_port() == 10234
assert env.world_size() == 4
assert env.global_rank() == 3
assert env.local_rank() == 1
env.set_global_rank(100)
assert env.global_rank() == 3
env.set_world_size(100)
assert env.world_size() == 4
assert LSFEnvironment.is_using_lsf()


@mock.patch("socket.gethostname", return_value="host2")
@mock.patch.dict(os.environ, {
"LSB_HOSTS": "batch host0 host1 host2 host3",
"LSB_JOBID": "1234",
})
def test_node_rank(_):
env = LSFEnvironment()
assert env.node_rank() == 2
13 changes: 13 additions & 0 deletions tests/plugins/environments/test_slurm_environment.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
# Copyright The PyTorch Lightning team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
from unittest import mock
Expand Down
13 changes: 13 additions & 0 deletions tests/plugins/environments/test_torchelastic_environment.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
# Copyright The PyTorch Lightning team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
from unittest import mock
Expand Down