From 68e3f159e38bb41fbe5e2d5982ce38c144faecc3 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 18 Jul 2024 15:55:31 -0500 Subject: [PATCH 1/4] Fix inconsistency in internal host representation --- smartsim/_core/launcher/dragon/dragonBackend.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index e98eb0a304..695b8d8772 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -210,10 +210,8 @@ def group_infos(self) -> dict[str, ProcessGroupInfo]: def _initialize_hosts(self) -> None: with self._queue_lock: - self._hosts: t.List[str] = sorted( - node for node in dragon_machine.System().nodes - ) - self._nodes = [dragon_machine.Node(node) for node in self._hosts] + self._nodes = [dragon_machine.Node(node) for node in dragon_machine.System().nodes] + self._hosts: t.List[str] = sorted(node.hostname for node in self._nodes) self._cpus = [node.num_cpus for node in self._nodes] self._gpus = [node.num_gpus for node in self._nodes] @@ -452,7 +450,7 @@ def create_run_policy( if run_request.policy.gpu_affinity: affinity = dragon_policy.Policy.Affinity.SPECIFIC gpu_affinity = run_request.policy.gpu_affinity - + logger.debug(f"Affinity: {affinity}, {cpu_affinity}, {gpu_affinity}") if affinity != dragon_policy.Policy.Affinity.DEFAULT: return dragon_policy.Policy( placement=dragon_policy.Policy.Placement.HOST_NAME, From 07c11deb8b8ff6e15c23de609f215b8e2a025aa5 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 18 Jul 2024 15:48:25 -0500 Subject: [PATCH 2/4] Changelog --- doc/changelog.md | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/changelog.md b/doc/changelog.md index cc23b703d9..6efeedfaf3 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -15,6 +15,7 @@ To be released at some future point in time Description +- Fix internal host name representation for Dragon backend - Make dependencies more discoverable in setup.py - Add hardware pinning capability when using dragon - Pin NumPy version to 1.x From ec40a3ec21081f01c84fe4fbfcaca3ff2f95f47f Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 18 Jul 2024 16:07:57 -0500 Subject: [PATCH 3/4] Style, lint --- smartsim/_core/launcher/dragon/dragonBackend.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 695b8d8772..abb212fd04 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -210,8 +210,10 @@ def group_infos(self) -> dict[str, ProcessGroupInfo]: def _initialize_hosts(self) -> None: with self._queue_lock: - self._nodes = [dragon_machine.Node(node) for node in dragon_machine.System().nodes] - self._hosts: t.List[str] = sorted(node.hostname for node in self._nodes) + self._nodes = [ + dragon_machine.Node(node) for node in dragon_machine.System().nodes + ] + self._hosts: t.List[str] = sorted(node.hostname for node in self._nodes) self._cpus = [node.num_cpus for node in self._nodes] self._gpus = [node.num_gpus for node in self._nodes] From c6ca05746ad5595da0c5c6f7b07751045b650805 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 18 Jul 2024 16:33:52 -0500 Subject: [PATCH 4/4] Improve debug logging --- smartsim/_core/launcher/dragon/dragonBackend.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index abb212fd04..4aba60d558 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -452,7 +452,11 @@ def create_run_policy( if run_request.policy.gpu_affinity: affinity = dragon_policy.Policy.Affinity.SPECIFIC gpu_affinity = run_request.policy.gpu_affinity - logger.debug(f"Affinity: {affinity}, {cpu_affinity}, {gpu_affinity}") + logger.debug( + f"Affinity strategy: {affinity}, " + f"CPU affinity mask: {cpu_affinity}, " + f"GPU affinity mask: {gpu_affinity}" + ) if affinity != dragon_policy.Policy.Affinity.DEFAULT: return dragon_policy.Policy( placement=dragon_policy.Policy.Placement.HOST_NAME,