Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions ats/atsMachines/fluxScheduled.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,16 @@ def init(self):
log(("DEBUG: FluxScheduled init : self.numNodesAvailable =%i" % (self.numNodesAvailable)), echo=True)
log(("DEBUG: FluxScheduled init : self.numGPUsAvailable =%i" % (self.numGPUs)), echo=True)

# Call get_physical_node to cache the hardware node listing before starting jobs
self.get_physical_node(0)
# Call get_physical_node to cache the hardware node listing before starting jobs.
# This is required for the same_node functionality.
try:
self.get_physical_node(0)
except RuntimeError:
# If you are not in an allocation, an exception will be thrown.
# We ignore the exception here and allow _cached_nodes to be None.
# If you are not using same_node, this is fine. If you are using same_node,
# it should throw an exception when setting up the command list.
pass

def expand_nodelist(self, nodelist_field):
"""
Expand Down Expand Up @@ -145,7 +153,7 @@ def get_physical_node(self, rel_index):
nodelist_field = parts[-1]
break
if nodelist_field is None:
raise RuntimeError("Could not find NODELIST field in flux resource list output.")
raise RuntimeError("Could not find NODELIST field in flux resource list output. Use of ATS same_node feature requires running ATS within an allocation.")
FluxScheduled._cached_nodes = self.expand_nodelist(nodelist_field)
log(("Info: Physical Hardware Nodes: %s" % FluxScheduled._cached_nodes), echo=True)

Expand Down
14 changes: 11 additions & 3 deletions ats/atsMachines/slurmProcessorScheduled.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,16 @@ def init(self):

super(SlurmProcessorScheduled, self).init()

# Call get_physical_node to cache the hardware node listing before starting jobs
self.get_physical_node(0)
# Call get_physical_node to cache the hardware node listing before starting jobs.
# This is required for the same_node functionality.
try:
self.get_physical_node(0)
except RuntimeError:
# If you are not in an allocation, an exception will be thrown.
# We ignore the exception here and allow _cached_nodes to be None.
# If you are not using same_node, this is fine. If you are using same_node,
# it should throw an exception when setting up the command list.
pass

def expand_nodelist(self, nodelist_field):
"""
Expand Down Expand Up @@ -112,7 +120,7 @@ def get_physical_node(self, rel_index):
nodelist_str = os.environ.get("SLURM_JOB_NODELIST")
if not nodelist_str:
raise RuntimeError(
"SLURM_JOB_NODELIST is not set. Are you running inside a Slurm allocation/job?"
"SLURM_JOB_NODELIST is not set. Use of ATS same_node feature requires running ATS within an allocation."
)

# Option 1: if your expand_nodelist already handles Slurm-style nodelists,
Expand Down