diff --git a/ats/atsMachines/fluxScheduled.py b/ats/atsMachines/fluxScheduled.py index a1492c9..18754fe 100755 --- a/ats/atsMachines/fluxScheduled.py +++ b/ats/atsMachines/fluxScheduled.py @@ -104,8 +104,16 @@ def init(self): log(("DEBUG: FluxScheduled init : self.numNodesAvailable =%i" % (self.numNodesAvailable)), echo=True) log(("DEBUG: FluxScheduled init : self.numGPUsAvailable =%i" % (self.numGPUs)), echo=True) - # Call get_physical_node to cache the hardware node listing before starting jobs - self.get_physical_node(0) + # Call get_physical_node to cache the hardware node listing before starting jobs. + # This is required for the same_node functionality. + try: + self.get_physical_node(0) + except RuntimeError: + # If you are not in an allocation, an exception will be thrown. + # We ignore the exception here and allow _cached_nodes to be None. + # If you are not using same_node, this is fine. If you are using same_node, + # it should throw an exception when setting up the command list. + pass def expand_nodelist(self, nodelist_field): """ @@ -145,7 +153,7 @@ def get_physical_node(self, rel_index): nodelist_field = parts[-1] break if nodelist_field is None: - raise RuntimeError("Could not find NODELIST field in flux resource list output.") + raise RuntimeError("Could not find NODELIST field in flux resource list output. Use of ATS same_node feature requires running ATS within an allocation.") FluxScheduled._cached_nodes = self.expand_nodelist(nodelist_field) log(("Info: Physical Hardware Nodes: %s" % FluxScheduled._cached_nodes), echo=True) diff --git a/ats/atsMachines/slurmProcessorScheduled.py b/ats/atsMachines/slurmProcessorScheduled.py index 7eceec5..dca3b8e 100644 --- a/ats/atsMachines/slurmProcessorScheduled.py +++ b/ats/atsMachines/slurmProcessorScheduled.py @@ -77,8 +77,16 @@ def init(self): super(SlurmProcessorScheduled, self).init() - # Call get_physical_node to cache the hardware node listing before starting jobs - self.get_physical_node(0) + # Call get_physical_node to cache the hardware node listing before starting jobs. + # This is required for the same_node functionality. + try: + self.get_physical_node(0) + except RuntimeError: + # If you are not in an allocation, an exception will be thrown. + # We ignore the exception here and allow _cached_nodes to be None. + # If you are not using same_node, this is fine. If you are using same_node, + # it should throw an exception when setting up the command list. + pass def expand_nodelist(self, nodelist_field): """ @@ -112,7 +120,7 @@ def get_physical_node(self, rel_index): nodelist_str = os.environ.get("SLURM_JOB_NODELIST") if not nodelist_str: raise RuntimeError( - "SLURM_JOB_NODELIST is not set. Are you running inside a Slurm allocation/job?" + "SLURM_JOB_NODELIST is not set. Use of ATS same_node feature requires running ATS within an allocation." ) # Option 1: if your expand_nodelist already handles Slurm-style nodelists,