From c7e9c0a3237d79e89d2432ec3eaad17e06232665 Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Mon, 9 Feb 2026 13:20:16 +0200 Subject: [PATCH 01/60] Adds angstrom to bohr conversion factor Adds the angstrom to bohr conversion factor to the constants module. This facilitates easier conversions between these units within the codebase, enhancing usability and reducing potential errors. --- arc/constants.pxd | 2 +- arc/constants.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arc/constants.pxd b/arc/constants.pxd index 4a50c72602..9fc3b9127d 100644 --- a/arc/constants.pxd +++ b/arc/constants.pxd @@ -1 +1 @@ -cdef double pi, Na, kB, R, h, hbar, c, e, m_e, m_p, m_n, amu, a0, E_h, F, E_h_kJmol, bohr_to_angstrom +cdef double pi, Na, kB, R, h, hbar, c, e, m_e, m_p, m_n, amu, a0, E_h, F, E_h_kJmol, bohr_to_angstrom, angstrom_to_bohr diff --git a/arc/constants.py b/arc/constants.py index fef8e8f167..dbd161f63d 100644 --- a/arc/constants.py +++ b/arc/constants.py @@ -79,6 +79,7 @@ epsilon_0 = 8.8541878128 bohr_to_angstrom = 0.529177 +angstrom_to_bohr = 1 / bohr_to_angstrom # Cython does not automatically place module-level variables into the module # symbol table when in compiled mode, so we must do this manually so that we @@ -102,4 +103,5 @@ 'F': F, 'epsilon_0': epsilon_0, 'bohr_to_angstrom': bohr_to_angstrom, + 'angstrom_to_bohr': angstrom_to_bohr, }) From c727294d0675084283c9da3f9bd9f64238606d39 Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Mon, 9 Feb 2026 13:23:16 +0200 Subject: [PATCH 02/60] Adds CREST settings and installation for transition state search Adds CREST settings and installation to ARC for transition state search. This commit introduces necessary files and updates to enable CREST within the ARC framework, enhancing its capabilities for exploring reaction pathways. It includes: - A script to install CREST via conda. - A module for locating the CREST executable and setting up the environment. - Integration of CREST into the settings to allow its use as a TS adapter. --- Makefile | 4 ++ arc/settings/crest.py | 113 +++++++++++++++++++++++++++++++++ arc/settings/crest_test.py | 77 ++++++++++++++++++++++ arc/settings/settings.py | 62 +++++++++++++++++- devtools/crest_environment.yml | 6 ++ devtools/install_all.sh | 35 +++++++--- devtools/install_autotst.sh | 82 +++++++++++++++++++++--- devtools/install_crest.sh | 64 +++++++++++++++++++ devtools/install_gcn.sh | 67 ++++++++++--------- devtools/install_pyrdl.sh | 4 +- devtools/install_torchani.sh | 5 +- 11 files changed, 461 insertions(+), 58 deletions(-) create mode 100644 arc/settings/crest.py create mode 100644 arc/settings/crest_test.py create mode 100644 devtools/crest_environment.yml create mode 100644 devtools/install_crest.sh diff --git a/Makefile b/Makefile index ff5b1e7091..4fd3dfbc35 100644 --- a/Makefile +++ b/Makefile @@ -36,6 +36,7 @@ help: @echo " install-kinbot Install KinBot" @echo " install-sella Install Sella" @echo " install-xtb Install xTB" + @echo " install-crest Install CREST" @echo " install-torchani Install TorchANI" @echo " install-ob Install OpenBabel" @echo "" @@ -100,6 +101,9 @@ install-sella: install-xtb: bash $(DEVTOOLS_DIR)/install_xtb.sh +install-crest: + bash $(DEVTOOLS_DIR)/install_crest.sh + install-torchani: bash $(DEVTOOLS_DIR)/install_torchani.sh diff --git a/arc/settings/crest.py b/arc/settings/crest.py new file mode 100644 index 0000000000..ebd227fa53 --- /dev/null +++ b/arc/settings/crest.py @@ -0,0 +1,113 @@ +""" +Utilities for locating CREST executables and activation commands. +""" + +import os +import re +import shutil +import sys +from typing import Optional, Tuple + + +def parse_version(folder_name: str) -> Tuple[int, int, int]: + """ + Parse a version from a folder name. + + Supports patterns such as ``3.0.2``, ``v212``, ``2.1``, ``2``. + """ + version_regex = re.compile(r"(?:v?(\d+)(?:\.(\d+))?(?:\.(\d+))?)", re.IGNORECASE) + match = version_regex.search(folder_name) + if not match: + return 0, 0, 0 + + major = int(match.group(1)) if match.group(1) else 0 + minor = int(match.group(2)) if match.group(2) else 0 + patch = int(match.group(3)) if match.group(3) else 0 + + # Example: v212 -> (2, 1, 2) + if major >= 100 and match.group(2) is None and match.group(3) is None: + s = str(major).rjust(3, "0") + major, minor, patch = int(s[0]), int(s[1]), int(s[2]) + + return major, minor, patch + + +def find_highest_version_in_directory(directory: str, name_contains: str) -> Optional[str]: + """ + Find the ``crest`` executable under the highest-version matching subdirectory. + """ + if not os.path.exists(directory): + return None + + highest_version_path = None + highest_version = () + for folder in os.listdir(directory): + file_path = os.path.join(directory, folder) + if name_contains.lower() in folder.lower() and os.path.isdir(file_path): + crest_path = os.path.join(file_path, "crest") + if os.path.isfile(crest_path) and os.access(crest_path, os.X_OK): + version = parse_version(folder) + if highest_version == () or version > highest_version: + highest_version = version + highest_version_path = crest_path + return highest_version_path + + +def find_crest_executable() -> Tuple[Optional[str], Optional[str]]: + """ + Return ``(crest_path, env_cmd)``. + + ``env_cmd`` is a shell snippet to activate the environment if needed, otherwise ``""``. + """ + # Priority 1: standalone builds in a configurable directory (default: /Local/ce_dana) + standalone_dir = os.getenv("ARC_CREST_STANDALONE_DIR", "/Local/ce_dana") + crest_path = find_highest_version_in_directory(standalone_dir, "crest") + if crest_path and os.path.isfile(crest_path) and os.access(crest_path, os.X_OK): + return crest_path, "" + + # Priority 2: Conda/Mamba/Micromamba envs + home = os.path.expanduser("~") + potential_env_paths = [ + os.path.join(home, "anaconda3", "envs", "crest_env", "bin", "crest"), + os.path.join(home, "miniconda3", "envs", "crest_env", "bin", "crest"), + os.path.join(home, "miniforge3", "envs", "crest_env", "bin", "crest"), + os.path.join(home, ".conda", "envs", "crest_env", "bin", "crest"), + os.path.join(home, "mambaforge", "envs", "crest_env", "bin", "crest"), + os.path.join(home, "micromamba", "envs", "crest_env", "bin", "crest"), + ] + + current_env_bin = os.path.dirname(sys.executable) + potential_env_paths.insert(0, os.path.join(current_env_bin, "crest")) + + for crest_path in potential_env_paths: + if os.path.isfile(crest_path) and os.access(crest_path, os.X_OK): + env_marker = os.path.join("envs", "crest_env") + os.path.sep + env_root = crest_path.split(env_marker)[0] + if "micromamba" in crest_path: + env_cmd = ( + f"source {env_root}/etc/profile.d/micromamba.sh && " + f"micromamba activate crest_env" + ) + elif any(name in env_root for name in ("anaconda3", "miniconda3", "miniforge3", "mambaforge", ".conda")): + env_cmd = ( + f"source {env_root}/etc/profile.d/conda.sh && " + f"conda activate crest_env" + ) + else: + env_cmd = "" + return crest_path, env_cmd + + # Priority 3: PATH + crest_in_path = shutil.which("crest") + if crest_in_path: + return crest_in_path, "" + + return None, None + + +__all__ = [ + "parse_version", + "find_highest_version_in_directory", + "find_crest_executable", +] + diff --git a/arc/settings/crest_test.py b/arc/settings/crest_test.py new file mode 100644 index 0000000000..d7793604ed --- /dev/null +++ b/arc/settings/crest_test.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +# encoding: utf-8 + +""" +Unit tests for arc.settings.crest +""" + +import os +import stat +import tempfile +import unittest +from unittest.mock import patch + +from arc.settings.crest import ( + find_crest_executable, + find_highest_version_in_directory, + parse_version, +) + + +class TestCrestSettingsUtils(unittest.TestCase): + + def _make_executable(self, path: str): + with open(path, "w") as f: + f.write("#!/bin/bash\n") + st = os.stat(path) + os.chmod(path, st.st_mode | stat.S_IXUSR) + + def test_parse_version(self): + self.assertEqual(parse_version("crest-3.0.2"), (3, 0, 2)) + self.assertEqual(parse_version("v212"), (2, 1, 2)) + self.assertEqual(parse_version("version-2.1"), (2, 1, 0)) + self.assertEqual(parse_version("foo"), (0, 0, 0)) + + def test_find_highest_version_in_directory(self): + with tempfile.TemporaryDirectory() as td: + low = os.path.join(td, "crest-2.1") + high = os.path.join(td, "crest-3.0.2") + os.makedirs(low) + os.makedirs(high) + self._make_executable(os.path.join(low, "crest")) + self._make_executable(os.path.join(high, "crest")) + + found = find_highest_version_in_directory(td, "crest") + self.assertEqual(found, os.path.join(high, "crest")) + + def test_find_crest_executable_prefers_standalone(self): + with tempfile.TemporaryDirectory() as td: + standalone = os.path.join(td, "crest-3.0.2") + os.makedirs(standalone) + standalone_crest = os.path.join(standalone, "crest") + self._make_executable(standalone_crest) + + with patch.dict(os.environ, {"ARC_CREST_STANDALONE_DIR": td}, clear=False): + path, env_cmd = find_crest_executable() + self.assertEqual(path, standalone_crest) + self.assertEqual(env_cmd, "") + + def test_find_crest_executable_env_detection(self): + with tempfile.TemporaryDirectory() as td: + fake_home = os.path.join(td, "home") + os.makedirs(fake_home) + crest_path = os.path.join(fake_home, "miniforge3", "envs", "crest_env", "bin", "crest") + os.makedirs(os.path.dirname(crest_path), exist_ok=True) + self._make_executable(crest_path) + + with patch("arc.settings.crest.os.path.expanduser", return_value=fake_home): + with patch("arc.settings.crest.sys.executable", os.path.join(td, "python")): + with patch("arc.settings.crest.shutil.which", return_value=None): + path, env_cmd = find_crest_executable() + self.assertEqual(path, crest_path) + self.assertIn("conda activate crest_env", env_cmd) + + +if __name__ == "__main__": + unittest.main() + diff --git a/arc/settings/settings.py b/arc/settings/settings.py index ea2c90a9cc..ff39426617 100644 --- a/arc/settings/settings.py +++ b/arc/settings/settings.py @@ -9,6 +9,12 @@ import os import string import sys +import shutil +from arc.settings.crest import ( + find_crest_executable, + find_highest_version_in_directory, + parse_version, +) # Users should update the following server dictionary. # Instructions for RSA key generation can be found here: @@ -88,7 +94,7 @@ supported_ess = ['cfour', 'gaussian', 'mockter', 'molpro', 'orca', 'qchem', 'terachem', 'onedmin', 'xtb', 'torchani', 'openbabel'] # TS methods to try when appropriate for a reaction (other than user guesses which are always allowed): -ts_adapters = ['heuristics', 'AutoTST', 'GCN', 'xtb_gsm'] +ts_adapters = ['heuristics', 'AutoTST', 'GCN', 'xtb_gsm', 'crest'] # List here job types to execute by default default_job_types = {'conf_opt': True, # defaults to True if not specified @@ -427,3 +433,57 @@ def add_rmg_db_candidates(prefix: str) -> None: if path and os.path.isdir(path): RMG_DB_PATH = path break + +CREST_PATH, CREST_ENV_PATH = find_crest_executable() + +__all__ = [ + "servers", + "global_ess_settings", + "supported_ess", + "ts_adapters", + "default_job_types", + "levels_ess", + "check_status_command", + "submit_command", + "delete_command", + "list_available_nodes_command", + "submit_filenames", + "t_max_format", + "input_filenames", + "output_filenames", + "default_levels_of_theory", + "orca_default_options_dict", + "tani_default_options_dict", + "ob_default_settings", + "xtb_gsm_settings", + "valid_chars", + "rotor_scan_resolution", + "maximum_barrier", + "minimum_barrier", + "inconsistency_az", + "inconsistency_ab", + "max_rotor_trsh", + "preserve_params_in_scan", + "workers_coeff", + "default_job_settings", + "ARC_FAMILIES_PATH", + "home", + "TANI_PYTHON", + "OB_PYTHON", + "TS_GCN_PYTHON", + "AUTOTST_PYTHON", + "ARC_PYTHON", + "RMG_ENV_NAME", + "RMG_PYTHON", + "XTB", + "exported_rmg_path", + "exported_rmg_db_path", + "gw", + "find_executable", + "add_rmg_db_candidates", + "parse_version", + "find_highest_version_in_directory", + "find_crest_executable", + "CREST_PATH", + "CREST_ENV_PATH", +] diff --git a/devtools/crest_environment.yml b/devtools/crest_environment.yml new file mode 100644 index 0000000000..2291e72d37 --- /dev/null +++ b/devtools/crest_environment.yml @@ -0,0 +1,6 @@ +name: crest_env +channels: + - conda-forge +dependencies: + - python>=3.7 + - crest=2.12 diff --git a/devtools/install_all.sh b/devtools/install_all.sh index c958fdd548..c9de207ef7 100644 --- a/devtools/install_all.sh +++ b/devtools/install_all.sh @@ -26,6 +26,8 @@ run_devtool () { bash "$DEVTOOLS_DIR/$1" "${@:2}"; } SKIP_CLEAN=false SKIP_EXT=false SKIP_ARC=false +SKIP_RMG=false +ARC_INSTALLED=false RMG_ARGS=() ARC_ARGS=() EXT_ARGS=() @@ -36,6 +38,7 @@ while [[ $# -gt 0 ]]; do --no-clean) SKIP_CLEAN=true ;; --no-ext) SKIP_EXT=true ;; --no-arc) SKIP_ARC=true ;; + --no-rmg) SKIP_RMG=true ;; --rmg-*) RMG_ARGS+=("--${1#--rmg-}") ;; --arc-*) ARC_ARGS+=("--${1#--arc-}") ;; --ext-*) EXT_ARGS+=("--${1#--ext-}") ;; @@ -44,6 +47,7 @@ while [[ $# -gt 0 ]]; do Usage: $0 [global-flags] [--rmg-xxx] [--arc-yyy] [--ext-zzz] --no-clean Skip micromamba/conda cache cleanup --no-ext Skip external tools (AutoTST, KinBot, …) + --no-rmg Skip RMG-Py entirely --rmg-path Forward '--path' to RMG installer --rmg-pip Forward '--pip' to RMG installer ... @@ -67,16 +71,15 @@ echo " EXT sub-flags : ${EXT_ARGS[*]:-(none)}" echo ">>> Beginning full ARC external repo installation…" pushd . >/dev/null -# 1) RMG -echo "=== Installing RMG ===" -run_devtool install_rmg.sh "${RMG_ARGS[@]}" - - - # 2) PyRDL - echo "=== Installing PyRDL ===" - bash devtools/install_pyrdl.sh +# 1) RMG (optional) +if [[ $SKIP_RMG == false ]]; then + echo "=== Installing RMG ===" + run_devtool install_rmg.sh "${RMG_ARGS[@]}" +else + echo "ℹ️ --no-rmg flag set. Skipping RMG installation." +fi -# 3) ARC itself (skip env creation in CI or if user requests it) +# 2) ARC itself (skip env creation in CI or if user requests it) if [[ "${CI:-false}" != "true" && "${SKIP_ARC:-false}" != "true" ]]; then if [[ $SKIP_CLEAN == false ]]; then echo "=== Cleaning up old ARC build artifacts ===" @@ -88,10 +91,23 @@ if [[ "${CI:-false}" != "true" && "${SKIP_ARC:-false}" != "true" ]]; then echo "=== Installing ARC ===" run_devtool install_arc.sh "${ARC_ARGS[@]}" + ARC_INSTALLED=true else + ARC_INSTALLED=false echo ":information_source: CI detected or --no-arc flag set. Skip cleaning ARC installation." fi +# 3) PyRDL (needs arc_env, but not ARC install) +if [[ "${CI:-false}" == "true" ]]; then + echo "=== Installing PyRDL (CI) ===" + bash devtools/install_pyrdl.sh +elif [[ $ARC_INSTALLED == true ]]; then + echo "=== Installing PyRDL ===" + bash devtools/install_pyrdl.sh +else + echo "ℹ️ Skipping PyRDL install because ARC installation was skipped." +fi + if [[ $SKIP_EXT == false ]]; then # map of friendly names → installer scripts declare -A EXT_INSTALLERS=( @@ -100,6 +116,7 @@ if [[ $SKIP_EXT == false ]]; then [KinBot]=install_kinbot.sh [OpenBabel]=install_ob.sh [xtb]=install_xtb.sh + [CREST]=install_crest.sh [Sella]=install_sella.sh [TorchANI]=install_torchani.sh ) diff --git a/devtools/install_autotst.sh b/devtools/install_autotst.sh index 5e3bc35288..e71e42d035 100644 --- a/devtools/install_autotst.sh +++ b/devtools/install_autotst.sh @@ -31,6 +31,8 @@ done # where "$(pwd)" is the path to the AutoTST repository. write_hook () { local env="$1" repo_path="$2" # repo_path="$(pwd)" in AutoTST + local repo_path_escaped + repo_path_escaped=$(printf '%q' "$repo_path") $COMMAND_PKG env list | awk '{print $1}' | grep -qx "$env" || return 0 # env prefix @@ -50,16 +52,37 @@ write_hook () { # --- activation -------------------------------------------------------- cat >"$act" <>"$act" <<'EOF' +# Remove RMG-Py from PATH/PYTHONPATH to avoid clashes while AutoTST is active. +if [[ -n "${RMG_PY_PATH:-}" ]]; then + export PATH="$(_strip_path "$RMG_PY_PATH" "$PATH")" + export PYTHONPATH="$(_strip_path "$RMG_PY_PATH" "${PYTHONPATH:-}")" +fi +EOF + fi + + cat >>"$act" <<'EOF' case ":\$PYTHONPATH:" in *":\$AUTOTST_ROOT:"*) ;; \ *) export PYTHONPATH="\$AUTOTST_ROOT:\${PYTHONPATH:-}" ;; esac EOF # --- de-activation ----------------------------------------------------- cat >"$deact" <<'EOF' -_strip () { local n=":$1:"; local s=":$2:"; echo "${s//$n/:}" | sed 's/^://;s/:$//'; } -export PYTHONPATH=$(_strip "$AUTOTST_ROOT" ":${PYTHONPATH:-}:") -unset AUTOTST_ROOT +export PATH="${AUTOTST_OLD_PATH:-$PATH}" +if [[ -n "${AUTOTST_OLD_PYTHONPATH+x}" ]]; then + export PYTHONPATH="$AUTOTST_OLD_PYTHONPATH" +else + unset PYTHONPATH +fi +unset AUTOTST_ROOT AUTOTST_OLD_PATH AUTOTST_OLD_PYTHONPATH EOF echo "🔗 AutoTST hook refreshed in $env" } @@ -115,12 +138,53 @@ fi if [[ $MODE == "path" ]]; then - AUTO_PATH_LINE="export PYTHONPATH=\"\$PYTHONPATH:$(pwd)\"" - if ! grep -Fqx "$AUTO_PATH_LINE" ~/.bashrc; then - echo "$AUTO_PATH_LINE" >> ~/.bashrc - echo "✔️ Added AutoTST path to ~/.bashrc" + HOOK_SENTINEL="# AutoTST path-mode hook" + if ! grep -Fqx "$HOOK_SENTINEL" ~/.bashrc; then + cat <<'EOF' >> ~/.bashrc +# AutoTST path-mode hook +_strip_path () { + local needle=":$1:" + local haystack=":$2:" + echo "${haystack//$needle/:}" | sed 's/^://;s/:$//' +} + +autotst_on () { + export AUTOTST_ROOT="__AUTOTST_PATH__" + export AUTOTST_OLD_PATH="$PATH" + export AUTOTST_OLD_PYTHONPATH="${PYTHONPATH:-}" + if [[ -n "${RMG_PY_PATH:-}" ]]; then + PATH="$(_strip_path "$RMG_PY_PATH" "$PATH")" + PYTHONPATH="$(_strip_path "$RMG_PY_PATH" "${PYTHONPATH:-}")" + fi + + case ":$PYTHONPATH:" in *":$AUTOTST_ROOT:"*) ;; \ + *) PYTHONPATH="$AUTOTST_ROOT:${PYTHONPATH:-}" ;; esac + export PATH PYTHONPATH +} + +autotst_off () { + export PATH="${AUTOTST_OLD_PATH:-$PATH}" + if [[ -n "${AUTOTST_OLD_PYTHONPATH+x}" ]]; then + export PYTHONPATH="$AUTOTST_OLD_PYTHONPATH" + else + unset PYTHONPATH + fi + unset AUTOTST_ROOT AUTOTST_OLD_PATH AUTOTST_OLD_PYTHONPATH +} + +# Enable AutoTST by default in new shells and keep RMG-Py out of the way. +autotst_on +EOF + # replace placeholder with actual path (portable across GNU/BSD sed) + AUTOTST_ESCAPED_PATH="$(printf '%q' "$(pwd)" | sed 's#/#\\\\/#g')" + if sed --version >/dev/null 2>&1; then + sed -i "s#__AUTOTST_PATH__#${AUTOTST_ESCAPED_PATH}#" ~/.bashrc + else + sed -i '' "s#__AUTOTST_PATH__#${AUTOTST_ESCAPED_PATH}#" ~/.bashrc + fi + echo "✔️ Added AutoTST path-mode hook to ~/.bashrc" else - echo "ℹ️ AutoTST path already exists in ~/.bashrc" + echo "ℹ️ AutoTST path-mode hook already exists in ~/.bashrc" fi elif [[ $MODE == "conda" ]]; then write_hook tst_env "$(pwd)" diff --git a/devtools/install_crest.sh b/devtools/install_crest.sh new file mode 100644 index 0000000000..1086ec9db2 --- /dev/null +++ b/devtools/install_crest.sh @@ -0,0 +1,64 @@ +#!/bin/bash -l +set -eo pipefail + +if command -v micromamba &> /dev/null; then + echo "✔️ Micromamba is installed." + COMMAND_PKG=micromamba +elif command -v mamba &> /dev/null; then + echo "✔️ Mamba is installed." + COMMAND_PKG=mamba +elif command -v conda &> /dev/null; then + echo "✔️ Conda is installed." + COMMAND_PKG=conda +else + echo "❌ Micromamba, Mamba, or Conda is required. Please install one." + exit 1 +fi + +if [ "$COMMAND_PKG" = "micromamba" ]; then + eval "$(micromamba shell hook --shell=bash)" +else + BASE=$(conda info --base) + . "$BASE/etc/profile.d/conda.sh" +fi + +ENV_FILE="devtools/crest_environment.yml" + +if [ ! -f "$ENV_FILE" ]; then + echo "❌ File not found: $ENV_FILE" + exit 1 +fi + +if $COMMAND_PKG env list | grep -q '^crest_env\s'; then + echo ">>> Updating existing crest_env..." + $COMMAND_PKG env update -n crest_env -f "$ENV_FILE" --prune +else + echo ">>> Creating new crest_env..." + $COMMAND_PKG env create -n crest_env -f "$ENV_FILE" -y +fi + +echo ">>> Checking CREST installation..." + +if [ "$COMMAND_PKG" = "micromamba" ]; then + CREST_RUNNER="micromamba run -n crest_env" + CREST_LISTER="micromamba list -n crest_env" +else + CREST_RUNNER="conda run -n crest_env" + CREST_LISTER="conda list -n crest_env" +fi + +if $CREST_RUNNER crest --version &> /dev/null; then + version_output=$($CREST_RUNNER crest --version 2>&1) + echo "$version_output" + installed_version=$(printf '%s' "$version_output" | tr '\n' ' ' | sed -n 's/.*Version[[:space:]]\+\([0-9.][0-9.]*\).*/\1/p') + if [ "$installed_version" != "2.12" ]; then + echo "❌ CREST version mismatch (expected 2.12)." + exit 1 + fi + echo "✔️ CREST 2.12 is successfully installed." +else + echo "❌ CREST is not found in PATH. Please check the environment." + exit 1 +fi + +echo "✅ Done installing CREST (crest_env)." diff --git a/devtools/install_gcn.sh b/devtools/install_gcn.sh index 8f83a2cda1..5273353d77 100644 --- a/devtools/install_gcn.sh +++ b/devtools/install_gcn.sh @@ -93,12 +93,12 @@ write_hook() { # env_name repo_path rm -f "$act" "$deact" # --- activation hook ----------------------------------------------------- - cat <<'ACTHOOK' >"$act" + cat <"$act" # TS-GCN hook – $(date +%F) export TSGCN_ROOT="$repo" -case ":$PYTHONPATH:" in - *":$TSGCN_ROOT:") ;; \ - *) export PYTHONPATH="$TSGCN_ROOT:\${PYTHONPATH:-}" ;; +case ":\$PYTHONPATH:" in + *":\$TSGCN_ROOT:") ;; \ + *) export PYTHONPATH="\$TSGCN_ROOT:\${PYTHONPATH:-}" ;; esac ACTHOOK @@ -182,46 +182,43 @@ CORE_PKGS=( # ── inline env creation & unified PyTorch install -------------------------- if $COMMAND_PKG env list | awk '{print $1}' | grep -qx ts_gcn; then - $COMMAND_PKG env update -n ts_gcn \ + $COMMAND_PKG install -n ts_gcn \ -c schrodinger -c conda-forge \ --channel-priority flexible \ "${CORE_PKGS[@]}" \ - --prune -y + --yes else - $COMMAND_PKG env create -n ts_gcn \ + $COMMAND_PKG create -n ts_gcn \ -c schrodinger -c conda-forge \ --channel-priority flexible \ "${CORE_PKGS[@]}" \ - -y + --yes fi - # 2) activate it - we set +u to avoid printing variable names - # that are not set yet - set +u; $COMMAND_PKG activate ts_gcn; set -u - - # 3) pip‐install exactly the CPU or CUDA wheels (no ROCm on that index) - WHEEL=https://download.pytorch.org/whl/torch_stable.html - if [[ $CUDA_VERSION == cpu ]]; then -pip install torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 -f $WHEEL - else - pip install torch==1.7.1+${CUDA_VERSION} \ - torchvision==0.8.2+${CUDA_VERSION} \ - torchaudio==0.7.2+${CUDA_VERSION} \ - -f $WHEEL - fi - # for PyG wheels use the official PyG index—with a real '+' in the URL - TORCH_VER=1.7.1 - WHEEL_URL="https://pytorch-geometric.com/whl/torch-${TORCH_VER}+${CUDA_VERSION}.html" - - # install ONLY the prebuilt binaries, never fall back to source - pip install torch-scatter -f "$WHEEL_URL" --only-binary torch-scatter - pip install torch-sparse -f "$WHEEL_URL" --only-binary torch-sparse - pip install torch-cluster -f "$WHEEL_URL" --only-binary torch-cluster - pip install torch-spline-conv -f "$WHEEL_URL" --only-binary torch-spline-conv - - # finally the meta‐package (this one can install from PyPI) - pip install torch-geometric - echo "✅ ts_gcn environment ready" +# 2) pip‐install exactly the CPU or CUDA wheels (no ROCm on that index) +PIP_RUN=("$COMMAND_PKG" run -n ts_gcn) +WHEEL=https://download.pytorch.org/whl/torch_stable.html +if [[ $CUDA_VERSION == cpu ]]; then + "${PIP_RUN[@]}" pip install torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 -f $WHEEL +else + "${PIP_RUN[@]}" pip install torch==1.7.1+${CUDA_VERSION} \ + torchvision==0.8.2+${CUDA_VERSION} \ + torchaudio==0.7.2+${CUDA_VERSION} \ + -f $WHEEL +fi +# for PyG wheels use the official PyG index—with a real '+' in the URL +TORCH_VER=1.7.1 +WHEEL_URL="https://pytorch-geometric.com/whl/torch-${TORCH_VER}+${CUDA_VERSION}.html" + +# install ONLY the prebuilt binaries, never fall back to source +"${PIP_RUN[@]}" pip install torch-scatter -f "$WHEEL_URL" --only-binary torch-scatter +"${PIP_RUN[@]}" pip install torch-sparse -f "$WHEEL_URL" --only-binary torch-sparse +"${PIP_RUN[@]}" pip install torch-cluster -f "$WHEEL_URL" --only-binary torch-cluster +"${PIP_RUN[@]}" pip install torch-spline-conv -f "$WHEEL_URL" --only-binary torch-spline-conv + +# finally the meta‐package (this one can install from PyPI) +"${PIP_RUN[@]}" pip install torch-geometric +echo "✅ ts_gcn environment ready" # ── write hooks into conda envs if required ------------------------------- if [[ $MODE == conda ]]; then diff --git a/devtools/install_pyrdl.sh b/devtools/install_pyrdl.sh index 529d9d5dc3..edcb5ed9da 100644 --- a/devtools/install_pyrdl.sh +++ b/devtools/install_pyrdl.sh @@ -51,8 +51,8 @@ fi # Ensure CMake is installed in the environment if ! command -v cmake &> /dev/null; then - echo "Installing CMake..." - "$COMMAND_PKG" install -y cmake + echo "Installing CMake into arc_env..." + "$COMMAND_PKG" install -n arc_env -c conda-forge -y cmake fi # Clone and build RingDecomposerLib diff --git a/devtools/install_torchani.sh b/devtools/install_torchani.sh index 5410e88658..992031d014 100644 --- a/devtools/install_torchani.sh +++ b/devtools/install_torchani.sh @@ -2,9 +2,10 @@ set -eo pipefail # Enable tracing of each command, but tee it to a logfile +LOGFILE="tani_env_setup.log" exec 3>&1 4>&2 trap 'exec 2>&4 1>&3' EXIT -exec 1> >(tee .log) 2>&1 +exec 1> >(tee "$LOGFILE") 2>&1 set -x echo ">>> Starting TANI environment setup at $(date)" @@ -53,7 +54,7 @@ fi echo ">>> Creating conda env from $ENV_YAML (name=$ENV_NAME)" if ! $COMMAND_PKG env create -n "$ENV_NAME" -f "$ENV_YAML" -v; then echo "❌ Environment creation failed. Dumping last 200 lines of log:" - tail -n 200 tani_env_setup.log + tail -n 200 "$LOGFILE" echo "---- Disk usage at failure ----" df -h . exit 1 From e265cddbcdde4e43e43af75e670bdffb53c56381 Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Mon, 9 Feb 2026 13:24:20 +0200 Subject: [PATCH 03/60] Adds CREST TS search adapter Adds a CREST adapter for transition state (TS) conformer searches, leveraging heuristics-generated guesses to find suitable TS structures. This facilitates more comprehensive TS exploration, particularly for reaction families supported by heuristics but potentially refined through CREST's conformer searching capabilities. Also introduces a TS seed hub, which centralizes requests to base TS-search adapters, and provides wrapper adapters (e.g., CREST) family-specific constraints for a seed. --- arc/job/adapters/ts/__init__.py | 2 + arc/job/adapters/ts/crest.py | 520 +++++++++++++++++++++++++ arc/job/adapters/ts/crest_test.py | 146 +++++++ arc/job/adapters/ts/heuristics.py | 436 +++++++++++++-------- arc/job/adapters/ts/heuristics_test.py | 59 +++ arc/job/adapters/ts/seed_hub.py | 168 ++++++++ 6 files changed, 1164 insertions(+), 167 deletions(-) create mode 100644 arc/job/adapters/ts/crest.py create mode 100644 arc/job/adapters/ts/crest_test.py create mode 100644 arc/job/adapters/ts/seed_hub.py diff --git a/arc/job/adapters/ts/__init__.py b/arc/job/adapters/ts/__init__.py index 29444e0ed4..fba9ebf26e 100644 --- a/arc/job/adapters/ts/__init__.py +++ b/arc/job/adapters/ts/__init__.py @@ -1,5 +1,7 @@ import arc.job.adapters.ts.autotst_ts +import arc.job.adapters.ts.crest import arc.job.adapters.ts.gcn_ts import arc.job.adapters.ts.heuristics import arc.job.adapters.ts.kinbot_ts +import arc.job.adapters.ts.seed_hub import arc.job.adapters.ts.xtb_gsm diff --git a/arc/job/adapters/ts/crest.py b/arc/job/adapters/ts/crest.py new file mode 100644 index 0000000000..6396a968da --- /dev/null +++ b/arc/job/adapters/ts/crest.py @@ -0,0 +1,520 @@ +""" +Utilities for running CREST within ARC. + +Separated from heuristics so CREST can be conditionally imported and reused. +""" + +import datetime +import os +import time +from typing import TYPE_CHECKING, List, Optional, Union + +from arc.common import almost_equal_coords, get_logger +from arc.imports import settings, submit_scripts +from arc.job.adapter import JobAdapter +from arc.job.adapters.common import _initialize_adapter, ts_adapters_by_rmg_family +from arc.job.adapters.ts.heuristics import DIHEDRAL_INCREMENT +from arc.job.adapters.ts.seed_hub import get_ts_seeds, get_wrapper_constraints +from arc.job.factory import register_job_adapter +from arc.job.local import check_job_status, submit_job +from arc.plotter import save_geo +from arc.species.converter import reorder_xyz_string, str_to_xyz, xyz_to_str +from arc.species.species import ARCSpecies, TSGuess + +if TYPE_CHECKING: + from arc.level import Level + from arc.reaction import ARCReaction + +logger = get_logger() + +MAX_CHECK_INTERVAL_SECONDS = 100 + +CREST_PATH = settings.get("CREST_PATH", None) +CREST_ENV_PATH = settings.get("CREST_ENV_PATH", None) +SERVERS = settings.get("servers", {}) + + +def crest_available() -> bool: + """ + Return whether CREST is configured for use. + """ + return bool(SERVERS.get("local")) and bool(CREST_PATH or CREST_ENV_PATH) + + +class CrestAdapter(JobAdapter): + """ + A class for executing CREST TS conformer searches based on heuristics-generated guesses. + """ + + def __init__(self, + project: str, + project_directory: str, + job_type: Union[List[str], str], + args: Optional[dict] = None, + bath_gas: Optional[str] = None, + checkfile: Optional[str] = None, + conformer: Optional[int] = None, + constraints: Optional[List] = None, + cpu_cores: Optional[str] = None, + dihedral_increment: Optional[float] = None, + dihedrals: Optional[List[float]] = None, + directed_scan_type: Optional[str] = None, + ess_settings: Optional[dict] = None, + ess_trsh_methods: Optional[List[str]] = None, + execution_type: Optional[str] = None, + fine: bool = False, + initial_time: Optional[Union['datetime.datetime', str]] = None, + irc_direction: Optional[str] = None, + job_id: Optional[int] = None, + job_memory_gb: float = 14.0, + job_name: Optional[str] = None, + job_num: Optional[int] = None, + job_server_name: Optional[str] = None, + job_status: Optional[List[Union[dict, str]]] = None, + level: Optional['Level'] = None, + max_job_time: Optional[float] = None, + run_multi_species: bool = False, + reactions: Optional[List['ARCReaction']] = None, + rotor_index: Optional[int] = None, + server: Optional[str] = None, + server_nodes: Optional[list] = None, + queue: Optional[str] = None, + attempted_queues: Optional[List[str]] = None, + species: Optional[List[ARCSpecies]] = None, + testing: bool = False, + times_rerun: int = 0, + torsions: Optional[List[List[int]]] = None, + tsg: Optional[int] = None, + xyz: Optional[dict] = None, + ): + + self.incore_capacity = 50 + self.job_adapter = 'crest' + self.command = None + self.execution_type = execution_type or 'incore' + + if reactions is None: + raise ValueError('Cannot execute TS CREST without ARCReaction object(s).') + + dihedral_increment = dihedral_increment or DIHEDRAL_INCREMENT + + _initialize_adapter(obj=self, + is_ts=True, + project=project, + project_directory=project_directory, + job_type=job_type, + args=args, + bath_gas=bath_gas, + checkfile=checkfile, + conformer=conformer, + constraints=constraints, + cpu_cores=cpu_cores, + dihedral_increment=dihedral_increment, + dihedrals=dihedrals, + directed_scan_type=directed_scan_type, + ess_settings=ess_settings, + ess_trsh_methods=ess_trsh_methods, + fine=fine, + initial_time=initial_time, + irc_direction=irc_direction, + job_id=job_id, + job_memory_gb=job_memory_gb, + job_name=job_name, + job_num=job_num, + job_server_name=job_server_name, + job_status=job_status, + level=level, + max_job_time=max_job_time, + run_multi_species=run_multi_species, + reactions=reactions, + rotor_index=rotor_index, + server=server, + server_nodes=server_nodes, + queue=queue, + attempted_queues=attempted_queues, + species=species, + testing=testing, + times_rerun=times_rerun, + torsions=torsions, + tsg=tsg, + xyz=xyz, + ) + + def write_input_file(self) -> None: + pass + + def set_files(self) -> None: + pass + + def set_additional_file_paths(self) -> None: + pass + + def set_input_file_memory(self) -> None: + pass + + def execute_incore(self): + self._log_job_execution() + self.initial_time = self.initial_time if self.initial_time else datetime.datetime.now() + + supported_families = [key for key, val in ts_adapters_by_rmg_family.items() if 'crest' in val] + + self.reactions = [self.reactions] if not isinstance(self.reactions, list) else self.reactions + for rxn in self.reactions: + if rxn.family not in supported_families: + logger.warning(f'The CREST TS search adapter does not support the {rxn.family} reaction family.') + continue + if any(spc.get_xyz() is None for spc in rxn.r_species + rxn.p_species): + logger.warning(f'The CREST TS search adapter cannot process a reaction if 3D coordinates of ' + f'some/all of its reactants/products are missing.\nNot processing {rxn}.') + continue + if not crest_available(): + logger.warning('CREST is not available. Skipping CREST TS search.') + break + + if rxn.ts_species is None: + rxn.ts_species = ARCSpecies(label='TS', + is_ts=True, + charge=rxn.charge, + multiplicity=rxn.multiplicity, + ) + + tsg = TSGuess(method='CREST') + tsg.tic() + + crest_job_dirs = [] + xyz_guesses = get_ts_seeds( + reaction=rxn, + base_adapter='heuristics', + dihedral_increment=self.dihedral_increment, + ) + if not xyz_guesses: + logger.warning(f'CREST TS search failed to generate any seed guesses for {rxn.label}.') + tsg.tok() + continue + + for iteration, xyz_entry in enumerate(xyz_guesses): + xyz_guess = xyz_entry.get("xyz") + family = xyz_entry.get("family", rxn.family) + if xyz_guess is None: + continue + + crest_constraint_atoms = get_wrapper_constraints( + wrapper='crest', + reaction=rxn, + seed=xyz_entry, + ) + if not crest_constraint_atoms: + logger.warning( + f"Could not determine CREST constraint atoms for {rxn.label} crest seed {iteration} " + f"(family: {family}). Skipping this CREST seed." + ) + continue + + crest_job_dir = crest_ts_conformer_search( + xyz_guess, + crest_constraint_atoms["A"], + crest_constraint_atoms["H"], + crest_constraint_atoms["B"], + path=self.local_path, + xyz_crest_int=iteration, + ) + crest_job_dirs.append(crest_job_dir) + + if not crest_job_dirs: + logger.warning(f'CREST TS search failed to prepare any jobs for {rxn.label}.') + tsg.tok() + continue + + crest_jobs = submit_crest_jobs(crest_job_dirs) + monitor_crest_jobs(crest_jobs) + xyz_guesses_crest = process_completed_jobs(crest_jobs) + tsg.tok() + + for method_index, xyz in enumerate(xyz_guesses_crest): + if xyz is None: + continue + unique = True + for other_tsg in rxn.ts_species.ts_guesses: + if almost_equal_coords(xyz, other_tsg.initial_xyz): + if hasattr(other_tsg, "method_sources"): + other_tsg.method_sources = other_tsg._normalize_method_sources( + (other_tsg.method_sources or []) + ["crest"] + ) + unique = False + break + if unique: + ts_guess = TSGuess(method='CREST', + index=len(rxn.ts_species.ts_guesses), + method_index=method_index, + t0=tsg.t0, + execution_time=tsg.execution_time, + success=True, + family=rxn.family, + xyz=xyz, + ) + rxn.ts_species.ts_guesses.append(ts_guess) + save_geo(xyz=xyz, + path=self.local_path, + filename=f'CREST_{method_index}', + format_='xyz', + comment=f'CREST {method_index}, family: {rxn.family}', + ) + + if len(self.reactions) < 5: + successes = [tsg for tsg in rxn.ts_species.ts_guesses if tsg.success and 'crest' in tsg.method.lower()] + if successes: + logger.info(f'CREST successfully found {len(successes)} TS guesses for {rxn.label}.') + else: + logger.info(f'CREST did not find any successful TS guesses for {rxn.label}.') + + self.final_time = datetime.datetime.now() + + def execute_queue(self): + self.execute_incore() + + +def crest_ts_conformer_search( + xyz_guess: dict, + a_atom: int, + h_atom: int, + b_atom: int, + path: str = "", + xyz_crest_int: int = 0, +) -> str: + """ + Prepare a CREST TS conformer search job: + - Write coords.ref and constraints.inp + - Write a PBS/HTCondor submit script using submit_scripts["local"]["crest"] + - Return the CREST job directory path + """ + path = os.path.join(path, f"crest_{xyz_crest_int}") + os.makedirs(path, exist_ok=True) + + # --- coords.ref --- + symbols = xyz_guess["symbols"] + converted_coords = reorder_xyz_string( + xyz_str=xyz_to_str(xyz_guess), + reverse_atoms=True, + convert_to="bohr", + ) + coords_ref_content = f"$coord\n{converted_coords}\n$end\n" + coords_ref_path = os.path.join(path, "coords.ref") + with open(coords_ref_path, "w") as f: + f.write(coords_ref_content) + + # --- constraints.inp --- + num_atoms = len(symbols) + # CREST uses 1-based indices + a_atom += 1 + h_atom += 1 + b_atom += 1 + + # All atoms not directly involved in A–H–B go into the metadynamics atom list + list_of_atoms_numbers_not_participating_in_reaction = [ + i for i in range(1, num_atoms + 1) if i not in [a_atom, h_atom, b_atom] + ] + + constraints_path = os.path.join(path, "constraints.inp") + with open(constraints_path, "w") as f: + f.write("$constrain\n") + f.write(f" atoms: {a_atom}, {h_atom}, {b_atom}\n") + f.write(" force constant: 0.5\n") + f.write(" reference=coords.ref\n") + f.write(f" distance: {a_atom}, {h_atom}, auto\n") + f.write(f" distance: {h_atom}, {b_atom}, auto\n") + f.write("$metadyn\n") + if list_of_atoms_numbers_not_participating_in_reaction: + f.write( + f' atoms: {", ".join(map(str, list_of_atoms_numbers_not_participating_in_reaction))}\n' + ) + f.write("$end\n") + + # --- build CREST command string --- + # Example: crest coords.ref --cinp constraints.inp --noreftopo -T 8 + local_server = SERVERS.get("local", {}) + cpus = int(local_server.get("cpus", 8)) + if CREST_ENV_PATH: + crest_exe = "crest" + else: + crest_exe = CREST_PATH if CREST_PATH is not None else "crest" + + commands = [ + crest_exe, + "coords.ref", + "--cinp constraints.inp", + "--noreftopo", + f"-T {cpus}", + ] + command = " ".join(commands) + + # --- activation line (optional) --- + activation_line = CREST_ENV_PATH or "" + + if SERVERS.get("local") is not None: + cluster_soft = SERVERS["local"]["cluster_soft"].lower() + local_templates = submit_scripts.get("local", {}) + crest_template = local_templates.get("crest") + crest_job_template = local_templates.get("crest_job") + + if cluster_soft in ["condor", "htcondor"]: + # HTCondor branch with a built-in fallback template. + if crest_template is None: + crest_template = ( + "universe = vanilla\n" + "executable = job.sh\n" + "output = out.txt\n" + "error = err.txt\n" + "log = log.txt\n" + "request_cpus = {cpus}\n" + "request_memory = {memory}\n" + "JobBatchName = {name}\n" + "queue\n" + ) + if crest_job_template is None: + crest_job_template = ( + "#!/bin/bash -l\n" + "{activation_line}\n" + "cd {path}\n" + "{commands}\n" + ) + sub_job = crest_template + format_params = { + "name": f"crest_{xyz_crest_int}", + "cpus": cpus, + "memory": int(SERVERS["local"].get("memory", 32.0) * 1024), + } + sub_job = sub_job.format(**format_params) + + with open( + os.path.join(path, settings["submit_filenames"]["HTCondor"]), "w" + ) as f: + f.write(sub_job) + + crest_job = crest_job_template.format( + path=path, + activation_line=activation_line, + commands=command, + ) + + with open(os.path.join(path, "job.sh"), "w") as f: + f.write(crest_job) + os.chmod(os.path.join(path, "job.sh"), 0o700) + + # Pre-create out/err for any status checkers that expect them + for fname in ("out.txt", "err.txt"): + fpath = os.path.join(path, fname) + if not os.path.exists(fpath): + with open(fpath, "w") as f: + f.write("") + os.chmod(fpath, 0o600) + + elif cluster_soft == "pbs": + # PBS branch with a built-in fallback template. + if crest_template is None: + crest_template = ( + "#!/bin/bash -l\n" + "#PBS -q {queue}\n" + "#PBS -N {name}\n" + "#PBS -l select=1:ncpus={cpus}:mem={memory}gb\n" + "#PBS -o out.txt\n" + "#PBS -e err.txt\n\n" + "{activation_line}\n" + "cd {path}\n" + "{commands}\n" + ) + sub_job = crest_template + format_params = { + "queue": SERVERS["local"].get("queue", "alon_q"), + "name": f"crest_{xyz_crest_int}", + "cpus": cpus, + # 'memory' is in GB for the template: mem={memory}gb + "memory": int( + SERVERS["local"].get("memory", 32) + if SERVERS["local"].get("memory", 32) < 60 + else 40 + ), + "activation_line": activation_line, + "path": path, + "commands": command, + } + sub_job = sub_job.format(**format_params) + + submit_filename = settings["submit_filenames"]["PBS"] # usually 'submit.sh' + submit_path = os.path.join(path, submit_filename) + with open(submit_path, "w") as f: + f.write(sub_job) + os.chmod(submit_path, 0o700) + + else: + raise ValueError(f"Unsupported cluster_soft for CREST: {cluster_soft!r}") + + return path + + +def submit_crest_jobs(crest_paths: List[str]) -> dict: + """ + Submit CREST jobs to the server. + + Args: + crest_paths (List[str]): List of paths to the CREST directories. + + Returns: + dict: A dictionary containing job IDs as keys and their statuses as values. + """ + crest_jobs = {} + for crest_path in crest_paths: + job_status, job_id = submit_job(path=crest_path) + logger.info(f"CREST job {job_id} submitted for {crest_path}") + crest_jobs[job_id] = {"path": crest_path, "status": job_status} + return crest_jobs + + +def monitor_crest_jobs(crest_jobs: dict, check_interval: int = 300) -> None: + """ + Monitor CREST jobs until they are complete. + + Args: + crest_jobs (dict): Dictionary containing job information (job ID, path, and status). + check_interval (int): Time interval (in seconds) to wait between status checks. + """ + while True: + all_done = True + for job_id, job_info in crest_jobs.items(): + if job_info["status"] not in ["done", "failed"]: + try: + job_info["status"] = check_job_status(job_id) # Update job status + except Exception as e: + logger.error(f"Error checking job status for job {job_id}: {e}") + job_info["status"] = "failed" + if job_info["status"] not in ["done", "failed"]: + all_done = False + if all_done: + break + time.sleep(min(check_interval, MAX_CHECK_INTERVAL_SECONDS)) + + +def process_completed_jobs(crest_jobs: dict) -> list: + """ + Process the completed CREST jobs and update XYZ guesses. + + Args: + crest_jobs (dict): Dictionary containing job information. + """ + xyz_guesses = [] + for job_id, job_info in crest_jobs.items(): + crest_path = job_info["path"] + if job_info["status"] == "done": + crest_best_path = os.path.join(crest_path, "crest_best.xyz") + if os.path.exists(crest_best_path): + with open(crest_best_path, "r") as f: + content = f.read() + xyz_guess = str_to_xyz(content) + xyz_guesses.append(xyz_guess) + else: + logger.error(f"crest_best.xyz not found in {crest_path}") + elif job_info["status"] == "failed": + logger.error(f"CREST job failed for {crest_path}") + + return xyz_guesses + +register_job_adapter('crest', CrestAdapter) diff --git a/arc/job/adapters/ts/crest_test.py b/arc/job/adapters/ts/crest_test.py new file mode 100644 index 0000000000..e243d8d43d --- /dev/null +++ b/arc/job/adapters/ts/crest_test.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +# encoding: utf-8 + +""" +Unit tests for arc.job.adapters.ts.crest +""" + +import os +import tempfile +import unittest + +from arc.species.converter import str_to_xyz + + +class TestCrestAdapter(unittest.TestCase): + """ + Tests for CREST input generation. + """ + + def setUp(self): + self.tmpdir = tempfile.TemporaryDirectory() + + def tearDown(self): + self.tmpdir.cleanup() + + def test_creates_valid_input_files(self): + """ + Ensure CREST inputs are written with expected content/format. + """ + from arc.job.adapters.ts import crest as crest_mod + + xyz = str_to_xyz( + """O 0.0 0.0 0.0 + H 0.0 0.0 0.96 + H 0.9 0.0 0.0""" + ) + + backups = { + "settings": crest_mod.settings, + "submit_scripts": crest_mod.submit_scripts, + "CREST_PATH": crest_mod.CREST_PATH, + "CREST_ENV_PATH": crest_mod.CREST_ENV_PATH, + "SERVERS": crest_mod.SERVERS, + } + + try: + crest_mod.settings = {"submit_filenames": {"PBS": "submit.sh"}} + crest_mod.submit_scripts = { + "local": { + "crest": ( + "#PBS -q {queue}\n" + "#PBS -N {name}\n" + "#PBS -l select=1:ncpus={cpus}:mem={memory}gb\n" + ), + "crest_job": "{activation_line}\ncd {path}\n{commands}\n", + } + } + crest_mod.CREST_PATH = "/usr/bin/crest" + crest_mod.CREST_ENV_PATH = "" + crest_mod.SERVERS = { + "local": {"cluster_soft": "pbs", "cpus": 4, "memory": 8, "queue": "testq"} + } + + crest_dir = crest_mod.crest_ts_conformer_search( + xyz_guess=xyz, a_atom=0, h_atom=1, b_atom=2, path=self.tmpdir.name, xyz_crest_int=0 + ) + + coords_path = os.path.join(crest_dir, "coords.ref") + constraints_path = os.path.join(crest_dir, "constraints.inp") + submit_path = os.path.join(crest_dir, "submit.sh") + + self.assertTrue(os.path.exists(coords_path)) + self.assertTrue(os.path.exists(constraints_path)) + self.assertTrue(os.path.exists(submit_path)) + + with open(coords_path) as f: + coords = f.read().strip().splitlines() + self.assertEqual(coords[0].strip(), "$coord") + self.assertEqual(coords[-1].strip(), "$end") + self.assertEqual(len(coords) - 2, len(xyz["symbols"])) + + with open(constraints_path) as f: + constraints = f.read() + self.assertIn("atoms: 1, 2, 3", constraints) + self.assertIn("force constant: 0.5", constraints) + self.assertIn("reference=coords.ref", constraints) + self.assertIn("distance: 1, 2, auto", constraints) + self.assertIn("distance: 2, 3, auto", constraints) + self.assertIn("$metadyn", constraints) + self.assertTrue(constraints.strip().endswith("$end")) + finally: + crest_mod.settings = backups["settings"] + crest_mod.submit_scripts = backups["submit_scripts"] + crest_mod.CREST_PATH = backups["CREST_PATH"] + crest_mod.CREST_ENV_PATH = backups["CREST_ENV_PATH"] + crest_mod.SERVERS = backups["SERVERS"] + + def test_creates_submit_file_without_crest_templates(self): + """ + Ensure fallback submit template generation works when submit.py has no CREST templates. + """ + from arc.job.adapters.ts import crest as crest_mod + + xyz = str_to_xyz( + """O 0.0 0.0 0.0 + H 0.0 0.0 0.96 + H 0.9 0.0 0.0""" + ) + + backups = { + "settings": crest_mod.settings, + "submit_scripts": crest_mod.submit_scripts, + "CREST_PATH": crest_mod.CREST_PATH, + "CREST_ENV_PATH": crest_mod.CREST_ENV_PATH, + "SERVERS": crest_mod.SERVERS, + } + + try: + crest_mod.settings = {"submit_filenames": {"PBS": "submit.sh"}} + crest_mod.submit_scripts = {"local": {}} + crest_mod.CREST_PATH = "/usr/bin/crest" + crest_mod.CREST_ENV_PATH = "" + crest_mod.SERVERS = { + "local": {"cluster_soft": "pbs", "cpus": 4, "memory": 8, "queue": "testq"} + } + + crest_dir = crest_mod.crest_ts_conformer_search( + xyz_guess=xyz, a_atom=0, h_atom=1, b_atom=2, path=self.tmpdir.name, xyz_crest_int=1 + ) + + submit_path = os.path.join(crest_dir, "submit.sh") + self.assertTrue(os.path.exists(submit_path)) + with open(submit_path) as f: + submit_text = f.read() + self.assertIn("#PBS -q testq", submit_text) + self.assertIn("coords.ref --cinp constraints.inp --noreftopo -T 4", submit_text) + finally: + crest_mod.settings = backups["settings"] + crest_mod.submit_scripts = backups["submit_scripts"] + crest_mod.CREST_PATH = backups["CREST_PATH"] + crest_mod.CREST_ENV_PATH = backups["CREST_ENV_PATH"] + crest_mod.SERVERS = backups["SERVERS"] + + +if __name__ == "__main__": + unittest.main() diff --git a/arc/job/adapters/ts/heuristics.py b/arc/job/adapters/ts/heuristics.py index 9031aa9ec3..8582735cab 100644 --- a/arc/job/adapters/ts/heuristics.py +++ b/arc/job/adapters/ts/heuristics.py @@ -21,28 +21,44 @@ import os from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union -from arc.common import (ARC_PATH, almost_equal_coords, get_angle_in_180_range, get_logger, is_angle_linear, - is_xyz_linear, key_by_val, read_yaml_file) +from arc.common import ( + ARC_PATH, + almost_equal_coords, + get_angle_in_180_range, + get_logger, + is_angle_linear, + is_xyz_linear, + key_by_val, + read_yaml_file, +) from arc.family import get_reaction_family_products from arc.job.adapter import JobAdapter from arc.job.adapters.common import _initialize_adapter, ts_adapters_by_rmg_family from arc.job.factory import register_job_adapter from arc.plotter import save_geo -from arc.species.converter import (compare_zmats, relocate_zmat_dummy_atoms_to_the_end, zmat_from_xyz, zmat_to_xyz, - add_atom_to_xyz_using_internal_coords, sorted_distances_of_atom) +from arc.species.converter import ( + add_atom_to_xyz_using_internal_coords, + compare_zmats, + relocate_zmat_dummy_atoms_to_the_end, + sorted_distances_of_atom, + zmat_from_xyz, + zmat_to_xyz, +) from arc.mapping.engine import map_two_species from arc.molecule.molecule import Molecule from arc.species.species import ARCSpecies, TSGuess, SpeciesError, colliding_atoms from arc.species.zmat import get_parameter_from_atom_indices, remove_zmat_atom_0, up_param, xyz_to_zmat from arc.species.vectors import calculate_angle +from arc.job.adapters.ts.seed_hub import get_ts_seeds if TYPE_CHECKING: from arc.level import Level from arc.reaction import ARCReaction - -FAMILY_SETS = {'hydrolysis_set_1': ['carbonyl_based_hydrolysis', 'ether_hydrolysis'], - 'hydrolysis_set_2': ['nitrile_hydrolysis']} +FAMILY_SETS = { + 'hydrolysis_set_1': ['carbonyl_based_hydrolysis', 'ether_hydrolysis'], + 'hydrolysis_set_2': ['nitrile_hydrolysis'], +} DIHEDRAL_INCREMENT = 30 @@ -258,56 +274,60 @@ def execute_incore(self): multiplicity=rxn.multiplicity, ) - xyzs = list() - tsg, families = None, None - if rxn.family == 'H_Abstraction': - tsg = TSGuess(method='Heuristics') - tsg.tic() - xyzs = h_abstraction(reaction=rxn, dihedral_increment=self.dihedral_increment) - tsg.tok() - + tsg = TSGuess(method='Heuristics') + tsg.tic() + xyzs = get_ts_seeds( + reaction=rxn, + base_adapter='heuristics', + dihedral_increment=self.dihedral_increment, + ) + tsg.tok() if rxn.family in FAMILY_SETS['hydrolysis_set_1'] or rxn.family in FAMILY_SETS['hydrolysis_set_2']: - try: - tsg = TSGuess(method='Heuristics') - tsg.tic() - xyzs, families, indices = hydrolysis(reaction=rxn) - tsg.tok() - if not xyzs: - logger.warning(f'Heuristics TS search failed to generate any valid TS guesses for {rxn.label}.') - continue - except ValueError: + if not xyzs: + logger.warning( + f'Heuristics TS search failed to generate any valid TS guesses for {rxn.label}.' + ) continue - for method_index, xyz in enumerate(xyzs): + for method_index, xyz_entry in enumerate(xyzs): + xyz = xyz_entry.get("xyz") + method_label = xyz_entry.get("method", "Heuristics") + family = xyz_entry.get("family", rxn.family) + if xyz is None: + continue unique = True for other_tsg in rxn.ts_species.ts_guesses: if almost_equal_coords(xyz, other_tsg.initial_xyz): - if 'heuristics' not in other_tsg.method.lower(): - other_tsg.method += ' and Heuristics' + existing_sources = getattr(other_tsg, "method_sources", None) + if existing_sources is not None: + combined_sources = list(existing_sources) + [method_label] + else: + combined_sources = [other_tsg.method, method_label] + other_tsg.method_sources = TSGuess._normalize_method_sources(combined_sources) unique = False break if unique: - ts_guess = TSGuess(method='Heuristics', + ts_guess = TSGuess(method=method_label, index=len(rxn.ts_species.ts_guesses), method_index=method_index, t0=tsg.t0, execution_time=tsg.execution_time, success=True, - family=rxn.family if families is None else families[method_index], + family=family, xyz=xyz, ) rxn.ts_species.ts_guesses.append(ts_guess) save_geo(xyz=xyz, path=self.local_path, - filename=f'Heuristics_{method_index}', + filename=f'{method_label}_{method_index}', format_='xyz', - comment=f'Heuristics {method_index}, family: {rxn.family}', + comment=f'{method_label} {method_index}, family: {rxn.family}', ) if len(self.reactions) < 5: - successes = len([tsg for tsg in rxn.ts_species.ts_guesses if tsg.success and 'heuristics' in tsg.method]) + successes = [tsg for tsg in rxn.ts_species.ts_guesses if tsg.success] if successes: - logger.info(f'Heuristics successfully found {successes} TS guesses for {rxn.label}.') + logger.info(f'Heuristics successfully found {len(successes)} TS guesses for {rxn.label}.') else: logger.info(f'Heuristics did not find any successful TS guesses for {rxn.label}.') @@ -873,7 +893,7 @@ def h_abstraction(reaction: 'ARCReaction', dihedral_increment (int, optional): The dihedral increment to use for B-H-A-C and D-B-H-C dihedral scans. Returns: List[dict] - Entries are Cartesian coordinates of TS guesses for all reactions. + Entries hold Cartesian coordinates of TS guesses and the generating method label. """ xyz_guesses = list() dihedral_increment = dihedral_increment or DIHEDRAL_INCREMENT @@ -952,7 +972,8 @@ def h_abstraction(reaction: 'ARCReaction', else: # This TS is unique, and has no atom collisions. zmats.append(zmat_guess) - xyz_guesses.append(xyz_guess) + xyz_guesses.append({"xyz": xyz_guess, "method": "Heuristics"}) + return xyz_guesses @@ -987,9 +1008,11 @@ def hydrolysis(reaction: 'ARCReaction') -> Tuple[List[dict], List[dict], List[in is_set_1 = reaction_family in hydrolysis_parameters["family_sets"]["set_1"] is_set_2 = reaction_family in hydrolysis_parameters["family_sets"]["set_2"] - main_reactant, water, initial_xyz, xyz_indices = extract_reactant_and_indices(reaction, - product_dict, - is_set_1) + main_reactant, water, initial_xyz, xyz_indices = extract_reactant_and_indices( + reaction, + product_dict, + is_set_1, + ) base_xyz_indices = { "a": xyz_indices["a"], "b": xyz_indices["b"], @@ -999,9 +1022,19 @@ def hydrolysis(reaction: 'ARCReaction') -> Tuple[List[dict], List[dict], List[in } adjustments_to_try = [False, True] if dihedrals_to_change_num == 1 else [True] for adjust_dihedral in adjustments_to_try: - chosen_xyz_indices, xyz_guesses, zmats_total, n_dihedrals_found = process_chosen_d_indices(initial_xyz, base_xyz_indices, xyz_indices, - hydrolysis_parameters,reaction_family, water, zmats_total, is_set_1, is_set_2, - dihedrals_to_change_num, should_adjust_dihedral=adjust_dihedral) + chosen_xyz_indices, xyz_guesses, zmats_total, n_dihedrals_found = process_chosen_d_indices( + initial_xyz, + base_xyz_indices, + xyz_indices, + hydrolysis_parameters, + reaction_family, + water, + zmats_total, + is_set_1, + is_set_2, + dihedrals_to_change_num, + should_adjust_dihedral=adjust_dihedral, + ) max_dihedrals_found = max(max_dihedrals_found, n_dihedrals_found) if xyz_guesses: xyz_guesses_total.extend(xyz_guesses) @@ -1015,8 +1048,8 @@ def hydrolysis(reaction: 'ARCReaction') -> Tuple[List[dict], List[dict], List[in condition_met = len(xyz_guesses_total) > 0 nitrile_in_inputs = any( - (pd.get("family") == "nitrile_hydrolysis") or - (isinstance(pd.get("family"), list) and "nitrile_hydrolysis" in pd.get("family")) + (pd.get("family") == "nitrile_hydrolysis") + or (isinstance(pd.get("family"), list) and "nitrile_hydrolysis" in pd.get("family")) for pd in product_dicts ) nitrile_already_found = any(fam == "nitrile_hydrolysis" for fam in reaction_families) @@ -1032,9 +1065,11 @@ def hydrolysis(reaction: 'ARCReaction') -> Tuple[List[dict], List[dict], List[in is_set_1 = reaction_family in hydrolysis_parameters["family_sets"]["set_1"] is_set_2 = reaction_family in hydrolysis_parameters["family_sets"]["set_2"] - main_reactant, water, initial_xyz, xyz_indices = extract_reactant_and_indices(reaction, - product_dict, - is_set_1) + main_reactant, water, initial_xyz, xyz_indices = extract_reactant_and_indices( + reaction, + product_dict, + is_set_1, + ) base_xyz_indices = { "a": xyz_indices["a"], "b": xyz_indices["b"], @@ -1048,10 +1083,18 @@ def hydrolysis(reaction: 'ARCReaction') -> Tuple[List[dict], List[dict], List[in break dihedrals_to_change_num += 1 chosen_xyz_indices, xyz_guesses, zmats_total, n_dihedrals_found = process_chosen_d_indices( - initial_xyz, base_xyz_indices, xyz_indices, - hydrolysis_parameters, reaction_family, water, zmats_total, is_set_1, is_set_2, - dihedrals_to_change_num, should_adjust_dihedral=True, - allow_nitrile_dihedrals=True + initial_xyz, + base_xyz_indices, + xyz_indices, + hydrolysis_parameters, + reaction_family, + water, + zmats_total, + is_set_1, + is_set_2, + dihedrals_to_change_num, + should_adjust_dihedral=True, + allow_nitrile_dihedrals=True, ) max_dihedrals_found = max(max_dihedrals_found, n_dihedrals_found) @@ -1083,11 +1126,13 @@ def get_products_and_check_families(reaction: 'ARCReaction') -> Tuple[List[dict] consider_arc_families=True, ) carbonyl_based_present = any( - "carbonyl_based_hydrolysis" in (d.get("family", []) if isinstance(d.get("family"), list) else [d.get("family")]) + "carbonyl_based_hydrolysis" + in (d.get("family", []) if isinstance(d.get("family"), list) else [d.get("family")]) for d in product_dicts ) ether_present = any( - "ether_hydrolysis" in (d.get("family", []) if isinstance(d.get("family"), list) else [d.get("family")]) + "ether_hydrolysis" + in (d.get("family", []) if isinstance(d.get("family"), list) else [d.get("family")]) for d in product_dicts ) @@ -1118,9 +1163,11 @@ def has_carbonyl_based_hydrolysis(reaction_families: List[dict]) -> bool: return any(family == "carbonyl_based_hydrolysis" for family in reaction_families) -def extract_reactant_and_indices(reaction: 'ARCReaction', - product_dict: dict, - is_set_1: bool) -> Tuple[ARCSpecies, ARCSpecies, dict, dict]: +def extract_reactant_and_indices( + reaction: 'ARCReaction', + product_dict: dict, + is_set_1: bool, +) -> Tuple[ARCSpecies, ARCSpecies, dict, dict]: """ Extract the reactant molecules and relevant atomic indices (a,b,e,d,o,h1) for the hydrolysis reaction. @@ -1163,11 +1210,13 @@ def extract_reactant_and_indices(reaction: 'ARCReaction', main_reactant, a_xyz_index, b_xyz_index, - two_neighbors + two_neighbors, ) except ValueError as e: - raise ValueError(f"Failed to determine neighbors by electronegativity for atom {a_xyz_index} " - f"in species {main_reactant.label}: {e}") + raise ValueError( + f"Failed to determine neighbors by electronegativity for atom {a_xyz_index} " + f"in species {main_reactant.label}: {e}" + ) o_index = len(main_reactant.mol.atoms) h1_index = o_index + 1 @@ -1178,25 +1227,26 @@ def extract_reactant_and_indices(reaction: 'ARCReaction', "e": e_xyz_index, "d": d_xyz_indices, "o": o_index, - "h1": h1_index + "h1": h1_index, } return main_reactant, water, initial_xyz, xyz_indices -def process_chosen_d_indices(initial_xyz: dict, - base_xyz_indices: dict, - xyz_indices: dict, - hydrolysis_parameters: dict, - reaction_family: str, - water: 'ARCSpecies', - zmats_total: List[dict], - is_set_1: bool, - is_set_2: bool, - dihedrals_to_change_num: int, - should_adjust_dihedral: bool, - allow_nitrile_dihedrals: bool = False - ) -> Tuple[Dict[str, int], List[Dict[str, Any]], List[Dict[str, Any]], int]: +def process_chosen_d_indices( + initial_xyz: dict, + base_xyz_indices: dict, + xyz_indices: dict, + hydrolysis_parameters: dict, + reaction_family: str, + water: 'ARCSpecies', + zmats_total: List[dict], + is_set_1: bool, + is_set_2: bool, + dihedrals_to_change_num: int, + should_adjust_dihedral: bool, + allow_nitrile_dihedrals: bool = False, +) -> Tuple[Dict[str, int], List[Dict[str, Any]], List[Dict[str, Any]], int]: """ Iterates over the 'd' indices to process TS guess generation. @@ -1214,7 +1264,6 @@ def process_chosen_d_indices(initial_xyz: dict, should_adjust_dihedral (bool): Whether to adjust dihedral angles. allow_nitrile_dihedrals (bool, optional): Force-enable dihedral adjustments for nitriles. Defaults to False. - Returns: Tuple[Dict[str, int], List[Dict[str, Any]], List[Dict[str, Any]]]: - Chosen indices for TS generation. @@ -1224,11 +1273,18 @@ def process_chosen_d_indices(initial_xyz: dict, """ max_dihedrals_found = 0 for d_index in xyz_indices.get("d", []) or [None]: - chosen_xyz_indices = {**base_xyz_indices, "d": d_index} if d_index is not None else {**base_xyz_indices, - "d": None} + chosen_xyz_indices = {**base_xyz_indices, "d": d_index} if d_index is not None else { + **base_xyz_indices, + "d": None, + } current_zmat, zmat_indices = setup_zmat_indices(initial_xyz, chosen_xyz_indices) - matches = get_matching_dihedrals(current_zmat, zmat_indices['a'], zmat_indices['b'], - zmat_indices['e'], zmat_indices['d']) + matches = get_matching_dihedrals( + current_zmat, + zmat_indices['a'], + zmat_indices['b'], + zmat_indices['e'], + zmat_indices['d'], + ) max_dihedrals_found = max(max_dihedrals_found, len(matches)) if should_adjust_dihedral and dihedrals_to_change_num > len(matches): continue @@ -1246,22 +1302,28 @@ def process_chosen_d_indices(initial_xyz: dict, zmat_variants = generate_dihedral_variants(current_zmat, indices, adjustment_factors) if zmat_variants: adjusted_zmats.extend(zmat_variants) - if not adjusted_zmats: - pass - else: + if adjusted_zmats: zmats_to_process = adjusted_zmats ts_guesses_list = [] for zmat_to_process in zmats_to_process: ts_guesses, updated_zmats = process_family_specific_adjustments( - is_set_1, is_set_2, reaction_family, hydrolysis_parameters, - zmat_to_process, water, chosen_xyz_indices, zmats_total) + is_set_1, + is_set_2, + reaction_family, + hydrolysis_parameters, + zmat_to_process, + water, + chosen_xyz_indices, + zmats_total, + ) zmats_total = updated_zmats ts_guesses_list.extend(ts_guesses) if attempted_dihedral_adjustments and not ts_guesses_list and ( - reaction_family != 'nitrile_hydrolysis' or allow_nitrile_dihedrals): - flipped_zmats= [] + reaction_family != 'nitrile_hydrolysis' or allow_nitrile_dihedrals + ): + flipped_zmats = [] adjustment_factors = [15, 25, 35, 45, 55] for indices in indices_list: flipped_variants = generate_dihedral_variants(current_zmat, indices, adjustment_factors, flip=True) @@ -1269,8 +1331,14 @@ def process_chosen_d_indices(initial_xyz: dict, for zmat_to_process in flipped_zmats: ts_guesses, updated_zmats = process_family_specific_adjustments( - is_set_1, is_set_2, reaction_family, hydrolysis_parameters, - zmat_to_process, water, chosen_xyz_indices, zmats_total + is_set_1, + is_set_2, + reaction_family, + hydrolysis_parameters, + zmat_to_process, + water, + chosen_xyz_indices, + zmats_total, ) zmats_total = updated_zmats ts_guesses_list.extend(ts_guesses) @@ -1311,10 +1379,12 @@ def get_main_reactant_and_water_from_hydrolysis_reaction(reaction: 'ARCReaction' return arc_reactant, water -def get_neighbors_by_electronegativity(spc: 'ARCSpecies', - atom_index: int, - exclude_index: int, - two_neighbors: bool = True) -> Tuple[int, List[int]]: +def get_neighbors_by_electronegativity( + spc: 'ARCSpecies', + atom_index: int, + exclude_index: int, + two_neighbors: bool = True, +) -> Tuple[int, List[int]]: """ Retrieve the top two neighbors of a given atom in a species, sorted by their effective electronegativity, excluding a specified neighbor. @@ -1340,8 +1410,11 @@ def get_neighbors_by_electronegativity(spc: 'ARCSpecies', Raises: ValueError: If the atom has no valid neighbors. """ - neighbors = [neighbor for neighbor in spc.mol.atoms[atom_index].edges.keys() - if spc.mol.atoms.index(neighbor) != exclude_index] + neighbors = [ + neighbor + for neighbor in spc.mol.atoms[atom_index].edges.keys() + if spc.mol.atoms.index(neighbor) != exclude_index + ] if not neighbors: raise ValueError(f"Atom at index {atom_index} has no valid neighbors.") @@ -1355,12 +1428,17 @@ def get_neighbor_total_electronegativity(neighbor: 'Atom') -> float: float: The total electronegativity of the neighbor """ return sum( - ELECTRONEGATIVITIES[n.symbol] * neighbor.edges[n].order - for n in neighbor.edges.keys() + ELECTRONEGATIVITIES[n.symbol] * neighbor.edges[n].order for n in neighbor.edges.keys() ) - effective_electronegativities = [(ELECTRONEGATIVITIES[n.symbol] * spc.mol.atoms[atom_index].edges[n].order, - get_neighbor_total_electronegativity(n), n ) for n in neighbors] + effective_electronegativities = [ + ( + ELECTRONEGATIVITIES[n.symbol] * spc.mol.atoms[atom_index].edges[n].order, + get_neighbor_total_electronegativity(n), + n, + ) + for n in neighbors + ] effective_electronegativities.sort(reverse=True, key=lambda x: (x[0], x[1])) sorted_neighbors = [spc.mol.atoms.index(n[2]) for n in effective_electronegativities] most_electronegative = sorted_neighbors[0] @@ -1368,8 +1446,7 @@ def get_neighbor_total_electronegativity(neighbor: 'Atom') -> float: return most_electronegative, remaining_neighbors -def setup_zmat_indices(initial_xyz: dict, - xyz_indices: dict) -> Tuple[dict, dict]: +def setup_zmat_indices(initial_xyz: dict, xyz_indices: dict) -> Tuple[dict, dict]: """ Convert XYZ coordinates to Z-matrix format and set up corresponding indices. @@ -1387,26 +1464,28 @@ def setup_zmat_indices(initial_xyz: dict, 'a': key_by_val(initial_zmat.get('map', {}), xyz_indices['a']), 'b': key_by_val(initial_zmat.get('map', {}), xyz_indices['b']), 'e': key_by_val(initial_zmat.get('map', {}), xyz_indices['e']), - 'd': key_by_val(initial_zmat.get('map', {}), xyz_indices['d']) if xyz_indices['d'] is not None else None + 'd': key_by_val(initial_zmat.get('map', {}), xyz_indices['d']) if xyz_indices['d'] is not None else None, } return initial_zmat, zmat_indices -def generate_dihedral_variants(zmat: dict, - indices: List[int], - adjustment_factors: List[float], - flip: bool = False, - tolerance_degrees: float = 10.0) -> List[dict]: +def generate_dihedral_variants( + zmat: dict, + indices: List[int], + adjustment_factors: List[float], + flip: bool = False, + tolerance_degrees: float = 10.0, +) -> List[dict]: """ - Create variants of a Z-matrix by adjusting dihedral angles using multiple adjustment factors. + Create variants of a Z-matrix by adjusting dihedral angles using multiple adjustment factors. This function creates variants of the Z-matrix using different adjustment factors: - 1. Retrieve the current dihedral value and normalize it to the (-180°, 180°] range. - 2. For each adjustment factor, slightly push the angle away from 0° or ±180° to avoid - unstable, boundary configurations. - 3. If `flip=True`, the same procedure is applied starting from a flipped - (180°-shifted) baseline angle. - 4. Each adjusted or flipped variant is deep-copied to ensure independence. + 1. Retrieve the current dihedral value and normalize it to the (-180°, 180°] range. + 2. For each adjustment factor, slightly push the angle away from 0° or ±180° to avoid + unstable, boundary configurations. + 3. If `flip=True`, the same procedure is applied starting from a flipped + (180°-shifted) baseline angle. + 4. Each adjusted or flipped variant is deep-copied to ensure independence. Args: zmat (dict): The initial Z-matrix. @@ -1414,7 +1493,8 @@ def generate_dihedral_variants(zmat: dict, adjustment_factors (List[float], optional): List of factors to try. flip (bool, optional): Whether to start from a flipped (180°) baseline dihedral angle. Defaults to False. - tolerance_degrees (float, optional): Tolerance (in degrees) for detecting angles near 0° or ±180°. Defaults to 10.0. + tolerance_degrees (float, optional): Tolerance (in degrees) for detecting angles near 0° or ±180°. + Defaults to 10.0. Returns: List[dict]: List of Z-matrix variants with adjusted dihedral angles. @@ -1440,8 +1520,9 @@ def push_up_dihedral(val: float, adj_factor: float) -> float: seed_value = normalized_value if flip: seed_value = get_angle_in_180_range(normalized_value + 180.0) - boundary_like = ((abs(seed_value) < tolerance_degrees) - or (180 - tolerance_degrees <= abs(seed_value) <= 180+tolerance_degrees)) + boundary_like = (abs(seed_value) < tolerance_degrees) or ( + 180 - tolerance_degrees <= abs(seed_value) <= 180 + tolerance_degrees + ) if boundary_like: for factor in adjustment_factors: variant = copy.deepcopy(zmat) @@ -1450,11 +1531,13 @@ def push_up_dihedral(val: float, adj_factor: float) -> float: return variants -def get_matching_dihedrals(zmat: dict, - a: int, - b: int, - e: int, - d: Optional[int]) -> List[List[int]]: +def get_matching_dihedrals( + zmat: dict, + a: int, + b: int, + e: int, + d: Optional[int], +) -> List[List[int]]: """ Retrieve all dihedral angles in the Z-matrix that match the given atom indices. This function scans the Z-matrix for dihedral parameters (keys starting with 'D_' or 'DX_') @@ -1484,11 +1567,13 @@ def get_matching_dihedrals(zmat: dict, return matches -def stretch_ab_bond(initial_zmat: 'dict', - xyz_indices: 'dict', - zmat_indices: 'dict', - hydrolysis_parameters: 'dict', - reaction_family: str) -> None: +def stretch_ab_bond( + initial_zmat: dict, + xyz_indices: dict, + zmat_indices: dict, + hydrolysis_parameters: dict, + reaction_family: str, +) -> None: """ Stretch the bond between atoms a and b in the Z-matrix based on the reaction family parameters. @@ -1519,16 +1604,18 @@ def stretch_ab_bond(initial_zmat: 'dict', stretch_zmat_bond(zmat=initial_zmat, indices=indices, stretch=stretch_degree) -def process_family_specific_adjustments(is_set_1: bool, - is_set_2: bool, - reaction_family: str, - hydrolysis_parameters: dict, - initial_zmat: dict, - water: 'ARCSpecies', - xyz_indices: dict, - zmats_total: List[dict]) -> Tuple[List[dict], List[dict]]: +def process_family_specific_adjustments( + is_set_1: bool, + is_set_2: bool, + reaction_family: str, + hydrolysis_parameters: dict, + initial_zmat: dict, + water: 'ARCSpecies', + xyz_indices: dict, + zmats_total: List[dict], +) -> Tuple[List[dict], List[dict]]: """ - Process specific adjustments for different hydrolysis reaction families if needed, then generate TS guesses . + Process specific adjustments for different hydrolysis reaction families if needed, then generate TS guesses. Args: is_set_1 (bool): Whether the reaction belongs to parameter set 1. @@ -1546,38 +1633,52 @@ def process_family_specific_adjustments(is_set_1: bool, Raises: ValueError: If the reaction family is not supported. """ - a_xyz, b_xyz, e_xyz, o_xyz, h1_xyz, d_xyz= xyz_indices.values() + a_xyz, b_xyz, e_xyz, o_xyz, h1_xyz, d_xyz = xyz_indices.values() r_atoms = [a_xyz, o_xyz, o_xyz] a_atoms = [[b_xyz, a_xyz], [a_xyz, o_xyz], [h1_xyz, o_xyz]] - d_atoms = ([[e_xyz, d_xyz, a_xyz], [b_xyz, a_xyz, o_xyz], [a_xyz, h1_xyz, o_xyz]] - if d_xyz is not None else - [[e_xyz, b_xyz, a_xyz], [b_xyz, a_xyz, o_xyz], [a_xyz, h1_xyz, o_xyz]]) + d_atoms = ( + [[e_xyz, d_xyz, a_xyz], [b_xyz, a_xyz, o_xyz], [a_xyz, h1_xyz, o_xyz]] + if d_xyz is not None + else [[e_xyz, b_xyz, a_xyz], [b_xyz, a_xyz, o_xyz], [a_xyz, h1_xyz, o_xyz]] + ) r_value = hydrolysis_parameters['family_parameters'][str(reaction_family)]['r_value'] a_value = hydrolysis_parameters['family_parameters'][str(reaction_family)]['a_value'] d_values = hydrolysis_parameters['family_parameters'][str(reaction_family)]['d_values'] if is_set_1 or is_set_2: initial_xyz = zmat_to_xyz(initial_zmat) - return generate_hydrolysis_ts_guess(initial_xyz, xyz_indices.values(), water, r_atoms, a_atoms, d_atoms, - r_value, a_value, d_values, zmats_total, is_set_1, - threshold=0.6 if reaction_family == 'nitrile_hydrolysis' else 0.8) + return generate_hydrolysis_ts_guess( + initial_xyz, + xyz_indices.values(), + water, + r_atoms, + a_atoms, + d_atoms, + r_value, + a_value, + d_values, + zmats_total, + is_set_1, + threshold=0.6 if reaction_family == 'nitrile_hydrolysis' else 0.8, + ) else: raise ValueError(f"Family {reaction_family} not supported for hydrolysis TS guess generation.") -def generate_hydrolysis_ts_guess(initial_xyz: dict, - xyz_indices: List[int], - water: 'ARCSpecies', - r_atoms: List[int], - a_atoms: List[List[int]], - d_atoms: List[List[int]], - r_value: List[float], - a_value: List[float], - d_values: List[List[float]], - zmats_total: List[dict], - is_set_1: bool, - threshold: float - ) -> Tuple[List[dict], List[dict]]: +def generate_hydrolysis_ts_guess( + initial_xyz: dict, + xyz_indices: List[int], + water: 'ARCSpecies', + r_atoms: List[int], + a_atoms: List[List[int]], + d_atoms: List[List[int]], + r_value: List[float], + a_value: List[float], + d_values: List[List[float]], + zmats_total: List[dict], + is_set_1: bool, + threshold: float, +) -> Tuple[List[dict], List[dict]]: """ Generate Z-matrices and Cartesian coordinates for transition state (TS) guesses. @@ -1600,7 +1701,7 @@ def generate_hydrolysis_ts_guess(initial_xyz: dict, """ xyz_guesses = [] - for index, d_value in enumerate(d_values): + for d_value in d_values: xyz_guess = copy.deepcopy(initial_xyz) for i in range(3): xyz_guess = add_atom_to_xyz_using_internal_coords( @@ -1611,23 +1712,22 @@ def generate_hydrolysis_ts_guess(initial_xyz: dict, d_indices=d_atoms[i], r_value=r_value[i], a_value=a_value[i], - d_value=d_value[i] + d_value=d_value[i], ) - a_xyz, b_xyz, e_xyz, o_xyz, h1_xyz, d_xyz= xyz_indices - are_valid_bonds=check_ts_bonds(xyz_guess, [o_xyz, h1_xyz, h1_xyz+1, a_xyz, b_xyz]) - colliding=colliding_atoms(xyz_guess, threshold=threshold) + a_xyz, b_xyz, e_xyz, o_xyz, h1_xyz, d_xyz = xyz_indices + are_valid_bonds = check_ts_bonds(xyz_guess, [o_xyz, h1_xyz, h1_xyz + 1, a_xyz, b_xyz]) + colliding = colliding_atoms(xyz_guess, threshold=threshold) duplicate = any(compare_zmats(existing, xyz_to_zmat(xyz_guess)) for existing in zmats_total) if is_set_1: - dihedral_edao=[e_xyz, d_xyz, a_xyz, o_xyz] - dao_is_linear=check_dao_angle(dihedral_edao, xyz_guess) + dihedral_edao = [e_xyz, d_xyz, a_xyz, o_xyz] + dao_is_linear = check_dao_angle(dihedral_edao, xyz_guess) else: - dao_is_linear=False + dao_is_linear = False if xyz_guess is not None and not colliding and not duplicate and are_valid_bonds and not dao_is_linear: xyz_guesses.append(xyz_guess) zmats_total.append(xyz_to_zmat(xyz_guess)) - return xyz_guesses, zmats_total @@ -1644,7 +1744,7 @@ def check_dao_angle(d_indices: List[int], xyz_guess: dict) -> bool: """ angle_indices = [d_indices[1], d_indices[2], d_indices[3]] angle_value = calculate_angle(xyz_guess, angle_indices) - norm_value=(angle_value + 180) % 180 + norm_value = (angle_value + 180) % 180 return (norm_value < 10) or (norm_value > 170) @@ -1659,7 +1759,7 @@ def check_ts_bonds(transition_state_xyz: dict, tested_atom_indices: list) -> boo Returns: bool: Whether the transition state guess has the expected water-related bonds. """ - oxygen_index, h1_index, h2_index, a_index, b_index= tested_atom_indices + oxygen_index, h1_index, h2_index, a_index, b_index = tested_atom_indices oxygen_bonds = sorted_distances_of_atom(transition_state_xyz, oxygen_index) h1_bonds = sorted_distances_of_atom(transition_state_xyz, h1_index) h2_bonds = sorted_distances_of_atom(transition_state_xyz, h2_index) @@ -1678,10 +1778,12 @@ def check_oxygen_bonds(bonds): return rel_error <= 0.1 return False - oxygen_has_valid_bonds = (oxygen_bonds[0][0] == h2_index and check_oxygen_bonds(oxygen_bonds)) - h1_has_valid_bonds = (h1_bonds[0][0] in {oxygen_index, b_index}and h1_bonds[1][0] in {oxygen_index, b_index}) + oxygen_has_valid_bonds = oxygen_bonds[0][0] == h2_index and check_oxygen_bonds(oxygen_bonds) + h1_has_valid_bonds = (h1_bonds[0][0] in {oxygen_index, b_index}) and ( + h1_bonds[1][0] in {oxygen_index, b_index} + ) h2_has_valid_bonds = h2_bonds[0][0] == oxygen_index return oxygen_has_valid_bonds and h1_has_valid_bonds and h2_has_valid_bonds -register_job_adapter('heuristics', HeuristicsAdapter) +register_job_adapter("heuristics", HeuristicsAdapter) diff --git a/arc/job/adapters/ts/heuristics_test.py b/arc/job/adapters/ts/heuristics_test.py index 250e10d852..fba89e9462 100644 --- a/arc/job/adapters/ts/heuristics_test.py +++ b/arc/job/adapters/ts/heuristics_test.py @@ -10,6 +10,8 @@ import os import shutil import unittest +from types import SimpleNamespace +from unittest.mock import patch from arc.common import ARC_TESTING_PATH, almost_equal_coords from arc.family import get_reaction_family_products @@ -31,6 +33,7 @@ check_dao_angle, check_ts_bonds, ) +from arc.job.adapters.ts.seed_hub import get_ts_seeds, get_wrapper_constraints from arc.reaction import ARCReaction from arc.species.converter import str_to_xyz, zmat_to_xyz, zmat_from_xyz from arc.species.species import ARCSpecies @@ -2258,5 +2261,61 @@ def tearDownClass(cls): shutil.rmtree(os.path.join(ARC_TESTING_PATH, 'heuristics_1'), ignore_errors=True) +class TestHeuristicsHub(unittest.TestCase): + """Unit tests for shared heuristic seed and CREST-constraint helpers.""" + + def test_get_ts_seeds_h_abstraction(self): + rxn = SimpleNamespace(family='H_Abstraction') + with patch('arc.job.adapters.ts.heuristics.h_abstraction', + return_value=[{'xyz': {'symbols': ('H',), 'coords': ((0.0, 0.0, 0.0),), 'isotopes': (1,)}, + 'method': 'Heuristics'}]): + seeds = get_ts_seeds(reaction=rxn, base_adapter='heuristics', dihedral_increment=60) + self.assertEqual(len(seeds), 1) + self.assertEqual(seeds[0]['family'], 'H_Abstraction') + self.assertEqual(seeds[0]['method'], 'Heuristics') + self.assertEqual(seeds[0]['source_adapter'], 'heuristics') + + def test_get_ts_seeds_hydrolysis(self): + rxn = SimpleNamespace(family='carbonyl_based_hydrolysis') + xyz = {'symbols': ('O',), 'coords': ((0.0, 0.0, 0.0),), 'isotopes': (16,)} + with patch('arc.job.adapters.ts.heuristics.hydrolysis', + return_value=([xyz], ['carbonyl_based_hydrolysis'], [[0, 1, 2]])): + seeds = get_ts_seeds(reaction=rxn, base_adapter='heuristics') + self.assertEqual(len(seeds), 1) + self.assertEqual(seeds[0]['family'], 'carbonyl_based_hydrolysis') + self.assertEqual(seeds[0]['xyz'], xyz) + self.assertEqual(seeds[0]['metadata'], {'indices': [0, 1, 2]}) + + def test_get_wrapper_constraints_crest(self): + rxn = SimpleNamespace(family='H_Abstraction') + xyz = str_to_xyz("""O 0.0000 0.0000 0.0000 + H 0.0000 0.0000 0.9600 + H 0.9000 0.0000 0.0000""") + seed = {'xyz': xyz, 'family': rxn.family} + atoms = get_wrapper_constraints(wrapper='crest', reaction=rxn, seed=seed) + self.assertIsInstance(atoms, dict) + self.assertSetEqual(set(atoms.keys()), {'A', 'H', 'B'}) + self.assertTrue(all(isinstance(v, int) for v in atoms.values())) + + def test_get_wrapper_constraints_crest_unsupported_family(self): + rxn = SimpleNamespace(family='carbonyl_based_hydrolysis') + xyz = str_to_xyz("""O 0.0000 0.0000 0.0000 + H 0.0000 0.0000 0.9600 + H 0.9000 0.0000 0.0000""") + seed = {'xyz': xyz, 'family': rxn.family} + atoms = get_wrapper_constraints(wrapper='crest', reaction=rxn, seed=seed) + self.assertIsNone(atoms) + + def test_get_ts_seeds_unsupported_adapter(self): + rxn = SimpleNamespace(family='H_Abstraction') + with self.assertRaises(ValueError): + get_ts_seeds(reaction=rxn, base_adapter='gcn') + + def test_get_wrapper_constraints_unsupported_wrapper(self): + rxn = SimpleNamespace(family='H_Abstraction') + with self.assertRaises(ValueError): + get_wrapper_constraints(wrapper='foo_wrapper', reaction=rxn, seed={}) + + if __name__ == '__main__': unittest.main(testRunner=unittest.TextTestRunner(verbosity=2)) diff --git a/arc/job/adapters/ts/seed_hub.py b/arc/job/adapters/ts/seed_hub.py new file mode 100644 index 0000000000..4a38254cdb --- /dev/null +++ b/arc/job/adapters/ts/seed_hub.py @@ -0,0 +1,168 @@ +""" +Shared TS-seed and wrapper-constraint hub. + +This module centralizes: +1. How TS seeds are requested from a base TS-search adapter. +2. How wrapper adapters (e.g., CREST) request family-specific constraints for a seed. +""" + +from typing import Dict, List, Optional + +from arc.common import get_logger +from arc.species.converter import xyz_to_dmat + +logger = get_logger() + + +def get_ts_seeds(reaction: 'ARCReaction', + base_adapter: str = 'heuristics', + dihedral_increment: Optional[int] = None, + ) -> List[dict]: + """ + Return TS seed entries from a base TS-search adapter. + + Seed schema: + - ``xyz`` (dict): Cartesian coordinates. + - ``family`` (str): The family associated with this seed. + - ``method`` (str): Human-readable generator label. + - ``source_adapter`` (str): Adapter id that generated the seed. + - ``metadata`` (dict, optional): Adapter-specific auxiliary fields. + + Args: + reaction: The ARC reaction object. + base_adapter: The underlying TS-search adapter providing seeds. + dihedral_increment: Optional scan increment used by adapters that support it. + """ + adapter = (base_adapter or '').lower() + if adapter != 'heuristics': + raise ValueError(f'Unsupported TS seed base adapter: {base_adapter}') + + # Lazily import to avoid circular imports with heuristics.py. + from arc.job.adapters.ts.heuristics import FAMILY_SETS, h_abstraction, hydrolysis + + xyz_entries = list() + if reaction.family == 'H_Abstraction': + xyzs = h_abstraction(reaction=reaction, dihedral_increment=dihedral_increment) + for entry in xyzs: + xyz = entry.get('xyz') if isinstance(entry, dict) else entry + method = entry.get('method', 'Heuristics') if isinstance(entry, dict) else 'Heuristics' + if xyz is not None: + xyz_entries.append({ + 'xyz': xyz, + 'method': method, + 'family': reaction.family, + 'source_adapter': 'heuristics', + 'metadata': {}, + }) + elif reaction.family in FAMILY_SETS['hydrolysis_set_1'] or reaction.family in FAMILY_SETS['hydrolysis_set_2']: + try: + xyzs_raw, families, indices = hydrolysis(reaction=reaction) + xyz_entries = [{ + 'xyz': xyz, + 'method': 'Heuristics', + 'family': family, + 'source_adapter': 'heuristics', + 'metadata': {'indices': idx}, + } for xyz, family, idx in zip(xyzs_raw, families, indices)] + except ValueError: + xyz_entries = list() + return xyz_entries + + +def get_wrapper_constraints(wrapper: str, + reaction: 'ARCReaction', + seed: dict, + ) -> Optional[dict]: + """ + Return wrapper-specific constraints for a TS seed. + + Args: + wrapper: Wrapper adapter id (e.g., ``crest``). + reaction: The ARC reaction object. + seed: A seed entry returned by :func:`get_ts_seeds`. + """ + wrapper_name = (wrapper or '').lower() + if wrapper_name != 'crest': + raise ValueError(f'Unsupported wrapper adapter: {wrapper}') + return _get_crest_constraints(reaction=reaction, seed=seed) + + +def _get_crest_constraints(reaction: 'ARCReaction', seed: dict) -> Optional[Dict[str, int]]: + """ + Return CREST constraints for a seed. + + Currently, only H_Abstraction is supported. + """ + family = seed.get('family') or reaction.family + xyz = seed.get('xyz') + if family != 'H_Abstraction' or xyz is None: + return None + return _get_h_abs_atoms_from_xyz(xyz) + + +def _get_h_abs_atoms_from_xyz(xyz: dict) -> Optional[Dict[str, int]]: + """ + Determine H-abstraction atoms from a TS guess. + + Returns: + Optional[Dict[str, int]]: ``{'H': int, 'A': int, 'B': int}``, or ``None``. + """ + symbols = xyz.get('symbols') if isinstance(xyz, dict) else None + if not symbols: + return None + dmat = xyz_to_dmat(xyz) + if dmat is None: + return None + + closest_atoms = dict() + for i in range(len(symbols)): + nearest = sorted( + ((dmat[i][j], j) for j in range(len(symbols)) if j != i), + key=lambda x: x[0], + )[:2] + closest_atoms[i] = [idx for _, idx in nearest] + + hydrogen_indices = [i for i, symbol in enumerate(symbols) if symbol.startswith('H')] + condition_occurrences = list() + + for hydrogen_index in hydrogen_indices: + atom_neighbors = closest_atoms[hydrogen_index] + is_heavy_present = any(not symbols[atom].startswith('H') for atom in atom_neighbors) + if_hydrogen_present = any(symbols[atom].startswith('H') and atom != hydrogen_index for atom in atom_neighbors) + + if is_heavy_present and if_hydrogen_present: + condition_occurrences.append({'H': hydrogen_index, 'A': atom_neighbors[0], 'B': atom_neighbors[1]}) + + if condition_occurrences: + if len(condition_occurrences) > 1: + occurrence_distances = list() + for occurrence in condition_occurrences: + h_atom = occurrence['H'] + a_atom = occurrence['A'] + b_atom = occurrence['B'] + occurrence_distances.append((occurrence, dmat[h_atom][a_atom] + dmat[h_atom][b_atom])) + best_occurrence = min(occurrence_distances, key=lambda x: x[1])[0] + return {'H': best_occurrence['H'], 'A': best_occurrence['A'], 'B': best_occurrence['B']} + single_occurrence = condition_occurrences[0] + return {'H': single_occurrence['H'], 'A': single_occurrence['A'], 'B': single_occurrence['B']} + + min_distance = float('inf') + selected_hydrogen = None + selected_heavy_atoms = None + for hydrogen_index in hydrogen_indices: + atom_neighbors = closest_atoms[hydrogen_index] + heavy_atoms = [atom for atom in atom_neighbors if not symbols[atom].startswith('H')] + if len(heavy_atoms) < 2: + continue + distances = dmat[hydrogen_index][heavy_atoms[0]] + dmat[hydrogen_index][heavy_atoms[1]] + if distances < min_distance: + min_distance = distances + selected_hydrogen = hydrogen_index + selected_heavy_atoms = heavy_atoms + + if selected_hydrogen is not None and selected_heavy_atoms is not None: + return {'H': selected_hydrogen, 'A': selected_heavy_atoms[0], 'B': selected_heavy_atoms[1]} + + logger.warning('No valid hydrogen atom found for CREST H-abstraction atoms.') + return None + From 432e594ef2371afb7c8efa5a880536a7069f33cc Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Mon, 9 Feb 2026 13:24:40 +0200 Subject: [PATCH 04/60] Adds CREST documentation Adds documentation for the CREST adapter, including a guide for extending CREST-based TS workflows and minimal usage examples. The documentation covers current family support, external references, and extension instructions for adding new families to CREST or enabling CREST to wrap a new TS seed adapter. Also includes seed schema contract. --- docs/source/TS_search.rst | 86 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/docs/source/TS_search.rst b/docs/source/TS_search.rst index 73513c9afb..21c6315ddb 100644 --- a/docs/source/TS_search.rst +++ b/docs/source/TS_search.rst @@ -54,4 +54,90 @@ A detailed description of the methodology, design choices, and validation benchm L. Fahoum, A. Grinberg Dana, *“Automated Reaction Transition State Search for Neutral Hydrolysis Reactions”*, Digital Discovery, 2026. +CREST +^^^^^ + +CREST is an external conformational sampling tool used by ARC as a TS-search wrapper stage. +In ARC's current flow, CREST is applied to TS seeds generated by base TS search methods and uses +family-specific constraints from ARC. + +Current ARC family support for CREST: + +- ``H_Abstraction`` only (RMG family reference: + `H_Abstraction `_). + +External references: + +- `CREST documentation `_ +- `CREST constrained sampling example `_ + +Wrapper Extension Guide +""""""""""""""""""""""" + +Use this guide when extending CREST-based TS workflows in ARC (for example, adding hydrolysis support to CREST, +or allowing CREST to wrap a new TS seed source adapter). + +ARC uses a neutral wrapper hub API for TS seed generation and wrapper-specific constraints: + +- ``arc.job.adapters.ts.seed_hub.get_ts_seeds(...)`` +- ``arc.job.adapters.ts.seed_hub.get_wrapper_constraints(...)`` + +Current status +"""""""""""""" + +- ``CrestAdapter`` requests seeds using ``base_adapter='heuristics'``. +- ``CrestAdapter`` requests constraints using ``wrapper='crest'``. +- CREST constraints are currently implemented for ``H_Abstraction`` only. +- Hydrolysis seeds can be generated by heuristics, but CREST constraints for hydrolysis are not implemented yet. + +Seed schema contract +"""""""""""""""""""" + +``get_ts_seeds(...)`` returns a list of seed dictionaries with the following fields: + +- ``xyz``: Cartesian coordinates dictionary. +- ``family``: Reaction family associated with the seed. +- ``method``: Method label for provenance. +- ``source_adapter``: TS-search adapter id that generated the seed. +- ``metadata``: Optional adapter-specific metadata dictionary. + +Extension instructions: Add a new family to CREST +""""""""""""""""""""""""""""""""""""""""""""""""" + +1. Update ``get_ts_seeds(...)`` logic in ``arc/job/adapters/ts/seed_hub.py`` only if the seed generation path changes. +2. Add family-specific CREST constraints in ``_get_crest_constraints(...)`` (or family helper it calls) in + ``arc/job/adapters/ts/seed_hub.py``. +3. Add/update tests in ``arc/job/adapters/ts/heuristics_test.py`` (``TestHeuristicsHub``). +4. Update ``ts_adapters_by_rmg_family`` mapping if CREST should be enabled for that family. + +Extension instructions: Let CREST wrap a new TS seed adapter +"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + +1. Add a ``base_adapter`` branch in ``get_ts_seeds(...)``. +2. Ensure the returned seed objects satisfy the seed schema contract. +3. Reuse ``get_wrapper_constraints(wrapper='crest', ...)`` with those seeds. +4. Add tests for the new adapter branch and constraints compatibility. + +Minimal usage pattern +""""""""""""""""""""" + +.. code-block:: python + + from arc.job.adapters.ts.seed_hub import get_ts_seeds, get_wrapper_constraints + + seeds = get_ts_seeds( + reaction=rxn, + base_adapter='heuristics', + dihedral_increment=30, + ) + for seed in seeds: + crest_constraints = get_wrapper_constraints( + wrapper='crest', + reaction=rxn, + seed=seed, + ) + if crest_constraints is None: + continue + # run CREST with crest_constraints["A"], crest_constraints["H"], crest_constraints["B"] + .. include:: links.txt From 5d1612df5e1b0376a888c2abb6b78eba82150785 Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Mon, 9 Feb 2026 13:25:00 +0200 Subject: [PATCH 05/60] Normalizes TSGuess method sources Introduces a `method_sources` attribute to the TSGuess class. This attribute stores all methods that produced an equivalent xyz guess. Normalizes the method sources to a unique, ordered, lowercase list, ensuring consistency and avoiding duplicates. This allows for better tracking of the origins of TS guesses and simplifies the clustering logic. --- arc/species/species.py | 31 ++++++++++++++++++++++++++++--- arc/species/species_test.py | 11 +++++++---- 2 files changed, 35 insertions(+), 7 deletions(-) diff --git a/arc/species/species.py b/arc/species/species.py index 0fe014d080..f453882436 100644 --- a/arc/species/species.py +++ b/arc/species/species.py @@ -1556,10 +1556,11 @@ def cluster_tsgs(self): for tsg in self.ts_guesses: for cluster_tsg in cluster_tsgs: if cluster_tsg.almost_equal_tsgs(tsg): + logger.debug(f"Similar TSGuesses found: {tsg.index} is similar to {cluster_tsg.index}") cluster_tsg.cluster.append(tsg.index) - if tsg.method not in cluster_tsg.method: - cluster_tsg.method += f' + {tsg.method}' - cluster_tsg.execution_time = f'{cluster_tsg.execution_time} + {tsg.execution_time}' + cluster_tsg.method_sources = TSGuess._normalize_method_sources( + (cluster_tsg.method_sources or []) + (tsg.method_sources or []) + ) break else: tsg.cluster = [tsg.index] @@ -2193,6 +2194,7 @@ class TSGuess(object): initial_xyz (dict): The 3D coordinates guess. opt_xyz (dict): The 3D coordinates after optimization at the ts_guesses level. method (str): The method/source used for the xyz guess. + method_sources (List[str]): All methods/sources that produced an equivalent xyz guess. method_index (int): A subindex, used for cases where a single method generates several guesses. Counts separately for each direction, 'F' and 'R'. method_direction (str): The reaction direction used for generating the guess ('F' or 'R'). @@ -2237,6 +2239,7 @@ def __init__(self, # Not reading from a dictionary self.index = index self.method = method.lower() if method is not None else 'user guess' + self.method_sources = self._normalize_method_sources([self.method]) self.method_index = method_index self.method_direction = method_direction self.constraints = constraints @@ -2293,6 +2296,22 @@ def opt_xyz(self, value): """Allow setting the initial coordinate guess""" self._opt_xyz = check_xyz_dict(value) + @staticmethod + def _normalize_method_sources(method_sources: Optional[List[str]]) -> List[str]: + """ + Normalize method_sources to a unique, ordered, lowercase list. + """ + if not method_sources: + return [] + normalized = [] + for method in method_sources: + if method is None: + continue + method = method.lower() + if method not in normalized: + normalized.append(method) + return normalized + def as_dict(self, for_report: bool = False) -> dict: """ A helper function for dumping this object as a dictionary. @@ -2306,6 +2325,8 @@ def as_dict(self, for_report: bool = False) -> dict: """ ts_dict = dict() ts_dict['method'] = self.method + if self.method_sources: + ts_dict['method_sources'] = list(self.method_sources) ts_dict['method_index'] = self.method_index if self.method_direction is not None: ts_dict['method_direction'] = self.method_direction @@ -2354,6 +2375,10 @@ def from_dict(self, ts_dict: dict): and isinstance(ts_dict['execution_time'], str) \ else ts_dict['execution_time'] if 'execution_time' in ts_dict else None self.method = ts_dict['method'].lower() if 'method' in ts_dict else 'user guess' + if 'method_sources' in ts_dict and isinstance(ts_dict['method_sources'], list): + self.method_sources = self._normalize_method_sources(ts_dict['method_sources']) + else: + self.method_sources = self._normalize_method_sources([self.method]) self.method_index = ts_dict['method_index'] if 'method_index' in ts_dict else None self.method_direction = ts_dict['method_direction'] if 'method_direction' in ts_dict else None self.imaginary_freqs = ts_dict['imaginary_freqs'] if 'imaginary_freqs' in ts_dict else None diff --git a/arc/species/species_test.py b/arc/species/species_test.py index 8074dd8c96..ebb300ea64 100644 --- a/arc/species/species_test.py +++ b/arc/species/species_test.py @@ -2225,8 +2225,9 @@ def test_cluster_tsgs(self): self.assertEqual(len(spc_1.ts_guesses), 4) spc_1.cluster_tsgs() self.assertEqual(len(spc_1.ts_guesses), 2) - self.assertEqual(spc_1.ts_guesses[0].method, 'user guess 0 + kinbot') - self.assertEqual(spc_1.ts_guesses[0].execution_time, '00:00:02 + 00:00:02') + self.assertEqual(spc_1.ts_guesses[0].method, 'user guess 0') + self.assertEqual(spc_1.ts_guesses[0].method_sources, ['user guess 0', 'kinbot']) + self.assertEqual(spc_1.ts_guesses[0].execution_time, '00:00:02') self.assertEqual(spc_1.ts_guesses[0].index, 0) self.assertEqual(spc_1.ts_guesses[1].method, 'gcn') self.assertEqual(spc_1.ts_guesses[1].execution_time, '00:00:02') @@ -2888,6 +2889,7 @@ def test_as_dict(self): """Test TSGuess.as_dict()""" tsg_dict = self.tsg1.as_dict() expected_dict = {'method': 'autotst', + 'method_sources': ['autotst'], 'conformer_index': None, 'family': 'H_Abstraction', 'index': None, @@ -2906,9 +2908,10 @@ def test_from_dict(self): ts_dict = self.tsg1.as_dict() tsg = TSGuess(ts_dict=ts_dict) self.assertEqual(tsg.method, 'autotst') + self.assertEqual(tsg.method_sources, ['autotst']) ts_dict_for_report = self.tsg1.as_dict(for_report=True) - self.assertEqual(list(ts_dict_for_report.keys()), ['method', 'method_index', 'success', 'index', - 'conformer_index', 'initial_xyz', 'opt_xyz']) + self.assertEqual(list(ts_dict_for_report.keys()), ['method', 'method_sources', 'method_index', 'success', + 'index', 'conformer_index', 'initial_xyz', 'opt_xyz']) def test_process_xyz(self): """Test the process_xyz() method""" From 8891252755cc5511bd6992cc9d5281a33857ca59 Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Mon, 9 Feb 2026 13:25:59 +0200 Subject: [PATCH 06/60] Adds CREST to available TS search methods Adds 'crest' as a valid option for transition state search methods. Adds a new job to the test suite for coverage of the wall time exceeded functionality. Updates testing path for wall_exceeded fixture Updates the path used to locate the `wall_exceeded.txt` fixture in the `TestJobAdapter` test class. This ensures that the test can correctly access the fixture file, regardless of the execution environment. --- arc/job/adapter.py | 1 + arc/job/adapter_test.py | 6 +++ .../calcs/Species/spc1/spc1/input.gjf | 12 ------ .../calcs/Species/spc1/spc1/submit.sub | 37 ------------------- .../spc1/err.txt => trsh/wall_exceeded.txt} | 0 5 files changed, 7 insertions(+), 49 deletions(-) delete mode 100644 arc/testing/test_JobAdapter_ServerTimeLimit/calcs/Species/spc1/spc1/input.gjf delete mode 100644 arc/testing/test_JobAdapter_ServerTimeLimit/calcs/Species/spc1/spc1/submit.sub rename arc/testing/{test_JobAdapter_ServerTimeLimit/calcs/Species/spc1/spc1/err.txt => trsh/wall_exceeded.txt} (100%) diff --git a/arc/job/adapter.py b/arc/job/adapter.py index fbef435827..5aeaba802b 100644 --- a/arc/job/adapter.py +++ b/arc/job/adapter.py @@ -97,6 +97,7 @@ class JobEnum(str, Enum): # TS search methods autotst = 'autotst' # AutoTST, 10.1021/acs.jpca.7b07361, 10.26434/chemrxiv.13277870.v2 heuristics = 'heuristics' # ARC's heuristics + crest = 'crest' # CREST conformer/TS search kinbot = 'kinbot' # KinBot, 10.1016/j.cpc.2019.106947 gcn = 'gcn' # Graph neural network for isomerization, https://doi.org/10.1021/acs.jpclett.0c00500 user = 'user' # user guesses diff --git a/arc/job/adapter_test.py b/arc/job/adapter_test.py index 2df3fc1d26..939c7753c1 100644 --- a/arc/job/adapter_test.py +++ b/arc/job/adapter_test.py @@ -207,6 +207,12 @@ def setUpClass(cls): server='server3', testing=True, ) + os.makedirs(cls.job_5.local_path, exist_ok=True) + fixture_path = os.path.join(ARC_TESTING_PATH, 'trsh', 'wall_exceeded.txt') + with open(fixture_path, 'r') as f: + log_content = f.read() + with open(os.path.join(cls.job_5.local_path, 'out.txt'), 'w') as f: + f.write(log_content) cls.job_6 = GaussianAdapter(execution_type='queue', job_name='spc1', job_type='opt', diff --git a/arc/testing/test_JobAdapter_ServerTimeLimit/calcs/Species/spc1/spc1/input.gjf b/arc/testing/test_JobAdapter_ServerTimeLimit/calcs/Species/spc1/spc1/input.gjf deleted file mode 100644 index 36f9d855ac..0000000000 --- a/arc/testing/test_JobAdapter_ServerTimeLimit/calcs/Species/spc1/spc1/input.gjf +++ /dev/null @@ -1,12 +0,0 @@ -%chk=check.chk -%mem=14336mb -%NProcShared=8 - -#P opt=(calcfc) cbs-qb3 IOp(2/9=2000) - -spc1 - -0 3 -O 0.00000000 0.00000000 1.00000000 - - diff --git a/arc/testing/test_JobAdapter_ServerTimeLimit/calcs/Species/spc1/spc1/submit.sub b/arc/testing/test_JobAdapter_ServerTimeLimit/calcs/Species/spc1/spc1/submit.sub deleted file mode 100644 index 00b840cd67..0000000000 --- a/arc/testing/test_JobAdapter_ServerTimeLimit/calcs/Species/spc1/spc1/submit.sub +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash -l -#SBATCH -p normal -#SBATCH -J server1 -#SBATCH -N 1 -#SBATCH -n 8 -#SBATCH --time=120:00:00 -#SBATCH --mem-per-cpu=15770 -#SBATCH -o out.txt -#SBATCH -e err.txt - -export g16root=/home/gridsan/groups/GRPAPI/Software -export PATH=$g16root/g16/:$g16root/gv:$PATH -which g16 - -echo "============================================================" -echo "Job ID : $SLURM_JOB_ID" -echo "Job Name : $SLURM_JOB_NAME" -echo "Starting on : $(date)" -echo "Running on node : $SLURMD_NODENAME" -echo "Current directory : $(pwd)" -echo "============================================================" - -touch initial_time - -GAUSS_SCRDIR=/state/partition1/user//$SLURM_JOB_NAME-$SLURM_JOB_ID -export $GAUSS_SCRDIR -. $g16root/g16/bsd/g16.profile - -mkdir -p $GAUSS_SCRDIR - -g16 < input.gjf > input.log - -rm -rf $GAUSS_SCRDIR - -touch final_time - - \ No newline at end of file diff --git a/arc/testing/test_JobAdapter_ServerTimeLimit/calcs/Species/spc1/spc1/err.txt b/arc/testing/trsh/wall_exceeded.txt similarity index 100% rename from arc/testing/test_JobAdapter_ServerTimeLimit/calcs/Species/spc1/spc1/err.txt rename to arc/testing/trsh/wall_exceeded.txt From 146a145d5c7f4fdf53ffd45ab85c3f31860b3b50 Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Mon, 9 Feb 2026 13:26:14 +0200 Subject: [PATCH 07/60] Adds CREST as TS adapter option Includes CREST as a valid TS adapter option for H_Abstraction reactions. This allows users to utilize CREST for transition state searches, expanding the available methods. --- arc/job/adapters/common.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arc/job/adapters/common.py b/arc/job/adapters/common.py index 8fb331522c..0256a300bf 100644 --- a/arc/job/adapters/common.py +++ b/arc/job/adapters/common.py @@ -41,7 +41,7 @@ 'Cyclic_Ether_Formation': ['kinbot'], 'Cyclopentadiene_scission': ['gcn', 'xtb_gsm'], 'Diels_alder_addition': ['kinbot'], - 'H_Abstraction': ['heuristics', 'autotst'], + 'H_Abstraction': ['heuristics', 'autotst', 'crest'], 'carbonyl_based_hydrolysis': ['heuristics'], 'ether_hydrolysis': ['heuristics'], 'nitrile_hydrolysis': ['heuristics'], @@ -77,7 +77,8 @@ adapters_that_do_not_require_a_level_arg = ['xtb', 'torchani'] # Default is "queue", "pipe" will be called whenever needed. So just list 'incore'. -default_incore_adapters = ['autotst', 'gcn', 'heuristics', 'kinbot', 'psi4', 'xtb', 'xtb_gsm', 'torchani', 'openbabel'] +default_incore_adapters = ['autotst', 'crest', 'gcn', 'heuristics', 'kinbot', 'psi4', 'xtb', 'xtb_gsm', 'torchani', + 'openbabel'] def _initialize_adapter(obj: 'JobAdapter', From abd9201b6bb9696767121382715296b4c732c705 Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Mon, 9 Feb 2026 13:26:25 +0200 Subject: [PATCH 08/60] Adds function to reorder XYZ strings. Adds a function to reorder and convert XYZ strings between ``ATOM X Y Z`` and ``X Y Z ATOM`` formats, with optional unit conversion. Also adds a backwards-compatible wrapper with a deprecation warning. --- arc/species/converter.py | 98 +++++++++++++++++++++++++++++++++++ arc/species/converter_test.py | 32 ++++++++++++ 2 files changed, 130 insertions(+) diff --git a/arc/species/converter.py b/arc/species/converter.py index ce0d484541..9d19a4b13d 100644 --- a/arc/species/converter.py +++ b/arc/species/converter.py @@ -5,6 +5,7 @@ import math import numpy as np import os +import warnings from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union from ase import Atoms @@ -48,6 +49,103 @@ DIST_PRECISION = 0.01 # Angstrom ANGL_PRECISION = 0.1 # rad (for both bond angle and dihedral) +def reorder_xyz_string(xyz_str: str, + reverse_atoms: bool = False, + units: str = 'angstrom', + convert_to: str = 'angstrom', + project_directory: Optional[str] = None + ) -> str: + """ + Reorder an XYZ string between ``ATOM X Y Z`` and ``X Y Z ATOM`` with optional unit conversion. + + Args: + xyz_str (str): The string xyz format to be converted. + reverse_atoms (bool, optional): Whether to reverse the atoms and coordinates. + units (str, optional): Units of the input coordinates ('angstrom' or 'bohr'). + convert_to (str, optional): The units to convert to (either 'angstrom' or 'bohr'). + project_directory (str, optional): The path to the project directory. + + Raises: + ConverterError: If xyz_str is not a string or does not have four space-separated entries per non-empty line. + + Returns: str + The converted string xyz format. + """ + if isinstance(xyz_str, tuple): + xyz_str = '\n'.join(xyz_str) + if isinstance(xyz_str, list): + xyz_str = '\n'.join(xyz_str) + if not isinstance(xyz_str, str): + raise ConverterError(f'Expected a string input, got {type(xyz_str)}') + if project_directory is not None: + file_path = os.path.join(project_directory, xyz_str) + if os.path.isfile(file_path): + with open(file_path, 'r') as f: + xyz_str = f.read() + + + if units.lower() == 'angstrom' and convert_to.lower() == 'angstrom': + conversion_factor = 1 + elif units.lower() == 'bohr' and convert_to.lower() == 'bohr': + conversion_factor = 1 + elif units.lower() == 'angstrom' and convert_to.lower() == 'bohr': + conversion_factor = constants.angstrom_to_bohr + elif units.lower() == 'bohr' and convert_to.lower() == 'angstrom': + conversion_factor = constants.bohr_to_angstrom + else: + raise ConverterError("Invalid target unit. Choose 'angstrom' or 'bohr'.") + + processed_lines = list() + # Split the string into lines + lxyz = xyz_str.strip().splitlines() + # Determine whether the atom label appears first or last in each line + first_line_tokens = lxyz[0].strip().split() + atom_first = not is_str_float(first_line_tokens[0]) + + for item in lxyz: + parts = item.strip().split() + + if len(parts) != 4: + raise ConverterError(f'xyz_str has an incorrect format, expected 4 elements in each line, ' + f'got "{item}" in:\n{xyz_str}') + if atom_first: + atom, x_str, y_str, z_str = parts + else: + x_str, y_str, z_str, atom = parts + + try: + x = float(x_str) * conversion_factor + y = float(y_str) * conversion_factor + z = float(z_str) * conversion_factor + + except ValueError as e: + raise ConverterError(f'Could not convert {x_str}, {y_str}, or {z_str} to floats.') from e + + if reverse_atoms and atom_first: + formatted_line = f'{x} {y} {z} {atom}' + elif reverse_atoms and not atom_first: + formatted_line = f'{atom} {x} {y} {z}' + elif not reverse_atoms and atom_first: + formatted_line = f'{atom} {x} {y} {z}' + elif not reverse_atoms and not atom_first: + formatted_line = f'{x} {y} {z} {atom}' + + processed_lines.append(formatted_line) + + return '\n'.join(processed_lines) + + +def str_to_str(*args, **kwargs) -> str: + """ + Backwards compatible wrapper for reorder_xyz_string. + """ + warnings.warn( + "str_to_str was renamed to reorder_xyz_string and will be removed in a future ARC release", + DeprecationWarning, + stacklevel=2, + ) + return reorder_xyz_string(*args, **kwargs) + def str_to_xyz(xyz_str: str, project_directory: Optional[str] = None, diff --git a/arc/species/converter_test.py b/arc/species/converter_test.py index aa881bdcac..f423c4d500 100644 --- a/arc/species/converter_test.py +++ b/arc/species/converter_test.py @@ -18,6 +18,7 @@ import arc.species.converter as converter from arc.common import ARC_PATH, ARC_TESTING_PATH, almost_equal_coords, almost_equal_coords_lists, almost_equal_lists +from arc.constants import angstrom_to_bohr from arc.exceptions import ConverterError from arc.molecule.molecule import Molecule from arc.species.perceive import perceive_molecule_from_xyz @@ -700,6 +701,37 @@ def test_str_to_xyz(self): xyz = converter.str_to_xyz(xyz_format) self.assertEqual(xyz, expected_xyz) + def test_reorder_xyz_string_atom_first(self): + """Test reordering atom-first XYZ strings with unit conversion""" + xyz_format = "C 0.0 1.0 2.0\nH -1.0 0.5 0.0" + converted = converter.reorder_xyz_string(xyz_str=xyz_format, reverse_atoms=True, convert_to="bohr") + converted_lines = converted.splitlines() + self.assertEqual(len(converted_lines), 2) + + x1, y1, z1, s1 = converted_lines[0].split() + self.assertEqual(s1, "C") + self.assertAlmostEqual(float(x1), 0.0) + self.assertAlmostEqual(float(y1), 1.0 * angstrom_to_bohr) + self.assertAlmostEqual(float(z1), 2.0 * angstrom_to_bohr) + + x2, y2, z2, s2 = converted_lines[1].split() + self.assertEqual(s2, "H") + self.assertAlmostEqual(float(x2), -1.0 * angstrom_to_bohr) + self.assertAlmostEqual(float(y2), 0.5 * angstrom_to_bohr) + self.assertAlmostEqual(float(z2), 0.0) + + def test_reorder_xyz_string_coordinate_first(self): + """Test reordering coordinate-first XYZ strings back to atom-last order with conversion""" + xyz_format = "0.0 0.0 0.0 N\n1.0 0.0 0.0 H" + converted = converter.reorder_xyz_string( + xyz_str=xyz_format, + reverse_atoms=False, + units="bohr", + convert_to="angstrom", + ) + expected = "0.0 0.0 0.0 N\n0.529177 0.0 0.0 H" + self.assertEqual(converted, expected) + def test_xyz_to_str(self): """Test converting an ARC xyz format to a string xyz format""" xyz_str1 = converter.xyz_to_str(xyz_dict=self.xyz1['dict']) From 87f3c51acd5c0a7877f18796117283c2a95821d0 Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Mon, 9 Feb 2026 13:26:37 +0200 Subject: [PATCH 09/60] Fixes restart tests for parallel execution Modifies restart tests to generate unique project names when running in parallel using pytest-xdist. This avoids collisions during cleanup of project directories. Updates restart tests to use ARC_TESTING_PATH Modifies restart tests to utilize the ARC_TESTING_PATH constant for specifying test directories. This change ensures consistency and simplifies path management within the testing framework. --- functional/restart_test.py | 48 ++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/functional/restart_test.py b/functional/restart_test.py index d49c2e945c..3437d384a7 100644 --- a/functional/restart_test.py +++ b/functional/restart_test.py @@ -12,10 +12,18 @@ from arc.molecule.molecule import Molecule -from arc.common import ARC_PATH, read_yaml_file +from arc.common import ARC_PATH, ARC_TESTING_PATH, read_yaml_file from arc.main import ARC +def _project_name(base: str) -> str: + """Return a per-xdist-worker project name to avoid parallel cleanup collisions.""" + worker_id = os.environ.get('PYTEST_XDIST_WORKER') + if worker_id: + return f'{base}_{worker_id}' + return base + + class TestRestart(unittest.TestCase): """ Contains unit tests for restarting ARC. @@ -34,9 +42,9 @@ def test_restart_thermo(self): Test restarting ARC through the ARC class in main.py via the input_dict argument of the API Rather than through ARC.py. Check that all files are in place and the log file content. """ - restart_dir = os.path.join(ARC_PATH, 'arc', 'testing', 'restart', '1_restart_thermo') + restart_dir = os.path.join(ARC_TESTING_PATH, 'restart', '1_restart_thermo') restart_path = os.path.join(restart_dir, 'restart.yml') - project = 'arc_project_for_testing_delete_after_usage_restart_thermo' + project = _project_name('arc_project_for_testing_delete_after_usage_restart_thermo') project_directory = os.path.join(ARC_PATH, 'Projects', project) os.makedirs(os.path.dirname(project_directory), exist_ok=True) shutil.copytree(os.path.join(restart_dir, 'calcs'), os.path.join(project_directory, 'calcs', 'Species'), dirs_exist_ok=True) @@ -55,7 +63,7 @@ def test_restart_thermo(self): break self.assertTrue(thermo_dft_ccsdtf12_bac) - with open(os.path.join(project_directory, 'arc_project_for_testing_delete_after_usage_restart_thermo.info'), 'r') as f: + with open(os.path.join(project_directory, f'{project}.info'), 'r') as f: sts, n2h3, oet, lot, ap = False, False, False, False, False for line in f.readlines(): if 'Considered the following species and TSs:' in line: @@ -66,7 +74,7 @@ def test_restart_thermo(self): oet = True elif 'Levels of theory used:' in line: lot = True - elif 'ARC project arc_project_for_testing_delete_after_usage_restart_thermo' in line: + elif f'ARC project {project}' in line: ap = True self.assertTrue(sts) self.assertTrue(n2h3) @@ -131,9 +139,9 @@ def test_restart_thermo(self): def test_restart_rate_1(self): """Test restarting ARC and attaining a reaction rate coefficient""" - restart_dir = os.path.join(ARC_PATH, 'arc', 'testing', 'restart', '2_restart_rate') + restart_dir = os.path.join(ARC_TESTING_PATH, 'restart', '2_restart_rate') restart_path = os.path.join(restart_dir, 'restart.yml') - project = 'arc_project_for_testing_delete_after_usage_restart_rate_1' + project = _project_name('arc_project_for_testing_delete_after_usage_restart_rate_1') project_directory = os.path.join(ARC_PATH, 'Projects', project) os.makedirs(os.path.dirname(project_directory), exist_ok=True) shutil.copytree(os.path.join(restart_dir, 'calcs'), os.path.join(project_directory, 'calcs'), dirs_exist_ok=True) @@ -154,9 +162,9 @@ def test_restart_rate_1(self): def test_restart_rate_2(self): """Test restarting ARC and attaining a reaction rate coefficient""" - project = 'arc_project_for_testing_delete_after_usage_restart_rate_2' + project = _project_name('arc_project_for_testing_delete_after_usage_restart_rate_2') project_directory = os.path.join(ARC_PATH, 'Projects', project) - base_path = os.path.join(ARC_PATH, 'arc', 'testing', 'restart', '5_TS1') + base_path = os.path.join(ARC_TESTING_PATH, 'restart', '5_TS1') restart_path = os.path.join(base_path, 'restart.yml') input_dict = read_yaml_file(path=restart_path, project_directory=project_directory) input_dict['output']['TS0']['paths']['freq'] = os.path.join(ARC_PATH, input_dict['output']['TS0']['paths']['freq']) @@ -181,9 +189,9 @@ def test_restart_rate_2(self): def test_restart_bde (self): """Test restarting ARC and attaining a BDE for anilino_radical.""" - restart_dir = os.path.join(ARC_PATH, 'arc', 'testing', 'restart', '3_restart_bde') + restart_dir = os.path.join(ARC_TESTING_PATH, 'restart', '3_restart_bde') restart_path = os.path.join(restart_dir, 'restart.yml') - project = 'test_restart_bde' + project = _project_name('test_restart_bde') project_directory = os.path.join(ARC_PATH, 'Projects', project) os.makedirs(os.path.dirname(project_directory), exist_ok=True) shutil.copytree(os.path.join(restart_dir, 'calcs'), os.path.join(project_directory, 'calcs'), dirs_exist_ok=True) @@ -192,7 +200,7 @@ def test_restart_bde (self): arc1 = ARC(**input_dict) arc1.execute() - report_path = os.path.join(ARC_PATH, 'Projects', 'test_restart_bde', 'output', 'BDE_report.txt') + report_path = os.path.join(ARC_PATH, 'Projects', project, 'output', 'BDE_report.txt') with open(report_path, 'r') as f: lines = f.readlines() self.assertIn(' BDE report for anilino_radical:\n', lines) @@ -200,7 +208,7 @@ def test_restart_bde (self): def test_globalize_paths(self): """Test modifying a YAML file's contents to correct absolute file paths""" - project_directory = os.path.join(ARC_PATH, 'arc', 'testing', 'restart', '4_globalized_paths') + project_directory = os.path.join(ARC_TESTING_PATH, 'restart', '4_globalized_paths') restart_path = os.path.join(project_directory, 'restart_paths.yml') input_dict = read_yaml_file(path=restart_path, project_directory=project_directory) input_dict['project_directory'] = project_directory @@ -218,25 +226,25 @@ def tearDownClass(cls): A function that is run ONCE after all unit tests in this class. Delete all project directories created during these unit tests """ - projects = ['arc_project_for_testing_delete_after_usage_restart_thermo', - 'arc_project_for_testing_delete_after_usage_restart_rate_1', - 'arc_project_for_testing_delete_after_usage_restart_rate_2', - 'test_restart_bde', + projects = [_project_name('arc_project_for_testing_delete_after_usage_restart_thermo'), + _project_name('arc_project_for_testing_delete_after_usage_restart_rate_1'), + _project_name('arc_project_for_testing_delete_after_usage_restart_rate_2'), + _project_name('test_restart_bde'), ] for project in projects: project_directory = os.path.join(ARC_PATH, 'Projects', project) shutil.rmtree(project_directory, ignore_errors=True) - shutil.rmtree(os.path.join(ARC_PATH, 'arc', 'testing', 'restart', '4_globalized_paths', + shutil.rmtree(os.path.join(ARC_TESTING_PATH, 'restart', '4_globalized_paths', 'log_and_restart_archive'), ignore_errors=True) for file_name in ['arc.log', 'restart_paths_globalized.yml']: - file_path = os.path.join(ARC_PATH, 'arc', 'testing', 'restart', '4_globalized_paths', file_name) + file_path = os.path.join(ARC_TESTING_PATH, 'restart', '4_globalized_paths', file_name) if os.path.isfile(file_path): os.remove(file_path) file_paths = [os.path.join(ARC_PATH, 'functional', 'nul'), os.path.join(ARC_PATH, 'functional', 'run.out')] project_names = ['1_restart_thermo', '2_restart_rate', '3_restart_bde', '5_TS1'] for project_name in project_names: - file_paths.append(os.path.join(ARC_PATH, 'arc', 'testing', 'restart', project_name, 'restart_globalized.yml')) + file_paths.append(os.path.join(ARC_TESTING_PATH, 'restart', project_name, 'restart_globalized.yml')) for file_path in file_paths: if os.path.isfile(file_path): os.remove(file_path) From 0baa098ce648af79dc33abb4132b7050857bb1fd Mon Sep 17 00:00:00 2001 From: Alon Grinberg Dana Date: Fri, 3 Apr 2026 15:23:08 +0300 Subject: [PATCH 10/60] Added the job pipe sub-module with state, coordinate, planner and run for pipe --- arc/job/pipe/__init__.py | 9 + arc/job/pipe/pipe_coordinator.py | 205 +++++++ arc/job/pipe/pipe_coordinator_test.py | 236 ++++++++ arc/job/pipe/pipe_planner.py | 312 ++++++++++ arc/job/pipe/pipe_planner_test.py | 278 +++++++++ arc/job/pipe/pipe_run.py | 826 ++++++++++++++++++++++++++ arc/job/pipe/pipe_run_test.py | 416 +++++++++++++ arc/job/pipe/pipe_state.py | 551 +++++++++++++++++ arc/job/pipe/pipe_state_test.py | 290 +++++++++ 9 files changed, 3123 insertions(+) create mode 100644 arc/job/pipe/__init__.py create mode 100644 arc/job/pipe/pipe_coordinator.py create mode 100644 arc/job/pipe/pipe_coordinator_test.py create mode 100644 arc/job/pipe/pipe_planner.py create mode 100644 arc/job/pipe/pipe_planner_test.py create mode 100644 arc/job/pipe/pipe_run.py create mode 100644 arc/job/pipe/pipe_run_test.py create mode 100644 arc/job/pipe/pipe_state.py create mode 100644 arc/job/pipe/pipe_state_test.py diff --git a/arc/job/pipe/__init__.py b/arc/job/pipe/__init__.py new file mode 100644 index 0000000000..88934eb54c --- /dev/null +++ b/arc/job/pipe/__init__.py @@ -0,0 +1,9 @@ +""" +ARC pipe subpackage — distributed HPC execution via job arrays. + +Submodules: + - ``pipe_state``: task/run state primitives, data models, file-level locking + - ``pipe_run``: PipeRun orchestrator, task builders, ingestion helpers + - ``pipe_coordinator``: active pipe lifecycle management (eligibility, submission, polling, ingestion) + - ``pipe_planner``: family-specific routing from ARC objects to pipe task batches +""" diff --git a/arc/job/pipe/pipe_coordinator.py b/arc/job/pipe/pipe_coordinator.py new file mode 100644 index 0000000000..1efab8be65 --- /dev/null +++ b/arc/job/pipe/pipe_coordinator.py @@ -0,0 +1,205 @@ +""" +Pipe run lifecycle coordinator. + +Manages the active pipe run registry, eligibility checks, submission, +reconstruction, polling, resubmission, and ingestion dispatch. + +This module owns the lifecycle of pipe runs once they are created. +Family-specific task planning lives in ``pipe_planner.py``. +""" + +import time +from typing import TYPE_CHECKING, Dict, List + +from arc.common import get_logger +from arc.imports import settings + +from arc.job.pipe.pipe_run import PipeRun, ingest_completed_task +from arc.job.pipe.pipe_state import PipeRunState, TaskState, TaskSpec, read_task_state + +if TYPE_CHECKING: + from arc.scheduler import Scheduler + +logger = get_logger() + +pipe_settings = settings['pipe_settings'] + + +class PipeCoordinator: + """ + Manages the lifecycle of active pipe runs for a Scheduler instance. + + Owns: + - pipe eligibility checks + - run creation / submission / reconstruction + - polling / resubmission + - terminal ingestion dispatch + + Args: + sched: The owning Scheduler instance, providing ``project_directory``, + ``species_dict``, and ``output``. + """ + + def __init__(self, sched: 'Scheduler'): + self.sched = sched + self.active_pipes: Dict[str, PipeRun] = {} + self._pipe_poll_failures: Dict[str, int] = {} + + def should_use_pipe(self, tasks: List[TaskSpec]) -> bool: + """ + Determine whether a list of tasks is eligible for pipe-mode execution. + + Returns ``True`` only if: + 1. Pipe mode is enabled. + 2. There are at least ``min_tasks`` tasks. + 3. All tasks are homogeneous in engine, task_family, owner_type, + level, required_cores, and required_memory_mb. + """ + if not pipe_settings.get('enabled', True): + return False + if not tasks: + return False + min_tasks = pipe_settings.get('min_tasks', 10) + if len(tasks) < min_tasks: + return False + ref = tasks[0] + return all(t.engine == ref.engine + and t.task_family == ref.task_family + and t.owner_type == ref.owner_type + and t.level == ref.level + and t.required_cores == ref.required_cores + and t.required_memory_mb == ref.required_memory_mb + for t in tasks[1:]) + + def submit_pipe_run(self, run_id: str, tasks: List[TaskSpec], + cluster_software: str = 'slurm') -> PipeRun: + """ + Create, stage, and register a new pipe run. + + Attempts to write a submit script and submit the array job. + On submission failure, the run is still registered as STAGED. + + Returns: + PipeRun: The created pipe run. + """ + pipe = PipeRun( + project_directory=self.sched.project_directory, + run_id=run_id, + tasks=tasks, + cluster_software=cluster_software, + max_workers=pipe_settings.get('max_workers', 100), + max_attempts=pipe_settings.get('max_attempts', 3), + ) + pipe.stage() + try: + pipe.write_submit_script() + except NotImplementedError: + logger.warning(f'Pipe run {run_id}: submit script generation not yet implemented ' + f'for {cluster_software}. Tasks are staged but must be submitted manually.') + self.active_pipes[run_id] = pipe + return pipe + try: + job_status, job_id = pipe.submit_to_scheduler() + if job_status == 'submitted' and job_id: + pipe.scheduler_job_id = job_id + pipe.status = PipeRunState.SUBMITTED + pipe.submitted_at = time.time() + pipe._save_run_metadata() + logger.info(f'Pipe run {run_id} submitted as job {job_id}.') + else: + logger.warning(f'Pipe run {run_id}: submission returned status={job_status}. ' + f'Tasks are staged at {pipe.pipe_root}.') + except Exception as e: + logger.warning(f'Pipe run {run_id}: submission failed ({e}). ' + f'Tasks are staged at {pipe.pipe_root} but not running.') + self.active_pipes[run_id] = pipe + return pipe + + def register_pipe_run_from_dir(self, pipe_root: str) -> PipeRun: + """Reconstruct and register an existing pipe run from disk.""" + pipe = PipeRun.from_dir(pipe_root) + self.active_pipes[pipe.run_id] = pipe + return pipe + + def poll_pipes(self) -> None: + """ + Reconcile all active pipe runs. + + Detects orphans, schedules retries, resubmits if needed, ingests + terminal runs, and removes completed/failed runs from the registry. + + Tolerates up to 3 consecutive reconciliation failures per run before + marking it as FAILED and removing it. + """ + max_consecutive_failures = 3 + for run_id in list(self.active_pipes.keys()): + pipe = self.active_pipes[run_id] + try: + counts = pipe.reconcile() + except Exception: + n_failures = self._pipe_poll_failures.get(run_id, 0) + 1 + self._pipe_poll_failures[run_id] = n_failures + logger.error(f'Pipe run {run_id}: reconciliation failed ' + f'({n_failures}/{max_consecutive_failures})', exc_info=True) + if n_failures >= max_consecutive_failures: + logger.error(f'Pipe run {run_id}: {max_consecutive_failures} consecutive polling failures. ' + f'Marking as FAILED and removing from active pipes.') + try: + pipe.status = PipeRunState.FAILED + pipe._save_run_metadata() + except Exception as e: + logger.debug(f'Pipe run {run_id}: best-effort FAILED persist failed: {e}') + del self.active_pipes[run_id] + self._pipe_poll_failures.pop(run_id, None) + continue + self._pipe_poll_failures.pop(run_id, None) + summary = ', '.join(f'{state}: {n}' for state, n in sorted(counts.items()) if n > 0) + logger.info(f'Pipe run {run_id}: {summary}') + if pipe.needs_resubmission: + logger.info(f'Pipe run {run_id}: resubmitting to pick up retried tasks.') + try: + job_status, job_id = pipe.submit_to_scheduler() + if job_status == 'submitted' and job_id: + pipe.scheduler_job_id = job_id + pipe.status = PipeRunState.SUBMITTED + pipe.submitted_at = time.time() + pipe._needs_resubmission = False + pipe._save_run_metadata() + logger.info(f'Pipe run {run_id}: resubmitted as job {job_id}.') + else: + pipe._needs_resubmission = False + except Exception: + logger.warning(f'Pipe run {run_id}: resubmission failed.', exc_info=True) + if pipe.status in (PipeRunState.COMPLETED, PipeRunState.COMPLETED_PARTIAL): + self.ingest_pipe_results(pipe) + del self.active_pipes[run_id] + elif pipe.status == PipeRunState.FAILED: + logger.error(f'Pipe run {run_id} has FAILED status. ' + f'Ingesting any available results and removing from active pipes.') + self.ingest_pipe_results(pipe) + del self.active_pipes[run_id] + + def ingest_pipe_results(self, pipe: PipeRun) -> None: + """ + Ingest results from a terminal pipe run. + + Dispatches by task_family. One broken task does not abort + ingestion of remaining tasks. + """ + for spec in pipe.tasks: + try: + state = read_task_state(pipe.pipe_root, spec.task_id) + except (FileNotFoundError, ValueError, KeyError): + logger.error(f'Pipe run {pipe.run_id}, task {spec.task_id}: ' + f'could not read state, skipping.') + continue + if state.status == TaskState.COMPLETED.value: + ingest_completed_task(pipe.run_id, pipe.pipe_root, spec, state, + self.sched.species_dict, self.sched.output) + elif state.status == TaskState.FAILED_TERMINAL.value: + logger.error(f'Pipe run {pipe.run_id}, task {spec.task_id}: ' + f'failed terminally (failure_class={state.failure_class}). ' + f'Manual troubleshooting required.') + elif state.status == TaskState.CANCELLED.value: + logger.warning(f'Pipe run {pipe.run_id}, task {spec.task_id}: ' + f'was cancelled.') diff --git a/arc/job/pipe/pipe_coordinator_test.py b/arc/job/pipe/pipe_coordinator_test.py new file mode 100644 index 0000000000..fe26ea0998 --- /dev/null +++ b/arc/job/pipe/pipe_coordinator_test.py @@ -0,0 +1,236 @@ +#!/usr/bin/env python3 +# encoding: utf-8 + +""" +This module contains unit tests for the arc.job.pipe.pipe_coordinator module +""" + +import os +import shutil +import tempfile +import time +import unittest +from unittest.mock import MagicMock, patch + +from arc.job.pipe.pipe_coordinator import PipeCoordinator +from arc.job.pipe.pipe_run import PipeRun +from arc.job.pipe.pipe_state import ( + PipeRunState, + TaskState, + TaskSpec, + update_task_state, +) +from arc.species import ARCSpecies + + +def _make_spec(task_id, task_family='conf_opt', engine='mockter', level=None, + species_label='H2O', conformer_index=0, cores=4, mem=2048): + """Helper to create a TaskSpec for testing.""" + spc = ARCSpecies(label=species_label, smiles='O') + return TaskSpec( + task_id=task_id, + task_family=task_family, + owner_type='species', + owner_key=species_label, + input_fingerprint=f'{task_id}_fp', + engine=engine, + level=level or {'method': 'mock', 'basis': 'mock'}, + required_cores=cores, + required_memory_mb=mem, + input_payload={'species_dicts': [spc.as_dict()]}, + ingestion_metadata={'conformer_index': conformer_index}, + ) + + +def _make_mock_sched(project_directory): + """Create a mock Scheduler with the attributes PipeCoordinator needs.""" + sched = MagicMock() + sched.project_directory = project_directory + spc = ARCSpecies(label='H2O', smiles='O') + spc.conformers = [None] * 5 + spc.conformer_energies = [None] * 5 + sched.species_dict = {'H2O': spc} + sched.output = {'H2O': {'paths': {}, 'job_types': {}}} + return sched + + +def _complete_task(pipe_root, task_id): + """Drive a task through the full lifecycle to COMPLETED.""" + now = time.time() + update_task_state(pipe_root, task_id, new_status=TaskState.CLAIMED, + claimed_by='w', claim_token='tok', + claimed_at=now, lease_expires_at=now + 300) + update_task_state(pipe_root, task_id, new_status=TaskState.RUNNING, started_at=now) + update_task_state(pipe_root, task_id, new_status=TaskState.COMPLETED, ended_at=now) + + +class TestShouldUsePipe(unittest.TestCase): + """Tests for PipeCoordinator.should_use_pipe().""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_coord_test_') + self.coord = PipeCoordinator(_make_mock_sched(self.tmpdir)) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_true_for_homogeneous_batch(self): + tasks = [_make_spec(f't_{i}') for i in range(15)] + self.assertTrue(self.coord.should_use_pipe(tasks)) + + def test_false_below_threshold(self): + tasks = [_make_spec(f't_{i}') for i in range(5)] + self.assertFalse(self.coord.should_use_pipe(tasks)) + + def test_false_for_empty_list(self): + self.assertFalse(self.coord.should_use_pipe([])) + + def test_false_for_heterogeneous_engine(self): + tasks = [_make_spec(f't_{i}') for i in range(15)] + tasks[0] = _make_spec('t_0', engine='gaussian') + self.assertFalse(self.coord.should_use_pipe(tasks)) + + def test_false_for_heterogeneous_level(self): + tasks = [_make_spec(f't_{i}') for i in range(15)] + tasks[3] = _make_spec('t_3', level={'method': 'b3lyp', 'basis': 'sto-3g'}) + self.assertFalse(self.coord.should_use_pipe(tasks)) + + def test_false_for_heterogeneous_family(self): + tasks = [_make_spec(f't_{i}') for i in range(15)] + tasks[0] = _make_spec('t_0', task_family='conf_sp') + self.assertFalse(self.coord.should_use_pipe(tasks)) + + @patch('arc.job.pipe.pipe_coordinator.pipe_settings', {'enabled': False, 'min_tasks': 10}) + def test_false_when_disabled(self): + tasks = [_make_spec(f't_{i}') for i in range(15)] + self.assertFalse(self.coord.should_use_pipe(tasks)) + + +class TestSubmitPipeRun(unittest.TestCase): + """Tests for PipeCoordinator.submit_pipe_run().""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_coord_submit_') + self.coord = PipeCoordinator(_make_mock_sched(self.tmpdir)) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_submit_returns_pipe_run(self): + tasks = [_make_spec(f't_{i}') for i in range(3)] + pipe = self.coord.submit_pipe_run('run_001', tasks) + self.assertIsInstance(pipe, PipeRun) + self.assertIn('run_001', self.coord.active_pipes) + self.assertIs(self.coord.active_pipes['run_001'], pipe) + + def test_submit_stages_on_disk(self): + tasks = [_make_spec(f't_{i}') for i in range(2)] + pipe = self.coord.submit_pipe_run('run_disk', tasks) + self.assertTrue(os.path.isdir(pipe.pipe_root)) + for t in tasks: + self.assertTrue(os.path.isfile( + os.path.join(pipe.pipe_root, 'tasks', t.task_id, 'spec.json'))) + + def test_submit_uses_explicit_cluster_software(self): + tasks = [_make_spec('t_0')] + pipe = self.coord.submit_pipe_run('run_pbs', tasks, cluster_software='pbs') + self.assertEqual(pipe.cluster_software, 'pbs') + + +class TestRegisterFromDir(unittest.TestCase): + """Tests for PipeCoordinator.register_pipe_run_from_dir().""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_coord_register_') + self.coord = PipeCoordinator(_make_mock_sched(self.tmpdir)) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_register_reconstructs(self): + tasks = [_make_spec(f't_{i}') for i in range(2)] + original = self.coord.submit_pipe_run('run_restore', tasks, cluster_software='pbs') + pipe_root = original.pipe_root + del self.coord.active_pipes['run_restore'] + restored = self.coord.register_pipe_run_from_dir(pipe_root) + self.assertIn('run_restore', self.coord.active_pipes) + self.assertEqual(restored.run_id, 'run_restore') + self.assertEqual(restored.cluster_software, 'pbs') + + +class TestPollPipes(unittest.TestCase): + """Tests for PipeCoordinator.poll_pipes().""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_coord_poll_') + self.coord = PipeCoordinator(_make_mock_sched(self.tmpdir)) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_poll_removes_completed_pipe(self): + pipe = self.coord.submit_pipe_run('run_done', [_make_spec('t_done')]) + _complete_task(pipe.pipe_root, 't_done') + self.coord.poll_pipes() + self.assertNotIn('run_done', self.coord.active_pipes) + + def test_poll_keeps_pending_pipe(self): + self.coord.submit_pipe_run('run_pending', [_make_spec('t_pending')]) + self.coord.poll_pipes() + self.assertIn('run_pending', self.coord.active_pipes) + + def test_poll_removes_failed_pipe(self): + pipe = self.coord.submit_pipe_run('run_fail', [_make_spec('t_fail')]) + pipe.status = PipeRunState.FAILED + pipe._save_run_metadata() + self.coord.poll_pipes() + self.assertNotIn('run_fail', self.coord.active_pipes) + + def test_poll_removes_after_repeated_reconcile_failures(self): + pipe = self.coord.submit_pipe_run('run_stuck', [_make_spec('t_stuck')]) + with patch.object(pipe, 'reconcile', side_effect=RuntimeError('corrupt')): + for _ in range(3): + self.coord.poll_pipes() + self.assertNotIn('run_stuck', self.coord.active_pipes) + + def test_poll_resets_failure_count_on_success(self): + pipe = self.coord.submit_pipe_run('run_flaky', [_make_spec('t_flaky')]) + with patch.object(pipe, 'reconcile', side_effect=RuntimeError('transient')): + self.coord.poll_pipes() + self.assertEqual(self.coord._pipe_poll_failures.get('run_flaky'), 1) + self.coord.poll_pipes() # succeeds this time + self.assertNotIn('run_flaky', self.coord._pipe_poll_failures) + + +class TestIngestPipeResults(unittest.TestCase): + """Tests for PipeCoordinator.ingest_pipe_results().""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_coord_ingest_') + self.sched = _make_mock_sched(self.tmpdir) + self.coord = PipeCoordinator(self.sched) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_ingest_completed_task(self): + task = _make_spec('t_ingest', conformer_index=2) + pipe = self.coord.submit_pipe_run('run_ingest', [task]) + _complete_task(pipe.pipe_root, 't_ingest') + with patch('arc.job.pipe.pipe_coordinator.ingest_completed_task') as mock_ingest: + self.coord.ingest_pipe_results(pipe) + mock_ingest.assert_called_once() + + def test_ingest_skips_unreadable_state(self): + """Ingestion continues when a task's state.json is missing.""" + task = _make_spec('t_missing') + pipe = PipeRun(project_directory=self.tmpdir, run_id='run_missing', + tasks=[task], cluster_software='slurm') + pipe.stage() + # Remove state.json to simulate corruption + os.remove(os.path.join(pipe.pipe_root, 'tasks', 't_missing', 'state.json')) + self.coord.ingest_pipe_results(pipe) # should not raise + + +if __name__ == '__main__': + unittest.main(testRunner=unittest.TextTestRunner(verbosity=2)) diff --git a/arc/job/pipe/pipe_planner.py b/arc/job/pipe/pipe_planner.py new file mode 100644 index 0000000000..d01206db2a --- /dev/null +++ b/arc/job/pipe/pipe_planner.py @@ -0,0 +1,312 @@ +""" +Pipe task planner — family-specific routing from ARC objects to pipe task batches. + +Translates scheduler-level decisions ("should we pipe these conformers?") into +homogeneous ``TaskSpec`` batches and submits them through a ``PipeCoordinator``. + +Each ``try_pipe_*`` method returns the **exact subset of items it handled** +(e.g., rotor indices, species labels, conformer indices). The scheduler +uses this to skip only the work that was actually piped, and immediately +falls back for the remainder. + +This module owns the family-specific logic for: + - choosing level / adapter + - rejecting incore adapters + - building TaskSpecs + - deriving cluster software + - checking pipe eligibility and submitting + +The Scheduler decides *when* to try pipe mode; this module decides *how*. + +Note on TSG: + ``try_pipe_tsg`` is implemented but **not wired** into ``spawn_ts_jobs()`` + because TSG methods are typically few per reaction (3-5 adapters), rarely + hitting ``min_tasks``. Wire when workload stats justify it. +""" + +from collections import Counter +from typing import TYPE_CHECKING, Callable, List, Set, Tuple + +from arc.common import get_logger +from arc.imports import settings +from arc.job.adapters.common import default_incore_adapters +from arc.level import Level + +from arc.job.pipe.pipe_run import ( + build_conformer_pipe_tasks, + build_rotor_scan_1d_tasks, + build_species_leaf_task, + build_ts_opt_tasks, + build_tsg_tasks, + derive_cluster_software, +) +from arc.job.pipe.pipe_state import TaskSpec + +if TYPE_CHECKING: + from arc.job.pipe.pipe_coordinator import PipeCoordinator + from arc.reaction import ARCReaction + from arc.scheduler import Scheduler + +logger = get_logger() + +pipe_settings = settings['pipe_settings'] + + +class PipePlanner: + """ + Family-specific pipe routing from ARC objects to pipe task batches. + + Each ``try_pipe_*`` method returns the handled subset so the scheduler + can fall back only for the remainder. The generic ``_try_pipe_job`` + captures the repeated routing pattern; individual methods supply the + task-building callable and family-specific preconditions. + + Args: + sched: The owning Scheduler instance. + coordinator: The PipeCoordinator that owns active pipe runs. + """ + + def __init__(self, sched: 'Scheduler', coordinator: 'PipeCoordinator'): + self.sched = sched + self.coordinator = coordinator + + @property + def _memory_mb(self) -> int: + return int(self.sched.memory * 1024) + + def _level_dict(self, level) -> dict: + return level.as_dict() if isinstance(level, Level) else Level(repr=level).as_dict() + + # ------------------------------------------------------------------ + # Generic routing helper + # ------------------------------------------------------------------ + + def _try_pipe_job(self, + run_id: str, + level, + job_type: str, + build_tasks_fn: Callable[..., List[TaskSpec]], + log_msg: str, + ) -> bool: + """ + Generic pipe routing: deduce adapter, reject incore, build tasks, + check eligibility, derive cluster software, log, and submit. + + Returns ``True`` if the batch was submitted, ``False`` otherwise. + Family wrappers translate this bool into the appropriate handled-subset + return value (all-or-nothing for families routed through this helper). + """ + job_adapter = self.sched.deduce_job_adapter(level=Level(repr=level), job_type=job_type) + if job_adapter in default_incore_adapters: + return False + tasks = build_tasks_fn(job_adapter) + if not self.coordinator.should_use_pipe(tasks): + return False + cs = derive_cluster_software(self.sched.ess_settings, job_adapter) + logger.info(f'{log_msg} (engine={job_adapter}, cluster={cs}).') + self.coordinator.submit_pipe_run(run_id, tasks, cluster_software=cs) + return True + + # ------------------------------------------------------------------ + # Family-specific routing — each returns the handled subset + # ------------------------------------------------------------------ + + def try_pipe_conformers(self, label: str) -> Set[int]: + """ + Route conformer optimization through pipe mode. + + Returns: + set[int]: Conformer indices that were piped (all or empty). + """ + level = self.sched.conformer_opt_level + n_conformers = len(self.sched.species_dict[label].conformers) + submitted = self._try_pipe_job( + run_id=f'{label}_conf_opt', + level=level, + job_type='conf_opt', + build_tasks_fn=lambda adapter: build_conformer_pipe_tasks( + self.sched.species_dict[label], label, 'conf_opt', + self._level_dict(level), adapter, self._memory_mb), + log_msg=f'Routing {n_conformers} conformer optimizations for {label} to pipe mode', + ) + return set(range(n_conformers)) if submitted else set() + + def try_pipe_conf_sp(self, label: str, conformer_indices: List[int]) -> Set[int]: + """ + Route conformer SP jobs through pipe mode for the given candidate indices. + + Args: + label: The species label. + conformer_indices: The exact conformer indices to consider for piping. + Only these indices will be built into tasks; the returned handled + set is always a subset of this input. + + Returns: + set[int]: Conformer indices that were piped (all supplied or empty). + """ + if not conformer_indices: + return set() + if not self.sched.job_types.get('conf_sp') or self.sched.conformer_sp_level is None: + return set() + if self.sched.conformer_sp_level == self.sched.conformer_opt_level: + return set() + level = self.sched.conformer_sp_level + candidate_set = set(conformer_indices) + submitted = self._try_pipe_job( + run_id=f'{label}_conf_sp', + level=level, + job_type='conf_sp', + build_tasks_fn=lambda adapter: build_conformer_pipe_tasks( + self.sched.species_dict[label], label, 'conf_sp', + self._level_dict(level), adapter, self._memory_mb, + conformer_indices=sorted(candidate_set)), + log_msg=f'Routing {len(candidate_set)} conformer SP jobs for {label} to pipe mode', + ) + return candidate_set if submitted else set() + + def try_pipe_tsg(self, rxn: 'ARCReaction', methods: List[str]) -> Set[str]: + """ + Route TSG methods through pipe mode, grouped by method. + + TSG is a special case: it loops over methods and may create multiple + pipe runs, so it does not use ``_try_pipe_job``. + + **Intentionally not wired** into ``Scheduler.spawn_ts_jobs()``. + This is not an omission. TSG methods are typically few per reaction + (3-5 adapters), so per-method counts rarely reach ``min_tasks``. + Future multi-reaction or global TSG batching could revisit this + decision if workload statistics show enough same-method TSG tasks + across reactions to justify pipe-mode submission. + + Args: + rxn: The reaction whose TS guesses are being generated. + methods: The exact list of TSG method names to consider. + + Returns: + set[str]: Method names that were piped (subset of ``methods``). + """ + ts_label = rxn.ts_label + method_counts = Counter(methods) + piped_methods = set() + for method, count in method_counts.items(): + if count < pipe_settings.get('min_tasks', 10): + continue + tasks = build_tsg_tasks(ts_label, method, count, rxn.as_dict(), self._memory_mb) + if not self.coordinator.should_use_pipe(tasks): + continue + cs = derive_cluster_software(self.sched.ess_settings, method) + logger.info(f'Routing {count} TSG {method} tasks for {ts_label} to pipe mode.') + self.coordinator.submit_pipe_run(f'{ts_label}_tsg_{method}', tasks, cluster_software=cs) + piped_methods.add(method) + return piped_methods + + def try_pipe_ts_opt(self, label: str, xyzs: List[dict], level) -> Set[int]: + """ + Route TS optimization jobs through pipe mode. + + Returns: + set[int]: TS guess indices that were piped (all or empty). + """ + submitted = self._try_pipe_job( + run_id=f'{label}_ts_opt', + level=level, + job_type='opt', + build_tasks_fn=lambda adapter: build_ts_opt_tasks( + self.sched.species_dict[label], label, xyzs, + self._level_dict(level), adapter, self._memory_mb), + log_msg=f'Routing {len(xyzs)} TS opt jobs for {label} to pipe mode', + ) + return set(range(len(xyzs))) if submitted else set() + + def try_pipe_species_sp(self, labels: List[str]) -> Set[str]: + """ + Batch species SP jobs through pipe mode. + + Returns: + set[str]: Species labels that were piped (all or empty). + """ + level = self.sched.sp_level + submitted = self._try_pipe_job( + run_id='species_sp_batch', + level=level, + job_type='sp', + build_tasks_fn=lambda adapter: [ + build_species_leaf_task(self.sched.species_dict[lbl], lbl, 'species_sp', + self._level_dict(level), adapter, self._memory_mb) + for lbl in labels], + log_msg=f'Routing {len(labels)} species SP jobs to pipe mode', + ) + return set(labels) if submitted else set() + + def try_pipe_species_freq(self, labels: List[str]) -> Set[str]: + """ + Batch species freq jobs through pipe mode. + + Returns: + set[str]: Species labels that were piped (all or empty). + """ + level = self.sched.freq_level + submitted = self._try_pipe_job( + run_id='species_freq_batch', + level=level, + job_type='freq', + build_tasks_fn=lambda adapter: [ + build_species_leaf_task(self.sched.species_dict[lbl], lbl, 'species_freq', + self._level_dict(level), adapter, self._memory_mb) + for lbl in labels], + log_msg=f'Routing {len(labels)} species freq jobs to pipe mode', + ) + return set(labels) if submitted else set() + + def try_pipe_irc(self, labels_and_directions: List[Tuple[str, str]]) -> Set[Tuple[str, str]]: + """ + Batch IRC jobs through pipe mode. + + Returns: + set[tuple[str, str]]: ``(label, direction)`` pairs that were piped (all or empty). + """ + level = self.sched.irc_level + if not level: + return set() + + def _build_irc_tasks(adapter): + tasks = [] + for label, direction in labels_and_directions: + task = build_species_leaf_task( + self.sched.species_dict[label], label, 'irc', + self._level_dict(level), adapter, self._memory_mb, + extra_ingestion={'irc_direction': direction}) + task.task_id = f'{label}_irc_{direction}' + task.input_fingerprint = f'{label}_irc_{direction}' + tasks.append(task) + return tasks + + submitted = self._try_pipe_job( + run_id='irc_batch', + level=level, + job_type='irc', + build_tasks_fn=_build_irc_tasks, + log_msg=f'Routing {len(labels_and_directions)} IRC jobs to pipe mode', + ) + return set(labels_and_directions) if submitted else set() + + def try_pipe_rotor_scans_1d(self, label: str, rotor_indices: List[int]) -> Set[int]: + """ + Batch 1D rotor scan jobs through pipe mode. + + Returns: + set[int]: Rotor indices that were piped (all or empty). + """ + level = self.sched.scan_level + if level is None: + return set() + submitted = self._try_pipe_job( + run_id=f'{label}_scan_1d', + level=level, + job_type='scan', + build_tasks_fn=lambda adapter: build_rotor_scan_1d_tasks( + self.sched.species_dict[label], label, rotor_indices, + self._level_dict(level), adapter, self._memory_mb), + log_msg=f'Routing {len(rotor_indices)} 1D rotor scans for {label} to pipe mode', + ) + return set(rotor_indices) if submitted else set() diff --git a/arc/job/pipe/pipe_planner_test.py b/arc/job/pipe/pipe_planner_test.py new file mode 100644 index 0000000000..f550556728 --- /dev/null +++ b/arc/job/pipe/pipe_planner_test.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python3 +# encoding: utf-8 + +""" +This module contains unit tests for the arc.job.pipe.pipe_planner module +""" + +import shutil +import tempfile +import unittest +from unittest.mock import MagicMock + +from arc.job.pipe.pipe_coordinator import PipeCoordinator +from arc.job.pipe.pipe_planner import PipePlanner +from arc.level import Level +from arc.species import ARCSpecies + + +def _make_mock_sched(project_directory): + """Create a mock Scheduler with attributes the planner needs.""" + sched = MagicMock() + sched.project_directory = project_directory + sched.memory = 14.0 + sched.conformer_opt_level = Level(method='b97d3', basis='6-31+g(d,p)') + sched.conformer_sp_level = Level(method='wb97xd', basis='def2-tzvp') + sched.sp_level = Level(method='wb97xd', basis='def2-tzvp') + sched.freq_level = Level(method='wb97xd', basis='def2-tzvp') + sched.scan_level = Level(method='wb97xd', basis='def2-tzvp') + sched.irc_level = Level(method='wb97xd', basis='def2-tzvp') + sched.ess_settings = {'gaussian': ['server1']} + sched.job_types = {'conf_opt': True, 'conf_sp': True, 'opt': True, + 'freq': True, 'sp': True, 'rotors': True} + spc = ARCSpecies(label='H2O', smiles='O') + spc.conformers = [{'symbols': ('O',), 'isotopes': (16,), + 'coords': ((0.0, 0.0, float(i)),)} + for i in range(12)] + spc.conformer_energies = [None] * 12 + spc.rotors_dict = {i: {'torsion': [0, 1, 2, 3], 'success': None} + for i in range(12)} + sched.species_dict = {'H2O': spc} + sched.output = {'H2O': {'paths': {}, 'job_types': {}}} + sched.deduce_job_adapter = MagicMock(return_value='gaussian') + return sched + + +class TestTryPipeConformers(unittest.TestCase): + """Tests for PipePlanner.try_pipe_conformers().""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_planner_test_') + self.sched = _make_mock_sched(self.tmpdir) + self.coord = PipeCoordinator(self.sched) + self.planner = PipePlanner(self.sched, self.coord) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_pipes_when_enough_conformers(self): + """12 conformers exceeds threshold, all indices should be piped.""" + handled = self.planner.try_pipe_conformers('H2O') + self.assertEqual(handled, set(range(12))) + self.assertEqual(len(self.coord.active_pipes), 1) + run_id = list(self.coord.active_pipes.keys())[0] + self.assertIn('H2O', run_id) + self.assertIn('conf_opt', run_id) + + def test_no_pipe_for_few_conformers(self): + """5 conformers is below threshold.""" + self.sched.species_dict['H2O'].conformers = [None] * 5 + handled = self.planner.try_pipe_conformers('H2O') + self.assertEqual(handled, set()) + self.assertEqual(len(self.coord.active_pipes), 0) + + def test_no_pipe_for_incore_adapter(self): + """Incore adapters should not use pipe.""" + self.sched.deduce_job_adapter.return_value = 'torchani' + handled = self.planner.try_pipe_conformers('H2O') + self.assertEqual(handled, set()) + + def test_task_specs_have_correct_metadata(self): + """Verify built TaskSpecs have the expected fields.""" + self.planner.try_pipe_conformers('H2O') + pipe = list(self.coord.active_pipes.values())[0] + spec = pipe.tasks[0] + self.assertEqual(spec.task_family, 'conf_opt') + self.assertEqual(spec.owner_type, 'species') + self.assertEqual(spec.owner_key, 'H2O') + self.assertIn('conformer_index', spec.ingestion_metadata) + self.assertIsNotNone(spec.level) + self.assertIn('species_dicts', spec.input_payload) + + +class TestTryPipeConfSp(unittest.TestCase): + """Tests for PipePlanner.try_pipe_conf_sp().""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_planner_confsp_') + self.sched = _make_mock_sched(self.tmpdir) + self.coord = PipeCoordinator(self.sched) + self.planner = PipePlanner(self.sched, self.coord) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_pipes_conf_sp(self): + handled = self.planner.try_pipe_conf_sp('H2O', list(range(12))) + self.assertEqual(handled, set(range(12))) + + def test_no_pipe_when_disabled(self): + self.sched.job_types['conf_sp'] = False + handled = self.planner.try_pipe_conf_sp('H2O', list(range(12))) + self.assertEqual(handled, set()) + + def test_no_pipe_when_same_level(self): + self.sched.conformer_sp_level = self.sched.conformer_opt_level + handled = self.planner.try_pipe_conf_sp('H2O', list(range(12))) + self.assertEqual(handled, set()) + + def test_no_pipe_for_empty_indices(self): + handled = self.planner.try_pipe_conf_sp('H2O', []) + self.assertEqual(handled, set()) + + +class TestTryPipeTsOpt(unittest.TestCase): + """Tests for PipePlanner.try_pipe_ts_opt().""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_planner_tsopt_') + self.sched = _make_mock_sched(self.tmpdir) + self.coord = PipeCoordinator(self.sched) + self.planner = PipePlanner(self.sched, self.coord) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_pipes_ts_opt(self): + xyzs = [{'symbols': ('O',), 'isotopes': (16,), + 'coords': ((0.0, 0.0, float(i)),)} + for i in range(12)] + level = Level(method='wb97xd', basis='def2-tzvp') + handled = self.planner.try_pipe_ts_opt('H2O', xyzs, level) + self.assertEqual(handled, set(range(12))) + pipe = list(self.coord.active_pipes.values())[0] + self.assertEqual(pipe.tasks[0].task_family, 'ts_opt') + + def test_no_pipe_below_threshold(self): + xyzs = [{'symbols': ('O',), 'isotopes': (16,), 'coords': ((0, 0, 0),)}] * 5 + level = Level(method='wb97xd', basis='def2-tzvp') + handled = self.planner.try_pipe_ts_opt('H2O', xyzs, level) + self.assertEqual(handled, set()) + + +class TestTryPipeSpeciesSp(unittest.TestCase): + """Tests for PipePlanner.try_pipe_species_sp().""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_planner_sp_') + self.sched = _make_mock_sched(self.tmpdir) + # Add more species to exceed threshold + for i in range(12): + lbl = f'spc_{i}' + self.sched.species_dict[lbl] = ARCSpecies(label=lbl, smiles='O') + self.sched.output[lbl] = {'paths': {}, 'job_types': {}} + self.coord = PipeCoordinator(self.sched) + self.planner = PipePlanner(self.sched, self.coord) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_pipes_species_sp(self): + labels = [f'spc_{i}' for i in range(12)] + handled = self.planner.try_pipe_species_sp(labels) + self.assertEqual(handled, set(labels)) + pipe = list(self.coord.active_pipes.values())[0] + self.assertEqual(pipe.tasks[0].task_family, 'species_sp') + + def test_no_pipe_below_threshold(self): + handled = self.planner.try_pipe_species_sp(['spc_0', 'spc_1']) + self.assertEqual(handled, set()) + + +class TestTryPipeIrc(unittest.TestCase): + """Tests for PipePlanner.try_pipe_irc().""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_planner_irc_') + self.sched = _make_mock_sched(self.tmpdir) + for i in range(12): + lbl = f'ts_{i}' + self.sched.species_dict[lbl] = ARCSpecies(label=lbl, smiles='O', is_ts=True) + self.coord = PipeCoordinator(self.sched) + self.planner = PipePlanner(self.sched, self.coord) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_pipes_irc(self): + pairs = [(f'ts_{i}', 'forward') for i in range(12)] + handled = self.planner.try_pipe_irc(pairs) + self.assertEqual(handled, set(pairs)) + pipe = list(self.coord.active_pipes.values())[0] + self.assertEqual(pipe.tasks[0].task_family, 'irc') + self.assertEqual(pipe.tasks[0].ingestion_metadata['irc_direction'], 'forward') + + def test_no_pipe_when_no_irc_level(self): + self.sched.irc_level = None + handled = self.planner.try_pipe_irc([(f'ts_{i}', 'forward') for i in range(12)]) + self.assertEqual(handled, set()) + + +class TestTryPipeRotorScans(unittest.TestCase): + """Tests for PipePlanner.try_pipe_rotor_scans_1d().""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_planner_scan_') + self.sched = _make_mock_sched(self.tmpdir) + self.coord = PipeCoordinator(self.sched) + self.planner = PipePlanner(self.sched, self.coord) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_pipes_scans(self): + handled = self.planner.try_pipe_rotor_scans_1d('H2O', list(range(12))) + self.assertEqual(handled, set(range(12))) + pipe = list(self.coord.active_pipes.values())[0] + self.assertEqual(pipe.tasks[0].task_family, 'rotor_scan_1d') + self.assertIn('torsions', pipe.tasks[0].input_payload) + self.assertIn('rotor_index', pipe.tasks[0].ingestion_metadata) + + def test_no_pipe_below_threshold(self): + handled = self.planner.try_pipe_rotor_scans_1d('H2O', [0, 1, 2]) + self.assertEqual(handled, set()) + + def test_no_pipe_when_no_scan_level(self): + self.sched.scan_level = None + handled = self.planner.try_pipe_rotor_scans_1d('H2O', list(range(12))) + self.assertEqual(handled, set()) + + +class TestTryPipeTsg(unittest.TestCase): + """Tests for PipePlanner.try_pipe_tsg().""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_planner_tsg_') + self.sched = _make_mock_sched(self.tmpdir) + self.coord = PipeCoordinator(self.sched) + self.planner = PipePlanner(self.sched, self.coord) + self.rxn = MagicMock() + self.rxn.ts_label = 'TS0' + self.rxn.as_dict.return_value = {'label': 'rxn_1'} + self.sched.species_dict['TS0'] = ARCSpecies(label='TS0', smiles='O', is_ts=True) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_pipes_tsg_when_enough_same_method(self): + """10+ instances of the same method triggers pipe.""" + methods = ['heuristics'] * 12 + handled = self.planner.try_pipe_tsg(self.rxn, methods) + self.assertEqual(handled, {'heuristics'}) + + def test_no_pipe_for_few_methods(self): + """Typical 3-method list stays below threshold.""" + methods = ['heuristics', 'kinbot', 'autotst'] + handled = self.planner.try_pipe_tsg(self.rxn, methods) + self.assertEqual(handled, set()) + + def test_mixed_methods_only_pipe_large_groups(self): + """Only the method with 12 instances gets piped.""" + methods = ['heuristics'] * 12 + ['kinbot'] * 3 + handled = self.planner.try_pipe_tsg(self.rxn, methods) + self.assertEqual(handled, {'heuristics'}) + self.assertNotIn('kinbot', handled) + + +if __name__ == '__main__': + unittest.main(testRunner=unittest.TextTestRunner(verbosity=2)) diff --git a/arc/job/pipe/pipe_run.py b/arc/job/pipe/pipe_run.py new file mode 100644 index 0000000000..4f23951e8e --- /dev/null +++ b/arc/job/pipe/pipe_run.py @@ -0,0 +1,826 @@ +""" +A module for the PipeRun orchestrator, task-spec routing, and result ingestion. + +Contains: + - ``PipeRun``: manages the lifecycle of a pipe run (staging, submit-script + generation, reconciliation with orphan detection and retry scheduling). + - Ingestion helpers: dispatch completed pipe task results back into ARC's + species/output state by task family. + - Routing helpers: build ``TaskSpec`` objects and decide whether to submit + a pipe run for various task families. + +All QA, troubleshooting, and downstream branching remain in mother ARC. +""" + +import json +import os +import stat +import sys +import time +from typing import Dict, List, Optional + +import arc.parser.parser as parser +from arc.common import get_logger +from arc.imports import pipe_submit, settings + +from arc.job.pipe.pipe_state import ( + PipeRunState, + TaskState, + TaskSpec, + get_task_attempt_dir, + initialize_task, + read_task_state, + update_task_state, +) + +logger = get_logger() + +pipe_settings = settings['pipe_settings'] +default_job_settings = settings['default_job_settings'] +servers_dict = settings['servers'] + + +class PipeRun: + """ + Orchestrator for a pipe run. + + Args: + project_directory (str): Path to the ARC project directory. + run_id (str): Unique identifier for this pipe run. + tasks (List[TaskSpec]): Task specifications to execute. + cluster_software (str): Cluster scheduler type. + max_workers (int): Maximum number of concurrent array workers. + max_attempts (int): Maximum retry attempts per task. + """ + + def __init__(self, + project_directory: str, + run_id: str, + tasks: List[TaskSpec], + cluster_software: str, + max_workers: int = 100, + max_attempts: int = 3, + ): + self.project_directory = project_directory + self.run_id = run_id + self.tasks = tasks + self.cluster_software = cluster_software + self.max_workers = max_workers + self.max_attempts = max_attempts + self.pipe_root = os.path.join(project_directory, 'runs', 'pipe_' + run_id) + self.status = PipeRunState.CREATED + self.created_at = time.time() + self.submitted_at = None + self.completed_at = None + self.scheduler_job_id = None + + def _save_run_metadata(self) -> None: + """Write run-level metadata to ``run.json`` under ``self.pipe_root``.""" + os.makedirs(self.pipe_root, exist_ok=True) + run_path = os.path.join(self.pipe_root, 'run.json') + # Derive homogeneous fields from tasks when all tasks agree. + task_family = None + engine = None + level = None + if self.tasks: + families = {t.task_family for t in self.tasks} + if len(families) == 1: + task_family = families.pop() + engines = {t.engine for t in self.tasks} + if len(engines) == 1: + engine = engines.pop() + levels = [t.level for t in self.tasks] + if levels and all(l == levels[0] for l in levels): + level = levels[0] + data = { + 'run_id': self.run_id, + 'pipe_root': self.pipe_root, + 'status': self.status.value, + 'cluster_software': self.cluster_software, + 'max_workers': self.max_workers, + 'max_attempts': self.max_attempts, + 'task_family': task_family, + 'engine': engine, + 'level': level, + 'created_at': self.created_at, + 'submitted_at': self.submitted_at, + 'completed_at': self.completed_at, + 'scheduler_job_id': self.scheduler_job_id, + } + tmp_path = run_path + '.tmp' + with open(tmp_path, 'w') as f: + json.dump(data, f, indent=2) + os.replace(tmp_path, run_path) + + @classmethod + def from_dir(cls, pipe_root: str) -> 'PipeRun': + """ + Reconstruct a PipeRun from an existing run directory. + + Args: + pipe_root: Path to the pipe run root directory. + + Returns: + PipeRun: The reconstructed run object. + """ + run_path = os.path.join(pipe_root, 'run.json') + with open(run_path, 'r') as f: + data = json.load(f) + tasks = [] + tasks_dir = os.path.join(pipe_root, 'tasks') + if os.path.isdir(tasks_dir): + for task_id in sorted(os.listdir(tasks_dir)): + spec_path = os.path.join(tasks_dir, task_id, 'spec.json') + if os.path.isfile(spec_path): + with open(spec_path, 'r') as f: + tasks.append(TaskSpec.from_dict(json.load(f))) + project_directory = os.path.dirname(os.path.dirname(pipe_root)) + run = cls( + project_directory=project_directory, + run_id=data['run_id'], + tasks=tasks, + cluster_software=data['cluster_software'], + max_workers=data.get('max_workers', 100), + max_attempts=data.get('max_attempts', 3), + ) + run.pipe_root = pipe_root + run.status = PipeRunState(data['status']) + run.created_at = data.get('created_at', 0) + run.submitted_at = data.get('submitted_at') + run.completed_at = data.get('completed_at') + run.scheduler_job_id = data.get('scheduler_job_id') + return run + + def stage(self) -> None: + """ + Create the pipe_root directory tree and initialize all tasks on disk. + + Validates that all tasks are homogeneous in ``task_family``, ``engine``, + and ``level`` before staging. Mixed conformer runs are rejected early. + """ + if len(self.tasks) > 1: + ref = self.tasks[0] + for t in self.tasks[1:]: + if t.task_family != ref.task_family: + raise ValueError(f'PipeRun tasks must be homogeneous in task_family: ' + f'{ref.task_family} vs {t.task_family}') + if t.engine != ref.engine: + raise ValueError(f'PipeRun tasks must be homogeneous in engine: ' + f'{ref.engine} vs {t.engine}') + if t.level != ref.level: + raise ValueError(f'PipeRun tasks must be homogeneous in level: ' + f'{ref.level} vs {t.level}') + if t.required_cores != ref.required_cores: + raise ValueError(f'PipeRun tasks must be homogeneous in required_cores: ' + f'{ref.required_cores} vs {t.required_cores}') + if t.required_memory_mb != ref.required_memory_mb: + raise ValueError(f'PipeRun tasks must be homogeneous in required_memory_mb: ' + f'{ref.required_memory_mb} vs {t.required_memory_mb}') + os.makedirs(os.path.join(self.pipe_root, 'tasks'), exist_ok=True) + for spec in self.tasks: + initialize_task(self.pipe_root, spec, max_attempts=self.max_attempts) + self.status = PipeRunState.STAGED + self._save_run_metadata() + + def _submission_resources(self): + """ + Derive resource settings from the homogeneous task list. + + Returns: + Tuple[int, int, int]: ``(cpus, memory_mb, array_size)`` + """ + cpus = self.tasks[0].required_cores if self.tasks else 1 + memory_mb = self.tasks[0].required_memory_mb if self.tasks else 4096 + array_size = min(self.max_workers, len(self.tasks)) if self.tasks else self.max_workers + return cpus, memory_mb, array_size + + def write_submit_script(self) -> str: + """ + Generate an array submission script for the configured cluster scheduler. + + Formats a template from ``arc/settings/submit.py`` (the ``pipe_submit`` + dict, keyed by cluster scheduler type) and writes it under + ``self.pipe_root``. Rerunning safely overwrites the file. + + Returns: + str: Absolute path to the generated submit script. + """ + template_key = 'sge' if self.cluster_software == 'oge' else self.cluster_software + if template_key not in pipe_submit: + raise NotImplementedError( + f'No pipe submit template for cluster software: {self.cluster_software}. ' + f'Available templates: {list(pipe_submit.keys())}') + cpus, memory_mb, array_size = self._submission_resources() + content = pipe_submit[template_key].format( + name=f'pipe_{self.run_id}', + max_task_num=array_size, + pipe_root=self.pipe_root, + python_exe=sys.executable, + cpus=cpus, + memory=memory_mb, + ) + filename = 'submit.sub' if self.cluster_software == 'htcondor' else 'submit.sh' + submit_path = os.path.join(self.pipe_root, filename) + tmp_path = submit_path + '.tmp' + with open(tmp_path, 'w') as f: + f.write(content) + os.replace(tmp_path, submit_path) + # Make shell scripts executable (not HTCondor .sub files). + if self.cluster_software != 'htcondor': + st = os.stat(submit_path) + os.chmod(submit_path, st.st_mode | stat.S_IXUSR | stat.S_IXGRP) + return submit_path + + def submit_to_scheduler(self): + """ + Submit the generated array script to the cluster scheduler. + + Uses ``arc.job.local.submit_job`` with the cluster software mapped + to the canonical casing expected by ``submit_command`` in settings. + + Returns: + Tuple[str, str]: ``(job_status, job_id)`` — ``'submitted'`` on + success, ``'errored'`` on failure. + """ + import shutil as _shutil + from arc.imports import settings as _settings + submit_command = _settings['submit_command'] + # Map lowercase cluster_software to the casing used in settings.submit_command + cs_map = {'slurm': 'Slurm', 'pbs': 'PBS', 'sge': 'OGE', 'oge': 'OGE', 'htcondor': 'HTCondor'} + canonical_cs = cs_map.get(self.cluster_software.lower(), self.cluster_software) + if canonical_cs not in submit_command: + logger.warning(f'No submit command configured for {canonical_cs}. Cannot submit.') + return 'errored', None + cmd_path = submit_command[canonical_cs].split()[0] + if not os.path.isfile(cmd_path) and _shutil.which(os.path.basename(cmd_path)) is None: + logger.warning(f'Submit command {cmd_path} not found. Cannot submit pipe run.') + return 'errored', None + from arc.job.local import submit_job as local_submit_job + filename = 'submit.sub' if self.cluster_software == 'htcondor' else 'submit.sh' + job_status, job_id = local_submit_job( + path=self.pipe_root, + cluster_soft=canonical_cs, + submit_filename=filename, + ) + return job_status, job_id + + def reconcile(self) -> Dict[str, int]: + """ + Poll all tasks, detect orphans, schedule retries, and check for completion. + Does not regress an already-terminal run status. + + Returns: + Dict[str, int]: Counts of tasks in each state. + """ + if self.status in (PipeRunState.COMPLETED, PipeRunState.COMPLETED_PARTIAL, PipeRunState.FAILED): + return self._count_task_states() + + self.status = PipeRunState.RECONCILING + self._save_run_metadata() + tasks_dir = os.path.join(self.pipe_root, 'tasks') + if not os.path.isdir(tasks_dir): + return {} + + now = time.time() + counts: Dict[str, int] = {s.value: 0 for s in TaskState} + task_ids = sorted(os.listdir(tasks_dir)) + + for task_id in task_ids: + if not os.path.isdir(os.path.join(tasks_dir, task_id)): + continue + try: + state = read_task_state(self.pipe_root, task_id) + except (FileNotFoundError, ValueError, KeyError): + continue + current = TaskState(state.status) + if current in (TaskState.CLAIMED, TaskState.RUNNING) \ + and state.lease_expires_at is not None \ + and now > state.lease_expires_at: + try: + update_task_state(self.pipe_root, task_id, + new_status=TaskState.ORPHANED, + claimed_by=None, claim_token=None, + claimed_at=None, lease_expires_at=None) + current = TaskState.ORPHANED + except (ValueError, TimeoutError) as e: + logger.debug(f'Could not mark task {task_id} as ORPHANED ' + f'(another process may be handling it): {e}') + counts[current.value] += 1 + + active_workers = counts[TaskState.CLAIMED.value] + counts[TaskState.RUNNING.value] + retryable = counts[TaskState.FAILED_RETRYABLE.value] + counts[TaskState.ORPHANED.value] + total = sum(counts.values()) + + if active_workers == 0 and retryable > 0: + for task_id in task_ids: + if not os.path.isdir(os.path.join(tasks_dir, task_id)): + continue + try: + state = read_task_state(self.pipe_root, task_id) + except (FileNotFoundError, ValueError, KeyError): + continue + current = TaskState(state.status) + if current not in (TaskState.FAILED_RETRYABLE, TaskState.ORPHANED): + continue + try: + if state.attempt_index + 1 < state.max_attempts: + update_task_state(self.pipe_root, task_id, + new_status=TaskState.PENDING, + attempt_index=state.attempt_index + 1, + claimed_by=None, claim_token=None, + claimed_at=None, lease_expires_at=None, + started_at=None, ended_at=None, + failure_class=None, retry_disposition=None) + counts[current.value] -= 1 + counts[TaskState.PENDING.value] += 1 + else: + ended = state.ended_at or now + update_task_state(self.pipe_root, task_id, + new_status=TaskState.FAILED_TERMINAL, + ended_at=ended) + counts[current.value] -= 1 + counts[TaskState.FAILED_TERMINAL.value] += 1 + except (ValueError, TimeoutError) as e: + logger.debug(f'Could not promote task {task_id} to FAILED_TERMINAL ' + f'(lock contention or concurrent state change): {e}') + + # If retries were scheduled but no workers remain, flag for resubmission. + pending_after_retry = counts[TaskState.PENDING.value] + active_after_retry = counts[TaskState.CLAIMED.value] + counts[TaskState.RUNNING.value] + if pending_after_retry > 0 and active_after_retry == 0: + self._needs_resubmission = True + logger.info(f'Pipe run {self.run_id}: {pending_after_retry} retryable tasks reset ' + f'to PENDING but no workers remain. Resubmission needed.') + else: + self._needs_resubmission = False + + terminal = (counts[TaskState.COMPLETED.value] + + counts[TaskState.FAILED_TERMINAL.value] + + counts[TaskState.CANCELLED.value]) + + if total > 0 and terminal == total: + failed = counts[TaskState.FAILED_TERMINAL.value] + counts[TaskState.CANCELLED.value] + if failed > 0: + self.status = PipeRunState.COMPLETED_PARTIAL + else: + self.status = PipeRunState.COMPLETED + self.completed_at = time.time() + self._save_run_metadata() + + return counts + + @property + def needs_resubmission(self) -> bool: + """Whether the run has PENDING retried tasks but no active workers.""" + return getattr(self, '_needs_resubmission', False) + + def _count_task_states(self) -> Dict[str, int]: + """Read all task states and return counts without modifying anything.""" + counts: Dict[str, int] = {s.value: 0 for s in TaskState} + tasks_dir = os.path.join(self.pipe_root, 'tasks') + if not os.path.isdir(tasks_dir): + return counts + for task_id in sorted(os.listdir(tasks_dir)): + if not os.path.isdir(os.path.join(tasks_dir, task_id)): + continue + try: + state = read_task_state(self.pipe_root, task_id) + counts[state.status] += 1 + except (FileNotFoundError, ValueError, KeyError): + continue + return counts + + +# =========================================================================== +# Ingestion helpers +# =========================================================================== + +def find_output_file(attempt_dir: str, engine: str, task_id: str = '') -> Optional[str]: + """ + Find the output file for a completed task. + + Prefers the ``canonical_output_path`` stored in ``result.json`` (written + by the worker) before falling back to a filesystem walk through the + ``calcs/`` tree. This keeps ingestion fast and consistent with the + worker's own output discovery. + + Returns: + Path to the output file, or ``None`` if not found. + """ + # 1. Prefer result.json canonical path (written by worker) + result_path = os.path.join(attempt_dir, 'result.json') + if os.path.isfile(result_path): + try: + with open(result_path) as f: + result_data = json.load(f) + canonical = result_data.get('canonical_output_path') + if canonical and os.path.isfile(canonical): + return canonical + except (json.JSONDecodeError, OSError): + pass # Fall through to filesystem walk. + + # 2. Fallback: walk calcs/ tree for engine-specific output filename + output_filenames = settings.get('output_filenames', {}) + target_name = output_filenames.get(engine, 'output.out') + calcs_dir = os.path.join(attempt_dir, 'calcs') + if not os.path.isdir(calcs_dir): + logger.warning(f'Task {task_id}: no calcs/ directory in {attempt_dir} ' + f'(engine={engine}, expected={target_name})') + return None + for root, dirs, files in os.walk(calcs_dir): + if target_name in files: + return os.path.join(root, target_name) + logger.warning(f'Task {task_id}: {target_name} not found under {calcs_dir} ' + f'(engine={engine})') + return None + + +def ingest_completed_task(pipe_run_id: str, pipe_root: str, spec: TaskSpec, + state: 'TaskStateRecord', species_dict: dict, + output: dict) -> None: + """ + Ingest a single completed task, dispatched by ``task_family``. + + Called from ``Scheduler.ingest_pipe_results()`` for each completed task. + Mutates ``species_dict`` and ``output`` in place. + """ + label = spec.owner_key + if not label: + logger.warning(f'Pipe run {pipe_run_id}, task {spec.task_id}: ' + f'missing owner_key, skipping.') + return + + if spec.task_family in ('conf_opt', 'conf_sp'): + if label not in species_dict: + logger.warning(f'Pipe run {pipe_run_id}, task {spec.task_id}: ' + f'species "{label}" not in species_dict, skipping.') + return + meta = spec.ingestion_metadata or {} + conformer_index = meta.get('conformer_index') + if conformer_index is None: + logger.warning(f'Pipe run {pipe_run_id}, task {spec.task_id}: ' + f'missing conformer_index in ingestion_metadata, skipping.') + return + if spec.task_family == 'conf_opt': + _ingest_conf_opt(pipe_run_id, pipe_root, spec, state, species_dict, label, conformer_index) + else: + _ingest_conf_sp(pipe_run_id, pipe_root, spec, state, species_dict, label, conformer_index) + elif spec.task_family == 'ts_guess_batch_method': + _ingest_ts_guess_batch(pipe_run_id, pipe_root, spec, state, species_dict, label) + elif spec.task_family == 'ts_opt': + _ingest_ts_opt(pipe_run_id, pipe_root, spec, state, species_dict, label) + elif spec.task_family == 'species_sp': + _ingest_species_sp(pipe_run_id, pipe_root, spec, state, species_dict, label) + elif spec.task_family == 'species_freq': + _ingest_species_freq(pipe_run_id, pipe_root, spec, state, species_dict, label, output) + elif spec.task_family == 'irc': + _ingest_irc(pipe_run_id, pipe_root, spec, state, species_dict, label, output) + elif spec.task_family == 'rotor_scan_1d': + _ingest_rotor_scan_1d(pipe_run_id, pipe_root, spec, state, species_dict, label) + + +def _ingest_conf_opt(run_id, pipe_root, spec, state, species_dict, label, conformer_index): + """Ingest a completed conf_opt task: update geometry and opt-level energy.""" + attempt_dir = get_task_attempt_dir(pipe_root, spec.task_id, state.attempt_index) + species = species_dict[label] + try: + output_file = find_output_file(attempt_dir, spec.engine, spec.task_id) + if output_file is None: + return + xyz = parser.parse_geometry(log_file_path=output_file) + e_elect = parser.parse_e_elect(log_file_path=output_file) + except Exception as e: + logger.error(f'Pipe run {run_id}, task {spec.task_id}: ' + f'parsing failed for {attempt_dir}: {type(e).__name__}: {e}') + return + if conformer_index < len(species.conformers) and xyz is not None: + species.conformers[conformer_index] = xyz + if conformer_index < len(species.conformer_energies) and e_elect is not None: + species.conformer_energies[conformer_index] = e_elect + + +def _ingest_conf_sp(run_id, pipe_root, spec, state, species_dict, label, conformer_index): + """Ingest a completed conf_sp task: update energy only.""" + attempt_dir = get_task_attempt_dir(pipe_root, spec.task_id, state.attempt_index) + species = species_dict[label] + try: + output_file = find_output_file(attempt_dir, spec.engine, spec.task_id) + if output_file is None: + return + e_elect = parser.parse_e_elect(log_file_path=output_file) + except Exception as e: + logger.error(f'Pipe run {run_id}, task {spec.task_id}: ' + f'parsing failed for {attempt_dir}: {type(e).__name__}: {e}') + return + if conformer_index < len(species.conformer_energies) and e_elect is not None: + species.conformer_energies[conformer_index] = e_elect + + +def _ingest_ts_guess_batch(run_id, pipe_root, spec, state, species_dict, label): + if label not in species_dict: + logger.warning(f'Pipe run {run_id}, task {spec.task_id}: ' + f'TS species "{label}" not in species_dict, skipping.') + return + attempt_dir = get_task_attempt_dir(pipe_root, spec.task_id, state.attempt_index) + try: + output_file = find_output_file(attempt_dir, spec.engine, spec.task_id) + except Exception as e: + logger.error(f'Pipe run {run_id}, task {spec.task_id}: ' + f'output lookup failed: {type(e).__name__}: {e}') + return + ts_species = species_dict[label] + if output_file is not None and hasattr(ts_species, 'process_completed_tsg_queue_jobs'): + try: + ts_species.process_completed_tsg_queue_jobs(path=output_file) + except Exception as e: + logger.error(f'Pipe run {run_id}, task {spec.task_id}: ' + f'TSG processing failed: {type(e).__name__}: {e}') + + +def _ingest_ts_opt(run_id, pipe_root, spec, state, species_dict, label): + if label not in species_dict: + logger.warning(f'Pipe run {run_id}, task {spec.task_id}: ' + f'TS species "{label}" not in species_dict, skipping.') + return + attempt_dir = get_task_attempt_dir(pipe_root, spec.task_id, state.attempt_index) + ts_species = species_dict[label] + try: + output_file = find_output_file(attempt_dir, spec.engine, spec.task_id) + if output_file is None: + return + xyz = parser.parse_geometry(log_file_path=output_file) + e_elect = parser.parse_e_elect(log_file_path=output_file) + except Exception as e: + logger.error(f'Pipe run {run_id}, task {spec.task_id}: ' + f'parsing failed for {attempt_dir}: {type(e).__name__}: {e}') + return + if xyz is not None: + ts_species.final_xyz = xyz + if e_elect is not None: + ts_species.e_elect = e_elect + + +def _ingest_species_sp(run_id, pipe_root, spec, state, species_dict, label): + if label not in species_dict: + logger.warning(f'Pipe run {run_id}, task {spec.task_id}: ' + f'species "{label}" not in species_dict, skipping.') + return + attempt_dir = get_task_attempt_dir(pipe_root, spec.task_id, state.attempt_index) + species = species_dict[label] + try: + output_file = find_output_file(attempt_dir, spec.engine, spec.task_id) + if output_file is None: + return + e_elect = parser.parse_e_elect(log_file_path=output_file) + except Exception as e: + logger.error(f'Pipe run {run_id}, task {spec.task_id}: ' + f'parsing failed for {attempt_dir}: {type(e).__name__}: {e}') + return + if e_elect is not None: + species.e_elect = e_elect + + +def _ingest_species_freq(run_id, pipe_root, spec, state, species_dict, label, output): + if label not in species_dict: + logger.warning(f'Pipe run {run_id}, task {spec.task_id}: ' + f'species "{label}" not in species_dict, skipping.') + return + attempt_dir = get_task_attempt_dir(pipe_root, spec.task_id, state.attempt_index) + try: + output_file = find_output_file(attempt_dir, spec.engine, spec.task_id) + except Exception as e: + logger.error(f'Pipe run {run_id}, task {spec.task_id}: ' + f'output lookup failed: {type(e).__name__}: {e}') + return + if output_file is not None: + if label not in output: + output[label] = {'paths': {}} + elif 'paths' not in output[label]: + output[label]['paths'] = {} + output[label]['paths']['freq'] = output_file + + +def _ingest_irc(run_id, pipe_root, spec, state, species_dict, label, output): + if label not in species_dict: + logger.warning(f'Pipe run {run_id}, task {spec.task_id}: ' + f'TS species "{label}" not in species_dict, skipping.') + return + attempt_dir = get_task_attempt_dir(pipe_root, spec.task_id, state.attempt_index) + try: + output_file = find_output_file(attempt_dir, spec.engine, spec.task_id) + except Exception as e: + logger.error(f'Pipe run {run_id}, task {spec.task_id}: ' + f'output lookup failed: {type(e).__name__}: {e}') + return + if output_file is not None: + if label not in output: + output[label] = {'paths': {'irc': []}} + elif 'paths' not in output[label]: + output[label]['paths'] = {'irc': []} + irc_paths = output[label]['paths'].get('irc', []) + irc_paths.append(output_file) + output[label]['paths']['irc'] = irc_paths + + +def _ingest_rotor_scan_1d(run_id, pipe_root, spec, state, species_dict, label): + if label not in species_dict: + logger.warning(f'Pipe run {run_id}, task {spec.task_id}: ' + f'species "{label}" not in species_dict, skipping.') + return + attempt_dir = get_task_attempt_dir(pipe_root, spec.task_id, state.attempt_index) + try: + output_file = find_output_file(attempt_dir, spec.engine, spec.task_id) + except Exception as e: + logger.error(f'Pipe run {run_id}, task {spec.task_id}: ' + f'output lookup failed: {type(e).__name__}: {e}') + return + if output_file is None: + return + meta = spec.ingestion_metadata or {} + rotor_index = meta.get('rotor_index') + if rotor_index is None: + logger.warning(f'Pipe run {run_id}, task {spec.task_id}: ' + f'missing rotor_index in ingestion_metadata for species "{label}", skipping.') + return + species = species_dict[label] + if not hasattr(species, 'rotors_dict') or not isinstance(species.rotors_dict, dict): + logger.warning(f'Pipe run {run_id}, task {spec.task_id}: ' + f'species "{label}" has no valid rotors_dict, skipping rotor_index={rotor_index}.') + return + if rotor_index not in species.rotors_dict: + logger.warning(f'Pipe run {run_id}, task {spec.task_id}: ' + f'rotor_index={rotor_index} not found in rotors_dict for species "{label}", skipping.') + return + species.rotors_dict[rotor_index]['scan_path'] = output_file + + +# =========================================================================== +# Routing helpers +# =========================================================================== + +def derive_cluster_software(ess_settings: dict, job_adapter: str) -> str: + """ + Heuristic: derive cluster software from the first server configured + for this engine in ess_settings. Mirrors how run_job() picks its server. + + Returns a lowercase identifier matching the ``pipe_submit`` template keys + (e.g., ``'slurm'``, ``'pbs'``, ``'sge'``, ``'htcondor'``). + Maps ``'oge'`` to ``'sge'`` for template compatibility. + """ + cs_alias = {'oge': 'sge'} + for server_name in ess_settings.get(job_adapter, []): + if server_name in servers_dict and 'cluster_soft' in servers_dict[server_name]: + raw = servers_dict[server_name]['cluster_soft'].lower() + return cs_alias.get(raw, raw) + return 'slurm' + + +def build_conformer_pipe_tasks(species, label: str, task_family: str, + level_dict: dict, job_adapter: str, + memory_mb: int, + conformer_indices: Optional[List[int]] = None, + ) -> List[TaskSpec]: + """ + Build TaskSpec objects for conformer pipe tasks (conf_opt or conf_sp). + + Args: + conformer_indices: If given, build tasks only for these indices. + If ``None``, build tasks for all conformers. + """ + cores = default_job_settings.get('job_cpu_cores', 8) + species_dict_payload = species.as_dict() + indices = conformer_indices if conformer_indices is not None else list(range(len(species.conformers))) + tasks = [] + for i in indices: + tasks.append(TaskSpec( + task_id=f'{label}_{task_family}_{i}', + task_family=task_family, + owner_type='species', + owner_key=label, + input_fingerprint=f'{label}_{task_family}_{i}', + engine=job_adapter, + level=level_dict, + required_cores=cores, + required_memory_mb=memory_mb, + input_payload={ + 'species_dicts': [species_dict_payload], + 'xyz': species.conformers[i], + 'conformer': i, + }, + ingestion_metadata={'conformer_index': i}, + )) + return tasks + + +def build_species_leaf_task(species, label: str, task_family: str, + level_dict: dict, job_adapter: str, + memory_mb: int, + extra_ingestion: Optional[dict] = None) -> TaskSpec: + """Build a single TaskSpec for a species-side leaf job (sp, freq, irc).""" + cores = default_job_settings.get('job_cpu_cores', 8) + meta = extra_ingestion or {} + return TaskSpec( + task_id=f'{label}_{task_family}', + task_family=task_family, + owner_type='species', + owner_key=label, + input_fingerprint=f'{label}_{task_family}', + engine=job_adapter, + level=level_dict, + required_cores=cores, + required_memory_mb=memory_mb, + input_payload={'species_dicts': [species.as_dict()]}, + ingestion_metadata=meta, + ) + + +def build_tsg_tasks(ts_label: str, method: str, count: int, + rxn_dict: dict, memory_mb: int) -> List[TaskSpec]: + """ + Build TaskSpec objects for one TSG method batch. + + Contract: + - ``engine`` is set to ``method`` (the TSG method name, e.g. 'heuristics'), + which is a registered ARC adapter — not a computational engine like 'gaussian'. + - ``level`` is ``{'method': method}`` by convention for TSG tasks. + - ``owner_key`` is the TS species label (not a reaction key), consistent + with the species-ownership model used throughout the pipe system. + - Each task represents one method-batch member for one TS species/method group. + """ + cores = default_job_settings.get('job_cpu_cores', 8) + tasks = [] + for i in range(count): + tasks.append(TaskSpec( + task_id=f'{ts_label}_tsg_{method}_{i}', + task_family='ts_guess_batch_method', + owner_type='species', + owner_key=ts_label, + input_fingerprint=f'{ts_label}_tsg_{method}_{i}', + engine=method, + level={'method': method}, + required_cores=cores, + required_memory_mb=memory_mb, + input_payload={'reactions_dicts': [rxn_dict]}, + ingestion_metadata={'tsg_index': i, 'method': method}, + )) + return tasks + + +def build_ts_opt_tasks(species, label: str, xyzs: List[dict], + level_dict: dict, job_adapter: str, + memory_mb: int) -> List[TaskSpec]: + """Build TaskSpec objects for TS optimization tasks.""" + cores = default_job_settings.get('job_cpu_cores', 8) + species_dict_payload = species.as_dict() + tasks = [] + for i, xyz in enumerate(xyzs): + tasks.append(TaskSpec( + task_id=f'{label}_ts_opt_{i}', + task_family='ts_opt', + owner_type='species', + owner_key=label, + input_fingerprint=f'{label}_ts_opt_{i}', + engine=job_adapter, + level=level_dict, + required_cores=cores, + required_memory_mb=memory_mb, + input_payload={ + 'species_dicts': [species_dict_payload], + 'xyz': xyz, + 'conformer': i, + }, + ingestion_metadata={'conformer_index': i}, + )) + return tasks + + +def build_rotor_scan_1d_tasks(species, label: str, rotor_indices: List[int], + level_dict: dict, job_adapter: str, + memory_mb: int) -> List[TaskSpec]: + """Build TaskSpec objects for 1D rotor scan tasks.""" + cores = default_job_settings.get('job_cpu_cores', 8) + species_dict_payload = species.as_dict() + tasks = [] + for ri in rotor_indices: + rotor = species.rotors_dict[ri] + torsions = rotor['torsion'] + if isinstance(torsions[0], int): + torsions = [torsions] + tasks.append(TaskSpec( + task_id=f'{label}_scan_r{ri}', + task_family='rotor_scan_1d', + owner_type='species', + owner_key=label, + input_fingerprint=f'{label}_scan_r{ri}', + engine=job_adapter, + level=level_dict, + required_cores=cores, + required_memory_mb=memory_mb, + input_payload={ + 'species_dicts': [species_dict_payload], + 'torsions': torsions, + 'rotor_index': ri, + }, + ingestion_metadata={'rotor_index': ri}, + )) + return tasks diff --git a/arc/job/pipe/pipe_run_test.py b/arc/job/pipe/pipe_run_test.py new file mode 100644 index 0000000000..4f93a1726d --- /dev/null +++ b/arc/job/pipe/pipe_run_test.py @@ -0,0 +1,416 @@ +#!/usr/bin/env python3 +# encoding: utf-8 + +""" +This module contains unit tests for the arc.job.pipe_run module +""" + +import json +import os +import shutil +import tempfile +import time +import unittest + +from arc.job.adapters.mockter import MockAdapter +from arc.job.pipe.pipe_state import TaskState, PipeRunState, TaskSpec, read_task_state, update_task_state +from arc.job.pipe.pipe_run import PipeRun +from arc.level import Level +from arc.species import ARCSpecies + + +def _make_spec(task_id, label='H2O', smiles='O', task_family='conf_opt', + engine='mockter', level=None): + """Helper to create a TaskSpec for testing.""" + spc = ARCSpecies(label=label, smiles=smiles) + return TaskSpec( + task_id=task_id, + task_family=task_family, + owner_type='species', + owner_key=label, + input_fingerprint=f'{task_id}_fp', + engine=engine, + level=level or {'method': 'mock', 'basis': 'mock'}, + required_cores=1, + required_memory_mb=512, + input_payload={'species_dicts': [spc.as_dict()]}, + ingestion_metadata={'conformer_index': 0}, + ) + + +class TestAdapterPipeRejection(unittest.TestCase): + + def test_execute_pipe_raises_value_error(self): + job = MockAdapter( + execution_type='incore', job_type='sp', + level=Level(method='mock', basis='mock'), + project='test', + project_directory=os.path.join(tempfile.gettempdir(), 'pipe_reject_test'), + species=[ARCSpecies(label='H2O', smiles='O')], + testing=True) + job.execution_type = 'pipe' + with self.assertRaises(ValueError): + job.execute() + + +class TestPipeRunStaging(unittest.TestCase): + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_run_stage_') + self.tasks = [_make_spec(f'task_{i}') for i in range(3)] + self.run = PipeRun( + project_directory=self.tmpdir, run_id='test_001', + tasks=self.tasks, cluster_software='slurm', max_attempts=3) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_stage_creates_directory_tree(self): + self.run.stage() + for task in self.tasks: + task_dir = os.path.join(self.run.pipe_root, 'tasks', task.task_id) + self.assertTrue(os.path.isfile(os.path.join(task_dir, 'spec.json'))) + self.assertTrue(os.path.isfile(os.path.join(task_dir, 'state.json'))) + + def test_stage_sets_status(self): + self.run.stage() + self.assertEqual(self.run.status, PipeRunState.STAGED) + + def test_run_json_written(self): + self.run.stage() + run_path = os.path.join(self.run.pipe_root, 'run.json') + self.assertTrue(os.path.isfile(run_path)) + with open(run_path) as f: + data = json.load(f) + self.assertEqual(data['run_id'], 'test_001') + self.assertEqual(data['status'], 'STAGED') + + def test_run_json_has_rich_metadata(self): + """run.json includes homogeneous task_family, engine, level, and timestamps.""" + self.run.stage() + with open(os.path.join(self.run.pipe_root, 'run.json')) as f: + data = json.load(f) + self.assertEqual(data['task_family'], 'conf_opt') + self.assertEqual(data['engine'], 'mockter') + self.assertEqual(data['level'], {'method': 'mock', 'basis': 'mock'}) + self.assertIsNotNone(data['created_at']) + self.assertIsNone(data['submitted_at']) + self.assertIsNone(data['scheduler_job_id']) + + +class TestPipeRunFromDir(unittest.TestCase): + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_run_fromdir_') + self.tasks = [_make_spec(f'task_{i}') for i in range(2)] + self.run = PipeRun( + project_directory=self.tmpdir, run_id='restore_test', + tasks=self.tasks, cluster_software='pbs', + max_workers=50, max_attempts=5) + self.run.stage() + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_from_dir_reconstructs(self): + restored = PipeRun.from_dir(self.run.pipe_root) + self.assertEqual(restored.run_id, 'restore_test') + self.assertEqual(restored.cluster_software, 'pbs') + self.assertEqual(restored.max_workers, 50) + self.assertEqual(restored.status, PipeRunState.STAGED) + self.assertEqual(len(restored.tasks), 2) + + def test_from_dir_rich_metadata(self): + restored = PipeRun.from_dir(self.run.pipe_root) + self.assertIsNotNone(restored.created_at) + self.assertIsNone(restored.scheduler_job_id) + + +class TestPipeRunWriteSubmitScript(unittest.TestCase): + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_submit_script_') + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def _make_run(self, cluster_software, max_workers=10, n_tasks=None): + n = n_tasks if n_tasks is not None else max_workers + tasks = [_make_spec(f't_{i}') for i in range(n)] + run = PipeRun(project_directory=self.tmpdir, run_id='sub_test', + tasks=tasks, cluster_software=cluster_software, + max_workers=max_workers) + run.stage() + return run + + def test_slurm_content(self): + run = self._make_run('slurm', max_workers=25, n_tasks=25) + path = run.write_submit_script() + self.assertTrue(os.path.isfile(path)) + with open(path) as f: + content = f.read() + self.assertIn('#!/bin/bash -l', content) + self.assertIn('#SBATCH --array=1-25', content) + self.assertIn('WORKER_ID=$SLURM_ARRAY_TASK_ID', content) + self.assertIn('-m arc.scripts.pipe_worker', content) + + def test_pbs_content(self): + run = self._make_run('pbs', max_workers=8, n_tasks=8) + path = run.write_submit_script() + with open(path) as f: + content = f.read() + self.assertIn('#PBS -t 1-8', content) + self.assertIn('WORKER_ID=$PBS_ARRAYID', content) + + def test_htcondor_content(self): + run = self._make_run('htcondor', max_workers=12, n_tasks=12) + path = run.write_submit_script() + self.assertEqual(os.path.basename(path), 'submit.sub') + with open(path) as f: + content = f.read() + self.assertIn('queue 12', content) + + def test_overwrite_is_safe(self): + run = self._make_run('slurm') + p1 = run.write_submit_script() + p2 = run.write_submit_script() + self.assertEqual(p1, p2) + + def test_unsupported_raises(self): + run = self._make_run('mystery') + with self.assertRaises(NotImplementedError): + run.write_submit_script() + + def test_shell_script_is_executable(self): + """Shell submit scripts (slurm/pbs/sge) have executable permissions.""" + import stat + run = self._make_run('slurm') + path = run.write_submit_script() + mode = os.stat(path).st_mode + self.assertTrue(mode & stat.S_IXUSR, 'slurm script should be user-executable') + + def test_htcondor_sub_not_executable(self): + """HTCondor .sub files should not have executable bit set.""" + import stat + run = self._make_run('htcondor') + path = run.write_submit_script() + mode = os.stat(path).st_mode + self.assertFalse(mode & stat.S_IXUSR, '.sub should not be executable') + + +class TestPipeRunReconcile(unittest.TestCase): + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_run_reconcile_') + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def _complete_task(self, pipe_root, task_id): + now = time.time() + update_task_state(pipe_root, task_id, new_status=TaskState.CLAIMED, + claimed_by='w', claim_token='tok', claimed_at=now, lease_expires_at=now + 300) + update_task_state(pipe_root, task_id, new_status=TaskState.RUNNING, started_at=now) + update_task_state(pipe_root, task_id, new_status=TaskState.COMPLETED, ended_at=now) + + def test_orphan_retry_clears_claim_token(self): + """Retry via reconcile clears claim_token.""" + run = PipeRun(project_directory=self.tmpdir, run_id='orphan', + tasks=[_make_spec('t')], cluster_software='slurm') + run.stage() + now = time.time() + update_task_state(run.pipe_root, 't', new_status=TaskState.CLAIMED, + claimed_by='dead', claim_token='old_token', + claimed_at=now - 200, lease_expires_at=now - 10) + run.reconcile() + state = read_task_state(run.pipe_root, 't') + self.assertEqual(state.status, 'PENDING') + self.assertIsNone(state.claim_token) + + def test_all_completed(self): + tasks = [_make_spec(f'task_{i}') for i in range(3)] + run = PipeRun(project_directory=self.tmpdir, run_id='done', + tasks=tasks, cluster_software='slurm') + run.stage() + for t in tasks: + self._complete_task(run.pipe_root, t.task_id) + run.reconcile() + self.assertEqual(run.status, PipeRunState.COMPLETED) + self.assertIsNotNone(run.completed_at) + with open(os.path.join(run.pipe_root, 'run.json')) as f: + self.assertIsNotNone(json.load(f).get('completed_at')) + + def test_retryable_budget_exhausted(self): + run = PipeRun(project_directory=self.tmpdir, run_id='exhausted', + tasks=[_make_spec('t')], cluster_software='slurm', max_attempts=1) + run.stage() + now = time.time() + update_task_state(run.pipe_root, 't', new_status=TaskState.CLAIMED, + claimed_by='w', claim_token='tok', claimed_at=now, lease_expires_at=now + 300) + update_task_state(run.pipe_root, 't', new_status=TaskState.RUNNING, started_at=now) + update_task_state(run.pipe_root, 't', new_status=TaskState.FAILED_RETRYABLE, + ended_at=now + 5, failure_class='timeout') + run.reconcile() + state = read_task_state(run.pipe_root, 't') + self.assertEqual(state.status, 'FAILED_TERMINAL') + + def test_terminal_run_not_regressed(self): + tasks = [_make_spec(f'task_{i}') for i in range(2)] + run = PipeRun(project_directory=self.tmpdir, run_id='terminal', + tasks=tasks, cluster_software='slurm') + run.stage() + for t in tasks: + self._complete_task(run.pipe_root, t.task_id) + run.reconcile() + self.assertEqual(run.status, PipeRunState.COMPLETED) + run.reconcile() + self.assertEqual(run.status, PipeRunState.COMPLETED) + + +class TestPipeRunHomogeneity(unittest.TestCase): + """Tests for PipeRun homogeneity validation.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_homo_') + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_mixed_families_rejected(self): + """Mixing conf_opt and conf_sp in one run is rejected.""" + tasks = [_make_spec('t1', task_family='conf_opt'), + _make_spec('t2', task_family='conf_sp')] + run = PipeRun(project_directory=self.tmpdir, run_id='mixed', + tasks=tasks, cluster_software='slurm') + with self.assertRaises(ValueError): + run.stage() + + def test_mixed_engines_rejected(self): + tasks = [_make_spec('t1', engine='mockter'), + _make_spec('t2', engine='gaussian')] + run = PipeRun(project_directory=self.tmpdir, run_id='mixed_eng', + tasks=tasks, cluster_software='slurm') + with self.assertRaises(ValueError): + run.stage() + + def test_homogeneous_conf_sp_accepted(self): + tasks = [_make_spec(f't_{i}', task_family='conf_sp') for i in range(3)] + run = PipeRun(project_directory=self.tmpdir, run_id='sp_ok', + tasks=tasks, cluster_software='slurm') + run.stage() + self.assertEqual(run.status, PipeRunState.STAGED) + with open(os.path.join(run.pipe_root, 'run.json')) as f: + data = json.load(f) + self.assertEqual(data['task_family'], 'conf_sp') + + def test_from_dir_reconstructs_conf_sp(self): + """from_dir reconstructs conf_sp tasks correctly.""" + tasks = [_make_spec(f't_{i}', task_family='conf_sp') for i in range(2)] + run = PipeRun(project_directory=self.tmpdir, run_id='sp_restore', + tasks=tasks, cluster_software='slurm') + run.stage() + restored = PipeRun.from_dir(run.pipe_root) + self.assertEqual(len(restored.tasks), 2) + self.assertEqual(restored.tasks[0].task_family, 'conf_sp') + + def test_mixed_ts_and_conformer_rejected(self): + """Mixing ts_opt and conf_opt in one run is rejected.""" + tasks = [_make_spec('t1', task_family='conf_opt'), + _make_spec('t2', task_family='ts_opt')] + run = PipeRun(project_directory=self.tmpdir, run_id='mixed_ts_conf', + tasks=tasks, cluster_software='slurm') + with self.assertRaises(ValueError): + run.stage() + + def test_mixed_ts_families_rejected(self): + """Mixing ts_guess_batch_method and ts_opt in one run is rejected.""" + tasks = [_make_spec('t1', task_family='ts_guess_batch_method'), + _make_spec('t2', task_family='ts_opt')] + run = PipeRun(project_directory=self.tmpdir, run_id='mixed_ts', + tasks=tasks, cluster_software='slurm') + with self.assertRaises(ValueError): + run.stage() + + def test_homogeneous_ts_opt_accepted(self): + tasks = [_make_spec(f't_{i}', task_family='ts_opt') for i in range(3)] + run = PipeRun(project_directory=self.tmpdir, run_id='ts_ok', + tasks=tasks, cluster_software='slurm') + run.stage() + self.assertEqual(run.status, PipeRunState.STAGED) + with open(os.path.join(run.pipe_root, 'run.json')) as f: + self.assertEqual(json.load(f)['task_family'], 'ts_opt') + + def test_from_dir_reconstructs_ts_opt(self): + tasks = [_make_spec(f't_{i}', task_family='ts_opt') for i in range(2)] + run = PipeRun(project_directory=self.tmpdir, run_id='ts_restore', + tasks=tasks, cluster_software='slurm') + run.stage() + restored = PipeRun.from_dir(run.pipe_root) + self.assertEqual(len(restored.tasks), 2) + self.assertEqual(restored.tasks[0].task_family, 'ts_opt') + + def test_homogeneous_species_sp_accepted(self): + tasks = [_make_spec(f't_{i}', task_family='species_sp') for i in range(3)] + run = PipeRun(project_directory=self.tmpdir, run_id='sp_ok', + tasks=tasks, cluster_software='slurm') + run.stage() + self.assertEqual(run.status, PipeRunState.STAGED) + + def test_homogeneous_species_freq_accepted(self): + tasks = [_make_spec(f't_{i}', task_family='species_freq') for i in range(3)] + run = PipeRun(project_directory=self.tmpdir, run_id='freq_ok', + tasks=tasks, cluster_software='slurm') + run.stage() + self.assertEqual(run.status, PipeRunState.STAGED) + + def test_homogeneous_irc_accepted(self): + tasks = [_make_spec(f't_{i}', task_family='irc') for i in range(3)] + run = PipeRun(project_directory=self.tmpdir, run_id='irc_ok', + tasks=tasks, cluster_software='slurm') + run.stage() + self.assertEqual(run.status, PipeRunState.STAGED) + + def test_mixed_sp_and_freq_rejected(self): + tasks = [_make_spec('t1', task_family='species_sp'), + _make_spec('t2', task_family='species_freq')] + run = PipeRun(project_directory=self.tmpdir, run_id='mixed_leaf', + tasks=tasks, cluster_software='slurm') + with self.assertRaises(ValueError): + run.stage() + + def test_from_dir_reconstructs_species_sp(self): + tasks = [_make_spec(f't_{i}', task_family='species_sp') for i in range(2)] + run = PipeRun(project_directory=self.tmpdir, run_id='sp_restore', + tasks=tasks, cluster_software='slurm') + run.stage() + restored = PipeRun.from_dir(run.pipe_root) + self.assertEqual(len(restored.tasks), 2) + self.assertEqual(restored.tasks[0].task_family, 'species_sp') + + def test_homogeneous_rotor_scan_1d_accepted(self): + tasks = [_make_spec(f't_{i}', task_family='rotor_scan_1d') for i in range(3)] + run = PipeRun(project_directory=self.tmpdir, run_id='scan_ok', + tasks=tasks, cluster_software='slurm') + run.stage() + self.assertEqual(run.status, PipeRunState.STAGED) + + def test_mixed_scan_and_conformer_rejected(self): + tasks = [_make_spec('t1', task_family='rotor_scan_1d'), + _make_spec('t2', task_family='conf_opt')] + run = PipeRun(project_directory=self.tmpdir, run_id='mixed_scan', + tasks=tasks, cluster_software='slurm') + with self.assertRaises(ValueError): + run.stage() + + def test_from_dir_reconstructs_rotor_scan_1d(self): + tasks = [_make_spec(f't_{i}', task_family='rotor_scan_1d') for i in range(2)] + run = PipeRun(project_directory=self.tmpdir, run_id='scan_restore', + tasks=tasks, cluster_software='slurm') + run.stage() + restored = PipeRun.from_dir(run.pipe_root) + self.assertEqual(len(restored.tasks), 2) + self.assertEqual(restored.tasks[0].task_family, 'rotor_scan_1d') + + +if __name__ == '__main__': + unittest.main(testRunner=unittest.TextTestRunner(verbosity=2)) diff --git a/arc/job/pipe/pipe_state.py b/arc/job/pipe/pipe_state.py new file mode 100644 index 0000000000..0021504f29 --- /dev/null +++ b/arc/job/pipe/pipe_state.py @@ -0,0 +1,551 @@ +""" +A module for pipe-mode task state management. + +Defines the state machines, data models, and filesystem I/O utilities for +orchestrating subjobs within a single SLURM/PBS/HTCondor array allocation. +All task metadata is persisted as JSON files under a structured directory +tree, with file-level locking for safe concurrent access from multiple +worker processes. + +Directory layout for a task:: + + / + tasks/ + / + spec.json # immutable task specification + state.json # mutable state record (locked on update) + state.json.lock # lock file for state.json + attempts/ + / + result.json # worker-written result metadata + calcs/ # preserved adapter output tree + worker.log # per-attempt log +""" + +import fcntl +import json +import os +import time +import uuid +from enum import Enum +from typing import Dict, Optional, Tuple, Union + + +class TaskState(str, Enum): + """States for an individual task within a pipe run.""" + PENDING = 'PENDING' + CLAIMED = 'CLAIMED' + RUNNING = 'RUNNING' + COMPLETED = 'COMPLETED' + FAILED_RETRYABLE = 'FAILED_RETRYABLE' + FAILED_TERMINAL = 'FAILED_TERMINAL' + ORPHANED = 'ORPHANED' + CANCELLED = 'CANCELLED' + + +class PipeRunState(str, Enum): + """States for the overall pipe run.""" + CREATED = 'CREATED' + STAGED = 'STAGED' + SUBMITTED = 'SUBMITTED' + ACTIVE = 'ACTIVE' + RECONCILING = 'RECONCILING' + COMPLETED = 'COMPLETED' + COMPLETED_PARTIAL = 'COMPLETED_PARTIAL' + FAILED = 'FAILED' + + +# Task families currently supported by the pipe system. +# Only families listed here pass TaskSpec validation. +SUPPORTED_TASK_FAMILIES = ( + 'conf_opt', 'conf_sp', + 'ts_guess_batch_method', 'ts_opt', + 'species_sp', 'species_freq', 'irc', + 'rotor_scan_1d', +) + +# Owner types mapping to ARC object categories. +SUPPORTED_OWNER_TYPES = ('species', 'reaction') + +# Mapping from task_family to the adapter-facing job_type. +# Kept explicit so that task_family is not blindly used as job_type. +TASK_FAMILY_TO_JOB_TYPE = { + 'conf_opt': 'conf_opt', + 'conf_sp': 'conf_sp', + 'ts_guess_batch_method': 'tsg', + 'ts_opt': 'opt', + 'species_sp': 'sp', + 'species_freq': 'freq', + 'irc': 'irc', + 'rotor_scan_1d': 'scan', +} + + +# Allowed transitions: maps each state to the set of states it may transition to. +TASK_TRANSITIONS: Dict[TaskState, Tuple[TaskState, ...]] = { + TaskState.PENDING: (TaskState.CLAIMED, TaskState.CANCELLED), + TaskState.CLAIMED: (TaskState.RUNNING, TaskState.ORPHANED, TaskState.CANCELLED), + TaskState.RUNNING: (TaskState.COMPLETED, TaskState.FAILED_RETRYABLE, + TaskState.FAILED_TERMINAL, TaskState.ORPHANED, TaskState.CANCELLED), + TaskState.COMPLETED: (), + TaskState.FAILED_RETRYABLE: (TaskState.PENDING, TaskState.FAILED_TERMINAL), + TaskState.FAILED_TERMINAL: (), + TaskState.ORPHANED: (TaskState.PENDING, TaskState.FAILED_TERMINAL), + TaskState.CANCELLED: (), +} + +PIPE_RUN_TRANSITIONS: Dict[PipeRunState, Tuple[PipeRunState, ...]] = { + PipeRunState.CREATED: (PipeRunState.STAGED, PipeRunState.FAILED), + PipeRunState.STAGED: (PipeRunState.SUBMITTED, PipeRunState.FAILED), + PipeRunState.SUBMITTED: (PipeRunState.ACTIVE, PipeRunState.FAILED), + PipeRunState.ACTIVE: (PipeRunState.RECONCILING, PipeRunState.FAILED), + PipeRunState.RECONCILING: (PipeRunState.COMPLETED, PipeRunState.COMPLETED_PARTIAL, PipeRunState.FAILED), + PipeRunState.COMPLETED: (), + PipeRunState.COMPLETED_PARTIAL: (), + PipeRunState.FAILED: (), +} + + +def check_valid_transition(current_state: Union[TaskState, PipeRunState], + new_state: Union[TaskState, PipeRunState], + ) -> None: + """ + Validate that a state transition is allowed. + + Args: + current_state: The current state. + new_state: The proposed new state. + + Raises: + ValueError: If the transition is not allowed. + TypeError: If the two states are not of the same enum type. + """ + if type(current_state) is not type(new_state): + raise TypeError(f'Cannot transition between different state types: ' + f'{type(current_state).__name__} -> {type(new_state).__name__}') + if isinstance(current_state, TaskState): + allowed = TASK_TRANSITIONS + elif isinstance(current_state, PipeRunState): + allowed = PIPE_RUN_TRANSITIONS + else: + raise TypeError(f'Unsupported state type: {type(current_state).__name__}') + if new_state not in allowed[current_state]: + raise ValueError(f'Invalid state transition: {current_state.value} -> {new_state.value}') + + +def _validate_task_spec(spec: 'TaskSpec') -> None: + """ + Validate required fields on a TaskSpec. + + Raises: + ValueError: If any required field is missing or invalid. + """ + if not spec.task_family: + raise ValueError('TaskSpec.task_family is required') + if spec.task_family not in SUPPORTED_TASK_FAMILIES: + raise ValueError(f'TaskSpec.task_family must be one of {SUPPORTED_TASK_FAMILIES}, ' + f'got {spec.task_family!r}') + if not spec.owner_type: + raise ValueError('TaskSpec.owner_type is required') + if spec.owner_type not in SUPPORTED_OWNER_TYPES: + raise ValueError(f'TaskSpec.owner_type must be one of {SUPPORTED_OWNER_TYPES}, ' + f'got {spec.owner_type!r}') + if not spec.owner_key: + raise ValueError('TaskSpec.owner_key is required') + if spec.level is None: + raise ValueError('TaskSpec.level is required') + if spec.input_payload is None: + raise ValueError('TaskSpec.input_payload is required') + if spec.ingestion_metadata is None: + raise ValueError('TaskSpec.ingestion_metadata is required') + + +class TaskSpec: + """ + Immutable specification for a single pipe task. + + Written once to ``spec.json`` and never modified. + + Args: + task_id (str): Unique identifier for this task. + task_family (str): Pipe task family (e.g. ``'conf_opt'``, ``'conf_sp'``). + owner_type (str): Owner kind — ``'species'`` or ``'reaction'``. + owner_key (str): Stable key identifying the owning ARC object. + input_fingerprint (str): Hash or fingerprint of the input for deduplication. + engine (str): Computational engine (e.g. ``'gaussian'``, ``'orca'``). + level (dict): Level-of-theory payload (``Level.as_dict()`` output). + required_cores (int): Number of CPU cores required. + required_memory_mb (int): Memory requirement in MB. + input_payload (dict): Task-family-specific execution inputs. + ingestion_metadata (dict): Task-family-specific data for reattaching results. + args (dict, optional): Legacy/extra arguments. + """ + + def __init__(self, + task_id: str, + task_family: str, + owner_type: str, + owner_key: str, + input_fingerprint: str, + engine: str, + level: dict, + required_cores: int, + required_memory_mb: int, + input_payload: dict, + ingestion_metadata: dict, + args: Optional[dict] = None, + ): + self.task_id = task_id + self.task_family = task_family + self.owner_type = owner_type + self.owner_key = owner_key + self.input_fingerprint = input_fingerprint + self.engine = engine + self.level = level + self.required_cores = required_cores + self.required_memory_mb = required_memory_mb + self.input_payload = input_payload + self.ingestion_metadata = ingestion_metadata + self.args = args or {} + _validate_task_spec(self) + + def as_dict(self) -> dict: + """Return a JSON-serializable dictionary.""" + return { + 'task_id': self.task_id, + 'task_family': self.task_family, + 'owner_type': self.owner_type, + 'owner_key': self.owner_key, + 'input_fingerprint': self.input_fingerprint, + 'engine': self.engine, + 'level': self.level, + 'required_cores': self.required_cores, + 'required_memory_mb': self.required_memory_mb, + 'input_payload': self.input_payload, + 'ingestion_metadata': self.ingestion_metadata, + 'args': self.args, + } + + @classmethod + def from_dict(cls, d: dict) -> 'TaskSpec': + """ + Reconstruct a TaskSpec from a dictionary. + + Bypasses validation so that specs already persisted on disk can be + read back even if the supported-families list has changed. + + Contract: + - **Producers** (``build_*_tasks`` helpers) must construct valid specs + through ``__init__``, which enforces validation. + - **Deserializers** (this method) read persisted specs leniently so + that evolving family definitions don't break restart. + - **Execution/routing** paths (worker dispatch, ingestion) must still + fail safely if a task_family is unsupported at runtime. + """ + obj = object.__new__(cls) + obj.task_id = d['task_id'] + obj.task_family = d['task_family'] + obj.owner_type = d['owner_type'] + obj.owner_key = d['owner_key'] + obj.input_fingerprint = d['input_fingerprint'] + obj.engine = d['engine'] + obj.level = d['level'] + obj.required_cores = d['required_cores'] + obj.required_memory_mb = d['required_memory_mb'] + obj.input_payload = d['input_payload'] + obj.ingestion_metadata = d['ingestion_metadata'] + obj.args = d.get('args', {}) + return obj + + +class TaskStateRecord: + """ + Mutable state record for a single pipe task. + + Persisted in ``state.json`` and updated under a file lock. + + Args: + status (str): Current task state (a TaskState value). + attempt_index (int): Current attempt number (0-indexed). + max_attempts (int): Maximum allowed attempts before terminal failure. + claimed_by (str, optional): Worker identifier that claimed this task. + claimed_at (float, optional): Timestamp (epoch seconds) when claimed. + lease_expires_at (float, optional): Timestamp when the lease expires. + started_at (float, optional): Timestamp when execution started. + ended_at (float, optional): Timestamp when execution ended. + failure_class (str, optional): Classification of the failure (e.g. 'oom', 'timeout', 'ess_error'). + retry_disposition (str, optional): How the retry was decided (e.g. 'auto', 'manual'). + """ + + def __init__(self, + status: str = TaskState.PENDING.value, + attempt_index: int = 0, + max_attempts: int = 3, + claimed_by: Optional[str] = None, + claim_token: Optional[str] = None, + claimed_at: Optional[float] = None, + lease_expires_at: Optional[float] = None, + started_at: Optional[float] = None, + ended_at: Optional[float] = None, + failure_class: Optional[str] = None, + retry_disposition: Optional[str] = None, + ): + self.status = status + self.attempt_index = attempt_index + self.max_attempts = max_attempts + self.claimed_by = claimed_by + self.claim_token = claim_token + self.claimed_at = claimed_at + self.lease_expires_at = lease_expires_at + self.started_at = started_at + self.ended_at = ended_at + self.failure_class = failure_class + self.retry_disposition = retry_disposition + + def as_dict(self) -> dict: + """Return a JSON-serializable dictionary.""" + return { + 'status': self.status, + 'attempt_index': self.attempt_index, + 'max_attempts': self.max_attempts, + 'claimed_by': self.claimed_by, + 'claim_token': self.claim_token, + 'claimed_at': self.claimed_at, + 'lease_expires_at': self.lease_expires_at, + 'started_at': self.started_at, + 'ended_at': self.ended_at, + 'failure_class': self.failure_class, + 'retry_disposition': self.retry_disposition, + } + + @classmethod + def from_dict(cls, d: dict) -> 'TaskStateRecord': + """Reconstruct a TaskStateRecord from a dictionary.""" + return cls( + status=d['status'], + attempt_index=d['attempt_index'], + max_attempts=d['max_attempts'], + claimed_by=d.get('claimed_by'), + claim_token=d.get('claim_token'), + claimed_at=d.get('claimed_at'), + lease_expires_at=d.get('lease_expires_at'), + started_at=d.get('started_at'), + ended_at=d.get('ended_at'), + failure_class=d.get('failure_class'), + retry_disposition=d.get('retry_disposition'), + ) + + +def generate_claim_token() -> str: + """Generate a unique claim token for ownership verification.""" + return uuid.uuid4().hex[:16] + + +# --------------------------------------------------------------------------- +# Directory & I/O Utilities +# --------------------------------------------------------------------------- + +def get_task_dir(pipe_root: str, task_id: str) -> str: + """ + Get the directory path for a task. + + Args: + pipe_root (str): Root directory of the pipe run. + task_id (str): The task identifier. + + Returns: + str: Absolute path to the task directory. + """ + return os.path.join(pipe_root, 'tasks', task_id) + + +def get_task_attempt_dir(pipe_root: str, task_id: str, attempt_index: int) -> str: + """ + Get the working directory for a specific attempt of a task. + + Args: + pipe_root (str): Root directory of the pipe run. + task_id (str): The task identifier. + attempt_index (int): The 0-indexed attempt number. + + Returns: + str: Absolute path to the attempt directory. + """ + return os.path.join(pipe_root, 'tasks', task_id, 'attempts', str(attempt_index)) + + +def initialize_task(pipe_root: str, spec: TaskSpec, max_attempts: int = 3, + overwrite: bool = False) -> str: + """ + Create the directory structure and initial files for a new task. + + Args: + pipe_root: Root directory of the pipe run. + spec: The task specification. + max_attempts: Maximum retry attempts. + overwrite: If False, raise FileExistsError if the task already exists. + + Returns: + str: Path to the created task directory. + """ + task_dir = get_task_dir(pipe_root, spec.task_id) + spec_path = os.path.join(task_dir, 'spec.json') + state_path = os.path.join(task_dir, 'state.json') + if not overwrite and (os.path.isfile(spec_path) or os.path.isfile(state_path)): + raise FileExistsError(f'Task {spec.task_id} already initialized at {task_dir}') + os.makedirs(os.path.join(task_dir, 'attempts'), exist_ok=True) + with open(spec_path, 'w') as f: + json.dump(spec.as_dict(), f, indent=2) + state = TaskStateRecord(max_attempts=max_attempts) + with open(state_path, 'w') as f: + json.dump(state.as_dict(), f, indent=2) + return task_dir + + +def read_task_spec(pipe_root: str, task_id: str) -> TaskSpec: + """ + Read the immutable task specification from disk. + + Args: + pipe_root (str): Root directory of the pipe run. + task_id (str): The task identifier. + + Returns: + TaskSpec: The deserialized task specification. + """ + spec_path = os.path.join(get_task_dir(pipe_root, task_id), 'spec.json') + with open(spec_path, 'r') as f: + return TaskSpec.from_dict(json.load(f)) + + +def read_task_state(pipe_root: str, task_id: str) -> TaskStateRecord: + """ + Read the current task state from disk. + + Args: + pipe_root (str): Root directory of the pipe run. + task_id (str): The task identifier. + + Returns: + TaskStateRecord: The deserialized task state. + """ + state_path = os.path.join(get_task_dir(pipe_root, task_id), 'state.json') + with open(state_path, 'r') as f: + return TaskStateRecord.from_dict(json.load(f)) + + +def write_result_json(attempt_dir: str, result: dict) -> str: + """Write a ``result.json`` file in the attempt directory. Returns the path.""" + result_path = os.path.join(attempt_dir, 'result.json') + tmp_path = result_path + '.tmp' + with open(tmp_path, 'w') as f: + json.dump(result, f, indent=2) + os.replace(tmp_path, result_path) + return result_path + + +def _validate_state_invariants(state: TaskStateRecord) -> None: + """Validate lightweight invariants on a TaskStateRecord before persisting.""" + if state.attempt_index < 0: + raise ValueError(f'attempt_index must be >= 0, got {state.attempt_index}') + if state.max_attempts < 1: + raise ValueError(f'max_attempts must be >= 1, got {state.max_attempts}') + status = TaskState(state.status) + if status == TaskState.CLAIMED: + if state.claimed_by is None: + raise ValueError('Transition to CLAIMED requires claimed_by') + if state.claim_token is None: + raise ValueError('Transition to CLAIMED requires claim_token') + if state.claimed_at is None: + raise ValueError('Transition to CLAIMED requires claimed_at') + if state.lease_expires_at is None: + raise ValueError('Transition to CLAIMED requires lease_expires_at') + if status == TaskState.RUNNING: + if state.started_at is None: + raise ValueError('Transition to RUNNING requires started_at') + if status in (TaskState.COMPLETED, TaskState.FAILED_TERMINAL, TaskState.CANCELLED): + if state.ended_at is None: + raise ValueError(f'Transition to {status.value} requires ended_at') + if state.lease_expires_at is not None and state.claimed_at is not None: + if state.lease_expires_at < state.claimed_at: + raise ValueError(f'lease_expires_at ({state.lease_expires_at}) ' + f'must be >= claimed_at ({state.claimed_at})') + + +def update_task_state(pipe_root: str, + task_id: str, + new_status: Optional[TaskState] = None, + lock_timeout: float = 30.0, + **fields, + ) -> TaskStateRecord: + """ + Atomically update a task's state record under a file lock. + + Acquires an exclusive lock on ``state.json.lock``, reads the current state, + validates any status transition and field invariants, applies updates, and + writes the result atomically (write to temp file, then rename). + + Args: + pipe_root (str): Root directory of the pipe run. + task_id (str): The task identifier. + new_status (TaskState, optional): If provided, transition to this status + (validated against allowed transitions). + lock_timeout (float): Maximum seconds to wait for the lock. + **fields: Additional fields to update on the TaskStateRecord + (e.g., ``claimed_by='worker-3'``, ``lease_expires_at=1234567890.0``). + + Returns: + TaskStateRecord: The updated state record. + + Raises: + ValueError: If the state transition or field invariants are invalid. + TimeoutError: If the lock cannot be acquired within ``lock_timeout``. + """ + task_dir = get_task_dir(pipe_root, task_id) + state_path = os.path.join(task_dir, 'state.json') + lock_path = state_path + '.lock' + lock_fd = open(lock_path, 'w') + try: + _acquire_lock(lock_fd, lock_timeout) + with open(state_path, 'r') as f: + state = TaskStateRecord.from_dict(json.load(f)) + if new_status is not None: + current = TaskState(state.status) + check_valid_transition(current, new_status) + state.status = new_status.value + valid_fields = set(TaskStateRecord().__dict__.keys()) - {'status'} + for key, value in fields.items(): + if key not in valid_fields: + raise ValueError(f'Unknown TaskStateRecord field: {key}') + setattr(state, key, value) + _validate_state_invariants(state) + tmp_path = state_path + '.tmp' + with open(tmp_path, 'w') as f: + json.dump(state.as_dict(), f, indent=2) + os.replace(tmp_path, state_path) + return state + finally: + fcntl.flock(lock_fd, fcntl.LOCK_UN) + lock_fd.close() + + +def _acquire_lock(lock_fd, timeout: float) -> None: + """ + Acquire an exclusive file lock with a timeout. + + Args: + lock_fd: Open file descriptor for the lock file. + timeout (float): Maximum seconds to wait. + + Raises: + TimeoutError: If the lock is not acquired within the timeout. + """ + deadline = time.monotonic() + timeout + while True: + try: + fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB) + return + except (OSError, BlockingIOError): + if time.monotonic() >= deadline: + raise TimeoutError(f'Could not acquire lock within {timeout}s') + time.sleep(0.10) diff --git a/arc/job/pipe/pipe_state_test.py b/arc/job/pipe/pipe_state_test.py new file mode 100644 index 0000000000..ff89e0aec7 --- /dev/null +++ b/arc/job/pipe/pipe_state_test.py @@ -0,0 +1,290 @@ +#!/usr/bin/env python3 +# encoding: utf-8 + +""" +This module contains unit tests for the arc.job.pipe_state module +""" + +import json +import os +import shutil +import tempfile +import threading +import time +import unittest + +from arc.job.pipe.pipe_state import ( + TaskState, + PipeRunState, + TASK_TRANSITIONS, + SUPPORTED_TASK_FAMILIES, + TASK_FAMILY_TO_JOB_TYPE, + check_valid_transition, + TaskSpec, + TaskStateRecord, + generate_claim_token, + initialize_task, + read_task_state, + update_task_state, + write_result_json, +) + + +def _make_spec(task_id='t1', task_family='conf_opt', **overrides): + defaults = dict( + task_id=task_id, + task_family=task_family, + owner_type='species', + owner_key='H2O', + input_fingerprint='fp', + engine='gaussian', + level={'method': 'b3lyp', 'basis': '6-31g'}, + required_cores=4, + required_memory_mb=2048, + input_payload={'species_dicts': [{'label': 'H2O'}]}, + ingestion_metadata={'conformer_index': 0}, + ) + defaults.update(overrides) + return TaskSpec(**defaults) + + +class TestTaskTransitions(unittest.TestCase): + + def test_all_valid_task_transitions(self): + for src, targets in TASK_TRANSITIONS.items(): + if targets: # terminal states have empty tuples + for tgt in targets: + check_valid_transition(src, tgt) + + def test_no_self_transitions(self): + for state in list(TaskState): + with self.assertRaises(ValueError): + check_valid_transition(state, state) + + def test_cross_type_raises(self): + with self.assertRaises(TypeError): + check_valid_transition(TaskState.PENDING, PipeRunState.CREATED) + + +class TestTaskSpec(unittest.TestCase): + + def test_conf_opt_roundtrip(self): + spec = _make_spec(task_family='conf_opt') + d = spec.as_dict() + spec2 = TaskSpec.from_dict(json.loads(json.dumps(d))) + self.assertEqual(spec2.task_family, 'conf_opt') + self.assertEqual(spec2.owner_key, 'H2O') + + def test_conf_sp_roundtrip(self): + spec = _make_spec(task_family='conf_sp') + d = spec.as_dict() + spec2 = TaskSpec.from_dict(json.loads(json.dumps(d))) + self.assertEqual(spec2.task_family, 'conf_sp') + + def test_ts_guess_batch_method_roundtrip(self): + spec = _make_spec(task_family='ts_guess_batch_method') + d = spec.as_dict() + spec2 = TaskSpec.from_dict(json.loads(json.dumps(d))) + self.assertEqual(spec2.task_family, 'ts_guess_batch_method') + self.assertEqual(spec2.owner_type, 'species') + + def test_ts_opt_roundtrip(self): + spec = _make_spec(task_family='ts_opt') + d = spec.as_dict() + spec2 = TaskSpec.from_dict(json.loads(json.dumps(d))) + self.assertEqual(spec2.task_family, 'ts_opt') + + def test_species_sp_roundtrip(self): + spec = _make_spec(task_family='species_sp') + d = spec.as_dict() + spec2 = TaskSpec.from_dict(json.loads(json.dumps(d))) + self.assertEqual(spec2.task_family, 'species_sp') + + def test_species_freq_roundtrip(self): + spec = _make_spec(task_family='species_freq') + d = spec.as_dict() + spec2 = TaskSpec.from_dict(json.loads(json.dumps(d))) + self.assertEqual(spec2.task_family, 'species_freq') + + def test_irc_roundtrip(self): + spec = _make_spec(task_family='irc') + d = spec.as_dict() + spec2 = TaskSpec.from_dict(json.loads(json.dumps(d))) + self.assertEqual(spec2.task_family, 'irc') + + def test_rotor_scan_1d_roundtrip(self): + spec = _make_spec(task_family='rotor_scan_1d') + d = spec.as_dict() + spec2 = TaskSpec.from_dict(json.loads(json.dumps(d))) + self.assertEqual(spec2.task_family, 'rotor_scan_1d') + + def test_supported_families(self): + for fam in ('conf_opt', 'conf_sp', 'ts_guess_batch_method', 'ts_opt', + 'species_sp', 'species_freq', 'irc', 'rotor_scan_1d'): + self.assertIn(fam, SUPPORTED_TASK_FAMILIES) + + def test_family_to_job_type_mapping(self): + self.assertEqual(TASK_FAMILY_TO_JOB_TYPE['conf_opt'], 'conf_opt') + self.assertEqual(TASK_FAMILY_TO_JOB_TYPE['conf_sp'], 'conf_sp') + self.assertEqual(TASK_FAMILY_TO_JOB_TYPE['ts_guess_batch_method'], 'tsg') + self.assertEqual(TASK_FAMILY_TO_JOB_TYPE['ts_opt'], 'opt') + self.assertEqual(TASK_FAMILY_TO_JOB_TYPE['species_sp'], 'sp') + self.assertEqual(TASK_FAMILY_TO_JOB_TYPE['species_freq'], 'freq') + self.assertEqual(TASK_FAMILY_TO_JOB_TYPE['irc'], 'irc') + self.assertEqual(TASK_FAMILY_TO_JOB_TYPE['rotor_scan_1d'], 'scan') + + def test_validation_unsupported_family(self): + with self.assertRaises(ValueError): + _make_spec(task_family='scan') + + def test_validation_missing_task_family(self): + with self.assertRaises(ValueError): + _make_spec(task_family='') + + def test_validation_bad_owner_type(self): + with self.assertRaises(ValueError): + _make_spec(owner_type='molecule') + + def test_validation_missing_owner_key(self): + with self.assertRaises(ValueError): + _make_spec(owner_key='') + + def test_validation_missing_level(self): + with self.assertRaises(ValueError): + _make_spec(level=None) + + def test_validation_missing_input_payload(self): + with self.assertRaises(ValueError): + _make_spec(input_payload=None) + + def test_validation_missing_ingestion_metadata(self): + with self.assertRaises(ValueError): + _make_spec(ingestion_metadata=None) + + +class TestTaskStateRecord(unittest.TestCase): + + def test_claim_token_roundtrip(self): + rec = TaskStateRecord(claim_token='abc123') + d = rec.as_dict() + rec2 = TaskStateRecord.from_dict(d) + self.assertEqual(rec2.claim_token, 'abc123') + + +class TestGenerateClaimToken(unittest.TestCase): + + def test_tokens_are_unique(self): + tokens = {generate_claim_token() for _ in range(100)} + self.assertEqual(len(tokens), 100) + + +class TestInitializeTask(unittest.TestCase): + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_test_') + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_creates_spec_and_state(self): + spec = _make_spec(task_id='t1') + task_dir = initialize_task(self.tmpdir, spec) + self.assertTrue(os.path.isfile(os.path.join(task_dir, 'spec.json'))) + self.assertTrue(os.path.isfile(os.path.join(task_dir, 'state.json'))) + + def test_duplicate_raises(self): + spec = _make_spec(task_id='dup') + initialize_task(self.tmpdir, spec) + with self.assertRaises(FileExistsError): + initialize_task(self.tmpdir, spec) + + def test_overwrite_allowed(self): + spec = _make_spec(task_id='dup') + initialize_task(self.tmpdir, spec, max_attempts=3) + initialize_task(self.tmpdir, spec, max_attempts=5, overwrite=True) + state = read_task_state(self.tmpdir, 'dup') + self.assertEqual(state.max_attempts, 5) + + +class TestWriteResultJson(unittest.TestCase): + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_result_') + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_writes_and_reads(self): + result = {'task_id': 't1', 'status': 'COMPLETED'} + path = write_result_json(self.tmpdir, result) + with open(path) as f: + self.assertEqual(json.load(f)['task_id'], 't1') + + +class TestUpdateTaskState(unittest.TestCase): + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_test_') + initialize_task(self.tmpdir, _make_spec(task_id='t')) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_full_lifecycle(self): + now = time.time() + update_task_state(self.tmpdir, 't', new_status=TaskState.CLAIMED, + claimed_by='w', claim_token='tok', + claimed_at=now, lease_expires_at=now + 300) + update_task_state(self.tmpdir, 't', new_status=TaskState.RUNNING, started_at=now) + update_task_state(self.tmpdir, 't', new_status=TaskState.COMPLETED, ended_at=now + 10) + self.assertEqual(read_task_state(self.tmpdir, 't').status, 'COMPLETED') + + def test_claimed_missing_fields(self): + with self.assertRaises(ValueError): + update_task_state(self.tmpdir, 't', new_status=TaskState.CLAIMED, + claimed_at=time.time(), lease_expires_at=time.time() + 300) + + def test_claimed_missing_claim_token(self): + now = time.time() + with self.assertRaises(ValueError): + update_task_state(self.tmpdir, 't', new_status=TaskState.CLAIMED, + claimed_by='w', claimed_at=now, lease_expires_at=now + 300) + + def test_running_missing_started_at(self): + now = time.time() + update_task_state(self.tmpdir, 't', new_status=TaskState.CLAIMED, + claimed_by='w', claim_token='tok', + claimed_at=now, lease_expires_at=now + 300) + with self.assertRaises(ValueError): + update_task_state(self.tmpdir, 't', new_status=TaskState.RUNNING) + + def test_completed_missing_ended_at(self): + now = time.time() + update_task_state(self.tmpdir, 't', new_status=TaskState.CLAIMED, + claimed_by='w', claim_token='tok', + claimed_at=now, lease_expires_at=now + 300) + update_task_state(self.tmpdir, 't', new_status=TaskState.RUNNING, started_at=now) + with self.assertRaises(ValueError): + update_task_state(self.tmpdir, 't', new_status=TaskState.COMPLETED) + + def test_concurrent_claims(self): + results, errors = [], [] + def claim(wid): + try: + update_task_state(self.tmpdir, 't', new_status=TaskState.CLAIMED, + claimed_by=f'w-{wid}', claim_token=generate_claim_token(), + claimed_at=time.time(), lease_expires_at=time.time() + 300) + results.append(wid) + except ValueError: + errors.append(wid) + threads = [threading.Thread(target=claim, args=(i,)) for i in range(5)] + for t in threads: + t.start() + for t in threads: + t.join() + self.assertEqual(len(results), 1) + self.assertEqual(len(errors), 4) + + +if __name__ == '__main__': + unittest.main(testRunner=unittest.TextTestRunner(verbosity=2)) From cab5b28b91320680ebef904417424ec5d596fc48 Mon Sep 17 00:00:00 2001 From: Alon Grinberg Dana Date: Fri, 3 Apr 2026 15:23:34 +0300 Subject: [PATCH 11/60] Updated the pipe submission script --- arc/settings/submit.py | 52 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/arc/settings/submit.py b/arc/settings/submit.py index 951947a5fe..e9e7b24908 100644 --- a/arc/settings/submit.py +++ b/arc/settings/submit.py @@ -42,23 +42,57 @@ } -# Submission scripts for pipe.py stored as a dictionary with server as the key +# Submission scripts for pipe_worker array jobs, keyed by cluster scheduler type. +# These are server-independent templates. PipeRun.write_submit_script() formats +# them with: name, max_task_num, pipe_root, python_exe, cpus, memory. +# Legacy note: this dict was previously keyed by server name and used for the +# old HDF5-based pipe.py design. It is now keyed by cluster scheduler type. pipe_submit = { - 'local': """#!/bin/bash -l -#SBATCH -p normal + 'slurm': """#!/bin/bash -l #SBATCH -J {name} #SBATCH -N 1 #SBATCH -n {cpus} -#SBATCH --time={t_max} -#SBATCH --mem-per-cpu={memory} +#SBATCH --mem={memory} #SBATCH --array=1-{max_task_num} -#SBATCH -o out.txt -#SBATCH -e err.txt +#SBATCH -o {pipe_root}/out_%a.txt +#SBATCH -e {pipe_root}/err_%a.txt -source activate arc_env +WORKER_ID=$SLURM_ARRAY_TASK_ID -python {arc_path}/arc/job/scripts/pipe.py {hdf5_path} +{python_exe} -m arc.scripts.pipe_worker --pipe_root {pipe_root} --worker_id $WORKER_ID +""", + 'pbs': """#!/bin/bash -l +#PBS -N {name} +#PBS -l ncpus={cpus} +#PBS -l mem={memory}mb +#PBS -t 1-{max_task_num} +#PBS -o {pipe_root}/out_$PBS_ARRAYID.txt +#PBS -e {pipe_root}/err_$PBS_ARRAYID.txt +WORKER_ID=$PBS_ARRAYID + +{python_exe} -m arc.scripts.pipe_worker --pipe_root {pipe_root} --worker_id $WORKER_ID +""", + 'sge': """#!/bin/bash -l +#$ -N {name} +#$ -pe smp {cpus} +#$ -l h_vmem={memory}M +#$ -t 1-{max_task_num} +#$ -o {pipe_root}/out_$SGE_TASK_ID.txt +#$ -e {pipe_root}/err_$SGE_TASK_ID.txt + +WORKER_ID=$SGE_TASK_ID + +{python_exe} -m arc.scripts.pipe_worker --pipe_root {pipe_root} --worker_id $WORKER_ID +""", + 'htcondor': """executable = {python_exe} +arguments = -m arc.scripts.pipe_worker --pipe_root {pipe_root} --worker_id $(Process) +request_cpus = {cpus} +request_memory = {memory} +output = {pipe_root}/out_$(Process).txt +error = {pipe_root}/err_$(Process).txt +log = {pipe_root}/condor.log +queue {max_task_num} """, } From f8d102bb16b552424aa51ccb53bc30886f4cbfd5 Mon Sep 17 00:00:00 2001 From: Alon Grinberg Dana Date: Fri, 3 Apr 2026 15:23:45 +0300 Subject: [PATCH 12/60] Added pipe_settings --- arc/settings/settings.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arc/settings/settings.py b/arc/settings/settings.py index 2a14fd4a2a..c2f0333cb1 100644 --- a/arc/settings/settings.py +++ b/arc/settings/settings.py @@ -299,6 +299,16 @@ 'job_max_server_node_memory_allocation': 0.95, # e.g., at most 95% node memory will be used per job **if needed** } +# Pipe mode settings: distributed HPC execution via job arrays. +# These can be overridden in ~/.arc/settings.py. +pipe_settings = { + 'enabled': True, # Set to False to disable pipe mode entirely. + 'min_tasks': 10, # Minimum batch size to trigger pipe mode. + 'max_workers': 100, # Upper bound on array worker slots per PipeRun. + 'max_attempts': 3, # Retry budget per task before terminal failure. + 'lease_duration_s': 86400, # Worker lease duration in seconds (default 24h). +} + # Criteria for identification of imaginary frequencies for transition states. # An imaginary frequency is valid if it is between the following range (in cm-1): LOWEST_MAJOR_TS_FREQ, HIGHEST_MAJOR_TS_FREQ = 75.0, 10000.0 From 24ac9dbed7152be7c8d64a4116aacc5bde664cc7 Mon Sep 17 00:00:00 2001 From: Alon Grinberg Dana Date: Fri, 3 Apr 2026 15:23:56 +0300 Subject: [PATCH 13/60] Added the pipe_worker script --- arc/scripts/__init__.py | 2 +- arc/scripts/pipe_worker.py | 353 ++++++++++++++++++++++++++++++++ arc/scripts/pipe_worker_test.py | 334 ++++++++++++++++++++++++++++++ 3 files changed, 688 insertions(+), 1 deletion(-) create mode 100644 arc/scripts/pipe_worker.py create mode 100644 arc/scripts/pipe_worker_test.py diff --git a/arc/scripts/__init__.py b/arc/scripts/__init__.py index 641b47e6cf..6ccc8ac496 100644 --- a/arc/scripts/__init__.py +++ b/arc/scripts/__init__.py @@ -1 +1 @@ -import common +from arc.scripts import common diff --git a/arc/scripts/pipe_worker.py b/arc/scripts/pipe_worker.py new file mode 100644 index 0000000000..2ece334c6b --- /dev/null +++ b/arc/scripts/pipe_worker.py @@ -0,0 +1,353 @@ +#!/usr/bin/env python3 +""" +Pipe-mode worker script. + +A lightweight consumer that runs inside a single slot of a SLURM/PBS/OGE/HTCondor +job array. It scans the task directory for claimable work, executes tasks +using an ARC job adapter in ``incore`` mode, and records the outcome. +The worker loops until no more PENDING tasks are available. + +Usage:: + + python -m arc.scripts.pipe_worker --pipe_root /path/to/pipe_run --worker_id 7 +""" + +import argparse +import logging +import os +import shutil +import tempfile +import time +from typing import Optional + +from arc.imports import settings +from arc.job.factory import job_factory +from arc.job.pipe.pipe_state import ( + TASK_FAMILY_TO_JOB_TYPE, + TaskState, + TaskSpec, + TaskStateRecord, + generate_claim_token, + get_task_attempt_dir, + read_task_spec, + read_task_state, + update_task_state, + write_result_json, +) +from arc.level import Level +from arc.reaction import ARCReaction +from arc.species import ARCSpecies + +pipe_settings, output_filenames = settings['pipe_settings'], settings.get('output_filenames', {}) + + +logger = logging.getLogger('pipe_worker') + + +def setup_logging(log_path: str) -> None: + """Configure logging. Safe to call multiple times.""" + os.makedirs(os.path.dirname(log_path), exist_ok=True) + for h in list(logger.handlers): + h.close() + logger.removeHandler(h) + handler = logging.FileHandler(log_path) + handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s')) + logger.addHandler(handler) + stderr_handler = logging.StreamHandler() + stderr_handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s')) + logger.addHandler(stderr_handler) + logger.setLevel(logging.INFO) + + +def claim_task(pipe_root: str, worker_id: str): + """ + Scan for a PENDING task and attempt to claim it. + Returns ``(task_id, TaskStateRecord, claim_token)`` on success, + or ``(None, None, None)`` if no tasks are available. + """ + tasks_dir = os.path.join(pipe_root, 'tasks') + if not os.path.isdir(tasks_dir): + return None, None, None + for task_id in sorted(os.listdir(tasks_dir)): + if not os.path.isdir(os.path.join(tasks_dir, task_id)): + continue + try: + state = read_task_state(pipe_root, task_id) + current_status = TaskState(state.status) + except (FileNotFoundError, ValueError, KeyError): + continue # Skip tasks with unreadable or corrupted state. + if current_status != TaskState.PENDING: + continue + try: + now = time.time() + token = generate_claim_token() + updated = update_task_state(pipe_root, task_id, + new_status=TaskState.CLAIMED, + claimed_by=worker_id, + claim_token=token, + claimed_at=now, + lease_expires_at=now + pipe_settings.get('lease_duration_s', 86400)) + logger.info(f'Claimed task {task_id}') + return task_id, updated, token + except (ValueError, TimeoutError): + continue + return None, None, None + + +def run_task(pipe_root: str, task_id: str, state: TaskStateRecord, + worker_id: str, claim_token: str) -> None: + """ + Execute a claimed task: transition to RUNNING, dispatch by task_family, + copy outputs, write result.json, and mark COMPLETED or FAILED. + """ + attempt_dir = get_task_attempt_dir(pipe_root, task_id, state.attempt_index) + os.makedirs(attempt_dir, exist_ok=True) + setup_logging(os.path.join(attempt_dir, 'worker.log')) + + started_at = time.time() + try: + update_task_state(pipe_root, task_id, new_status=TaskState.RUNNING, started_at=started_at) + except (ValueError, TimeoutError) as e: + logger.warning(f'Task {task_id}: could not transition to RUNNING ({e}), skipping.') + return + + spec = read_task_spec(pipe_root, task_id) + scratch_dir = tempfile.mkdtemp(prefix=f'pipe_{task_id}_') + result = _make_result_template(task_id, state.attempt_index, started_at) + try: + _dispatch_execution(spec, scratch_dir) + _copy_outputs(scratch_dir, attempt_dir) + ended_at = time.time() + result['ended_at'] = ended_at + result['status'] = 'COMPLETED' + result['canonical_output_path'] = _find_canonical_output(attempt_dir, spec.engine) + write_result_json(attempt_dir, result) + if not _verify_ownership(pipe_root, task_id, worker_id, claim_token): + return + try: + update_task_state(pipe_root, task_id, new_status=TaskState.COMPLETED, ended_at=ended_at) + except (ValueError, TimeoutError) as e: + logger.warning(f'Task {task_id}: could not mark COMPLETED ({e}). ' + f'Task may have been orphaned concurrently.') + return + logger.info(f'Task {task_id} completed successfully') + except Exception as e: + failure_class = type(e).__name__ + ended_at = time.time() + logger.error(f'Task {task_id} failed: {failure_class}: {e}') + _copy_outputs(scratch_dir, attempt_dir) + result['ended_at'] = ended_at + result['status'] = 'FAILED' + result['failure_class'] = failure_class + write_result_json(attempt_dir, result) + if not _verify_ownership(pipe_root, task_id, worker_id, claim_token): + return + try: + current_state = read_task_state(pipe_root, task_id) + target = TaskState.FAILED_RETRYABLE if current_state.attempt_index + 1 < current_state.max_attempts \ + else TaskState.FAILED_TERMINAL + update_task_state(pipe_root, task_id, new_status=target, + ended_at=ended_at, failure_class=failure_class) + except (ValueError, TimeoutError) as e: + logger.warning(f'Task {task_id}: could not mark failed ({e}). ' + f'Task may have been orphaned concurrently.') + finally: + shutil.rmtree(scratch_dir, ignore_errors=True) + + +def _make_result_template(task_id: str, attempt_index: int, started_at: float) -> dict: + return { + 'task_id': task_id, + 'attempt_index': attempt_index, + 'started_at': started_at, + 'ended_at': None, + 'status': None, + 'canonical_output_path': None, + 'exit_code': None, + 'failure_class': None, + 'parser_summary': None, + 'result_fields': {}, + } + + +# --------------------------------------------------------------------------- +# Task-family execution dispatch +# --------------------------------------------------------------------------- + +def _get_family_extra_kwargs(spec: TaskSpec) -> dict: + """ + Extract family-specific kwargs needed by the adapter beyond the base job_type. + + The adapter-facing job_type comes from TASK_FAMILY_TO_JOB_TYPE (the central + mapping in pipe_state.py). This helper supplies only the extra parameters + that certain families need (e.g. irc_direction, torsions, rotor_index). + """ + kwargs = {} + payload = spec.input_payload or {} + meta = spec.ingestion_metadata or {} + + if spec.task_family == 'irc': + irc_direction = meta.get('irc_direction') + if irc_direction: + kwargs['irc_direction'] = irc_direction + elif spec.task_family == 'rotor_scan_1d': + torsions = payload.get('torsions') + rotor_index = payload.get('rotor_index') + if torsions is not None: + kwargs['torsions'] = torsions + if rotor_index is not None: + kwargs['rotor_index'] = rotor_index + + return kwargs + + +def _dispatch_execution(spec: TaskSpec, scratch_dir: str) -> None: + """ + Dispatch execution by task_family. + + The adapter-facing job_type is derived from the central + ``TASK_FAMILY_TO_JOB_TYPE`` mapping in ``pipe_state.py``. + Family-specific extra kwargs (e.g. irc_direction, torsions) + are extracted by ``_get_family_extra_kwargs``. + """ + job_type = TASK_FAMILY_TO_JOB_TYPE.get(spec.task_family) + if job_type is None: + raise ValueError(f'Unsupported task_family for execution: {spec.task_family}') + extra = _get_family_extra_kwargs(spec) + _run_adapter(spec, scratch_dir, job_type=job_type, **extra) + + +def _run_adapter(spec: TaskSpec, scratch_dir: str, job_type: str, **extra_kwargs) -> None: + """ + Reconstruct ARC objects and run the adapter incore with the given job_type. + + Args: + spec: The task specification. + scratch_dir: Temporary working directory for the adapter. + job_type: The adapter-facing job type (e.g. 'sp', 'freq', 'irc'). + **extra_kwargs: Additional keyword arguments passed to job_factory + (e.g. ``irc_direction`` for IRC jobs). + """ + species_list = None + reactions_list = None + payload = spec.input_payload or {} + species_dicts = payload.get('species_dicts') + reactions_dicts = payload.get('reactions_dicts') + if species_dicts: + species_list = [ARCSpecies(species_dict=_fix_int_keys(d)) for d in species_dicts] + if reactions_dicts: + reactions_list = [ARCReaction(reaction_dict=_fix_int_keys(d)) for d in reactions_dicts] + level_info = spec.level + if not level_info: + raise ValueError(f'Task {spec.task_id}: missing level information') + level = Level(repr=level_info) + # Pass per-task xyz and conformer/tsg index from input_payload so + # each task operates on its specific geometry, not the species default. + xyz = payload.get('xyz') + conformer = payload.get('conformer') + tsg = payload.get('tsg') + job = job_factory( + job_adapter=spec.engine, + execution_type='incore', + project='pipe_run', + project_directory=scratch_dir, + job_type=job_type, + level=level, + species=species_list, + reactions=reactions_list, + xyz=xyz, + conformer=conformer, + tsg=tsg, + testing=False, + **extra_kwargs, + ) + job.execute() + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _verify_ownership(pipe_root: str, task_id: str, + worker_id: str, claim_token: str) -> bool: + """ + Verify this worker still owns the task. + + Checks claimed_by, claim_token, AND that the current status is still + RUNNING or CLAIMED (not ORPHANED by the coordinator due to lease expiry). + """ + try: + current = read_task_state(pipe_root, task_id) + except (FileNotFoundError, ValueError, KeyError): + logger.warning(f'Task {task_id}: could not read state for ownership check') + return False + if current.claimed_by != worker_id or current.claim_token != claim_token: + logger.warning(f'Task {task_id}: ownership lost ' + f'(claimed_by={current.claimed_by}, token={current.claim_token}, ' + f'expected={worker_id}/{claim_token}). Not writing terminal state.') + return False + current_status = TaskState(current.status) + if current_status not in (TaskState.RUNNING, TaskState.CLAIMED): + logger.warning(f'Task {task_id}: status is {current_status.value} (expected RUNNING/CLAIMED). ' + f'Task may have been orphaned. Not writing terminal state.') + return False + return True + + +def _find_canonical_output(attempt_dir: str, engine: str) -> Optional[str]: + """Try to find the canonical output file path within the attempt calcs tree.""" + target = output_filenames.get(engine, 'output.out') + calcs_dir = os.path.join(attempt_dir, 'calcs') + if os.path.isdir(calcs_dir): + for root, dirs, files in os.walk(calcs_dir): + if target in files: + return os.path.join(root, target) + return None + + +def _fix_int_keys(obj): + """Recursively convert string dict keys that represent integers back to int.""" + if isinstance(obj, dict): + new = {} + for k, v in obj.items(): + try: + k = int(k) + except (ValueError, TypeError): + pass # Key is not numeric; keep it as a string. + new[k] = _fix_int_keys(v) + return new + elif isinstance(obj, list): + return [_fix_int_keys(x) for x in obj] + return obj + + +def _copy_outputs(src_dir: str, dst_dir: str) -> None: + calcs_dir = os.path.join(src_dir, 'calcs') + if not os.path.isdir(calcs_dir): + return + shutil.copytree(calcs_dir, os.path.join(dst_dir, 'calcs'), dirs_exist_ok=True) + + +def main(argv=None): + """Entry point. Loops claiming and executing PENDING tasks until none remain.""" + parser = argparse.ArgumentParser(description='Pipe-mode worker: claim and execute tasks.') + parser.add_argument('--pipe_root', required=True, help='Root directory of the pipe run.') + parser.add_argument('--worker_id', required=True, help='Worker identifier.') + args = parser.parse_args(argv) + + tasks_completed = 0 + while True: + task_id, state, token = claim_task(args.pipe_root, args.worker_id) + if task_id is None: + break + run_task(args.pipe_root, task_id, state, args.worker_id, token) + tasks_completed += 1 + + if tasks_completed == 0: + print('No claimable tasks found. Exiting.') + else: + print(f'Worker {args.worker_id} completed {tasks_completed} task(s). No more work remaining.') + + +if __name__ == '__main__': + main() diff --git a/arc/scripts/pipe_worker_test.py b/arc/scripts/pipe_worker_test.py new file mode 100644 index 0000000000..1f77fdc4bb --- /dev/null +++ b/arc/scripts/pipe_worker_test.py @@ -0,0 +1,334 @@ +#!/usr/bin/env python3 +# encoding: utf-8 + +""" +This module contains unit tests for the arc.scripts.pipe_worker module +""" + +import json +import os +import shutil +import tempfile +import time +import unittest + +from arc.job.pipe.pipe_state import ( + TaskState, + TaskSpec, + generate_claim_token, + get_task_attempt_dir, + initialize_task, + read_task_state, + update_task_state, +) +from arc.scripts.pipe_worker import claim_task, run_task, main, logger as worker_logger +from arc.species import ARCSpecies + + +def _make_h2o_spec(task_id='sp_h2o', task_family='conf_opt'): + """Helper to create a TaskSpec for H2O using the mockter adapter.""" + spc = ARCSpecies(label='H2O', smiles='O') + return TaskSpec( + task_id=task_id, + task_family=task_family, + owner_type='species', + owner_key='H2O', + input_fingerprint=f'{task_id}_fp', + engine='mockter', + level={'method': 'mock', 'basis': 'mock'}, + required_cores=1, + required_memory_mb=512, + input_payload={'species_dicts': [spc.as_dict()]}, + ingestion_metadata={'conformer_index': 0}, + ) + + +class TestClaimTask(unittest.TestCase): + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_claim_test_') + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_claims_pending_task(self): + initialize_task(self.tmpdir, _make_h2o_spec('task_a')) + task_id, state, token = claim_task(self.tmpdir, 'worker-1') + self.assertEqual(task_id, 'task_a') + self.assertEqual(state.status, 'CLAIMED') + self.assertEqual(state.claimed_by, 'worker-1') + self.assertIsNotNone(token) + self.assertEqual(state.claim_token, token) + + def test_skips_completed_and_running(self): + initialize_task(self.tmpdir, _make_h2o_spec('task_01')) + now = time.time() + update_task_state(self.tmpdir, 'task_01', new_status=TaskState.CLAIMED, + claimed_by='w0', claim_token='t', claimed_at=now, lease_expires_at=now + 300) + update_task_state(self.tmpdir, 'task_01', new_status=TaskState.RUNNING, started_at=now) + update_task_state(self.tmpdir, 'task_01', new_status=TaskState.COMPLETED, ended_at=now) + + initialize_task(self.tmpdir, _make_h2o_spec('task_02')) + update_task_state(self.tmpdir, 'task_02', new_status=TaskState.CLAIMED, + claimed_by='w0', claim_token='t', claimed_at=now, lease_expires_at=now + 300) + update_task_state(self.tmpdir, 'task_02', new_status=TaskState.RUNNING, started_at=now) + + initialize_task(self.tmpdir, _make_h2o_spec('task_03')) + task_id, state, token = claim_task(self.tmpdir, 'worker-5') + self.assertEqual(task_id, 'task_03') + + def test_returns_none_when_no_tasks(self): + task_id, state, token = claim_task(self.tmpdir, 'worker-1') + self.assertIsNone(task_id) + self.assertIsNone(token) + + def test_ignores_orphaned_tasks(self): + initialize_task(self.tmpdir, _make_h2o_spec('task_orphan')) + now = time.time() + update_task_state(self.tmpdir, 'task_orphan', new_status=TaskState.CLAIMED, + claimed_by='dead', claim_token='t', claimed_at=now, lease_expires_at=now + 300) + update_task_state(self.tmpdir, 'task_orphan', new_status=TaskState.ORPHANED) + task_id, state, token = claim_task(self.tmpdir, 'worker-rescue') + self.assertIsNone(task_id) + + +class TestRunTask(unittest.TestCase): + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_run_test_') + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def _claim(self, task_id, worker_id='test-worker'): + now = time.time() + token = generate_claim_token() + state = update_task_state( + self.tmpdir, task_id, new_status=TaskState.CLAIMED, + claimed_by=worker_id, claim_token=token, + claimed_at=now, lease_expires_at=now + 86400) + return state, token + + def test_successful_execution(self): + spec = _make_h2o_spec('sp_h2o') + initialize_task(self.tmpdir, spec) + state, token = self._claim('sp_h2o') + run_task(self.tmpdir, 'sp_h2o', state, 'test-worker', token) + final = read_task_state(self.tmpdir, 'sp_h2o') + self.assertEqual(final.status, 'COMPLETED') + + def test_result_json_written_on_success(self): + spec = _make_h2o_spec('sp_result') + initialize_task(self.tmpdir, spec) + state, token = self._claim('sp_result') + run_task(self.tmpdir, 'sp_result', state, 'test-worker', token) + attempt_dir = get_task_attempt_dir(self.tmpdir, 'sp_result', 0) + result_path = os.path.join(attempt_dir, 'result.json') + self.assertTrue(os.path.isfile(result_path)) + with open(result_path) as f: + result = json.load(f) + self.assertEqual(result['task_id'], 'sp_result') + self.assertEqual(result['status'], 'COMPLETED') + self.assertIsNotNone(result['started_at']) + self.assertIsNotNone(result['ended_at']) + for key in ('canonical_output_path', 'exit_code', 'failure_class', + 'parser_summary', 'result_fields'): + self.assertIn(key, result) + + def test_result_json_written_on_failure(self): + """A failing task still produces result.json with status=FAILED.""" + # Create a valid spec, then corrupt the task_family on disk to trigger failure. + spec = _make_h2o_spec('bad_job') + initialize_task(self.tmpdir, spec) + spec_path = os.path.join(self.tmpdir, 'tasks', 'bad_job', 'spec.json') + with open(spec_path) as f: + data = json.load(f) + data['task_family'] = 'nonexistent_type' + with open(spec_path, 'w') as f: + json.dump(data, f) + state, token = self._claim('bad_job') + run_task(self.tmpdir, 'bad_job', state, 'test-worker', token) + attempt_dir = get_task_attempt_dir(self.tmpdir, 'bad_job', 0) + result_path = os.path.join(attempt_dir, 'result.json') + self.assertTrue(os.path.isfile(result_path)) + with open(result_path) as f: + result = json.load(f) + self.assertEqual(result['status'], 'FAILED') + self.assertIsNotNone(result['failure_class']) + + def test_output_preservation(self): + spec = _make_h2o_spec('sp_h2o_out') + initialize_task(self.tmpdir, spec) + state, token = self._claim('sp_h2o_out') + run_task(self.tmpdir, 'sp_h2o_out', state, 'test-worker', token) + attempt_dir = get_task_attempt_dir(self.tmpdir, 'sp_h2o_out', 0) + calcs_dir = os.path.join(attempt_dir, 'calcs') + self.assertTrue(os.path.isdir(calcs_dir)) + + def test_ownership_with_token(self): + """If claim_token changes, worker does not overwrite terminal state.""" + spec = _make_h2o_spec('sp_stolen') + initialize_task(self.tmpdir, spec) + now = time.time() + token_a = generate_claim_token() + update_task_state(self.tmpdir, 'sp_stolen', new_status=TaskState.CLAIMED, + claimed_by='worker-A', claim_token=token_a, + claimed_at=now, lease_expires_at=now + 86400) + # Simulate reassignment + update_task_state(self.tmpdir, 'sp_stolen', new_status=TaskState.ORPHANED) + update_task_state(self.tmpdir, 'sp_stolen', new_status=TaskState.PENDING, + attempt_index=1, claimed_by=None, claim_token=None, + claimed_at=None, lease_expires_at=None, + started_at=None, ended_at=None, + failure_class=None, retry_disposition=None) + token_b = generate_claim_token() + update_task_state(self.tmpdir, 'sp_stolen', new_status=TaskState.CLAIMED, + claimed_by='worker-B', claim_token=token_b, + claimed_at=now + 1, lease_expires_at=now + 86401) + from arc.scripts.pipe_worker import _verify_ownership + self.assertFalse(_verify_ownership(self.tmpdir, 'sp_stolen', 'worker-A', token_a)) + self.assertTrue(_verify_ownership(self.tmpdir, 'sp_stolen', 'worker-B', token_b)) + + def test_scratch_cleanup(self): + spec = _make_h2o_spec('sp_clean') + initialize_task(self.tmpdir, spec) + state, token = self._claim('sp_clean') + run_task(self.tmpdir, 'sp_clean', state, 'test-worker', token) + import glob + leftover = glob.glob(os.path.join(tempfile.gettempdir(), 'pipe_sp_clean_*')) + self.assertEqual(len(leftover), 0) + + def test_conf_sp_dispatch(self): + """conf_sp task family dispatches correctly and produces result.json.""" + spec = _make_h2o_spec('conf_sp_task', task_family='conf_sp') + initialize_task(self.tmpdir, spec) + state, token = self._claim('conf_sp_task') + run_task(self.tmpdir, 'conf_sp_task', state, 'test-worker', token) + final = read_task_state(self.tmpdir, 'conf_sp_task') + self.assertEqual(final.status, 'COMPLETED') + attempt_dir = get_task_attempt_dir(self.tmpdir, 'conf_sp_task', 0) + result_path = os.path.join(attempt_dir, 'result.json') + self.assertTrue(os.path.isfile(result_path)) + with open(result_path) as f: + result = json.load(f) + self.assertEqual(result['status'], 'COMPLETED') + + def test_conf_opt_dispatch(self): + """conf_opt task family dispatches correctly.""" + spec = _make_h2o_spec('conf_opt_task', task_family='conf_opt') + initialize_task(self.tmpdir, spec) + state, token = self._claim('conf_opt_task') + run_task(self.tmpdir, 'conf_opt_task', state, 'test-worker', token) + final = read_task_state(self.tmpdir, 'conf_opt_task') + self.assertEqual(final.status, 'COMPLETED') + + def test_ts_opt_dispatch(self): + """ts_opt task family dispatches via opt job_type and produces result.json.""" + spec = _make_h2o_spec('ts_opt_task', task_family='ts_opt') + initialize_task(self.tmpdir, spec) + state, token = self._claim('ts_opt_task') + run_task(self.tmpdir, 'ts_opt_task', state, 'test-worker', token) + final = read_task_state(self.tmpdir, 'ts_opt_task') + self.assertEqual(final.status, 'COMPLETED') + attempt_dir = get_task_attempt_dir(self.tmpdir, 'ts_opt_task', 0) + self.assertTrue(os.path.isfile(os.path.join(attempt_dir, 'result.json'))) + + def test_ts_guess_batch_dispatch(self): + """ts_guess_batch_method dispatches via tsg job_type. May fail at adapter + level (mockter doesn't natively support tsg without reactions), but the + dispatch path itself should route correctly and write result.json.""" + spec = _make_h2o_spec('tsg_task', task_family='ts_guess_batch_method') + initialize_task(self.tmpdir, spec) + state, token = self._claim('tsg_task') + run_task(self.tmpdir, 'tsg_task', state, 'test-worker', token) + # The task should at least have written result.json (even on failure) + attempt_dir = get_task_attempt_dir(self.tmpdir, 'tsg_task', 0) + self.assertTrue(os.path.isfile(os.path.join(attempt_dir, 'result.json'))) + final = read_task_state(self.tmpdir, 'tsg_task') + # Either COMPLETED (if mockter handled it) or FAILED_* (if adapter rejected tsg) + self.assertIn(final.status, ('COMPLETED', 'FAILED_RETRYABLE', 'FAILED_TERMINAL')) + + def test_species_sp_dispatch(self): + """species_sp task family dispatches via sp job_type.""" + spec = _make_h2o_spec('sp_task', task_family='species_sp') + initialize_task(self.tmpdir, spec) + state, token = self._claim('sp_task') + run_task(self.tmpdir, 'sp_task', state, 'test-worker', token) + final = read_task_state(self.tmpdir, 'sp_task') + self.assertEqual(final.status, 'COMPLETED') + attempt_dir = get_task_attempt_dir(self.tmpdir, 'sp_task', 0) + self.assertTrue(os.path.isfile(os.path.join(attempt_dir, 'result.json'))) + + def test_species_freq_dispatch(self): + """species_freq task family dispatches via freq job_type.""" + spec = _make_h2o_spec('freq_task', task_family='species_freq') + initialize_task(self.tmpdir, spec) + state, token = self._claim('freq_task') + run_task(self.tmpdir, 'freq_task', state, 'test-worker', token) + final = read_task_state(self.tmpdir, 'freq_task') + self.assertEqual(final.status, 'COMPLETED') + + def test_irc_dispatch(self): + """irc task family dispatches via irc job_type.""" + spec = _make_h2o_spec('irc_task', task_family='irc') + initialize_task(self.tmpdir, spec) + state, token = self._claim('irc_task') + run_task(self.tmpdir, 'irc_task', state, 'test-worker', token) + # IRC may fail at adapter level (mockter may not handle irc natively), + # but the dispatch route should work and result.json should be written. + attempt_dir = get_task_attempt_dir(self.tmpdir, 'irc_task', 0) + self.assertTrue(os.path.isfile(os.path.join(attempt_dir, 'result.json'))) + + def test_rotor_scan_1d_dispatch(self): + """rotor_scan_1d task family dispatches via scan job_type and writes result.json.""" + spec = _make_h2o_spec('scan_task', task_family='rotor_scan_1d') + initialize_task(self.tmpdir, spec) + state, token = self._claim('scan_task') + run_task(self.tmpdir, 'scan_task', state, 'test-worker', token) + attempt_dir = get_task_attempt_dir(self.tmpdir, 'scan_task', 0) + self.assertTrue(os.path.isfile(os.path.join(attempt_dir, 'result.json'))) + + def test_unsupported_family_fails(self): + """An unsupported task_family causes FAILED_RETRYABLE.""" + spec = _make_h2o_spec('bad_family') + initialize_task(self.tmpdir, spec) + spec_path = os.path.join(self.tmpdir, 'tasks', 'bad_family', 'spec.json') + with open(spec_path) as f: + data = json.load(f) + data['task_family'] = 'unsupported_scan' + with open(spec_path, 'w') as f: + json.dump(data, f) + state, token = self._claim('bad_family') + run_task(self.tmpdir, 'bad_family', state, 'test-worker', token) + final = read_task_state(self.tmpdir, 'bad_family') + self.assertIn(final.status, ('FAILED_RETRYABLE', 'FAILED_TERMINAL')) + + +class TestWorkerLoop(unittest.TestCase): + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_loop_test_') + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_main_processes_multiple_tasks(self): + for i in range(3): + initialize_task(self.tmpdir, _make_h2o_spec(f'task_{i}')) + main(['--pipe_root', self.tmpdir, '--worker_id', 'worker-loop']) + for i in range(3): + state = read_task_state(self.tmpdir, f'task_{i}') + self.assertEqual(state.status, 'COMPLETED') + + def test_main_no_tasks(self): + main(['--pipe_root', self.tmpdir, '--worker_id', 'worker-1']) + + def test_no_duplicate_log_handlers(self): + for i in range(3): + initialize_task(self.tmpdir, _make_h2o_spec(f'task_log_{i}')) + main(['--pipe_root', self.tmpdir, '--worker_id', 'worker-log']) + self.assertLessEqual(len(worker_logger.handlers), 2) + + +if __name__ == '__main__': + unittest.main(testRunner=unittest.TextTestRunner(verbosity=2)) From 9e7cab6ccc16a3e5c97b0a2dc2679313436c29cc Mon Sep 17 00:00:00 2001 From: Alon Grinberg Dana Date: Fri, 3 Apr 2026 15:24:34 +0300 Subject: [PATCH 14/60] Removed the previous pipe implementation --- arc/job/adapter.py | 435 +----------------------------- arc/job/adapter_test.py | 128 +-------- arc/job/adapters/common.py | 2 - arc/job/adapters/gaussian_test.py | 20 -- arc/job/adapters/psi_4.py | 1 - 5 files changed, 11 insertions(+), 575 deletions(-) diff --git a/arc/job/adapter.py b/arc/job/adapter.py index 8c5d6a9cff..de8c747718 100644 --- a/arc/job/adapter.py +++ b/arc/job/adapter.py @@ -21,11 +21,10 @@ from typing import TYPE_CHECKING, List, Optional, Tuple, Union import numpy as np -import pandas as pd from arc.common import ARC_PATH, get_logger, read_yaml_file, save_yaml_file, torsions_to_scans, convert_to_hours from arc.exceptions import JobError -from arc.imports import local_arc_path, pipe_submit, settings, submit_scripts +from arc.imports import local_arc_path, settings, submit_scripts from arc.job.local import (change_mode, check_job_status, delete_job, @@ -44,9 +43,9 @@ logger = get_logger() -default_job_settings, servers, submit_filenames, t_max_format, input_filenames, output_filenames, workers_coeff = \ +default_job_settings, servers, submit_filenames, t_max_format, input_filenames, output_filenames = \ settings['default_job_settings'], settings['servers'], settings['submit_filenames'], settings['t_max_format'], \ - settings['input_filenames'], settings['output_filenames'], settings['workers_coeff'] + settings['input_filenames'], settings['output_filenames'] constraint_type_dict = {2: 'B', 3: 'A', 4: 'D'} @@ -135,108 +134,6 @@ class JobExecutionTypeEnum(str, Enum): pipe = 'pipe' -class DataPoint(object): - """ - A class for representing a data point dictionary (a single job) per species for the HDF5 file. - - Args: - job_types (List[str]): The job types to be executed in sequence. - label (str): The species label. - level (dict): The level of theory, a Level.dict() representation. - xyz_1 (dict): The cartesian coordinates to consider. - args (dict, str, optional): Methods (including troubleshooting) to be used in input files. - bath_gas (str, optional): A bath gas. Currently only used in OneDMin to calculate L-J parameters. - charge (int): The species (or TS) charge. - constraints (List[Tuple[List[int], float]], optional): Optimization constraint. - cpu_cores (int, optional): The total number of cpu cores requested for a job. - dihedrals (List[float], optional): The dihedral angels corresponding to self.torsions. - fine (bool, optional): Whether to use fine geometry optimization parameters. Default: ``False``. - irc_direction (str, optional): The direction of the IRC job (`forward` or `reverse`). - multiplicity (int): The species (or TS) multiplicity. - torsions (List[List[int]], optional): The 0-indexed atom indices of the torsion(s). - xyz_2 (dict, optional): Additional cartesian coordinates to consider in double-ended TS search methods. - """ - - def __init__(self, - job_types: List[str], - label: str, - level: dict, - xyz_1: dict, - args: Optional[Union[dict, str]] = None, - bath_gas: Optional[str] = None, - charge: int = 0, - constraints: Optional[List[Tuple[List[int], float]]] = None, - cpu_cores: Optional[str] = None, - dihedrals: Optional[List[float]] = None, - fine: bool = False, - irc_direction: Optional[str] = None, - multiplicity: int = 1, - torsions: Optional[List[List[int]]] = None, - xyz_2: Optional[dict] = None, - ): - self.job_types = job_types - self.label = label - self.level = level - self.xyz_1 = xyz_1 - - self.args = args - self.bath_gas = bath_gas - self.charge = charge - self.constraints = constraints - self.cpu_cores = cpu_cores - self.dihedrals = dihedrals - self.fine = fine - self.irc_direction = irc_direction - self.multiplicity = multiplicity - self.torsions = torsions - self.xyz_2 = xyz_2 - - self.status = 0 - - # initialize outputs - self.electronic_energy = None - self.error = None - self.frequencies = None - self.xyz_out = None - - def as_dict(self): - """ - A dictionary representation of the object, not storing default or trivial data. - - Returns: dict - The dictionary representation. - """ - result = {'job_types': self.job_types, - 'label': self.label, - 'level': self.level, - 'xyz_1': self.xyz_1, - 'status': self.status, - 'electronic_energy': self.electronic_energy, - 'error': self.error, - 'frequencies': self.frequencies, - 'xyz_out': self.xyz_out, - } - if self.args is not None: - result['args'] = self.args - if self.bath_gas is not None: - result['bath_gas'] = self.bath_gas - if self.charge != 0: - result['charge'] = self.charge - if self.constraints is not None: - result['constraints'] = self.constraints - if self.cpu_cores is not None: - result['cpu_cores'] = self.cpu_cores - if self.fine: - result['fine'] = self.fine - if self.irc_direction is not None: - result['irc_direction'] = self.irc_direction - if self.multiplicity != 1: - result['multiplicity'] = self.multiplicity - if self.xyz_2 is not None: - result['xyz_2'] = self.xyz_2 - return result - - class JobAdapter(ABC): """ An abstract class for job adapters. @@ -324,10 +221,9 @@ def execute(self): elif execution_type == JobExecutionTypeEnum.queue: self.execute_queue() elif execution_type == JobExecutionTypeEnum.pipe: - # Todo: - # - Check that the HDF5 file is available, else raise an error. - # - Submit ARC workers with the HDF5 file. - self.execute_queue() # This is temporary until pipe is fully functional. + raise ValueError('Pipe execution is handled at the Scheduler level. ' + 'JobAdapters inside a pipe must be executed by the worker ' + "with execution_type='incore'.") if not self.restarted: self._write_initiated_job_to_csv_file() @@ -367,120 +263,6 @@ def set_job_shell_file_to_upload(self) -> dict: change_mode(mode='+x', file_name=file_name, path=self.local_path) return self.get_file_property_dictionary(file_name=file_name, make_x=True) - def determine_job_array_parameters(self): - """ - Determine the number of processes to use in a job array - and whether to iterate by conformers, species, reactions, or scan constraints. - - Explaining "workers" vs. "processes": - A pipe job may have, e.g., 1000 individual processes to compute. - ARC will allocate, e.g., 8 workers, to simultaneously get processes (one by one) from the HDF5 bank - and execute them. On average, each worker in this example executes 125 jobs. - """ - if self.execution_type == 'incore' or self.run_multi_species: - return None - if len(self.job_types) > 1: - self.iterate_by.append('job_types') - - for job_type in self.job_types: - if self.species is not None: - if len(self.species) > 1: - self.iterate_by.append('species') - if job_type == 'conf_opt': - if self.species is not None and sum(len(species.conformers) for species in self.species) > 10: - self.iterate_by.append('conf_opt') - self.number_of_processes += sum([len(species.conformers) for species in self.species]) - for species in self.species: - if job_type in ['sp', 'opt', 'freq', 'optfreq', 'composite', 'ornitals', 'onedmin', 'irc']: - self.number_of_processes += 1 - # elif job_type == 'scan' and rotor_dict['directed_scan_type'] != 'ess': # Todo: implement directed scans - elif job_type == 'scan' and len(species.rotors_dict.keys()) > 1000: # Todo: Modify when pipe is implemented - self.iterate_by.append('scan') - scan_points_per_dimension = 360.0 / self.scan_res - for rotor_dict in species.rotors_dict.values(): - if rotor_dict['directed_scan_type'] == 'ess': - self.number_of_processes += 1 - elif 'cont_opt' in rotor_dict['directed_scan_type']: - # A single calculation per species for a continuous scan, either diagonal or not. - self.number_of_processes += 1 - elif 'brute_force' in rotor_dict['directed_scan_type']: - if 'diagonal' in rotor_dict['directed_scan_type']: - self.number_of_processes += scan_points_per_dimension - else: - self.number_of_processes += \ - sum([scan_points_per_dimension ** len(rotor_dict['scan'])]) - - elif self.reactions is not None: - if len(self.reactions) > 1: - self.iterate_by.append('reactions') - self.number_of_processes += len(self.reactions) - - if self.number_of_processes > self.incore_capacity: - self.execution_type = 'pipe' - self._determine_workers() - self.write_hdf5() - - def _determine_workers(self): - """ - Determine the number of workers to use in a pipe job. - """ - if self.workers is None: - if self.number_of_processes <= workers_coeff['max_one']: - self.workers = 1 - elif self.number_of_processes <= workers_coeff['max_two']: - self.workers = 2 - else: - self.workers = min(round(workers_coeff['A'] * self.number_of_processes ** workers_coeff['b']), - workers_coeff['cap']) - - def write_hdf5(self): - """ - Write the HDF5 data file for job arrays. - Each data point is a dictionary representation of the DataPoint class. - Note: Each data point will always run "incore". A job array is created once the pipe is submitted to the queue - (rather than running the pipe "incore", taking no advantage of the server's potential for parallelization). - """ - if self.iterate_by: - data = dict() - if 'reactions' in self.iterate_by: - for reaction in self.reactions: - data[reaction.index] = list() - data[reaction.index].append(DataPoint(charge=reaction.charge, - job_types=[self.job_type], - label=reaction.label, - level=self.level.as_dict(), - multiplicity=reaction.multiplicity, - xyz_1=reaction.get_reactants_xyz(), - xyz_2=reaction.get_products_xyz(), - constraints=self.constraints, - ).as_dict()) - else: - for species in self.species: - data[species.label] = list() - if 'conf_opt' in self.iterate_by: - for conformer in species.conformers: - data[species.label].append(DataPoint(charge=species.charge, - job_types=['opt'], - label=species.label, - level=self.level.as_dict(), - multiplicity=species.multiplicity, - xyz_1=conformer, - ).as_dict()) - elif 'scan' in self.iterate_by: - data[species.label].extend(self.generate_scan_points(species=species)) - elif 'species' in self.iterate_by: - data[species.label].append(DataPoint(charge=species.charge, - job_types=[self.job_type], - label=species.label, - level=self.level.as_dict(), - multiplicity=species.multiplicity, - xyz_1=species.get_xyz(), - constraints=self.constraints, - ).as_dict()) - - df = pd.json_normalize(data) - df.to_hdf(os.path.join(self.local_path, 'data.hdf5'), key='df', mode='w') - def write_submit_script(self) -> None: """ Write a submit script to execute the job. @@ -500,8 +282,7 @@ def write_submit_script(self) -> None: if default_queue and default_queue not in self.attempted_queues: self.attempted_queues.append(default_queue) - submit_script = submit_scripts[self.server][self.job_adapter] if self.workers is None \ - else pipe_submit[self.server] + submit_script = submit_scripts[self.server][self.job_adapter] queue = self.queue if self.queue is not None else default_queue @@ -527,14 +308,9 @@ def write_submit_script(self) -> None: try: submit_script = submit_script.format(**format_params) except KeyError: - if self.workers is None: - submit_scripts_for_printing = {server: [software for software in values.keys()] - for server, values in submit_scripts.items()} - pipe = '' - else: - submit_scripts_for_printing = {server for server, values in pipe_submit.keys()} - pipe = ' pipe' - logger.error(f'Could not find{pipe} submit script for server {self.server} and software {self.job_adapter}.' + submit_scripts_for_printing = {server: [software for software in values.keys()] + for server, values in submit_scripts.items()} + logger.error(f'Could not find submit script for server {self.server} and software {self.job_adapter}.' f'\nMake sure your submit scripts (under arc/job/submit.py) are updated with the servers ' f'and software defined in arc/settings.py\n' f'Alternatively, It is possible that you defined parameters in curly braces (e.g., {{PARAM}}) ' @@ -1172,197 +948,6 @@ def get_file_property_dictionary(self, 'make_x': make_x, } - def generate_scan_points(self, - species: 'ARCSpecies', - cont_only: bool = False, - ) -> List[DataPoint]: - """ - Generate all coordinates in advance for "brute force" (non-continuous) directed scans, - or the *next* coordinates for a continuous scan. - - Directed scan types could be one of the following: ``'brute_force_sp'``, ``'brute_force_opt'``, - ``'brute_force_sp_diagonal'``, ``'brute_force_opt_diagonal'``, ``'cont_opt'``, or ``'cont_opt_diagonal'``. - The differentiation between ``'sp'`` and ``'opt'`` is done in at the Job level. - - Args: - species (ARCSpecies): The species to consider. - cont_only (bool, optional): Whether to only return the next point in continuous scans. - - Raises: - ValueError: If the species directed scan type has an unexpected value. - - Returns: List[DataPoint] - Entries are DataPoint instances. - """ - data_list = list() - - if divmod(360, self.scan_res)[1]: - raise ValueError(f'Got an illegal scan resolution of {self.scan_res}.') - - for rotor_dict in species.rotors_dict.values(): - directed_scan_type = rotor_dict['directed_scan_type'] - if cont_only and 'cont' not in directed_scan_type: - # Visiting this method again for a cont scan should not re-trigger all other scans. - continue - - torsions = rotor_dict['torsion'] - if isinstance(torsions[0], int): - torsions = [torsions] - xyz = species.get_xyz(generate=True) - - if not ('cont' in directed_scan_type or 'brute' in directed_scan_type or 'ess' in directed_scan_type): - raise ValueError(f'directed_scan_type must be either continuous or brute force, got: {directed_scan_type}') - - if directed_scan_type == 'ess' and not rotor_dict['scan_path'] and rotor_dict['success'] is None: - # Allow the ESS to control the scan. - data_list.append(DataPoint(job_types=['scan'], - label=species.label, - level=self.level, - xyz_1=species.get_xyz(generate=True), - args=self.args, - charge=species.charge, - constraints=self.constraints, - cpu_cores=self.cpu_cores, - multiplicity=species.multiplicity, - torsions=torsions, - )) - - elif 'brute' in directed_scan_type: - # Spawn jobs all at once. - dihedrals = dict() - for torsion in torsions: - original_dihedral = calculate_dihedral_angle(coords=xyz['coords'], torsion=torsion, index=0) - dihedrals[tuple(torsion)] = [round(original_dihedral + i * self.scan_res - if original_dihedral + i * self.scan_res <= 180.0 - else original_dihedral + i * self.scan_res - 360.0, 2) - for i in range(int(360 / self.scan_res) + 1)] - modified_xyz = xyz.copy() - if 'diagonal' not in directed_scan_type: - # Increment dihedrals one by one (results in an ND scan). - all_dihedral_combinations = list(itertools.product(*[dihedrals[tuple(torsion)] for torsion in torsions])) - for dihedral_tuple in all_dihedral_combinations: - for torsion, dihedral in zip(torsions, dihedral_tuple): - species.set_dihedral(scan=torsions_to_scans(torsion), - deg_abs=dihedral, - count=False, - xyz=modified_xyz) - modified_xyz = species.initial_xyz - rotor_dict['number_of_running_jobs'] += 1 - data_list.append(DataPoint(job_types=['opt'] if 'opt' in directed_scan_type else ['sp'], - label=species.label, - level=self.level, - xyz_1=modified_xyz.copy(), - args=self.args, - charge=species.charge, - constraints=self.constraints, - cpu_cores=self.cpu_cores, - multiplicity=species.multiplicity, - )) - else: - # Increment all dihedrals at once (results in a 1D scan along simultaneously-changing dimensions). - for i in range(len(dihedrals[tuple(torsions[0])])): - for torsion in torsions: - dihedral = dihedrals[tuple(torsion)][i] - species.set_dihedral(scan=torsions_to_scans(torsion), - deg_abs=dihedral, - count=False, - xyz=modified_xyz) - modified_xyz = species.initial_xyz - directed_dihedrals = [dihedrals[tuple(torsion)][i] for torsion in torsions] - rotor_dict['number_of_running_jobs'] += 1 - data_list.append(DataPoint(job_types=['opt'] if 'opt' in directed_scan_type else ['sp'], - label=species.label, - level=self.level, - xyz_1=modified_xyz.copy(), - args=self.args, - charge=species.charge, - constraints=self.constraints, - cpu_cores=self.cpu_cores, - dihedrals=directed_dihedrals, - multiplicity=species.multiplicity, - )) - - elif 'cont' in directed_scan_type: - # Set up the next DataPoint only. - if not len(rotor_dict['cont_indices']): - rotor_dict['cont_indices'] = [0] * len(torsions) - if not len(rotor_dict['original_dihedrals']): - rotor_dict['original_dihedrals'] = \ - [f'{calculate_dihedral_angle(coords=xyz["coords"], torsion=scan, index=1):.2f}' - for scan in rotor_dict['scan']] # Store the dihedrals as strings for the YAML restart file. - torsions = rotor_dict['torsion'] - max_num = 360 / self.scan_res + 1 # Dihedral angles per scan - original_dihedrals = list() - for dihedral in rotor_dict['original_dihedrals']: - f_dihedral = float(dihedral) - original_dihedrals.append(f_dihedral if f_dihedral < 180.0 else f_dihedral - 360.0) - if not any(rotor_dict['cont_indices']): - # This is the first call to the cont_opt directed rotor, spawn the first job w/o changing dihedrals. - data_list.append(DataPoint(job_types=['opt'], - label=species.label, - level=self.level, - xyz_1=species.final_xyz, - args=self.args, - charge=species.charge, - constraints=self.constraints, - cpu_cores=self.cpu_cores, - dihedrals=original_dihedrals, - multiplicity=species.multiplicity, - )) - rotor_dict['cont_indices'][0] += 1 - continue - else: - # This is NOT the first call for this cont_opt directed rotor. - # Check whether this rotor is done. - if rotor_dict['cont_indices'][-1] == max_num - 1: # 0-indexed - # No more counters to increment, all done! - logger.info(f"Completed all jobs for the continuous directed rotor scan for species " - f"{species.label} between pivots {rotor_dict['pivots']}") - continue - - modified_xyz = xyz.copy() - dihedrals = list() - for index, (original_dihedral, torsion_) in enumerate(zip(original_dihedrals, torsions)): - dihedral = original_dihedral + rotor_dict['cont_indices'][index] * self.scan_res - # Change the original dihedral so we won't end up with two calcs for 180.0, but none for -180.0 - # (it only matters for plotting, the geometry is of course the same). - dihedral = dihedral if dihedral <= 180.0 else dihedral - 360.0 - dihedrals.append(dihedral) - # Only change the dihedrals in the xyz if this torsion corresponds to the current index, - # or if this is a cont diagonal scan. - # Species.set_dihedral() uses .final_xyz or the given xyz to modify the .initial_xyz - # attribute to the desired dihedral. - species.set_dihedral(scan=torsions_to_scans(torsion_), - deg_abs=dihedral, - count=False, - xyz=modified_xyz) - modified_xyz = species.initial_xyz - data_list.append(DataPoint(job_types=['opt'], - label=species.label, - level=self.level, - xyz_1=modified_xyz, - args=self.args, - charge=species.charge, - constraints=self.constraints, - cpu_cores=self.cpu_cores, - dihedrals=dihedrals, - multiplicity=species.multiplicity, - )) - - if 'diagonal' in directed_scan_type: - # Increment ALL counters for a diagonal scan. - rotor_dict['cont_indices'] = [rotor_dict['cont_indices'][0] + 1] * len(torsions) - else: - # Increment the counter sequentially for a non-diagonal scan. - for index in range(len(torsions)): - if rotor_dict['cont_indices'][index] < max_num - 1: - rotor_dict['cont_indices'][index] += 1 - break - elif rotor_dict['cont_indices'][index] == max_num - 1 and index < len(torsions) - 1: - rotor_dict['cont_indices'][index] = 0 - - return data_list - def troubleshoot_server(self): """ Troubleshoot server errors. diff --git a/arc/job/adapter_test.py b/arc/job/adapter_test.py index 083b23288b..9657f9a62a 100644 --- a/arc/job/adapter_test.py +++ b/arc/job/adapter_test.py @@ -15,11 +15,9 @@ import unittest from unittest.mock import patch -import pandas as pd - from arc.common import ARC_TESTING_PATH from arc.imports import settings -from arc.job.adapter import DataPoint, JobAdapter, JobEnum, JobTypeEnum, JobExecutionTypeEnum +from arc.job.adapter import JobAdapter, JobEnum, JobTypeEnum, JobExecutionTypeEnum from arc.job.adapters.gaussian import GaussianAdapter from arc.level import Level from arc.species import ARCSpecies @@ -79,39 +77,6 @@ def test_job_execution_type_enum(self): JobExecutionTypeEnum('wrong') -class TestDataPoint(unittest.TestCase): - """ - Contains unit tests for the DataPoint class. - """ - - def test_as_dict(self): - """Test the dictionary representation of a DataPoint instance""" - xyz_1 = {'symbols': ('C', 'H', 'H', 'H', 'H'), - 'isotopes': (12, 1, 1, 1, 1), - 'coords': ((0.0, 0.0, 0.0), - (0.6300326, 0.6300326, 0.6300326), - (-0.6300326, -0.6300326, 0.6300326), - (-0.6300326, 0.6300326, -0.6300326), - (0.6300326, -0.6300326, -0.6300326))} - data_point = DataPoint(charge=0, - job_types=['opt'], - label='spc1', - level={'method': 'cbs-qb3'}, - multiplicity=1, - xyz_1=xyz_1, - ) - expected_dict = {'job_types': ['opt'], - 'label': 'spc1', - 'level': {'method': 'cbs-qb3'}, - 'xyz_1': xyz_1, - 'status': 0, - 'electronic_energy': None, - 'error': None, - 'frequencies': None, - 'xyz_out': None} - self.assertEqual(data_point.as_dict(), expected_dict) - - class TestJobAdapter(unittest.TestCase): """ Contains unit tests for the JobAdapter class. @@ -230,97 +195,6 @@ def setUpClass(cls): attempted_queues=['short_queue'] ) - def test_determine_job_array_parameters(self): - """Test determining job array parameters""" - self.assertEqual(self.job_1.iterate_by, ['species', 'conf_opt']) - self.assertEqual(self.job_1.number_of_processes, 3 * 6) - self.assertEqual(self.job_1.workers, 4) - - def test_determine_workers(self): - """Test determining the number of workers""" - self.job_2.number_of_processes, self.job_2.workers = 1, None - self.job_2._determine_workers() - self.assertEqual(self.job_2.workers, 1) - - self.job_2.number_of_processes, self.job_2.workers = 2, None - self.job_2._determine_workers() - self.assertEqual(self.job_2.workers, 1) - - self.job_2.number_of_processes, self.job_2.workers = 3, None - self.job_2._determine_workers() - self.assertEqual(self.job_2.workers, 1) - - self.job_2.number_of_processes, self.job_2.workers = 4, None - self.job_2._determine_workers() - self.assertEqual(self.job_2.workers, 2) - - self.job_2.number_of_processes, self.job_2.workers = 5, None - self.job_2._determine_workers() - self.assertEqual(self.job_2.workers, 2) - - self.job_2.number_of_processes, self.job_2.workers = 9, None - self.job_2._determine_workers() - self.assertEqual(self.job_2.workers, 2) - - self.job_2.number_of_processes, self.job_2.workers = 10, None - self.job_2._determine_workers() - self.assertEqual(self.job_2.workers, 4) - - self.job_2.number_of_processes, self.job_2.workers = 100, None - self.job_2._determine_workers() - self.assertEqual(self.job_2.workers, 6) - - self.job_2.number_of_processes, self.job_2.workers = 1000, None - self.job_2._determine_workers() - self.assertEqual(self.job_2.workers, 11) - - self.job_2.number_of_processes, self.job_2.workers = 1e4, None - self.job_2._determine_workers() - self.assertEqual(self.job_2.workers, 20) - - self.job_2.number_of_processes, self.job_2.workers = 1e5, None - self.job_2._determine_workers() - self.assertEqual(self.job_2.workers, 36) - - self.job_2.number_of_processes, self.job_2.workers = 1e6, None - self.job_2._determine_workers() - self.assertEqual(self.job_2.workers, 63) - - self.job_2.number_of_processes, self.job_2.workers = 1e7, None - self.job_2._determine_workers() - self.assertEqual(self.job_2.workers, 100) - - self.job_2.number_of_processes, self.job_2.workers = 1e8, None - self.job_2._determine_workers() - self.assertEqual(self.job_2.workers, 100) - - def test_write_hdf5(self): - """Test writing the HDF5 file""" - with pd.HDFStore(os.path.join(self.job_1.local_path, 'data.hdf5')) as store: - data = store['df'].to_dict() - self.assertEqual([key for key in data.keys()], ['spc1', 'spc2', 'spc3']) - - def test_write_hdf5_for_directed_scans(self): - """Test writing the HDF5 file for directed scans""" - with pd.HDFStore(os.path.join(self.job_1.local_path, 'data.hdf5')) as store: - data = store['df'].to_dict() - self.assertEqual([key for key in data.keys()], ['spc1', 'spc2', 'spc3']) - - def test_write_array_submit_script(self): - """Test writing an array submit script""" - self.job_1.write_submit_script() - with open(os.path.join(self.job_1.local_path, submit_filenames[servers[self.job_1.server]['cluster_soft']]), - 'r') as f: - lines = f.readlines() - array, hdf5 = False, False - for line in lines: - if '#SBATCH --array=1-4' in line: - array = True - if 'job/scripts/pipe.py' in line and 'data.hdf5' in line: - hdf5 = True - self.assertTrue(array) - self.assertTrue(hdf5) - def test_write_queue_submit_script(self): """Test writing a queue submit script""" self.job_4.number_of_processes, self.job_4.workers = 1, None diff --git a/arc/job/adapters/common.py b/arc/job/adapters/common.py index 7ad8495713..0c585fa169 100644 --- a/arc/job/adapters/common.py +++ b/arc/job/adapters/common.py @@ -242,8 +242,6 @@ def _initialize_adapter(obj: 'JobAdapter', obj.set_file_paths() obj.set_cpu_and_mem() - if obj.execution_type != 'incore' and obj.job_adapter in obj.ess_settings.keys(): - obj.determine_job_array_parameters() # Set scan_res if required by trsh if obj.args and 'trsh' in obj.args.keys() and 'scan_res' in obj.args['trsh'].keys(): diff --git a/arc/job/adapters/gaussian_test.py b/arc/job/adapters/gaussian_test.py index b04f10d1c1..c81e8669b9 100644 --- a/arc/job/adapters/gaussian_test.py +++ b/arc/job/adapters/gaussian_test.py @@ -750,26 +750,6 @@ def test_set_files(self): self.assertEqual(self.job_3.files_to_upload, job_3_files_to_upload) self.assertEqual(self.job_3.files_to_download, job_3_files_to_download) - def test_set_files_for_pipe(self): - """Test setting files for a pipe job""" - job_2_files_to_upload = [{'file_name': 'submit.sub', - 'local': os.path.join(self.job_2.local_path, 'submit.sub'), - 'remote': os.path.join(self.job_2.remote_path, 'submit.sub'), - 'source': 'path', - 'make_x': False}, - {'file_name': 'data.hdf5', - 'local': os.path.join(self.job_2.local_path, 'data.hdf5'), - 'remote': os.path.join(self.job_2.remote_path, 'data.hdf5'), - 'source': 'path', - 'make_x': False}] - job_2_files_to_download = [{'file_name': 'data.hdf5', - 'local': os.path.join(self.job_2.local_path, 'data.hdf5'), - 'remote': os.path.join(self.job_2.remote_path, 'data.hdf5'), - 'source': 'path', - 'make_x': False}] - self.assertEqual(self.job_2.files_to_upload, job_2_files_to_upload) - self.assertEqual(self.job_2.files_to_download, job_2_files_to_download) - def test_gaussian_def2tzvp(self): """Test a Gaussian job using def2-tzvp""" self.assertEqual(self.job_9.level.basis.lower(), 'def2tzvp') diff --git a/arc/job/adapters/psi_4.py b/arc/job/adapters/psi_4.py index f409a9329d..38260fd57d 100644 --- a/arc/job/adapters/psi_4.py +++ b/arc/job/adapters/psi_4.py @@ -275,7 +275,6 @@ def __init__(self, self.iterate_by = list() self.number_of_processes = 0 self.incore_capacity = 5 - self.determine_job_array_parameters() # Writes the local HDF5 file if needed. self.files_to_upload = list() self.files_to_download = list() From 96c70ce7bf68e82f3f1980ad6915ebf825fd0579 Mon Sep 17 00:00:00 2001 From: Alon Grinberg Dana Date: Fri, 3 Apr 2026 15:24:57 +0300 Subject: [PATCH 15/60] Implement pipe into Scheduler --- arc/scheduler.py | 198 ++++-- arc/scheduler_pipe_test.py | 1368 ++++++++++++++++++++++++++++++++++++ 2 files changed, 1520 insertions(+), 46 deletions(-) create mode 100644 arc/scheduler_pipe_test.py diff --git a/arc/scheduler.py b/arc/scheduler.py index 21f5a2a7a7..d309df4a7d 100644 --- a/arc/scheduler.py +++ b/arc/scheduler.py @@ -29,7 +29,6 @@ torsions_to_scans, ) from arc.exceptions import (InputError, - SanitizationError, SchedulerError, SpeciesError, TrshError, @@ -38,6 +37,8 @@ from arc.job.adapters.common import all_families_ts_adapters, default_incore_adapters, ts_adapters_by_rmg_family from arc.job.factory import job_factory from arc.job.local import check_running_jobs_ids +from arc.job.pipe.pipe_coordinator import PipeCoordinator +from arc.job.pipe.pipe_planner import PipePlanner from arc.job.ssh import SSHClient from arc.job.trsh import (scan_quality_check, trsh_conformer_isomorphism, @@ -505,11 +506,84 @@ def __init__(self, if species.is_ts: # This is a TS loaded from a YAML file species.ts_conf_spawned = True + # Pipe mode: coordinator manages run lifecycle, planner handles family routing + self.pipe_coordinator = PipeCoordinator(self) + self.pipe_planner = PipePlanner(self, self.pipe_coordinator) + # Backward-compatible alias to coordinator-owned state. + # ``active_pipes`` is owned and mutated by ``PipeCoordinator``; this alias + # exists so that scheduler-level loop conditions (``while ... or self.active_pipes``) + # and logging can reference it directly without going through the coordinator. + self.active_pipes = self.pipe_coordinator.active_pipes + # Deferred pipe batching accumulators — flushed once per main-loop iteration. + self._pending_pipe_sp: set = set() # species labels + self._pending_pipe_freq: set = set() # species labels + self._pending_pipe_irc: set = set() # (label, direction) tuples + self._pending_pipe_conf_sp: dict = dict() # {label: set of conformer indices} + self.save_restart = True self.timer = True if not self.testing: self.schedule_jobs() + def flush_pending_pipe_batches(self) -> None: + """ + Attempt to submit accumulated deferred pipe batches for SP, freq, IRC, and conf_sp. + + For each family: + 1. Snapshot and clear the pending set. + 2. Ask the planner for the handled subset. + 3. Fall back to per-job submission for the unhandled remainder. + + Called once per main-loop iteration, after all newly-ready work has been + discovered and before the loop sleeps. + """ + self._flush_pending_pipe_sp() + self._flush_pending_pipe_freq() + self._flush_pending_pipe_irc() + self._flush_pending_pipe_conf_sp() + + def _flush_pending_pipe_sp(self) -> None: + """Flush pending species SP jobs through planner or fallback.""" + if not self._pending_pipe_sp: + return + pending = set(self._pending_pipe_sp) + self._pending_pipe_sp.clear() + piped = self.pipe_planner.try_pipe_species_sp(sorted(pending)) + for label in sorted(pending - piped): + self.run_sp_job(label) + + def _flush_pending_pipe_freq(self) -> None: + """Flush pending species freq jobs through planner or fallback.""" + if not self._pending_pipe_freq: + return + pending = set(self._pending_pipe_freq) + self._pending_pipe_freq.clear() + piped = self.pipe_planner.try_pipe_species_freq(sorted(pending)) + for label in sorted(pending - piped): + self.run_freq_job(label) + + def _flush_pending_pipe_irc(self) -> None: + """Flush pending IRC jobs through planner or fallback.""" + if not self._pending_pipe_irc: + return + pending = set(self._pending_pipe_irc) + self._pending_pipe_irc.clear() + piped = self.pipe_planner.try_pipe_irc(sorted(pending)) + for label, direction in sorted(pending - piped): + self.run_irc_job(label=label, irc_direction=direction) + + def _flush_pending_pipe_conf_sp(self) -> None: + """Flush pending conformer SP jobs through planner or fallback.""" + if not self._pending_pipe_conf_sp: + return + pending = dict(self._pending_pipe_conf_sp) + self._pending_pipe_conf_sp.clear() + for label in sorted(pending): + conformer_indices = pending[label] + piped = self.pipe_planner.try_pipe_conf_sp(label, sorted(conformer_indices)) + for i in sorted(conformer_indices - piped): + self.run_sp_job(label=label, level=self.conformer_sp_level, conformer=i) + def schedule_jobs(self): """ The main job scheduling block @@ -526,7 +600,9 @@ def schedule_jobs(self): self.run_opt_job(species.label, fine=self.fine_only) self.run_conformer_jobs() self.spawn_ts_jobs() # If all reactants/products are already known (Arkane yml or restart), spawn TS searches. - while self.running_jobs != {}: + while self.running_jobs != {} or self.active_pipes \ + or self._pending_pipe_sp or self._pending_pipe_freq \ + or self._pending_pipe_irc or self._pending_pipe_conf_sp: self.timer = True for label in self.unique_species_labels: if label in self.output and self.output[label]['convergence'] is False: @@ -551,9 +627,8 @@ def schedule_jobs(self): if successful_server_termination: troubleshooting_conformer = self.parse_conformer(job=job, label=label, i=i) if 'conf_opt' in job_name and self.job_types['conf_sp'] and not troubleshooting_conformer: - self.run_sp_job(label=label, - level=self.conformer_sp_level, - conformer=i) + # Accumulate for deferred pipe batching of conf_sp. + self._pending_pipe_conf_sp.setdefault(label, set()).add(i) if troubleshooting_conformer: break # Just terminated a conformer job. @@ -732,12 +807,23 @@ def schedule_jobs(self): # Delete the label only if it represents an empty entry. del self.running_jobs[label] - if self.timer and len(job_list): + # Poll active pipe runs (per-run failures are handled inside poll_pipes). + if self.active_pipes: + self.pipe_coordinator.poll_pipes() + + # Flush deferred pipe batches (SP, freq, IRC, conf_sp) after all + # newly-ready work has been discovered and before the loop sleeps. + self.flush_pending_pipe_batches() + + should_sleep = self.timer and (self.running_jobs or self.active_pipes) + if should_sleep: time.sleep(30) # wait 30 sec before bugging the servers again. t = time.time() - self.report_time - if t > 3600 and self.running_jobs: + if t > 3600 and (self.running_jobs or self.active_pipes): self.report_time = time.time() logger.info(f'Currently running jobs:\n{pprint.pformat(self.running_jobs)}') + if self.active_pipes: + logger.info(f'Active pipe runs: {list(self.active_pipes.keys())}') # Generate a TS report: self.generate_final_ts_guess_report() @@ -1160,15 +1246,22 @@ def run_ts_conformer_jobs(self, label: str): ) successful_tsgs = [tsg for tsg in self.species_dict[label].ts_guesses if tsg.success] if len(successful_tsgs) > 1: - self.job_dict[label]['conf_opt'] = dict() + xyzs = [tsg.initial_xyz for tsg in successful_tsgs] + piped_indices = self.pipe_planner.try_pipe_ts_opt(label, xyzs, self.ts_guess_level) + if not piped_indices: + self.job_dict[label]['conf_opt'] = dict() for i, tsg in enumerate(successful_tsgs): + tsg.conformer_index = i # Store the conformer index to match them later. + if i in piped_indices: + continue + if 'conf_opt' not in self.job_dict[label]: + self.job_dict[label]['conf_opt'] = dict() self.run_job(label=label, xyz=tsg.initial_xyz, level_of_theory=self.ts_guess_level, job_type='conf_opt', conformer=i, ) - tsg.conformer_index = i # Store the conformer index in the TSGuess object to match them later. elif len(successful_tsgs) == 1: if 'opt' not in self.job_dict[label].keys() and 'composite' not in self.job_dict[label].keys(): # proceed only if opt (/composite) not already spawned @@ -1356,6 +1449,7 @@ def run_scan_jobs(self, label: str): label (str): The species label. """ if self.job_types['rotors'] and isinstance(self.species_dict[label].rotors_dict, dict): + ess_rotor_indices = [] # Collected for potential pipe batching below. for i, rotor in self.species_dict[label].rotors_dict.items(): if rotor['scan_path'] and os.path.isfile(rotor['scan_path']): continue @@ -1412,29 +1506,37 @@ def run_scan_jobs(self, label: str): else: self.spawn_directed_scan_jobs(label, rotor_index=i) else: - # This is a "normal" scan (not directed). - # Check that this job isn't already running on the server (from a restarted project). - if 'scan' not in self.job_dict[label].keys(): - # We're spawning the first scan job for this species. - self.job_dict[label]['scan'] = dict() - # Check that this job isn't already running on the server (from a restarted project). - for scan_job in self.job_dict[label]['scan'].values(): - if torsions == scan_job.torsions and scan_job.job_name in self.running_jobs[label]: - break - else: - if self.species_dict[label].multi_species: - if self.output_multi_spc[self.species_dict[label].multi_species].get('scan', False): - return - self.output_multi_spc[self.species_dict[label].multi_species]['scan'] = True - label = [species.label for species in self.species_list + # This is a "normal" ESS scan (not directed). Collect for potential pipe batching. + ess_rotor_indices.append(i) + + # Attempt to batch ESS scans through pipe mode; fall back per-rotor for the rest. + piped_rotors = self.pipe_planner.try_pipe_rotor_scans_1d(label, ess_rotor_indices) \ + if ess_rotor_indices else set() + for i in ess_rotor_indices: + if i in piped_rotors: + continue + rotor = self.species_dict[label].rotors_dict[i] + torsions = rotor['torsion'] + if 'scan' not in self.job_dict[label].keys(): + self.job_dict[label]['scan'] = dict() + for scan_job in self.job_dict[label]['scan'].values(): + if torsions == scan_job.torsions and scan_job.job_name in self.running_jobs[label]: + break + else: + job_label = label + if self.species_dict[label].multi_species: + if self.output_multi_spc[self.species_dict[label].multi_species].get('scan', False): + return + self.output_multi_spc[self.species_dict[label].multi_species]['scan'] = True + job_label = [species.label for species in self.species_list if species.multi_species == self.species_dict[label].multi_species] - self.run_job(label=label, - xyz=self.species_dict[label].get_xyz(generate=False), - level_of_theory=self.scan_level, - job_type='scan', - torsions=torsions, - rotor_index=i, - ) + self.run_job(label=job_label, + xyz=self.species_dict[label].get_xyz(generate=False), + level_of_theory=self.scan_level, + job_type='scan', + torsions=torsions, + rotor_index=i, + ) def run_irc_job(self, label, irc_direction='forward'): """ @@ -1503,24 +1605,22 @@ def spawn_post_opt_jobs(self, self.run_opt_job(label, fine=self.fine_only) return None - # Spawn IRC if requested and if relevant. + # Enqueue IRC if requested and if relevant (deferred for pipe batching). if label in self.output.keys() and self.job_types['irc'] and self.species_dict[label].is_ts: - self.run_irc_job(label=label, irc_direction='forward') - self.run_irc_job(label=label, irc_direction='reverse') + self._pending_pipe_irc.add((label, 'forward')) + self._pending_pipe_irc.add((label, 'reverse')) - # Spawn freq (or check it if this is a composite job) for polyatomic molecules. + # Enqueue freq (deferred for pipe batching), or check it if composite. if label in self.output.keys() and self.species_dict[label].number_of_atoms > 1 \ and self.species_dict[label].irc_label is None: if 'freq' not in job_name and self.job_types['freq']: - # This is either an opt or a composite job (not an optfreq job), spawn freq. - self.run_freq_job(label) + self._pending_pipe_freq.add(label) if 'optfreq' in job_name: - # This is an 'optfreq' job type, don't spawn freq (but do check it). self.check_freq_job(label=label, job=self.job_dict[label]['optfreq'][job_name]) - # Spawn sp after an opt (non-composite) job. + # Enqueue sp after an opt (non-composite) job (deferred for pipe batching). if not composite and self.job_types['sp'] and self.species_dict[label].irc_label is None: - self.run_sp_job(label) + self._pending_pipe_sp.add(label) # Perceive the Molecule from xyz. # Useful for TS species where xyz might not be given at the outset to perceive a .mol attribute. @@ -1862,14 +1962,20 @@ def process_conformers(self, label): if self.species_dict[label].initial_xyz is None and self.species_dict[label].final_xyz is None \ and not self.testing: if len(self.species_dict[label].conformers) > 1: - self.job_dict[label]['conf_opt'] = dict() + piped_conformers = self.pipe_planner.try_pipe_conformers(label) + if not piped_conformers: + self.job_dict[label]['conf_opt'] = dict() for i, xyz in enumerate(self.species_dict[label].conformers): + if i in piped_conformers: + continue + if 'conf_opt' not in self.job_dict[label]: + self.job_dict[label]['conf_opt'] = dict() self.run_job(label=label, - xyz=xyz, - job_type='conf_opt', - level_of_theory=self.conformer_opt_level, - conformer=i, - ) + xyz=xyz, + job_type='conf_opt', + level_of_theory=self.conformer_opt_level, + conformer=i, + ) elif len(self.species_dict[label].conformers) == 1: logger.info(f'Only one conformer is available for species {label}, using it as initial xyz.') self.species_dict[label].initial_xyz = self.species_dict[label].conformers[0] diff --git a/arc/scheduler_pipe_test.py b/arc/scheduler_pipe_test.py new file mode 100644 index 0000000000..d38ca55420 --- /dev/null +++ b/arc/scheduler_pipe_test.py @@ -0,0 +1,1368 @@ +#!/usr/bin/env python3 +# encoding: utf-8 + +""" +This module contains unit tests for the pipe-mode methods of the arc.scheduler module +""" + +import os +import shutil +import tempfile +import time +import unittest +from unittest.mock import patch + +from arc.imports import settings +from arc.job.pipe.pipe_state import ( + PipeRunState, + TaskState, + TaskSpec, + get_task_attempt_dir, + update_task_state, +) +from arc.job.pipe.pipe_run import PipeRun +from arc.level import Level +from arc.scheduler import Scheduler +from arc.species.species import ARCSpecies + + +default_levels_of_theory = settings['default_levels_of_theory'] + + +def _make_task_spec(task_id, engine='mockter', task_family='conf_opt', + cores=4, mem=2048, species_label='H2O', conformer_index=0, + level=None): + """Helper to create a TaskSpec for testing.""" + spc = ARCSpecies(label=species_label, smiles='O') + return TaskSpec( + task_id=task_id, + task_family=task_family, + owner_type='species', + owner_key=species_label, + input_fingerprint=f'{task_id}_fp', + engine=engine, + level=level or {'method': 'mock', 'basis': 'mock'}, + required_cores=cores, + required_memory_mb=mem, + input_payload={'species_dicts': [spc.as_dict()]}, + ingestion_metadata={'conformer_index': conformer_index}, + ) + + +def _make_scheduler(project_directory): + """Create a minimal Scheduler for testing pipe methods.""" + ess_settings = {'gaussian': ['server1'], 'molpro': ['server2', 'server1'], 'qchem': ['server1']} + spc = ARCSpecies(label='H2O', smiles='O') + spc.conformers = [None] * 5 + spc.conformer_energies = [None] * 5 + return Scheduler( + project='pipe_test', + ess_settings=ess_settings, + species_list=[spc], + project_directory=project_directory, + conformer_opt_level=Level(repr=default_levels_of_theory['conformer']), + opt_level=Level(repr=default_levels_of_theory['opt']), + freq_level=Level(repr=default_levels_of_theory['freq']), + sp_level=Level(repr=default_levels_of_theory['sp']), + scan_level=Level(repr=default_levels_of_theory['scan']), + ts_guess_level=Level(repr=default_levels_of_theory['ts_guesses']), + testing=True, + job_types={'conf_opt': True, 'opt': True, 'fine': False, 'freq': True, + 'sp': True, 'rotors': False, 'orbitals': False, 'lennard_jones': False}, + orbitals_level=default_levels_of_theory['orbitals'], + ) + + +def _complete_task(pipe_root, task_id): + """Drive a task through the full lifecycle to COMPLETED.""" + now = time.time() + update_task_state(pipe_root, task_id, new_status=TaskState.CLAIMED, + claimed_by='w', claim_token='tok', claimed_at=now, lease_expires_at=now + 300) + update_task_state(pipe_root, task_id, new_status=TaskState.RUNNING, started_at=now) + update_task_state(pipe_root, task_id, new_status=TaskState.COMPLETED, ended_at=now) + + +_pipe_patches = [] + + +def setUpModule(): + """Enable pipe mode for all tests in this module.""" + global _pipe_patches + pipe_vals = {'enabled': True, 'min_tasks': 10, 'max_workers': 100, + 'max_attempts': 3, 'lease_duration_s': 86400} + for target in ('arc.job.pipe.pipe_coordinator.pipe_settings', + 'arc.job.pipe.pipe_planner.pipe_settings'): + p = patch.dict(target, pipe_vals) + p.start() + _pipe_patches.append(p) + + +def tearDownModule(): + """Restore pipe settings.""" + global _pipe_patches + for p in _pipe_patches: + p.stop() + _pipe_patches.clear() + + +class TestShouldUsePipe(unittest.TestCase): + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_sched_test_') + self.sched = _make_scheduler(self.tmpdir) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_returns_true_for_homogeneous_batch(self): + tasks = [_make_task_spec(f'task_{i}') for i in range(15)] + self.assertTrue(self.sched.pipe_coordinator.should_use_pipe(tasks)) + + def test_returns_false_for_heterogeneous_memory(self): + tasks = [_make_task_spec(f'task_{i}') for i in range(15)] + tasks[7] = _make_task_spec('task_7', mem=9999) + self.assertFalse(self.sched.pipe_coordinator.should_use_pipe(tasks)) + + def test_returns_false_for_heterogeneous_engine(self): + tasks = [_make_task_spec(f'task_{i}') for i in range(15)] + tasks[0] = _make_task_spec('task_0', engine='gaussian') + self.assertFalse(self.sched.pipe_coordinator.should_use_pipe(tasks)) + + def test_returns_false_for_heterogeneous_level(self): + tasks = [_make_task_spec(f'task_{i}') for i in range(15)] + tasks[3] = _make_task_spec('task_3', level={'method': 'b3lyp', 'basis': 'sto-3g'}) + self.assertFalse(self.sched.pipe_coordinator.should_use_pipe(tasks)) + + def test_returns_false_below_threshold(self): + tasks = [_make_task_spec(f'task_{i}') for i in range(5)] + self.assertFalse(self.sched.pipe_coordinator.should_use_pipe(tasks)) + + def test_returns_true_at_exact_threshold(self): + tasks = [_make_task_spec(f'task_{i}') for i in range(10)] + self.assertTrue(self.sched.pipe_coordinator.should_use_pipe(tasks)) + + +class TestSubmitPipeRun(unittest.TestCase): + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_submit_test_') + self.sched = _make_scheduler(self.tmpdir) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_submit_returns_pipe_run(self): + tasks = [_make_task_spec(f'task_{i}') for i in range(3)] + pipe = self.sched.pipe_coordinator.submit_pipe_run('run_001', tasks) + self.assertIsInstance(pipe, PipeRun) + self.assertEqual(pipe.status, PipeRunState.STAGED) + self.assertIn('run_001', self.sched.active_pipes) + self.assertIs(self.sched.active_pipes['run_001'], pipe) + + def test_submit_uses_explicit_cluster_software(self): + tasks = [_make_task_spec('task_0')] + pipe = self.sched.pipe_coordinator.submit_pipe_run('run_pbs', tasks, cluster_software='pbs') + self.assertEqual(pipe.cluster_software, 'pbs') + + def test_submit_default_cluster_software(self): + tasks = [_make_task_spec('task_0')] + pipe = self.sched.pipe_coordinator.submit_pipe_run('run_default', tasks) + self.assertEqual(pipe.cluster_software, 'slurm') + + +class TestPollPipes(unittest.TestCase): + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_poll_test_') + self.sched = _make_scheduler(self.tmpdir) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_poll_removes_completed_pipe(self): + pipe = self.sched.pipe_coordinator.submit_pipe_run('run_poll', [_make_task_spec('task_poll')]) + _complete_task(pipe.pipe_root, 'task_poll') + self.sched.pipe_coordinator.poll_pipes() + self.assertNotIn('run_poll', self.sched.active_pipes) + + def test_poll_keeps_active_pipe(self): + self.sched.pipe_coordinator.submit_pipe_run('run_active', [_make_task_spec('task_active')]) + self.sched.pipe_coordinator.poll_pipes() + self.assertIn('run_active', self.sched.active_pipes) + + def test_poll_removes_failed_pipe(self): + pipe = self.sched.pipe_coordinator.submit_pipe_run('run_fail', [_make_task_spec('task_f')]) + pipe.status = PipeRunState.FAILED + pipe._save_run_metadata() + self.sched.pipe_coordinator.poll_pipes() + self.assertNotIn('run_fail', self.sched.active_pipes) + + def test_poll_logs_counts(self): + pipe = self.sched.pipe_coordinator.submit_pipe_run('run_log', [_make_task_spec('task_log')]) + _complete_task(pipe.pipe_root, 'task_log') + with patch('arc.job.pipe.pipe_coordinator.logger') as mock_logger: + self.sched.pipe_coordinator.poll_pipes() + info_calls = [str(c) for c in mock_logger.info.call_args_list] + self.assertTrue(any('run_log' in c for c in info_calls)) + + def test_poll_logs_exception_with_traceback(self): + """A reconcile exception is logged with traceback, run stays on first failure.""" + pipe = self.sched.pipe_coordinator.submit_pipe_run('run_err', [_make_task_spec('task_err')]) + with patch.object(pipe, 'reconcile', side_effect=RuntimeError('disk full')): + with patch('arc.job.pipe.pipe_coordinator.logger') as mock_logger: + self.sched.pipe_coordinator.poll_pipes() + error_calls = [str(c) for c in mock_logger.error.call_args_list] + self.assertTrue(any('run_err' in c and 'reconciliation failed' in c for c in error_calls)) + # Run should still be in active_pipes after first failure + self.assertIn('run_err', self.sched.active_pipes) + self.assertEqual(self.sched.pipe_coordinator._pipe_poll_failures.get('run_err'), 1) + + def test_poll_removes_after_repeated_failures(self): + """After 3 consecutive failures, the broken run is removed from active_pipes.""" + pipe = self.sched.pipe_coordinator.submit_pipe_run('run_stuck', [_make_task_spec('task_stuck')]) + with patch.object(pipe, 'reconcile', side_effect=RuntimeError('corrupt state')): + for _ in range(3): + self.sched.pipe_coordinator.poll_pipes() + self.assertNotIn('run_stuck', self.sched.active_pipes) + self.assertNotIn('run_stuck', self.sched.pipe_coordinator._pipe_poll_failures) + + def test_poll_resets_failure_count_on_success(self): + """Successful reconciliation resets the failure counter.""" + pipe = self.sched.pipe_coordinator.submit_pipe_run('run_flaky', [_make_task_spec('task_flaky')]) + # Fail once + with patch.object(pipe, 'reconcile', side_effect=RuntimeError('transient')): + self.sched.pipe_coordinator.poll_pipes() + self.assertEqual(self.sched.pipe_coordinator._pipe_poll_failures.get('run_flaky'), 1) + # Succeed — counter should reset + self.sched.pipe_coordinator.poll_pipes() + self.assertNotIn('run_flaky', self.sched.pipe_coordinator._pipe_poll_failures) + + +class TestScheduleJobsLoopCondition(unittest.TestCase): + """Test that the main loop does not exit while active_pipes remain.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_loop_test_') + self.sched = _make_scheduler(self.tmpdir) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_loop_continues_for_active_pipes(self): + """Verify the loop condition includes active_pipes.""" + pipe = self.sched.pipe_coordinator.submit_pipe_run('run_loop', [_make_task_spec('task_loop')]) + _complete_task(pipe.pipe_root, 'task_loop') + # Clear running_jobs so only active_pipes keeps the loop alive + self.sched.running_jobs = {} + self.assertIn('run_loop', self.sched.active_pipes) + # Simulate one iteration: poll_pipes should complete and remove it + self.sched.pipe_coordinator.poll_pipes() + self.assertNotIn('run_loop', self.sched.active_pipes) + + def test_poll_pipes_invoked_in_loop(self): + """Verify poll_pipes is invoked when the loop runs with only active pipes.""" + pipe = self.sched.pipe_coordinator.submit_pipe_run('run_int', [_make_task_spec('task_int')]) + _complete_task(pipe.pipe_root, 'task_int') + self.sched.running_jobs = {} + # Patch poll_pipes to track calls, then run one iteration manually. + # The loop condition is: while self.running_jobs != {} or self.active_pipes + # Since we can't safely run schedule_jobs (too many side effects), we + # verify that: (a) the condition is true, and (b) poll_pipes works. + self.assertTrue(self.sched.running_jobs == {} and bool(self.sched.active_pipes)) + with patch.object(self.sched.pipe_coordinator, 'poll_pipes', + wraps=self.sched.pipe_coordinator.poll_pipes) as mock_poll: + self.sched.pipe_coordinator.poll_pipes() + mock_poll.assert_called_once() + # After polling, the completed pipe should be gone. + self.assertNotIn('run_int', self.sched.active_pipes) + + +class TestRegisterPipeRunFromDir(unittest.TestCase): + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_register_test_') + self.sched = _make_scheduler(self.tmpdir) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_register_from_dir(self): + tasks = [_make_task_spec(f'task_{i}') for i in range(2)] + original = self.sched.pipe_coordinator.submit_pipe_run('run_restart', tasks, cluster_software='pbs') + pipe_root = original.pipe_root + del self.sched.active_pipes['run_restart'] + restored = self.sched.pipe_coordinator.register_pipe_run_from_dir(pipe_root) + self.assertIn('run_restart', self.sched.active_pipes) + self.assertEqual(restored.run_id, 'run_restart') + self.assertEqual(restored.cluster_software, 'pbs') + + +class TestTryPipeConformers(unittest.TestCase): + """Tests for the _try_pipe_conformers method.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_conf_test_') + self.sched = _make_scheduler(self.tmpdir) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_pipes_when_enough_conformers(self): + """When >=10 conformers, pipe mode should be used.""" + species = self.sched.species_dict['H2O'] + species.conformers = [{'symbols': ('O',), 'isotopes': (16,), + 'coords': ((0.0, 0.0, float(i)),)} + for i in range(12)] + species.conformer_energies = [None] * 12 + # Mock deduce_job_adapter to return a queue-eligible adapter + with patch.object(self.sched, 'deduce_job_adapter', return_value='gaussian'): + result = self.sched.pipe_planner.try_pipe_conformers('H2O') + self.assertTrue(result) + self.assertEqual(len(self.sched.active_pipes), 1) + run_id = list(self.sched.active_pipes.keys())[0] + self.assertIn('H2O', run_id) + pipe = self.sched.active_pipes[run_id] + self.assertEqual(len(pipe.tasks), 12) + # Verify task metadata uses the new explicit schema + spec = pipe.tasks[0] + self.assertEqual(spec.owner_key, 'H2O') + self.assertEqual(spec.task_family, 'conf_opt') + self.assertEqual(spec.ingestion_metadata['conformer_index'], 0) + self.assertIsNotNone(spec.level) + + def test_no_pipe_when_few_conformers(self): + """When <10 conformers, pipe mode should not be used.""" + species = self.sched.species_dict['H2O'] + species.conformers = [{'symbols': ('O',), 'isotopes': (16,), + 'coords': ((0.0, 0.0, float(i)),)} + for i in range(5)] + species.conformer_energies = [None] * 5 + with patch.object(self.sched, 'deduce_job_adapter', return_value='gaussian'): + result = self.sched.pipe_planner.try_pipe_conformers('H2O') + self.assertFalse(result) + self.assertEqual(len(self.sched.active_pipes), 0) + + def test_no_pipe_for_incore_adapter(self): + """Incore adapters should not use pipe mode.""" + species = self.sched.species_dict['H2O'] + species.conformers = [{'symbols': ('O',), 'isotopes': (16,), + 'coords': ((0.0, 0.0, float(i)),)} + for i in range(15)] + species.conformer_energies = [None] * 15 + with patch.object(self.sched, 'deduce_job_adapter', return_value='torchani'): + result = self.sched.pipe_planner.try_pipe_conformers('H2O') + self.assertFalse(result) + + +class TestIngestPipeResults(unittest.TestCase): + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_ingest_test_') + self.sched = _make_scheduler(self.tmpdir) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def _make_pipe_with_completed_task(self, task_id='task_ingest', **spec_kwargs): + task = _make_task_spec(task_id, **spec_kwargs) + pipe = PipeRun(project_directory=self.tmpdir, run_id=f'{task_id}_run', + tasks=[task], cluster_software='slurm') + pipe.stage() + _complete_task(pipe.pipe_root, task_id) + attempt_dir = get_task_attempt_dir(pipe.pipe_root, task_id, 0) + return pipe, attempt_dir + + def _place_output_file(self, attempt_dir): + calcs_dir = os.path.join(attempt_dir, 'calcs', 'Species', 'H2O', 'conf_opt_a1') + os.makedirs(calcs_dir, exist_ok=True) + path = os.path.join(calcs_dir, 'output.yml') + with open(path, 'w') as f: + f.write('dummy') + return path + + def test_ingest_updates_species_conformer(self): + pipe, attempt_dir = self._make_pipe_with_completed_task( + species_label='H2O', conformer_index=2) + self._place_output_file(attempt_dir) + mock_xyz = {'symbols': ('O', 'H', 'H'), 'isotopes': (16, 1, 1), + 'coords': ((0.0, 0.0, 0.12), (0.0, 0.76, -0.47), (0.0, -0.76, -0.47))} + with patch('arc.job.pipe.pipe_run.parser.parse_geometry', return_value=mock_xyz), \ + patch('arc.job.pipe.pipe_run.parser.parse_e_elect', return_value=-75.5): + self.sched.pipe_coordinator.ingest_pipe_results(pipe) + species = self.sched.species_dict['H2O'] + self.assertEqual(species.conformers[2], mock_xyz) + self.assertAlmostEqual(species.conformer_energies[2], -75.5) + + def test_ingest_terminal_failure_logs_error(self): + task = _make_task_spec('task_fail', species_label='H2O', conformer_index=0) + pipe = PipeRun(project_directory=self.tmpdir, run_id='fail_test', + tasks=[task], cluster_software='slurm') + pipe.stage() + now = time.time() + update_task_state(pipe.pipe_root, 'task_fail', new_status=TaskState.CLAIMED, + claimed_by='w', claim_token='tok', claimed_at=now, lease_expires_at=now + 300) + update_task_state(pipe.pipe_root, 'task_fail', new_status=TaskState.RUNNING, started_at=now) + update_task_state(pipe.pipe_root, 'task_fail', new_status=TaskState.FAILED_TERMINAL, + ended_at=now, failure_class='oom') + self.sched.pipe_coordinator.ingest_pipe_results(pipe) + self.assertIsNone(self.sched.species_dict['H2O'].conformers[0]) + + def test_ingest_cancelled_task_logged(self): + task = _make_task_spec('task_cancel', species_label='H2O', conformer_index=0) + pipe = PipeRun(project_directory=self.tmpdir, run_id='cancel_test', + tasks=[task], cluster_software='slurm') + pipe.stage() + now = time.time() + update_task_state(pipe.pipe_root, 'task_cancel', new_status=TaskState.CANCELLED, ended_at=now) + with patch('arc.job.pipe.pipe_coordinator.logger') as mock_logger: + self.sched.pipe_coordinator.ingest_pipe_results(pipe) + warning_calls = [str(c) for c in mock_logger.warning.call_args_list] + self.assertTrue(any('cancelled' in c.lower() for c in warning_calls)) + + def test_ingest_skips_unknown_species(self): + pipe, _ = self._make_pipe_with_completed_task( + task_id='task_unknown', species_label='NONEXISTENT', conformer_index=0) + self.sched.pipe_coordinator.ingest_pipe_results(pipe) + + def test_ingest_missing_conformer_index(self): + """conf_opt task with empty ingestion_metadata is skipped with warning.""" + task = _make_task_spec('task_no_idx', species_label='H2O') + # Override ingestion_metadata to remove conformer_index + task.ingestion_metadata = {} + pipe = PipeRun(project_directory=self.tmpdir, run_id='noidx_test', + tasks=[task], cluster_software='slurm') + pipe.stage() + _complete_task(pipe.pipe_root, 'task_no_idx') + with patch('arc.job.pipe.pipe_run.logger') as mock_logger: + self.sched.pipe_coordinator.ingest_pipe_results(pipe) + warning_calls = [str(c) for c in mock_logger.warning.call_args_list] + self.assertTrue(any('conformer_index' in c for c in warning_calls)) + + def test_ingest_continues_on_missing_output(self): + task_ok = _make_task_spec('task_ok', species_label='H2O', conformer_index=1) + task_bad = _make_task_spec('task_bad', species_label='H2O', conformer_index=2) + pipe = PipeRun(project_directory=self.tmpdir, run_id='partial_test', + tasks=[task_bad, task_ok], cluster_software='slurm') + pipe.stage() + _complete_task(pipe.pipe_root, 'task_ok') + _complete_task(pipe.pipe_root, 'task_bad') + attempt_dir_ok = get_task_attempt_dir(pipe.pipe_root, 'task_ok', 0) + self._place_output_file(attempt_dir_ok) + mock_xyz = {'symbols': ('O', 'H', 'H'), 'isotopes': (16, 1, 1), + 'coords': ((0.0, 0.0, 0.12), (0.0, 0.76, -0.47), (0.0, -0.76, -0.47))} + with patch('arc.job.pipe.pipe_run.parser.parse_geometry', return_value=mock_xyz), \ + patch('arc.job.pipe.pipe_run.parser.parse_e_elect', return_value=-75.5): + self.sched.pipe_coordinator.ingest_pipe_results(pipe) + species = self.sched.species_dict['H2O'] + self.assertEqual(species.conformers[1], mock_xyz) + self.assertIsNone(species.conformers[2]) + + def test_ingest_continues_on_parser_exception(self): + task_ok = _make_task_spec('task_ok2', species_label='H2O', conformer_index=0) + task_bad = _make_task_spec('task_err', species_label='H2O', conformer_index=3) + pipe = PipeRun(project_directory=self.tmpdir, run_id='parse_err_test', + tasks=[task_bad, task_ok], cluster_software='slurm') + pipe.stage() + _complete_task(pipe.pipe_root, 'task_ok2') + _complete_task(pipe.pipe_root, 'task_err') + attempt_ok = get_task_attempt_dir(pipe.pipe_root, 'task_ok2', 0) + attempt_err = get_task_attempt_dir(pipe.pipe_root, 'task_err', 0) + self._place_output_file(attempt_ok) + self._place_output_file(attempt_err) + mock_xyz = {'symbols': ('O', 'H', 'H'), 'isotopes': (16, 1, 1), + 'coords': ((0.0, 0.0, 0.12), (0.0, 0.76, -0.47), (0.0, -0.76, -0.47))} + + def mock_parse_geometry(log_file_path): + if 'task_err' in log_file_path: + raise RuntimeError('simulated parser crash') + return mock_xyz + + with patch('arc.job.pipe.pipe_run.parser.parse_geometry', side_effect=mock_parse_geometry), \ + patch('arc.job.pipe.pipe_run.parser.parse_e_elect', return_value=-10.0): + self.sched.pipe_coordinator.ingest_pipe_results(pipe) + species = self.sched.species_dict['H2O'] + self.assertEqual(species.conformers[0], mock_xyz) + self.assertIsNone(species.conformers[3]) + + +class TestConfSpIngestion(unittest.TestCase): + """Tests for conf_sp pipe ingestion.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_confsp_test_') + self.sched = _make_scheduler(self.tmpdir) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_conf_sp_ingestion_updates_energy(self): + """conf_sp ingestion updates conformer energy but not geometry.""" + task = _make_task_spec('sp_task', task_family='conf_sp', + species_label='H2O', conformer_index=1) + pipe = PipeRun(project_directory=self.tmpdir, run_id='sp_ingest', + tasks=[task], cluster_software='slurm') + pipe.stage() + _complete_task(pipe.pipe_root, 'sp_task') + attempt_dir = get_task_attempt_dir(pipe.pipe_root, 'sp_task', 0) + calcs_dir = os.path.join(attempt_dir, 'calcs', 'Species', 'H2O', 'conf_sp_a1') + os.makedirs(calcs_dir, exist_ok=True) + with open(os.path.join(calcs_dir, 'output.yml'), 'w') as f: + f.write('dummy') + + species = self.sched.species_dict['H2O'] + species.conformers[1] = {'symbols': ('O',), 'coords': ((0, 0, 0),)} # pre-existing geometry + + with patch('arc.job.pipe.pipe_run.parser.parse_e_elect', return_value=-99.9): + self.sched.pipe_coordinator.ingest_pipe_results(pipe) + + # Energy updated + self.assertAlmostEqual(species.conformer_energies[1], -99.9) + # Geometry preserved (conf_sp doesn't touch it) + self.assertEqual(species.conformers[1], {'symbols': ('O',), 'coords': ((0, 0, 0),)}) + + def test_conf_opt_and_conf_sp_not_mixed(self): + """conf_opt and conf_sp tasks cannot be in the same PipeRun.""" + t1 = _make_task_spec('t1', task_family='conf_opt') + t2 = _make_task_spec('t2', task_family='conf_sp') + run = PipeRun(project_directory=self.tmpdir, run_id='mixed', + tasks=[t1, t2], cluster_software='slurm') + with self.assertRaises(ValueError): + run.stage() + + +class TestTryPipeConfSp(unittest.TestCase): + """Tests for _try_pipe_conf_sp.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_confsp_route_') + self.sched = _make_scheduler(self.tmpdir) + # Give the scheduler a conf_sp level + from arc.level import Level as Lvl + self.sched.conformer_sp_level = Lvl(method='wb97xd', basis='def2-tzvp') + self.sched.conformer_opt_level = Lvl(method='b97d3', basis='6-31+g(d,p)') + self.sched.job_types['conf_sp'] = True + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_conf_sp_pipes_when_enough(self): + species = self.sched.species_dict['H2O'] + species.conformers = [{'symbols': ('O',), 'isotopes': (16,), + 'coords': ((0.0, 0.0, float(i)),)} + for i in range(12)] + species.conformer_energies = [None] * 12 + with patch.object(self.sched, 'deduce_job_adapter', return_value='gaussian'): + result = self.sched.pipe_planner.try_pipe_conf_sp('H2O', list(range(len(self.sched.species_dict['H2O'].conformers)))) + self.assertTrue(result) + run_id = list(self.sched.active_pipes.keys())[0] + self.assertIn('conf_sp', run_id) + pipe = self.sched.active_pipes[run_id] + self.assertEqual(pipe.tasks[0].task_family, 'conf_sp') + + def test_conf_sp_no_pipe_below_threshold(self): + species = self.sched.species_dict['H2O'] + species.conformers = [{'symbols': ('O',), 'isotopes': (16,), + 'coords': ((0.0, 0.0, float(i)),)} + for i in range(5)] + species.conformer_energies = [None] * 5 + with patch.object(self.sched, 'deduce_job_adapter', return_value='gaussian'): + result = self.sched.pipe_planner.try_pipe_conf_sp('H2O', list(range(len(self.sched.species_dict['H2O'].conformers)))) + self.assertFalse(result) + + def test_conf_sp_not_triggered_when_disabled(self): + self.sched.job_types['conf_sp'] = False + species = self.sched.species_dict['H2O'] + species.conformers = [None] * 15 + species.conformer_energies = [None] * 15 + result = self.sched.pipe_planner.try_pipe_conf_sp('H2O', list(range(len(self.sched.species_dict['H2O'].conformers)))) + self.assertFalse(result) + + +class TestTsIngestion(unittest.TestCase): + """Tests for TS pipe ingestion.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_ts_ingest_') + self.sched = _make_scheduler(self.tmpdir) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_ts_opt_ingestion_updates_species(self): + """ts_opt ingestion sets final_xyz and e_elect on the TS species.""" + ts_label = 'H2O' # reusing existing species as TS proxy + task = _make_task_spec('ts_opt_task', task_family='ts_opt', + species_label=ts_label, conformer_index=0) + pipe = PipeRun(project_directory=self.tmpdir, run_id='ts_opt_ingest', + tasks=[task], cluster_software='slurm') + pipe.stage() + _complete_task(pipe.pipe_root, 'ts_opt_task') + attempt_dir = get_task_attempt_dir(pipe.pipe_root, 'ts_opt_task', 0) + calcs_dir = os.path.join(attempt_dir, 'calcs', 'Species', ts_label, 'opt_a1') + os.makedirs(calcs_dir, exist_ok=True) + with open(os.path.join(calcs_dir, 'output.yml'), 'w') as f: + f.write('dummy') + + mock_xyz = {'symbols': ('O', 'H', 'H'), 'isotopes': (16, 1, 1), + 'coords': ((0.0, 0.0, 0.12), (0.0, 0.76, -0.47), (0.0, -0.76, -0.47))} + with patch('arc.job.pipe.pipe_run.parser.parse_geometry', return_value=mock_xyz), \ + patch('arc.job.pipe.pipe_run.parser.parse_e_elect', return_value=-50.0): + self.sched.pipe_coordinator.ingest_pipe_results(pipe) + species = self.sched.species_dict[ts_label] + self.assertEqual(species.final_xyz, mock_xyz) + self.assertAlmostEqual(species.e_elect, -50.0) + + def test_ts_guess_batch_ingestion_calls_process(self): + """ts_guess_batch_method ingestion calls process_completed_tsg_queue_jobs.""" + ts_label = 'H2O' + task = _make_task_spec('tsg_task', task_family='ts_guess_batch_method', + species_label=ts_label, conformer_index=0) + pipe = PipeRun(project_directory=self.tmpdir, run_id='tsg_ingest', + tasks=[task], cluster_software='slurm') + pipe.stage() + _complete_task(pipe.pipe_root, 'tsg_task') + attempt_dir = get_task_attempt_dir(pipe.pipe_root, 'tsg_task', 0) + calcs_dir = os.path.join(attempt_dir, 'calcs', 'Species', ts_label, 'tsg_a1') + os.makedirs(calcs_dir, exist_ok=True) + with open(os.path.join(calcs_dir, 'output.yml'), 'w') as f: + f.write('dummy') + + species = self.sched.species_dict[ts_label] + with patch.object(species, 'process_completed_tsg_queue_jobs') as mock_process: + self.sched.pipe_coordinator.ingest_pipe_results(pipe) + mock_process.assert_called_once() + + def test_ts_not_mixed_with_conformer(self): + """ts_opt and conf_opt cannot be in the same PipeRun.""" + t1 = _make_task_spec('t1', task_family='conf_opt') + t2 = _make_task_spec('t2', task_family='ts_opt') + run = PipeRun(project_directory=self.tmpdir, run_id='mixed', + tasks=[t1, t2], cluster_software='slurm') + with self.assertRaises(ValueError): + run.stage() + + +class TestTryPipeTsOpt(unittest.TestCase): + """Tests for _try_pipe_ts_opt.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_tsopt_route_') + self.sched = _make_scheduler(self.tmpdir) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_ts_opt_pipes_when_enough(self): + """When >= 10 TS opt xyzs, pipe mode is used.""" + xyzs = [{'symbols': ('O',), 'isotopes': (16,), + 'coords': ((0.0, 0.0, float(i)),)} + for i in range(12)] + level = Level(method='wb97xd', basis='def2-tzvp') + with patch.object(self.sched, 'deduce_job_adapter', return_value='gaussian'): + result = self.sched.pipe_planner.try_pipe_ts_opt('H2O', xyzs, level) + self.assertTrue(result) + run_id = list(self.sched.active_pipes.keys())[0] + self.assertIn('ts_opt', run_id) + pipe = self.sched.active_pipes[run_id] + self.assertEqual(pipe.tasks[0].task_family, 'ts_opt') + self.assertEqual(pipe.tasks[0].owner_type, 'species') + + def test_ts_opt_no_pipe_below_threshold(self): + xyzs = [{'symbols': ('O',), 'isotopes': (16,), + 'coords': ((0.0, 0.0, float(i)),)} + for i in range(5)] + level = Level(method='wb97xd', basis='def2-tzvp') + with patch.object(self.sched, 'deduce_job_adapter', return_value='gaussian'): + result = self.sched.pipe_planner.try_pipe_ts_opt('H2O', xyzs, level) + self.assertFalse(result) + + +class TestConfOptIngestionSemantics(unittest.TestCase): + """Verify conf_opt ingestion updates both geometry and energy (ARC-consistent).""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_confopt_sem_') + self.sched = _make_scheduler(self.tmpdir) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_conf_opt_updates_both_geometry_and_energy(self): + """conf_opt ingestion must update both conformers[i] and conformer_energies[i].""" + task = _make_task_spec('conf_opt_sem', species_label='H2O', conformer_index=1) + pipe = PipeRun(project_directory=self.tmpdir, run_id='sem_test', + tasks=[task], cluster_software='slurm') + pipe.stage() + _complete_task(pipe.pipe_root, 'conf_opt_sem') + attempt_dir = get_task_attempt_dir(pipe.pipe_root, 'conf_opt_sem', 0) + calcs_dir = os.path.join(attempt_dir, 'calcs', 'Species', 'H2O', 'conf_opt_a1') + os.makedirs(calcs_dir, exist_ok=True) + with open(os.path.join(calcs_dir, 'output.yml'), 'w') as f: + f.write('dummy') + + mock_xyz = {'symbols': ('O', 'H', 'H'), 'isotopes': (16, 1, 1), + 'coords': ((0.0, 0.0, 0.12), (0.0, 0.76, -0.47), (0.0, -0.76, -0.47))} + with patch('arc.job.pipe.pipe_run.parser.parse_geometry', return_value=mock_xyz), \ + patch('arc.job.pipe.pipe_run.parser.parse_e_elect', return_value=-75.5): + self.sched.pipe_coordinator.ingest_pipe_results(pipe) + species = self.sched.species_dict['H2O'] + # Both geometry and energy must be updated (ARC uses opt-level energy for ranking) + self.assertEqual(species.conformers[1], mock_xyz) + self.assertAlmostEqual(species.conformer_energies[1], -75.5) + + +class TestSpeciesSpIngestion(unittest.TestCase): + """Tests for species_sp pipe ingestion.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_sp_ingest_') + self.sched = _make_scheduler(self.tmpdir) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_species_sp_sets_e_elect(self): + task = _make_task_spec('sp_task', task_family='species_sp', species_label='H2O') + pipe = PipeRun(project_directory=self.tmpdir, run_id='sp_ingest', + tasks=[task], cluster_software='slurm') + pipe.stage() + _complete_task(pipe.pipe_root, 'sp_task') + attempt_dir = get_task_attempt_dir(pipe.pipe_root, 'sp_task', 0) + calcs_dir = os.path.join(attempt_dir, 'calcs', 'Species', 'H2O', 'sp_a1') + os.makedirs(calcs_dir, exist_ok=True) + with open(os.path.join(calcs_dir, 'output.yml'), 'w') as f: + f.write('dummy') + + with patch('arc.job.pipe.pipe_run.parser.parse_e_elect', return_value=-76.1): + self.sched.pipe_coordinator.ingest_pipe_results(pipe) + self.assertAlmostEqual(self.sched.species_dict['H2O'].e_elect, -76.1) + + +class TestSpeciesFreqIngestion(unittest.TestCase): + """Tests for species_freq pipe ingestion.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_freq_ingest_') + self.sched = _make_scheduler(self.tmpdir) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_species_freq_stores_output_path(self): + task = _make_task_spec('freq_task', task_family='species_freq', species_label='H2O') + pipe = PipeRun(project_directory=self.tmpdir, run_id='freq_ingest', + tasks=[task], cluster_software='slurm') + pipe.stage() + _complete_task(pipe.pipe_root, 'freq_task') + attempt_dir = get_task_attempt_dir(pipe.pipe_root, 'freq_task', 0) + calcs_dir = os.path.join(attempt_dir, 'calcs', 'Species', 'H2O', 'freq_a1') + os.makedirs(calcs_dir, exist_ok=True) + output_path = os.path.join(calcs_dir, 'output.yml') + with open(output_path, 'w') as f: + f.write('dummy') + + self.sched.pipe_coordinator.ingest_pipe_results(pipe) + self.assertEqual(self.sched.output['H2O']['paths']['freq'], output_path) + + +class TestIrcIngestion(unittest.TestCase): + """Tests for IRC pipe ingestion.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_irc_ingest_') + self.sched = _make_scheduler(self.tmpdir) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_irc_stores_output_path(self): + task = _make_task_spec('irc_task', task_family='irc', species_label='H2O') + pipe = PipeRun(project_directory=self.tmpdir, run_id='irc_ingest', + tasks=[task], cluster_software='slurm') + pipe.stage() + _complete_task(pipe.pipe_root, 'irc_task') + attempt_dir = get_task_attempt_dir(pipe.pipe_root, 'irc_task', 0) + calcs_dir = os.path.join(attempt_dir, 'calcs', 'Species', 'H2O', 'irc_a1') + os.makedirs(calcs_dir, exist_ok=True) + output_path = os.path.join(calcs_dir, 'output.yml') + with open(output_path, 'w') as f: + f.write('dummy') + + self.sched.pipe_coordinator.ingest_pipe_results(pipe) + self.assertIn(output_path, self.sched.output['H2O']['paths']['irc']) + + +class TestTryPipeSpeciesSp(unittest.TestCase): + """Tests for _try_pipe_species_sp.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_sp_route_') + self.sched = _make_scheduler(self.tmpdir) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_sp_pipes_when_enough(self): + labels = [f'spc_{i}' for i in range(12)] + for lbl in labels: + spc = ARCSpecies(label=lbl, smiles='O') + self.sched.species_dict[lbl] = spc + with patch.object(self.sched, 'deduce_job_adapter', return_value='gaussian'): + result = self.sched.pipe_planner.try_pipe_species_sp(labels) + self.assertTrue(result) + run_id = list(self.sched.active_pipes.keys())[0] + pipe = self.sched.active_pipes[run_id] + self.assertEqual(pipe.tasks[0].task_family, 'species_sp') + self.assertEqual(pipe.tasks[0].owner_type, 'species') + + def test_sp_no_pipe_below_threshold(self): + labels = [f'spc_{i}' for i in range(5)] + for lbl in labels: + self.sched.species_dict[lbl] = ARCSpecies(label=lbl, smiles='O') + with patch.object(self.sched, 'deduce_job_adapter', return_value='gaussian'): + result = self.sched.pipe_planner.try_pipe_species_sp(labels) + self.assertFalse(result) + + +class TestTryPipeIrc(unittest.TestCase): + """Tests for _try_pipe_irc.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_irc_route_') + self.sched = _make_scheduler(self.tmpdir) + self.sched.irc_level = Level(method='wb97xd', basis='def2-tzvp') + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_irc_pipes_when_enough(self): + labels_and_dirs = [(f'ts_spc_{i}', 'forward') for i in range(12)] + for lbl, _ in labels_and_dirs: + self.sched.species_dict[lbl] = ARCSpecies(label=lbl, smiles='O', is_ts=True) + with patch.object(self.sched, 'deduce_job_adapter', return_value='gaussian'): + result = self.sched.pipe_planner.try_pipe_irc(labels_and_dirs) + self.assertTrue(result) + pipe = list(self.sched.active_pipes.values())[0] + self.assertEqual(pipe.tasks[0].task_family, 'irc') + self.assertEqual(pipe.tasks[0].ingestion_metadata['irc_direction'], 'forward') + + def test_irc_no_pipe_below_threshold(self): + labels_and_dirs = [(f'ts_spc_{i}', 'forward') for i in range(3)] + for lbl, _ in labels_and_dirs: + self.sched.species_dict[lbl] = ARCSpecies(label=lbl, smiles='O', is_ts=True) + with patch.object(self.sched, 'deduce_job_adapter', return_value='gaussian'): + result = self.sched.pipe_planner.try_pipe_irc(labels_and_dirs) + self.assertFalse(result) + + +class TestRotorScan1dIngestion(unittest.TestCase): + """Tests for rotor_scan_1d pipe ingestion.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_scan_ingest_') + self.sched = _make_scheduler(self.tmpdir) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_scan_ingestion_stores_scan_path(self): + """rotor_scan_1d ingestion sets rotors_dict[rotor_index]['scan_path'].""" + species = self.sched.species_dict['H2O'] + species.rotors_dict = {0: {'scan_path': '', 'success': None, 'torsion': [0, 1, 2, 3]}} + + task = _make_task_spec('scan_task', task_family='rotor_scan_1d', species_label='H2O') + # Override ingestion_metadata to include rotor_index + task.ingestion_metadata = {'rotor_index': 0} + pipe = PipeRun(project_directory=self.tmpdir, run_id='scan_ingest', + tasks=[task], cluster_software='slurm') + pipe.stage() + _complete_task(pipe.pipe_root, 'scan_task') + attempt_dir = get_task_attempt_dir(pipe.pipe_root, 'scan_task', 0) + calcs_dir = os.path.join(attempt_dir, 'calcs', 'Species', 'H2O', 'scan_a1') + os.makedirs(calcs_dir, exist_ok=True) + output_path = os.path.join(calcs_dir, 'output.yml') + with open(output_path, 'w') as f: + f.write('dummy') + + self.sched.pipe_coordinator.ingest_pipe_results(pipe) + self.assertEqual(species.rotors_dict[0]['scan_path'], output_path) + + def test_scan_ingestion_missing_rotor_slot(self): + """Ingestion skips safely when the rotor slot doesn't exist.""" + species = self.sched.species_dict['H2O'] + species.rotors_dict = {} # no rotor 0 + + task = _make_task_spec('scan_bad', task_family='rotor_scan_1d', species_label='H2O') + task.ingestion_metadata = {'rotor_index': 0} + pipe = PipeRun(project_directory=self.tmpdir, run_id='scan_bad', + tasks=[task], cluster_software='slurm') + pipe.stage() + _complete_task(pipe.pipe_root, 'scan_bad') + attempt_dir = get_task_attempt_dir(pipe.pipe_root, 'scan_bad', 0) + calcs_dir = os.path.join(attempt_dir, 'calcs', 'Species', 'H2O', 'scan_a1') + os.makedirs(calcs_dir, exist_ok=True) + with open(os.path.join(calcs_dir, 'output.yml'), 'w') as f: + f.write('dummy') + + with patch('arc.job.pipe.pipe_run.logger') as mock_logger: + self.sched.pipe_coordinator.ingest_pipe_results(pipe) + warning_calls = [str(c) for c in mock_logger.warning.call_args_list] + self.assertTrue(any('rotor_index=0' in c and 'not found' in c for c in warning_calls)) + + def test_scan_ingestion_no_rotors_dict(self): + """Ingestion skips safely when species has no rotors_dict.""" + species = self.sched.species_dict['H2O'] + if hasattr(species, 'rotors_dict'): + del species.rotors_dict + + task = _make_task_spec('scan_nodict', task_family='rotor_scan_1d', species_label='H2O') + task.ingestion_metadata = {'rotor_index': 0} + pipe = PipeRun(project_directory=self.tmpdir, run_id='scan_nodict', + tasks=[task], cluster_software='slurm') + pipe.stage() + _complete_task(pipe.pipe_root, 'scan_nodict') + attempt_dir = get_task_attempt_dir(pipe.pipe_root, 'scan_nodict', 0) + calcs_dir = os.path.join(attempt_dir, 'calcs', 'Species', 'H2O', 'scan_a1') + os.makedirs(calcs_dir, exist_ok=True) + with open(os.path.join(calcs_dir, 'output.yml'), 'w') as f: + f.write('dummy') + + with patch('arc.job.pipe.pipe_run.logger') as mock_logger: + self.sched.pipe_coordinator.ingest_pipe_results(pipe) + warning_calls = [str(c) for c in mock_logger.warning.call_args_list] + self.assertTrue(any('no valid rotors_dict' in c for c in warning_calls)) + + +class TestTryPipeRotorScans1d(unittest.TestCase): + """Tests for _try_pipe_rotor_scans_1d.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_scan_route_') + self.sched = _make_scheduler(self.tmpdir) + self.sched.scan_level = Level(method='wb97xd', basis='def2-tzvp') + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_scans_pipe_when_enough(self): + species = self.sched.species_dict['H2O'] + species.rotors_dict = {i: {'torsion': [0, 1, 2, 3], 'success': None} + for i in range(12)} + with patch.object(self.sched, 'deduce_job_adapter', return_value='gaussian'): + result = self.sched.pipe_planner.try_pipe_rotor_scans_1d('H2O', list(range(12))) + self.assertTrue(result) + pipe = list(self.sched.active_pipes.values())[0] + self.assertEqual(pipe.tasks[0].task_family, 'rotor_scan_1d') + self.assertEqual(pipe.tasks[0].owner_type, 'species') + self.assertEqual(pipe.tasks[0].owner_key, 'H2O') + self.assertIn('torsions', pipe.tasks[0].input_payload) + self.assertEqual(pipe.tasks[0].ingestion_metadata['rotor_index'], 0) + + def test_scans_no_pipe_below_threshold(self): + species = self.sched.species_dict['H2O'] + species.rotors_dict = {i: {'torsion': [0, 1, 2, 3], 'success': None} + for i in range(5)} + with patch.object(self.sched, 'deduce_job_adapter', return_value='gaussian'): + result = self.sched.pipe_planner.try_pipe_rotor_scans_1d('H2O', list(range(5))) + self.assertFalse(result) + + def test_scan_not_mixed_with_other_families(self): + """rotor_scan_1d and conf_opt cannot be in the same PipeRun.""" + t1 = _make_task_spec('t1', task_family='rotor_scan_1d') + t2 = _make_task_spec('t2', task_family='conf_opt') + run = PipeRun(project_directory=self.tmpdir, run_id='mixed', + tasks=[t1, t2], cluster_software='slurm') + with self.assertRaises(ValueError): + run.stage() + + +class TestResubmissionLifecycle(unittest.TestCase): + """Tests for #1: resubmission sets SUBMITTED status and clears flag.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_resub_test_') + self.sched = _make_scheduler(self.tmpdir) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_resubmission_sets_submitted_status(self): + """After successful resubmission, pipe status should be SUBMITTED.""" + tasks = [_make_task_spec(f'task_{i}') for i in range(3)] + pipe = self.sched.pipe_coordinator.submit_pipe_run('resub_test', tasks) + # Simulate needs_resubmission condition + pipe._needs_resubmission = True + pipe.status = PipeRunState.RECONCILING + # Mock submit_to_scheduler to succeed + with patch.object(pipe, 'submit_to_scheduler', return_value=('submitted', '12345')): + self.sched.pipe_coordinator.poll_pipes() + self.assertEqual(pipe.status, PipeRunState.SUBMITTED) + self.assertEqual(pipe.scheduler_job_id, '12345') + self.assertFalse(pipe._needs_resubmission) + + def test_resubmission_clears_flag_on_failure(self): + """After failed resubmission, flag should still be cleared to avoid infinite loops.""" + tasks = [_make_task_spec(f'task_{i}') for i in range(3)] + pipe = self.sched.pipe_coordinator.submit_pipe_run('resub_fail', tasks) + pipe._needs_resubmission = True + pipe.status = PipeRunState.RECONCILING + with patch.object(pipe, 'submit_to_scheduler', return_value=('errored', None)): + self.sched.pipe_coordinator.poll_pipes() + self.assertFalse(pipe._needs_resubmission) + + +class TestShouldUsePipeOwnerType(unittest.TestCase): + """Tests for #4: owner_type homogeneity check.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_owner_test_') + self.sched = _make_scheduler(self.tmpdir) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_rejects_mixed_owner_types(self): + """Batches with mixed owner_type should be rejected.""" + tasks = [_make_task_spec(f'task_{i}') for i in range(15)] + # Override one task's owner_type + mixed = _make_task_spec('task_mixed') + mixed_dict = mixed.as_dict() + mixed_dict['owner_type'] = 'reaction' + mixed_task = TaskSpec.from_dict(mixed_dict) + # Manually set owner_type since from_dict bypasses validation + mixed_task.owner_type = 'reaction' + tasks[7] = mixed_task + self.assertFalse(self.sched.pipe_coordinator.should_use_pipe(tasks)) + + +class TestWorkerUsesMapping(unittest.TestCase): + """Tests for #3: worker uses TASK_FAMILY_TO_JOB_TYPE mapping.""" + + def test_dispatch_uses_central_mapping(self): + """Verify worker dispatch derives job_type from TASK_FAMILY_TO_JOB_TYPE.""" + from arc.scripts.pipe_worker import _dispatch_execution, _get_family_extra_kwargs + from arc.job.pipe.pipe_state import TASK_FAMILY_TO_JOB_TYPE + # ts_guess_batch_method -> 'tsg' (non-identity mapping) + self.assertEqual(TASK_FAMILY_TO_JOB_TYPE['ts_guess_batch_method'], 'tsg') + self.assertEqual(TASK_FAMILY_TO_JOB_TYPE['ts_opt'], 'opt') + self.assertEqual(TASK_FAMILY_TO_JOB_TYPE['species_sp'], 'sp') + + def test_extra_kwargs_for_irc(self): + """IRC family should extract irc_direction from ingestion_metadata.""" + from arc.scripts.pipe_worker import _get_family_extra_kwargs + spec = _make_task_spec('irc_task', task_family='irc') + spec_dict = spec.as_dict() + spec_dict['task_family'] = 'irc' + spec_dict['ingestion_metadata'] = {'irc_direction': 'forward'} + irc_spec = TaskSpec.from_dict(spec_dict) + irc_spec.task_family = 'irc' + irc_spec.ingestion_metadata = {'irc_direction': 'forward'} + kwargs = _get_family_extra_kwargs(irc_spec) + self.assertEqual(kwargs, {'irc_direction': 'forward'}) + + +class TestFindOutputFileResultJson(unittest.TestCase): + """Tests for #6: find_output_file prefers result.json canonical path.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_output_test_') + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_prefers_result_json_canonical_path(self): + """find_output_file should use canonical_output_path from result.json.""" + from arc.job.pipe.pipe_run import find_output_file + attempt_dir = os.path.join(self.tmpdir, 'attempt_0') + os.makedirs(attempt_dir) + # Create a canonical output file + canonical_path = os.path.join(attempt_dir, 'my_output.out') + with open(canonical_path, 'w') as f: + f.write('output data') + # Write result.json pointing to it + import json + result = {'canonical_output_path': canonical_path} + with open(os.path.join(attempt_dir, 'result.json'), 'w') as f: + json.dump(result, f) + found = find_output_file(attempt_dir, 'gaussian', 'test_task') + self.assertEqual(found, canonical_path) + + def test_falls_back_to_walk_without_result_json(self): + """Without result.json, should fall back to filesystem walk.""" + from arc.job.pipe.pipe_run import find_output_file + attempt_dir = os.path.join(self.tmpdir, 'attempt_1') + calcs_dir = os.path.join(attempt_dir, 'calcs', 'subdir') + os.makedirs(calcs_dir) + out_file = os.path.join(calcs_dir, 'output.out') + with open(out_file, 'w') as f: + f.write('output data') + found = find_output_file(attempt_dir, 'some_engine', 'test_task') + self.assertEqual(found, out_file) + + def test_result_json_wins_over_walk(self): + """When both result.json and calcs/ contain valid files, result.json wins.""" + from arc.job.pipe.pipe_run import find_output_file + import json + attempt_dir = os.path.join(self.tmpdir, 'attempt_2') + # Create the canonical file pointed to by result.json + canonical_path = os.path.join(attempt_dir, 'canonical_output.log') + os.makedirs(attempt_dir) + with open(canonical_path, 'w') as f: + f.write('canonical output') + # Also create a file the walk would find (engine=gaussian -> input.log) + calcs_dir = os.path.join(attempt_dir, 'calcs', 'Species', 'spc') + os.makedirs(calcs_dir) + walk_path = os.path.join(calcs_dir, 'input.log') + with open(walk_path, 'w') as f: + f.write('walk output') + # Write result.json pointing to canonical + with open(os.path.join(attempt_dir, 'result.json'), 'w') as f: + json.dump({'canonical_output_path': canonical_path}, f) + found = find_output_file(attempt_dir, 'gaussian', 'test_task') + self.assertEqual(found, canonical_path) + self.assertNotEqual(found, walk_path) + + +class TestFreqIrcIngestionSafety(unittest.TestCase): + """Tests for #7: freq/irc ingestion initializes output structure if missing.""" + + def test_freq_ingestion_creates_output_entry(self): + """Freq ingestion should create output[label] if missing.""" + from arc.job.pipe.pipe_run import _ingest_species_freq + from arc.job.pipe.pipe_state import get_task_attempt_dir, initialize_task, TaskStateRecord + tmpdir = tempfile.mkdtemp(prefix='pipe_freq_test_') + try: + spec = _make_task_spec('freq_task', task_family='species_freq') + pipe_root = tmpdir + initialize_task(pipe_root, spec, max_attempts=3) + state = TaskStateRecord(status='completed', attempt_index=0, max_attempts=3, ended_at=time.time()) + species_dict = {'H2O': True} # species exists + output = {} # output entry MISSING + # Create a fake output file for find_output_file to find + attempt_dir = get_task_attempt_dir(pipe_root, spec.task_id, 0) + os.makedirs(attempt_dir, exist_ok=True) + calcs_dir = os.path.join(attempt_dir, 'calcs') + os.makedirs(calcs_dir, exist_ok=True) + with open(os.path.join(calcs_dir, 'output.yml'), 'w') as f: + f.write('freq output') + _ingest_species_freq('run1', pipe_root, spec, state, species_dict, 'H2O', output) + self.assertIn('H2O', output) + self.assertIn('freq', output['H2O']['paths']) + finally: + shutil.rmtree(tmpdir, ignore_errors=True) + + def test_irc_ingestion_creates_output_entry(self): + """IRC ingestion should create output[label] if missing.""" + from arc.job.pipe.pipe_run import _ingest_irc + from arc.job.pipe.pipe_state import get_task_attempt_dir, initialize_task, TaskStateRecord + tmpdir = tempfile.mkdtemp(prefix='pipe_irc_test_') + try: + spec = _make_task_spec('irc_task', task_family='irc') + pipe_root = tmpdir + initialize_task(pipe_root, spec, max_attempts=3) + state = TaskStateRecord(status='completed', attempt_index=0, max_attempts=3, ended_at=time.time()) + species_dict = {'TS_H2O': True} + output = {} # output entry MISSING + attempt_dir = get_task_attempt_dir(pipe_root, spec.task_id, 0) + os.makedirs(attempt_dir, exist_ok=True) + calcs_dir = os.path.join(attempt_dir, 'calcs') + os.makedirs(calcs_dir, exist_ok=True) + with open(os.path.join(calcs_dir, 'output.yml'), 'w') as f: + f.write('irc output') + _ingest_irc('run1', pipe_root, spec, state, species_dict, 'TS_H2O', output) + self.assertIn('TS_H2O', output) + self.assertIn('irc', output['TS_H2O']['paths']) + self.assertEqual(len(output['TS_H2O']['paths']['irc']), 1) + finally: + shutil.rmtree(tmpdir, ignore_errors=True) + + +class TestSubmitPipeRunLifecycle(unittest.TestCase): + """Tests for #5: submit_pipe_run state consistency.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_lifecycle_test_') + self.sched = _make_scheduler(self.tmpdir) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_successful_submission_sets_submitted(self): + """On successful submission, status should be SUBMITTED with job_id.""" + tasks = [_make_task_spec(f'task_{i}') for i in range(3)] + with patch('arc.job.pipe.pipe_run.PipeRun.submit_to_scheduler', + return_value=('submitted', '99999')): + pipe = self.sched.pipe_coordinator.submit_pipe_run('success_run', tasks) + self.assertEqual(pipe.status, PipeRunState.SUBMITTED) + self.assertEqual(pipe.scheduler_job_id, '99999') + self.assertIsNotNone(pipe.submitted_at) + + def test_failed_submission_stays_staged(self): + """On failed submission, status should remain STAGED.""" + tasks = [_make_task_spec(f'task_{i}') for i in range(3)] + with patch('arc.job.pipe.pipe_run.PipeRun.submit_to_scheduler', + return_value=('errored', None)): + pipe = self.sched.pipe_coordinator.submit_pipe_run('fail_run', tasks) + self.assertEqual(pipe.status, PipeRunState.STAGED) + self.assertIn('fail_run', self.sched.active_pipes) + + +class TestPollPipesIntegration(unittest.TestCase): + """Tests for #9: poll integration with schedule_jobs loop.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_poll_int_') + self.sched = _make_scheduler(self.tmpdir) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_schedule_jobs_calls_poll_pipes_for_active_pipes(self): + """schedule_jobs should invoke poll_pipes when active_pipes is non-empty.""" + tasks = [_make_task_spec(f'task_{i}') for i in range(3)] + pipe = self.sched.pipe_coordinator.submit_pipe_run('poll_int', tasks) + # Complete all tasks so poll_pipes removes the pipe + for spec in pipe.tasks: + _complete_task(pipe.pipe_root, spec.task_id) + # Mock schedule_jobs loop by calling poll_pipes directly + # (full schedule_jobs is too heavy; this verifies the integration point) + self.sched.pipe_coordinator.poll_pipes() + self.assertNotIn('poll_int', self.sched.active_pipes) + + +class TestFlushPendingPipeSp(unittest.TestCase): + """Focused tests for deferred SP batch flushing.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_flush_sp_') + self.sched = _make_scheduler(self.tmpdir) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_flush_clears_pending_and_calls_planner(self): + """Pending set is snapshotted, cleared, and planner is called with the labels.""" + self.sched._pending_pipe_sp = {'spc_A', 'spc_B'} + with patch.object(self.sched.pipe_planner, 'try_pipe_species_sp', return_value={'spc_A', 'spc_B'}): + with patch.object(self.sched, 'run_sp_job') as mock_sp: + self.sched._flush_pending_pipe_sp() + self.assertEqual(self.sched._pending_pipe_sp, set()) + mock_sp.assert_not_called() # All piped, no fallback. + + def test_flush_falls_back_for_unhandled(self): + """Unhandled labels are submitted through run_sp_job.""" + self.sched._pending_pipe_sp = {'spc_A', 'spc_B', 'spc_C'} + with patch.object(self.sched.pipe_planner, 'try_pipe_species_sp', return_value={'spc_B'}): + with patch.object(self.sched, 'run_sp_job') as mock_sp: + self.sched._flush_pending_pipe_sp() + # spc_A and spc_C should fall back (sorted order) + self.assertEqual(mock_sp.call_count, 2) + fallback_labels = sorted([c.args[0] for c in mock_sp.call_args_list]) + self.assertEqual(fallback_labels, ['spc_A', 'spc_C']) + + def test_flush_noop_when_empty(self): + """Empty pending set should not call planner.""" + with patch.object(self.sched.pipe_planner, 'try_pipe_species_sp') as mock_planner: + self.sched._flush_pending_pipe_sp() + mock_planner.assert_not_called() + + +class TestFlushPendingPipeFreq(unittest.TestCase): + """Focused tests for deferred freq batch flushing.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_flush_freq_') + self.sched = _make_scheduler(self.tmpdir) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_flush_falls_back_for_unhandled(self): + """Unhandled labels fall back to run_freq_job.""" + self.sched._pending_pipe_freq = {'spc_X', 'spc_Y'} + with patch.object(self.sched.pipe_planner, 'try_pipe_species_freq', return_value=set()): + with patch.object(self.sched, 'run_freq_job') as mock_freq: + self.sched._flush_pending_pipe_freq() + self.assertEqual(mock_freq.call_count, 2) + + +class TestFlushPendingPipeIrc(unittest.TestCase): + """Focused tests for deferred IRC batch flushing.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_flush_irc_') + self.sched = _make_scheduler(self.tmpdir) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_flush_falls_back_for_unhandled(self): + """Unhandled (label, direction) pairs fall back to run_irc_job.""" + self.sched._pending_pipe_irc = {('ts_A', 'forward'), ('ts_A', 'reverse')} + with patch.object(self.sched.pipe_planner, 'try_pipe_irc', return_value={('ts_A', 'forward')}): + with patch.object(self.sched, 'run_irc_job') as mock_irc: + self.sched._flush_pending_pipe_irc() + mock_irc.assert_called_once_with(label='ts_A', irc_direction='reverse') + + def test_flush_clears_pending(self): + """Pending set is cleared after flush.""" + self.sched._pending_pipe_irc = {('ts_B', 'forward')} + with patch.object(self.sched.pipe_planner, 'try_pipe_irc', return_value=set()): + with patch.object(self.sched, 'run_irc_job'): + self.sched._flush_pending_pipe_irc() + self.assertEqual(self.sched._pending_pipe_irc, set()) + + +class TestFlushPendingPipeConfSp(unittest.TestCase): + """Focused tests for deferred conformer SP batch flushing.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_flush_csp_') + self.sched = _make_scheduler(self.tmpdir) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_flush_passes_exact_indices_to_planner(self): + """Planner receives exactly the accumulated conformer indices.""" + self.sched._pending_pipe_conf_sp = {'H2O': {2, 5, 7}} + with patch.object(self.sched.pipe_planner, 'try_pipe_conf_sp', + return_value={2, 5, 7}) as mock_plan: + with patch.object(self.sched, 'run_sp_job') as mock_sp: + self.sched._flush_pending_pipe_conf_sp() + mock_plan.assert_called_once_with('H2O', [2, 5, 7]) + mock_sp.assert_not_called() + + def test_flush_falls_back_for_unhandled_indices(self): + """Unhandled conformer indices fall back to run_sp_job.""" + self.sched._pending_pipe_conf_sp = {'H2O': {0, 1, 2}} + with patch.object(self.sched.pipe_planner, 'try_pipe_conf_sp', return_value={1}): + with patch.object(self.sched, 'run_sp_job') as mock_sp: + self.sched._flush_pending_pipe_conf_sp() + # Indices 0 and 2 should fall back (sorted) + self.assertEqual(mock_sp.call_count, 2) + fallback_conformers = [c.kwargs.get('conformer') for c in mock_sp.call_args_list] + self.assertEqual(fallback_conformers, [0, 2]) + + def test_flush_clears_pending(self): + """Pending dict is cleared after flush.""" + self.sched._pending_pipe_conf_sp = {'H2O': {0}} + with patch.object(self.sched.pipe_planner, 'try_pipe_conf_sp', return_value=set()): + with patch.object(self.sched, 'run_sp_job'): + self.sched._flush_pending_pipe_conf_sp() + self.assertEqual(self.sched._pending_pipe_conf_sp, {}) + + def test_returned_handled_is_subset_of_candidates(self): + """Planner should never return indices outside the supplied candidates.""" + self.sched._pending_pipe_conf_sp = {'H2O': {3, 4}} + # Simulate planner returning a superset — the flush should still work + # because it only checks `conformer_indices - piped`. + with patch.object(self.sched.pipe_planner, 'try_pipe_conf_sp', + return_value={3, 4, 99}): + with patch.object(self.sched, 'run_sp_job') as mock_sp: + self.sched._flush_pending_pipe_conf_sp() + mock_sp.assert_not_called() # {3,4} - {3,4,99} = empty + + +if __name__ == '__main__': + unittest.main(testRunner=unittest.TextTestRunner(verbosity=2)) From 1b2a9cb22a4a8296e354f873293dbe89795e99e1 Mon Sep 17 00:00:00 2001 From: Alon Grinberg Dana Date: Fri, 3 Apr 2026 15:25:09 +0300 Subject: [PATCH 16/60] Docs: pipe mode --- docs/source/advanced.rst | 57 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst index 94e871cec9..8db1458943 100644 --- a/docs/source/advanced.rst +++ b/docs/source/advanced.rst @@ -918,4 +918,61 @@ Alternatively, the user may request to compute the rate coefficients in the clas instructs the relevant statmech program to compute rate coefficients in the classical two-parameter Arrhenius format for all reactions in the same ARC project. +.. _pipe_mode: + +Pipe mode (distributed HPC execution) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Pipe mode allows ARC to batch many independent jobs (e.g., conformer optimizations) +into a single SLURM/PBS/SGE/HTCondor array allocation. +Instead of submitting hundreds of individual cluster jobs, ARC stages all tasks on +disk and launches a small number of array workers that claim and execute tasks from +a shared task directory. + +**When does ARC use pipe mode?** + +ARC automatically evaluates pipe eligibility when scheduling batches of homogeneous +jobs (same engine, level of theory, and resource requirements). +By default, pipe mode activates when a batch has 10 or more tasks. +Below that threshold, ARC uses its normal per-job submission path. + +**Supported job types:** + +- Conformer optimization (``conf_opt``) and single-point (``conf_sp``) +- TS guess generation and TS optimization +- Species single-point, frequency, and IRC calculations +- 1D rotor scans + +**What pipe mode does and does not do:** + +- Pipe executes only ready "leaf" jobs. All quality checks, troubleshooting, + and downstream decision-making remain in ARC's main scheduler. +- Failed tasks are retried automatically (configurable). + If a task exhausts its retry budget, it is marked as terminally failed + and reported to the scheduler for manual review. +- Each array worker verifies task ownership before writing results, + preventing stale workers from overwriting state after lease expiration. + +**Configuration:** + +Pipe mode is configured via ``pipe_settings`` in ``arc/settings/settings.py`` +(or in ``~/.arc/settings.py`` to override per-installation):: + + pipe_settings = { + 'enabled': True, # Set to False to disable pipe mode entirely. + 'min_tasks': 10, # Minimum batch size to trigger pipe mode. + 'max_workers': 100, # Upper bound on array worker slots per PipeRun. + 'max_attempts': 3, # Retry budget per task before terminal failure. + 'lease_duration_s': 86400, # Worker lease duration in seconds (default 24h). + } + +**Submit scripts:** + +Pipe mode generates array submit scripts under the run directory +(``/runs/pipe_/submit.sh``). +The templates follow ARC's existing submit-script conventions from +``arc/settings/submit.py`` and support SLURM, PBS, SGE, and HTCondor. +Users who customize their submit templates can edit the ``pipe_submit`` +dictionary in ``submit.py``. + .. include:: links.txt From 42b0798dce0fdf5b05271c5380dc469dd2a91426 Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Sun, 5 Apr 2026 18:04:10 +0300 Subject: [PATCH 17/60] Fixes --- arc/job/adapters/gaussian.py | 16 +++++++++------- arc/job/pipe/pipe_run.py | 6 ++++++ arc/job/pipe/pipe_run_test.py | 4 ++-- arc/scripts/pipe_worker.py | 4 ++++ arc/settings/settings.py | 3 +++ arc/settings/submit.py | 14 +++++++++----- 6 files changed, 33 insertions(+), 14 deletions(-) diff --git a/arc/job/adapters/gaussian.py b/arc/job/adapters/gaussian.py index 476af80e67..9321d454f2 100644 --- a/arc/job/adapters/gaussian.py +++ b/arc/job/adapters/gaussian.py @@ -155,7 +155,7 @@ def __init__(self, self.incore_capacity = 1 self.job_adapter = 'gaussian' self.execution_type = execution_type or 'queue' - self.command = ['g03', 'g09', 'g16'] + self.command = ['g16', 'g09', 'g03'] self.url = 'https://gaussian.com/' if species is None: @@ -500,13 +500,15 @@ def execute_incore(self): """ Execute a job incore. """ - which(self.command, - return_bool=True, - raise_error=True, - raise_msg=f'Please install {self.job_adapter}, see {self.url} for more information.', - ) + binary = which(self.command, + return_bool=False, + raise_error=True, + raise_msg=f'Please install {self.job_adapter}, see {self.url} for more information.', + ) + binary_name = os.path.basename(binary) self._log_job_execution() - execute_command(incore_commands[self.job_adapter]) + commands = [cmd.replace('g16', binary_name) for cmd in incore_commands[self.job_adapter]] + execute_command(commands) def execute_queue(self): """ diff --git a/arc/job/pipe/pipe_run.py b/arc/job/pipe/pipe_run.py index 4f23951e8e..3b963c985c 100644 --- a/arc/job/pipe/pipe_run.py +++ b/arc/job/pipe/pipe_run.py @@ -211,6 +211,10 @@ def write_submit_script(self) -> str: f'No pipe submit template for cluster software: {self.cluster_software}. ' f'Available templates: {list(pipe_submit.keys())}') cpus, memory_mb, array_size = self._submission_resources() + server = servers_dict.get('local', {}) + queue, _ = next(iter(server.get('queues', {}).items()), ('', None)) + engine = self.tasks[0].engine if self.tasks else '' + env_setup = pipe_settings.get('env_setup', {}).get(engine, '') content = pipe_submit[template_key].format( name=f'pipe_{self.run_id}', max_task_num=array_size, @@ -218,6 +222,8 @@ def write_submit_script(self) -> str: python_exe=sys.executable, cpus=cpus, memory=memory_mb, + queue=queue, + env_setup=env_setup, ) filename = 'submit.sub' if self.cluster_software == 'htcondor' else 'submit.sh' submit_path = os.path.join(self.pipe_root, filename) diff --git a/arc/job/pipe/pipe_run_test.py b/arc/job/pipe/pipe_run_test.py index 4f93a1726d..15b9230230 100644 --- a/arc/job/pipe/pipe_run_test.py +++ b/arc/job/pipe/pipe_run_test.py @@ -159,8 +159,8 @@ def test_pbs_content(self): path = run.write_submit_script() with open(path) as f: content = f.read() - self.assertIn('#PBS -t 1-8', content) - self.assertIn('WORKER_ID=$PBS_ARRAYID', content) + self.assertIn('#PBS -J 1-8', content) + self.assertIn('WORKER_ID="$PBS_ARRAY_INDEX"', content) def test_htcondor_content(self): run = self._make_run('htcondor', max_workers=12, n_tasks=12) diff --git a/arc/scripts/pipe_worker.py b/arc/scripts/pipe_worker.py index 2ece334c6b..48d0cf64ff 100644 --- a/arc/scripts/pipe_worker.py +++ b/arc/scripts/pipe_worker.py @@ -262,6 +262,10 @@ def _run_adapter(spec: TaskSpec, scratch_dir: str, job_type: str, **extra_kwargs **extra_kwargs, ) job.execute() + output_file = getattr(job, 'local_path_to_output_file', None) + if output_file and not os.path.isfile(output_file): + raise RuntimeError(f'{spec.engine} produced no output file at {output_file}. ' + f'The engine may not be installed or configured on this node.') # --------------------------------------------------------------------------- diff --git a/arc/settings/settings.py b/arc/settings/settings.py index 057ac4f3a4..41ae0e40fe 100644 --- a/arc/settings/settings.py +++ b/arc/settings/settings.py @@ -313,6 +313,9 @@ 'max_workers': 100, # Upper bound on array worker slots per PipeRun. 'max_attempts': 3, # Retry budget per task before terminal failure. 'lease_duration_s': 86400, # Worker lease duration in seconds (default 24h). + 'env_setup': {}, # Engine-specific shell setup commands, e.g., + # {'gaussian': 'source /usr/local/g09/setup.sh', + # 'orca': 'source /usr/local/orca/setup.sh'} } # Criteria for identification of imaginary frequencies for transition states. diff --git a/arc/settings/submit.py b/arc/settings/submit.py index e9e7b24908..993681319a 100644 --- a/arc/settings/submit.py +++ b/arc/settings/submit.py @@ -50,6 +50,7 @@ pipe_submit = { 'slurm': """#!/bin/bash -l #SBATCH -J {name} +#SBATCH -p {queue} #SBATCH -N 1 #SBATCH -n {cpus} #SBATCH --mem={memory} @@ -57,30 +58,33 @@ #SBATCH -o {pipe_root}/out_%a.txt #SBATCH -e {pipe_root}/err_%a.txt +{env_setup} WORKER_ID=$SLURM_ARRAY_TASK_ID {python_exe} -m arc.scripts.pipe_worker --pipe_root {pipe_root} --worker_id $WORKER_ID """, 'pbs': """#!/bin/bash -l #PBS -N {name} +#PBS -q {queue} #PBS -l ncpus={cpus} #PBS -l mem={memory}mb -#PBS -t 1-{max_task_num} -#PBS -o {pipe_root}/out_$PBS_ARRAYID.txt -#PBS -e {pipe_root}/err_$PBS_ARRAYID.txt +#PBS -J 1-{max_task_num} -WORKER_ID=$PBS_ARRAYID +{env_setup} +WORKER_ID="$PBS_ARRAY_INDEX" -{python_exe} -m arc.scripts.pipe_worker --pipe_root {pipe_root} --worker_id $WORKER_ID +{python_exe} -m arc.scripts.pipe_worker --pipe_root {pipe_root} --worker_id "$WORKER_ID" """, 'sge': """#!/bin/bash -l #$ -N {name} +#$ -q {queue} #$ -pe smp {cpus} #$ -l h_vmem={memory}M #$ -t 1-{max_task_num} #$ -o {pipe_root}/out_$SGE_TASK_ID.txt #$ -e {pipe_root}/err_$SGE_TASK_ID.txt +{env_setup} WORKER_ID=$SGE_TASK_ID {python_exe} -m arc.scripts.pipe_worker --pipe_root {pipe_root} --worker_id $WORKER_ID From 63207c4e8f0cc74fac45bcc23719472ab3b5f18c Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Sun, 5 Apr 2026 20:56:59 +0300 Subject: [PATCH 18/60] Adjustments --- arc/job/pipe/pipe_run.py | 6 +++++- arc/settings/settings.py | 8 +++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/arc/job/pipe/pipe_run.py b/arc/job/pipe/pipe_run.py index 3b963c985c..3546163bc4 100644 --- a/arc/job/pipe/pipe_run.py +++ b/arc/job/pipe/pipe_run.py @@ -214,7 +214,11 @@ def write_submit_script(self) -> str: server = servers_dict.get('local', {}) queue, _ = next(iter(server.get('queues', {}).items()), ('', None)) engine = self.tasks[0].engine if self.tasks else '' - env_setup = pipe_settings.get('env_setup', {}).get(engine, '') + engine_env = pipe_settings.get('env_setup', {}).get(engine, {}) + if isinstance(engine_env, dict): + env_setup = '\n'.join(engine_env.values()) + else: + env_setup = engine_env # backward compat: plain string content = pipe_submit[template_key].format( name=f'pipe_{self.run_id}', max_task_num=array_size, diff --git a/arc/settings/settings.py b/arc/settings/settings.py index 41ae0e40fe..493c8143d3 100644 --- a/arc/settings/settings.py +++ b/arc/settings/settings.py @@ -313,9 +313,11 @@ 'max_workers': 100, # Upper bound on array worker slots per PipeRun. 'max_attempts': 3, # Retry budget per task before terminal failure. 'lease_duration_s': 86400, # Worker lease duration in seconds (default 24h). - 'env_setup': {}, # Engine-specific shell setup commands, e.g., - # {'gaussian': 'source /usr/local/g09/setup.sh', - # 'orca': 'source /usr/local/orca/setup.sh'} + 'env_setup': {}, # Engine-specific shell setup commands, nested by version, e.g., + # {'gaussian': {'g16': 'source /usr/local/g16/setup.sh', + # 'g09': 'source /usr/local/g09/setup.sh'}, + # 'orca': {'6.0': 'source /usr/local/orca-6.0/setup.sh', + # '5.4': 'source /usr/local/orca-5.0.4/setup.sh'}} } # Criteria for identification of imaginary frequencies for transition states. From 5c7dc77b20146500f9210656c1df5eba7eafdf79 Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Sun, 5 Apr 2026 23:53:32 +0300 Subject: [PATCH 19/60] Update --- arc/job/pipe/pipe_run.py | 6 +----- arc/scripts/pipe_worker.py | 7 ++++++- arc/settings/settings.py | 8 +++----- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/arc/job/pipe/pipe_run.py b/arc/job/pipe/pipe_run.py index 3546163bc4..3b963c985c 100644 --- a/arc/job/pipe/pipe_run.py +++ b/arc/job/pipe/pipe_run.py @@ -214,11 +214,7 @@ def write_submit_script(self) -> str: server = servers_dict.get('local', {}) queue, _ = next(iter(server.get('queues', {}).items()), ('', None)) engine = self.tasks[0].engine if self.tasks else '' - engine_env = pipe_settings.get('env_setup', {}).get(engine, {}) - if isinstance(engine_env, dict): - env_setup = '\n'.join(engine_env.values()) - else: - env_setup = engine_env # backward compat: plain string + env_setup = pipe_settings.get('env_setup', {}).get(engine, '') content = pipe_submit[template_key].format( name=f'pipe_{self.run_id}', max_task_num=array_size, diff --git a/arc/scripts/pipe_worker.py b/arc/scripts/pipe_worker.py index 48d0cf64ff..4afdaa1aa7 100644 --- a/arc/scripts/pipe_worker.py +++ b/arc/scripts/pipe_worker.py @@ -261,7 +261,12 @@ def _run_adapter(spec: TaskSpec, scratch_dir: str, job_type: str, **extra_kwargs testing=False, **extra_kwargs, ) - job.execute() + original_dir = os.getcwd() + try: + os.chdir(job.local_path) + job.execute() + finally: + os.chdir(original_dir) output_file = getattr(job, 'local_path_to_output_file', None) if output_file and not os.path.isfile(output_file): raise RuntimeError(f'{spec.engine} produced no output file at {output_file}. ' diff --git a/arc/settings/settings.py b/arc/settings/settings.py index 493c8143d3..2d7144047f 100644 --- a/arc/settings/settings.py +++ b/arc/settings/settings.py @@ -313,11 +313,9 @@ 'max_workers': 100, # Upper bound on array worker slots per PipeRun. 'max_attempts': 3, # Retry budget per task before terminal failure. 'lease_duration_s': 86400, # Worker lease duration in seconds (default 24h). - 'env_setup': {}, # Engine-specific shell setup commands, nested by version, e.g., - # {'gaussian': {'g16': 'source /usr/local/g16/setup.sh', - # 'g09': 'source /usr/local/g09/setup.sh'}, - # 'orca': {'6.0': 'source /usr/local/orca-6.0/setup.sh', - # '5.4': 'source /usr/local/orca-5.0.4/setup.sh'}} + 'env_setup': {}, # Engine-specific shell setup commands, e.g., + # {'gaussian': 'source /usr/local/g09/setup.sh', + # 'orca': 'source /usr/local/orca-5.0.4/setup.sh && source /usr/local/openmpi-4.1.1/setup.sh'} } # Criteria for identification of imaginary frequencies for transition states. From c46792cf5e20e69d4a82e33233e938758003a709 Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Mon, 6 Apr 2026 11:37:05 +0300 Subject: [PATCH 20/60] Thread back to Scheduler after completion of TS pipeline --- arc/job/pipe/pipe_coordinator.py | 92 ++++++++++++++++++++++++++++++-- arc/job/pipe/pipe_run.py | 31 +++++++++-- arc/scheduler_pipe_test.py | 39 ++++++++++---- 3 files changed, 144 insertions(+), 18 deletions(-) diff --git a/arc/job/pipe/pipe_coordinator.py b/arc/job/pipe/pipe_coordinator.py index 1efab8be65..892f71fcf8 100644 --- a/arc/job/pipe/pipe_coordinator.py +++ b/arc/job/pipe/pipe_coordinator.py @@ -44,6 +44,7 @@ def __init__(self, sched: 'Scheduler'): self.sched = sched self.active_pipes: Dict[str, PipeRun] = {} self._pipe_poll_failures: Dict[str, int] = {} + self._last_pipe_summary: Dict[str, str] = {} def should_use_pipe(self, tasks: List[TaskSpec]) -> bool: """ @@ -100,7 +101,7 @@ def submit_pipe_run(self, run_id: str, tasks: List[TaskSpec], return pipe try: job_status, job_id = pipe.submit_to_scheduler() - if job_status == 'submitted' and job_id: + if job_id and job_status in ('submitted', 'running'): pipe.scheduler_job_id = job_id pipe.status = PipeRunState.SUBMITTED pipe.submitted_at = time.time() @@ -154,12 +155,14 @@ def poll_pipes(self) -> None: continue self._pipe_poll_failures.pop(run_id, None) summary = ', '.join(f'{state}: {n}' for state, n in sorted(counts.items()) if n > 0) - logger.info(f'Pipe run {run_id}: {summary}') + if summary != self._last_pipe_summary.get(run_id): + logger.info(f'Pipe run {run_id}: {summary}') + self._last_pipe_summary[run_id] = summary if pipe.needs_resubmission: logger.info(f'Pipe run {run_id}: resubmitting to pick up retried tasks.') try: job_status, job_id = pipe.submit_to_scheduler() - if job_status == 'submitted' and job_id: + if job_id and job_status in ('submitted', 'running'): pipe.scheduler_job_id = job_id pipe.status = PipeRunState.SUBMITTED pipe.submitted_at = time.time() @@ -184,7 +187,9 @@ def ingest_pipe_results(self, pipe: PipeRun) -> None: Ingest results from a terminal pipe run. Dispatches by task_family. One broken task does not abort - ingestion of remaining tasks. + ingestion of remaining tasks. After all per-task ingestion, + triggers family-specific post-processing (e.g., selecting + the best conformer and spawning the next job). """ for spec in pipe.tasks: try: @@ -203,3 +208,82 @@ def ingest_pipe_results(self, pipe: PipeRun) -> None: elif state.status == TaskState.CANCELLED.value: logger.warning(f'Pipe run {pipe.run_id}, task {spec.task_id}: ' f'was cancelled.') + self._post_ingest_pipe_run(pipe) + + def _post_ingest_pipe_run(self, pipe: PipeRun) -> None: + """ + Trigger family-specific post-processing after all tasks in a pipe run + have been individually ingested. + + Families requiring post-processing: + - ts_opt: determine best TS conformer, then run opt job + - conf_opt: determine most stable conformer, then run opt job + - conf_sp: determine most stable conformer (sp_flag), then run opt job + + Other families (species_sp, species_freq, irc, rotor_scan_1d) are + leaf jobs with no batch-level post-processing. + """ + if not pipe.tasks: + return + task_family = pipe.tasks[0].task_family + label = pipe.tasks[0].owner_key + if not label or label not in self.sched.species_dict: + return + if task_family == 'ts_opt': + self._post_ingest_ts_opt(label) + elif task_family == 'conf_opt': + self._post_ingest_conf_opt(label) + elif task_family == 'conf_sp': + self._post_ingest_conf_sp(label) + + def _post_ingest_ts_opt(self, label: str) -> None: + """After all TS opt tasks, pick the best conformer and run proper opt.""" + ts_species = self.sched.species_dict[label] + if not ts_species.is_ts: + logger.warning(f'_post_ingest_ts_opt called for non-TS species {label}, skipping.') + return + if all(tsg.energy is None for tsg in ts_species.ts_guesses): + logger.error(f'No ts_opt task converged for TS {label}.') + return + logger.info(f'\nConformer jobs for {label} successfully terminated (pipe mode).\n') + try: + self.sched.determine_most_likely_ts_conformer(label) + except Exception: + logger.error(f'Failed to determine most likely TS conformer for {label}.', exc_info=True) + return + if ts_species.initial_xyz is not None: + if not self.sched.composite_method: + self.sched.run_opt_job(label, fine=self.sched.fine_only) + else: + self.sched.run_composite_job(label) + + def _post_ingest_conf_opt(self, label: str) -> None: + """After all conformer opt tasks, pick the best conformer and run opt.""" + logger.info(f'\nConformer opt jobs for {label} successfully terminated (pipe mode).\n') + try: + if self.sched.species_dict[label].is_ts: + self.sched.determine_most_likely_ts_conformer(label) + else: + self.sched.determine_most_stable_conformer(label, sp_flag=False) + except Exception: + logger.error(f'Failed to determine most stable conformer for {label}.', exc_info=True) + return + if self.sched.species_dict[label].initial_xyz is not None: + if not self.sched.composite_method: + self.sched.run_opt_job(label, fine=self.sched.fine_only) + else: + self.sched.run_composite_job(label) + + def _post_ingest_conf_sp(self, label: str) -> None: + """After all conformer SP tasks, pick the best conformer and run opt.""" + logger.info(f'\nConformer SP jobs for {label} successfully terminated (pipe mode).\n') + try: + self.sched.determine_most_stable_conformer(label, sp_flag=True) + except Exception: + logger.error(f'Failed to determine most stable conformer for {label}.', exc_info=True) + return + if self.sched.species_dict[label].initial_xyz is not None: + if not self.sched.composite_method: + self.sched.run_opt_job(label, fine=self.sched.fine_only) + else: + self.sched.run_composite_job(label) diff --git a/arc/job/pipe/pipe_run.py b/arc/job/pipe/pipe_run.py index 3b963c985c..73805f83de 100644 --- a/arc/job/pipe/pipe_run.py +++ b/arc/job/pipe/pipe_run.py @@ -544,26 +544,49 @@ def _ingest_ts_guess_batch(run_id, pipe_root, spec, state, species_dict, label): def _ingest_ts_opt(run_id, pipe_root, spec, state, species_dict, label): + """Ingest a completed ts_opt task: update the matching TSGuess's opt_xyz and energy.""" + from arc.job.trsh import determine_ess_status if label not in species_dict: logger.warning(f'Pipe run {run_id}, task {spec.task_id}: ' f'TS species "{label}" not in species_dict, skipping.') return + meta = spec.ingestion_metadata or {} + conformer_index = meta.get('conformer_index') + if conformer_index is None: + logger.warning(f'Pipe run {run_id}, task {spec.task_id}: ' + f'missing conformer_index in ingestion_metadata, skipping.') + return attempt_dir = get_task_attempt_dir(pipe_root, spec.task_id, state.attempt_index) ts_species = species_dict[label] try: output_file = find_output_file(attempt_dir, spec.engine, spec.task_id) if output_file is None: return + ess_status, keywords, error, line = determine_ess_status( + output_path=output_file, species_label=label, + job_type='opt', software=spec.engine) + if ess_status != 'done': + logger.warning(f'Pipe run {run_id}, task {spec.task_id}: ' + f'optimization did not converge (status={ess_status}, ' + f'keywords={keywords}). Skipping.') + return xyz = parser.parse_geometry(log_file_path=output_file) e_elect = parser.parse_e_elect(log_file_path=output_file) except Exception as e: logger.error(f'Pipe run {run_id}, task {spec.task_id}: ' f'parsing failed for {attempt_dir}: {type(e).__name__}: {e}') return - if xyz is not None: - ts_species.final_xyz = xyz - if e_elect is not None: - ts_species.e_elect = e_elect + for tsg in ts_species.ts_guesses: + if getattr(tsg, 'conformer_index', None) == conformer_index: + if xyz is not None: + tsg.opt_xyz = xyz + if e_elect is not None: + tsg.energy = e_elect + tsg.index = conformer_index + break + else: + logger.warning(f'Pipe run {run_id}, task {spec.task_id}: ' + f'no TSGuess with conformer_index={conformer_index} for {label}.') def _ingest_species_sp(run_id, pipe_root, spec, state, species_dict, label): diff --git a/arc/scheduler_pipe_test.py b/arc/scheduler_pipe_test.py index d38ca55420..872ac52657 100644 --- a/arc/scheduler_pipe_test.py +++ b/arc/scheduler_pipe_test.py @@ -387,7 +387,9 @@ def test_ingest_updates_species_conformer(self): mock_xyz = {'symbols': ('O', 'H', 'H'), 'isotopes': (16, 1, 1), 'coords': ((0.0, 0.0, 0.12), (0.0, 0.76, -0.47), (0.0, -0.76, -0.47))} with patch('arc.job.pipe.pipe_run.parser.parse_geometry', return_value=mock_xyz), \ - patch('arc.job.pipe.pipe_run.parser.parse_e_elect', return_value=-75.5): + patch('arc.job.pipe.pipe_run.parser.parse_e_elect', return_value=-75.5), \ + patch.object(self.sched, 'determine_most_stable_conformer'), \ + patch.object(self.sched, 'run_opt_job'): self.sched.pipe_coordinator.ingest_pipe_results(pipe) species = self.sched.species_dict['H2O'] self.assertEqual(species.conformers[2], mock_xyz) @@ -451,7 +453,9 @@ def test_ingest_continues_on_missing_output(self): mock_xyz = {'symbols': ('O', 'H', 'H'), 'isotopes': (16, 1, 1), 'coords': ((0.0, 0.0, 0.12), (0.0, 0.76, -0.47), (0.0, -0.76, -0.47))} with patch('arc.job.pipe.pipe_run.parser.parse_geometry', return_value=mock_xyz), \ - patch('arc.job.pipe.pipe_run.parser.parse_e_elect', return_value=-75.5): + patch('arc.job.pipe.pipe_run.parser.parse_e_elect', return_value=-75.5), \ + patch.object(self.sched, 'determine_most_stable_conformer'), \ + patch.object(self.sched, 'run_opt_job'): self.sched.pipe_coordinator.ingest_pipe_results(pipe) species = self.sched.species_dict['H2O'] self.assertEqual(species.conformers[1], mock_xyz) @@ -478,7 +482,9 @@ def mock_parse_geometry(log_file_path): return mock_xyz with patch('arc.job.pipe.pipe_run.parser.parse_geometry', side_effect=mock_parse_geometry), \ - patch('arc.job.pipe.pipe_run.parser.parse_e_elect', return_value=-10.0): + patch('arc.job.pipe.pipe_run.parser.parse_e_elect', return_value=-10.0), \ + patch.object(self.sched, 'determine_most_stable_conformer'), \ + patch.object(self.sched, 'run_opt_job'): self.sched.pipe_coordinator.ingest_pipe_results(pipe) species = self.sched.species_dict['H2O'] self.assertEqual(species.conformers[0], mock_xyz) @@ -589,8 +595,16 @@ def tearDown(self): shutil.rmtree(self.tmpdir, ignore_errors=True) def test_ts_opt_ingestion_updates_species(self): - """ts_opt ingestion sets final_xyz and e_elect on the TS species.""" - ts_label = 'H2O' # reusing existing species as TS proxy + """ts_opt ingestion updates the matching TSGuess's opt_xyz and energy.""" + from arc.species.species import TSGuess + ts_label = 'H2O' + species = self.sched.species_dict[ts_label] + species.is_ts = True + tsg = TSGuess(method='heuristics', index=0) + tsg.success = True + tsg.conformer_index = 0 + species.ts_guesses = [tsg] + task = _make_task_spec('ts_opt_task', task_family='ts_opt', species_label=ts_label, conformer_index=0) pipe = PipeRun(project_directory=self.tmpdir, run_id='ts_opt_ingest', @@ -606,11 +620,14 @@ def test_ts_opt_ingestion_updates_species(self): mock_xyz = {'symbols': ('O', 'H', 'H'), 'isotopes': (16, 1, 1), 'coords': ((0.0, 0.0, 0.12), (0.0, 0.76, -0.47), (0.0, -0.76, -0.47))} with patch('arc.job.pipe.pipe_run.parser.parse_geometry', return_value=mock_xyz), \ - patch('arc.job.pipe.pipe_run.parser.parse_e_elect', return_value=-50.0): + patch('arc.job.pipe.pipe_run.parser.parse_e_elect', return_value=-50.0), \ + patch('arc.job.trsh.determine_ess_status', return_value=('done', [], '', '')), \ + patch.object(self.sched, 'determine_most_likely_ts_conformer'), \ + patch.object(self.sched, 'run_opt_job'): self.sched.pipe_coordinator.ingest_pipe_results(pipe) - species = self.sched.species_dict[ts_label] - self.assertEqual(species.final_xyz, mock_xyz) - self.assertAlmostEqual(species.e_elect, -50.0) + self.assertEqual(tsg.opt_xyz, mock_xyz) + self.assertAlmostEqual(tsg.energy, -50.0) + self.assertEqual(tsg.index, 0) def test_ts_guess_batch_ingestion_calls_process(self): """ts_guess_batch_method ingestion calls process_completed_tsg_queue_jobs.""" @@ -703,7 +720,9 @@ def test_conf_opt_updates_both_geometry_and_energy(self): mock_xyz = {'symbols': ('O', 'H', 'H'), 'isotopes': (16, 1, 1), 'coords': ((0.0, 0.0, 0.12), (0.0, 0.76, -0.47), (0.0, -0.76, -0.47))} with patch('arc.job.pipe.pipe_run.parser.parse_geometry', return_value=mock_xyz), \ - patch('arc.job.pipe.pipe_run.parser.parse_e_elect', return_value=-75.5): + patch('arc.job.pipe.pipe_run.parser.parse_e_elect', return_value=-75.5), \ + patch.object(self.sched, 'determine_most_stable_conformer'), \ + patch.object(self.sched, 'run_opt_job'): self.sched.pipe_coordinator.ingest_pipe_results(pipe) species = self.sched.species_dict['H2O'] # Both geometry and energy must be updated (ARC uses opt-level energy for ranking) From 9840d3361b1dd99f5fd41ea1b3b85a6dc73fb3ab Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Mon, 6 Apr 2026 12:26:44 +0300 Subject: [PATCH 21/60] Race to condition --- arc/job/pipe/pipe_run.py | 59 +++++++++++++++++++++++++++---------- arc/scheduler_pipe_test.py | 60 ++++++++++++++++++++++++++++---------- 2 files changed, 88 insertions(+), 31 deletions(-) diff --git a/arc/job/pipe/pipe_run.py b/arc/job/pipe/pipe_run.py index 73805f83de..f0d91fe824 100644 --- a/arc/job/pipe/pipe_run.py +++ b/arc/job/pipe/pipe_run.py @@ -289,6 +289,7 @@ def reconcile(self) -> Dict[str, int]: now = time.time() counts: Dict[str, int] = {s.value: 0 for s in TaskState} + retried_pending = 0 # PENDING tasks with attempt_index > 0 (genuinely retried) task_ids = sorted(os.listdir(tasks_dir)) for task_id in task_ids: @@ -311,6 +312,8 @@ def reconcile(self) -> Dict[str, int]: except (ValueError, TimeoutError) as e: logger.debug(f'Could not mark task {task_id} as ORPHANED ' f'(another process may be handling it): {e}') + if current == TaskState.PENDING and state.attempt_index > 0: + retried_pending += 1 counts[current.value] += 1 active_workers = counts[TaskState.CLAIMED.value] + counts[TaskState.RUNNING.value] @@ -339,6 +342,7 @@ def reconcile(self) -> Dict[str, int]: failure_class=None, retry_disposition=None) counts[current.value] -= 1 counts[TaskState.PENDING.value] += 1 + retried_pending += 1 else: ended = state.ended_at or now update_task_state(self.pipe_root, task_id, @@ -350,13 +354,14 @@ def reconcile(self) -> Dict[str, int]: logger.debug(f'Could not promote task {task_id} to FAILED_TERMINAL ' f'(lock contention or concurrent state change): {e}') - # If retries were scheduled but no workers remain, flag for resubmission. - pending_after_retry = counts[TaskState.PENDING.value] + # Only flag resubmission for genuinely retried tasks (attempt_index > 0). + # Fresh PENDING tasks (attempt_index == 0) are waiting for the initial + # submission's workers to start — don't resubmit for those. active_after_retry = counts[TaskState.CLAIMED.value] + counts[TaskState.RUNNING.value] - if pending_after_retry > 0 and active_after_retry == 0: + if retried_pending > 0 and active_after_retry == 0: self._needs_resubmission = True - logger.info(f'Pipe run {self.run_id}: {pending_after_retry} retryable tasks reset ' - f'to PENDING but no workers remain. Resubmission needed.') + logger.info(f'Pipe run {self.run_id}: {retried_pending} retried tasks ' + f'need workers. Resubmission needed.') else: self._needs_resubmission = False @@ -441,6 +446,29 @@ def find_output_file(attempt_dir: str, engine: str, task_id: str = '') -> Option return None +def _check_ess_convergence(pipe_run_id: str, spec: TaskSpec, output_file: str, label: str) -> bool: + """ + Check whether an ESS job converged by inspecting the output file. + + Returns ``True`` if the job converged (status == 'done'), ``False`` otherwise. + Families that don't run ESS (e.g., ts_guess_batch_method) should skip this check. + """ + from arc.job.trsh import determine_ess_status + try: + status, keywords, error, line = determine_ess_status( + output_path=output_file, species_label=label, + job_type='opt', software=spec.engine) + except Exception as e: + logger.warning(f'Pipe run {pipe_run_id}, task {spec.task_id}: ' + f'could not determine ESS status: {type(e).__name__}: {e}') + return False + if status != 'done': + logger.warning(f'Pipe run {pipe_run_id}, task {spec.task_id}: ' + f'ESS job did not converge (status={status}, keywords={keywords}). Skipping.') + return False + return True + + def ingest_completed_task(pipe_run_id: str, pipe_root: str, spec: TaskSpec, state: 'TaskStateRecord', species_dict: dict, output: dict) -> None: @@ -493,6 +521,8 @@ def _ingest_conf_opt(run_id, pipe_root, spec, state, species_dict, label, confor output_file = find_output_file(attempt_dir, spec.engine, spec.task_id) if output_file is None: return + if not _check_ess_convergence(run_id, spec, output_file, label): + return xyz = parser.parse_geometry(log_file_path=output_file) e_elect = parser.parse_e_elect(log_file_path=output_file) except Exception as e: @@ -513,6 +543,8 @@ def _ingest_conf_sp(run_id, pipe_root, spec, state, species_dict, label, conform output_file = find_output_file(attempt_dir, spec.engine, spec.task_id) if output_file is None: return + if not _check_ess_convergence(run_id, spec, output_file, label): + return e_elect = parser.parse_e_elect(log_file_path=output_file) except Exception as e: logger.error(f'Pipe run {run_id}, task {spec.task_id}: ' @@ -545,7 +577,6 @@ def _ingest_ts_guess_batch(run_id, pipe_root, spec, state, species_dict, label): def _ingest_ts_opt(run_id, pipe_root, spec, state, species_dict, label): """Ingest a completed ts_opt task: update the matching TSGuess's opt_xyz and energy.""" - from arc.job.trsh import determine_ess_status if label not in species_dict: logger.warning(f'Pipe run {run_id}, task {spec.task_id}: ' f'TS species "{label}" not in species_dict, skipping.') @@ -562,13 +593,7 @@ def _ingest_ts_opt(run_id, pipe_root, spec, state, species_dict, label): output_file = find_output_file(attempt_dir, spec.engine, spec.task_id) if output_file is None: return - ess_status, keywords, error, line = determine_ess_status( - output_path=output_file, species_label=label, - job_type='opt', software=spec.engine) - if ess_status != 'done': - logger.warning(f'Pipe run {run_id}, task {spec.task_id}: ' - f'optimization did not converge (status={ess_status}, ' - f'keywords={keywords}). Skipping.') + if not _check_ess_convergence(run_id, spec, output_file, label): return xyz = parser.parse_geometry(log_file_path=output_file) e_elect = parser.parse_e_elect(log_file_path=output_file) @@ -600,6 +625,8 @@ def _ingest_species_sp(run_id, pipe_root, spec, state, species_dict, label): output_file = find_output_file(attempt_dir, spec.engine, spec.task_id) if output_file is None: return + if not _check_ess_convergence(run_id, spec, output_file, label): + return e_elect = parser.parse_e_elect(log_file_path=output_file) except Exception as e: logger.error(f'Pipe run {run_id}, task {spec.task_id}: ' @@ -621,7 +648,7 @@ def _ingest_species_freq(run_id, pipe_root, spec, state, species_dict, label, ou logger.error(f'Pipe run {run_id}, task {spec.task_id}: ' f'output lookup failed: {type(e).__name__}: {e}') return - if output_file is not None: + if output_file is not None and _check_ess_convergence(run_id, spec, output_file, label): if label not in output: output[label] = {'paths': {}} elif 'paths' not in output[label]: @@ -641,7 +668,7 @@ def _ingest_irc(run_id, pipe_root, spec, state, species_dict, label, output): logger.error(f'Pipe run {run_id}, task {spec.task_id}: ' f'output lookup failed: {type(e).__name__}: {e}') return - if output_file is not None: + if output_file is not None and _check_ess_convergence(run_id, spec, output_file, label): if label not in output: output[label] = {'paths': {'irc': []}} elif 'paths' not in output[label]: @@ -665,6 +692,8 @@ def _ingest_rotor_scan_1d(run_id, pipe_root, spec, state, species_dict, label): return if output_file is None: return + if not _check_ess_convergence(run_id, spec, output_file, label): + return meta = spec.ingestion_metadata or {} rotor_index = meta.get('rotor_index') if rotor_index is None: diff --git a/arc/scheduler_pipe_test.py b/arc/scheduler_pipe_test.py index 872ac52657..8484da04b9 100644 --- a/arc/scheduler_pipe_test.py +++ b/arc/scheduler_pipe_test.py @@ -387,6 +387,7 @@ def test_ingest_updates_species_conformer(self): mock_xyz = {'symbols': ('O', 'H', 'H'), 'isotopes': (16, 1, 1), 'coords': ((0.0, 0.0, 0.12), (0.0, 0.76, -0.47), (0.0, -0.76, -0.47))} with patch('arc.job.pipe.pipe_run.parser.parse_geometry', return_value=mock_xyz), \ + patch('arc.job.trsh.determine_ess_status', return_value=('done', [], '', '')), \ patch('arc.job.pipe.pipe_run.parser.parse_e_elect', return_value=-75.5), \ patch.object(self.sched, 'determine_most_stable_conformer'), \ patch.object(self.sched, 'run_opt_job'): @@ -406,7 +407,8 @@ def test_ingest_terminal_failure_logs_error(self): update_task_state(pipe.pipe_root, 'task_fail', new_status=TaskState.RUNNING, started_at=now) update_task_state(pipe.pipe_root, 'task_fail', new_status=TaskState.FAILED_TERMINAL, ended_at=now, failure_class='oom') - self.sched.pipe_coordinator.ingest_pipe_results(pipe) + with patch('arc.job.trsh.determine_ess_status', return_value=('done', [], '', '')): + self.sched.pipe_coordinator.ingest_pipe_results(pipe) self.assertIsNone(self.sched.species_dict['H2O'].conformers[0]) def test_ingest_cancelled_task_logged(self): @@ -417,14 +419,16 @@ def test_ingest_cancelled_task_logged(self): now = time.time() update_task_state(pipe.pipe_root, 'task_cancel', new_status=TaskState.CANCELLED, ended_at=now) with patch('arc.job.pipe.pipe_coordinator.logger') as mock_logger: - self.sched.pipe_coordinator.ingest_pipe_results(pipe) + with patch('arc.job.trsh.determine_ess_status', return_value=('done', [], '', '')): + self.sched.pipe_coordinator.ingest_pipe_results(pipe) warning_calls = [str(c) for c in mock_logger.warning.call_args_list] self.assertTrue(any('cancelled' in c.lower() for c in warning_calls)) def test_ingest_skips_unknown_species(self): pipe, _ = self._make_pipe_with_completed_task( task_id='task_unknown', species_label='NONEXISTENT', conformer_index=0) - self.sched.pipe_coordinator.ingest_pipe_results(pipe) + with patch('arc.job.trsh.determine_ess_status', return_value=('done', [], '', '')): + self.sched.pipe_coordinator.ingest_pipe_results(pipe) def test_ingest_missing_conformer_index(self): """conf_opt task with empty ingestion_metadata is skipped with warning.""" @@ -436,7 +440,8 @@ def test_ingest_missing_conformer_index(self): pipe.stage() _complete_task(pipe.pipe_root, 'task_no_idx') with patch('arc.job.pipe.pipe_run.logger') as mock_logger: - self.sched.pipe_coordinator.ingest_pipe_results(pipe) + with patch('arc.job.trsh.determine_ess_status', return_value=('done', [], '', '')): + self.sched.pipe_coordinator.ingest_pipe_results(pipe) warning_calls = [str(c) for c in mock_logger.warning.call_args_list] self.assertTrue(any('conformer_index' in c for c in warning_calls)) @@ -453,6 +458,7 @@ def test_ingest_continues_on_missing_output(self): mock_xyz = {'symbols': ('O', 'H', 'H'), 'isotopes': (16, 1, 1), 'coords': ((0.0, 0.0, 0.12), (0.0, 0.76, -0.47), (0.0, -0.76, -0.47))} with patch('arc.job.pipe.pipe_run.parser.parse_geometry', return_value=mock_xyz), \ + patch('arc.job.trsh.determine_ess_status', return_value=('done', [], '', '')), \ patch('arc.job.pipe.pipe_run.parser.parse_e_elect', return_value=-75.5), \ patch.object(self.sched, 'determine_most_stable_conformer'), \ patch.object(self.sched, 'run_opt_job'): @@ -482,6 +488,7 @@ def mock_parse_geometry(log_file_path): return mock_xyz with patch('arc.job.pipe.pipe_run.parser.parse_geometry', side_effect=mock_parse_geometry), \ + patch('arc.job.trsh.determine_ess_status', return_value=('done', [], '', '')), \ patch('arc.job.pipe.pipe_run.parser.parse_e_elect', return_value=-10.0), \ patch.object(self.sched, 'determine_most_stable_conformer'), \ patch.object(self.sched, 'run_opt_job'): @@ -518,7 +525,8 @@ def test_conf_sp_ingestion_updates_energy(self): species = self.sched.species_dict['H2O'] species.conformers[1] = {'symbols': ('O',), 'coords': ((0, 0, 0),)} # pre-existing geometry - with patch('arc.job.pipe.pipe_run.parser.parse_e_elect', return_value=-99.9): + with patch('arc.job.trsh.determine_ess_status', return_value=('done', [], '', '')), \ + patch('arc.job.pipe.pipe_run.parser.parse_e_elect', return_value=-99.9): self.sched.pipe_coordinator.ingest_pipe_results(pipe) # Energy updated @@ -646,7 +654,8 @@ def test_ts_guess_batch_ingestion_calls_process(self): species = self.sched.species_dict[ts_label] with patch.object(species, 'process_completed_tsg_queue_jobs') as mock_process: - self.sched.pipe_coordinator.ingest_pipe_results(pipe) + with patch('arc.job.trsh.determine_ess_status', return_value=('done', [], '', '')): + self.sched.pipe_coordinator.ingest_pipe_results(pipe) mock_process.assert_called_once() def test_ts_not_mixed_with_conformer(self): @@ -720,6 +729,7 @@ def test_conf_opt_updates_both_geometry_and_energy(self): mock_xyz = {'symbols': ('O', 'H', 'H'), 'isotopes': (16, 1, 1), 'coords': ((0.0, 0.0, 0.12), (0.0, 0.76, -0.47), (0.0, -0.76, -0.47))} with patch('arc.job.pipe.pipe_run.parser.parse_geometry', return_value=mock_xyz), \ + patch('arc.job.trsh.determine_ess_status', return_value=('done', [], '', '')), \ patch('arc.job.pipe.pipe_run.parser.parse_e_elect', return_value=-75.5), \ patch.object(self.sched, 'determine_most_stable_conformer'), \ patch.object(self.sched, 'run_opt_job'): @@ -752,7 +762,8 @@ def test_species_sp_sets_e_elect(self): with open(os.path.join(calcs_dir, 'output.yml'), 'w') as f: f.write('dummy') - with patch('arc.job.pipe.pipe_run.parser.parse_e_elect', return_value=-76.1): + with patch('arc.job.trsh.determine_ess_status', return_value=('done', [], '', '')), \ + patch('arc.job.pipe.pipe_run.parser.parse_e_elect', return_value=-76.1): self.sched.pipe_coordinator.ingest_pipe_results(pipe) self.assertAlmostEqual(self.sched.species_dict['H2O'].e_elect, -76.1) @@ -780,7 +791,8 @@ def test_species_freq_stores_output_path(self): with open(output_path, 'w') as f: f.write('dummy') - self.sched.pipe_coordinator.ingest_pipe_results(pipe) + with patch('arc.job.trsh.determine_ess_status', return_value=('done', [], '', '')): + self.sched.pipe_coordinator.ingest_pipe_results(pipe) self.assertEqual(self.sched.output['H2O']['paths']['freq'], output_path) @@ -807,7 +819,8 @@ def test_irc_stores_output_path(self): with open(output_path, 'w') as f: f.write('dummy') - self.sched.pipe_coordinator.ingest_pipe_results(pipe) + with patch('arc.job.trsh.determine_ess_status', return_value=('done', [], '', '')): + self.sched.pipe_coordinator.ingest_pipe_results(pipe) self.assertIn(output_path, self.sched.output['H2O']['paths']['irc']) @@ -903,7 +916,8 @@ def test_scan_ingestion_stores_scan_path(self): with open(output_path, 'w') as f: f.write('dummy') - self.sched.pipe_coordinator.ingest_pipe_results(pipe) + with patch('arc.job.trsh.determine_ess_status', return_value=('done', [], '', '')): + self.sched.pipe_coordinator.ingest_pipe_results(pipe) self.assertEqual(species.rotors_dict[0]['scan_path'], output_path) def test_scan_ingestion_missing_rotor_slot(self): @@ -924,7 +938,8 @@ def test_scan_ingestion_missing_rotor_slot(self): f.write('dummy') with patch('arc.job.pipe.pipe_run.logger') as mock_logger: - self.sched.pipe_coordinator.ingest_pipe_results(pipe) + with patch('arc.job.trsh.determine_ess_status', return_value=('done', [], '', '')): + self.sched.pipe_coordinator.ingest_pipe_results(pipe) warning_calls = [str(c) for c in mock_logger.warning.call_args_list] self.assertTrue(any('rotor_index=0' in c and 'not found' in c for c in warning_calls)) @@ -947,7 +962,8 @@ def test_scan_ingestion_no_rotors_dict(self): f.write('dummy') with patch('arc.job.pipe.pipe_run.logger') as mock_logger: - self.sched.pipe_coordinator.ingest_pipe_results(pipe) + with patch('arc.job.trsh.determine_ess_status', return_value=('done', [], '', '')): + self.sched.pipe_coordinator.ingest_pipe_results(pipe) warning_calls = [str(c) for c in mock_logger.warning.call_args_list] self.assertTrue(any('no valid rotors_dict' in c for c in warning_calls)) @@ -1009,8 +1025,18 @@ def test_resubmission_sets_submitted_status(self): """After successful resubmission, pipe status should be SUBMITTED.""" tasks = [_make_task_spec(f'task_{i}') for i in range(3)] pipe = self.sched.pipe_coordinator.submit_pipe_run('resub_test', tasks) - # Simulate needs_resubmission condition - pipe._needs_resubmission = True + # Simulate retried tasks (attempt_index > 0) so reconcile flags resubmission + for task_id in ['task_0', 'task_1', 'task_2']: + now = time.time() + update_task_state(pipe.pipe_root, task_id, new_status=TaskState.CLAIMED, + claimed_by='w', claim_token='t', claimed_at=now, lease_expires_at=now + 300) + update_task_state(pipe.pipe_root, task_id, new_status=TaskState.RUNNING, started_at=now) + update_task_state(pipe.pipe_root, task_id, new_status=TaskState.FAILED_RETRYABLE, + ended_at=now, failure_class='test') + update_task_state(pipe.pipe_root, task_id, new_status=TaskState.PENDING, + attempt_index=1, claimed_by=None, claim_token=None, + claimed_at=None, lease_expires_at=None, + started_at=None, ended_at=None, failure_class=None) pipe.status = PipeRunState.RECONCILING # Mock submit_to_scheduler to succeed with patch.object(pipe, 'submit_to_scheduler', return_value=('submitted', '12345')): @@ -1164,7 +1190,8 @@ def test_freq_ingestion_creates_output_entry(self): os.makedirs(calcs_dir, exist_ok=True) with open(os.path.join(calcs_dir, 'output.yml'), 'w') as f: f.write('freq output') - _ingest_species_freq('run1', pipe_root, spec, state, species_dict, 'H2O', output) + with patch('arc.job.trsh.determine_ess_status', return_value=('done', [], '', '')): + _ingest_species_freq('run1', pipe_root, spec, state, species_dict, 'H2O', output) self.assertIn('H2O', output) self.assertIn('freq', output['H2O']['paths']) finally: @@ -1188,7 +1215,8 @@ def test_irc_ingestion_creates_output_entry(self): os.makedirs(calcs_dir, exist_ok=True) with open(os.path.join(calcs_dir, 'output.yml'), 'w') as f: f.write('irc output') - _ingest_irc('run1', pipe_root, spec, state, species_dict, 'TS_H2O', output) + with patch('arc.job.trsh.determine_ess_status', return_value=('done', [], '', '')): + _ingest_irc('run1', pipe_root, spec, state, species_dict, 'TS_H2O', output) self.assertIn('TS_H2O', output) self.assertIn('irc', output['TS_H2O']['paths']) self.assertEqual(len(output['TS_H2O']['paths']['irc']), 1) From 15503d95bf17ed58591fdb8ce7cc2fe2a47d6a24 Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Mon, 6 Apr 2026 12:45:06 +0300 Subject: [PATCH 22/60] Better reporting --- arc/scheduler.py | 8 ++++++-- arc/species/species.py | 4 ++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arc/scheduler.py b/arc/scheduler.py index d309df4a7d..0ecbd9201c 100644 --- a/arc/scheduler.py +++ b/arc/scheduler.py @@ -821,7 +821,8 @@ def schedule_jobs(self): t = time.time() - self.report_time if t > 3600 and (self.running_jobs or self.active_pipes): self.report_time = time.time() - logger.info(f'Currently running jobs:\n{pprint.pformat(self.running_jobs)}') + if self.running_jobs: + logger.info(f'Currently running jobs:\n{pprint.pformat(self.running_jobs)}') if self.active_pipes: logger.info(f'Active pipe runs: {list(self.active_pipes.keys())}') @@ -2321,8 +2322,11 @@ def determine_most_likely_ts_conformer(self, label: str): execution_time = execution_time[:execution_time.index('.') + 2] \ if '.' in execution_time else execution_time aux = f' {tsg.errors}.' if tsg.errors else '.' + methods_str = tsg.method + if tsg.method_sources and len(tsg.method_sources) > 1: + methods_str += f' (also: {", ".join(m for m in tsg.method_sources if m != tsg.method)})' logger.info(f'TS guess {tsg.index:2} for {label}. ' - f'Method: {tsg.method:10}, ' + f'Method: {methods_str}, ' f'relative energy: {tsg.energy:8.2f} kJ/mol, ' f'guess ex time: {execution_time}{im_freqs}' f'{aux}') diff --git a/arc/species/species.py b/arc/species/species.py index 31097380d6..76a9105dfd 100644 --- a/arc/species/species.py +++ b/arc/species/species.py @@ -1569,7 +1569,11 @@ def cluster_tsgs(self): else: tsg.cluster = [tsg.index] cluster_tsgs.append(tsg) + n_before = len([tsg for tsg in self.ts_guesses]) self.ts_guesses = cluster_tsgs + if len(cluster_tsgs) < n_before: + logger.info(f'Clustered {n_before} TS guesses for {self.label} ' + f'into {len(cluster_tsgs)} unique conformers.') def process_completed_tsg_queue_jobs(self, path: str): """ From de8e67bbd98aeeba34341275b0314fb8d6c8160f Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Mon, 6 Apr 2026 13:11:09 +0300 Subject: [PATCH 23/60] Added troubleshooting of failed indiv. pipe jobs This is phase 1, should be a second phase implemented where we round up all the failed, see if they hit the min pipe count, modify with the trsh methods and then submit as a pipe job again --- arc/job/pipe/pipe_coordinator.py | 69 +++++++++++++++++++++++++++++--- arc/job/pipe/pipe_run.py | 6 ++- arc/scripts/pipe_worker.py | 57 +++++++++++++++++++++++++- 3 files changed, 124 insertions(+), 8 deletions(-) diff --git a/arc/job/pipe/pipe_coordinator.py b/arc/job/pipe/pipe_coordinator.py index 892f71fcf8..3efeccba6a 100644 --- a/arc/job/pipe/pipe_coordinator.py +++ b/arc/job/pipe/pipe_coordinator.py @@ -15,7 +15,10 @@ from arc.imports import settings from arc.job.pipe.pipe_run import PipeRun, ingest_completed_task -from arc.job.pipe.pipe_state import PipeRunState, TaskState, TaskSpec, read_task_state +from arc.job.pipe.pipe_state import ( + TASK_FAMILY_TO_JOB_TYPE, PipeRunState, TaskState, TaskSpec, + TaskStateRecord, read_task_state, +) if TYPE_CHECKING: from arc.scheduler import Scheduler @@ -189,8 +192,13 @@ def ingest_pipe_results(self, pipe: PipeRun) -> None: Dispatches by task_family. One broken task does not abort ingestion of remaining tasks. After all per-task ingestion, triggers family-specific post-processing (e.g., selecting - the best conformer and spawning the next job). + the best conformer and spawning the next job) — but only if + no tasks were ejected to the Scheduler for troubleshooting. + Ejected tasks will complete through the Scheduler's normal + pipeline, and the Scheduler's main loop will trigger the + next workflow steps when all conformer jobs are done. """ + ejected_count = 0 for spec in pipe.tasks: try: state = read_task_state(pipe.pipe_root, spec.task_id) @@ -202,13 +210,21 @@ def ingest_pipe_results(self, pipe: PipeRun) -> None: ingest_completed_task(pipe.run_id, pipe.pipe_root, spec, state, self.sched.species_dict, self.sched.output) elif state.status == TaskState.FAILED_TERMINAL.value: - logger.error(f'Pipe run {pipe.run_id}, task {spec.task_id}: ' - f'failed terminally (failure_class={state.failure_class}). ' - f'Manual troubleshooting required.') + if state.failure_class == 'ess_error': + self._eject_to_scheduler(pipe, spec, state) + ejected_count += 1 + else: + logger.error(f'Pipe run {pipe.run_id}, task {spec.task_id}: ' + f'failed terminally (failure_class={state.failure_class}). ' + f'Manual troubleshooting required.') elif state.status == TaskState.CANCELLED.value: logger.warning(f'Pipe run {pipe.run_id}, task {spec.task_id}: ' f'was cancelled.') - self._post_ingest_pipe_run(pipe) + if ejected_count > 0: + logger.info(f'Pipe run {pipe.run_id}: {ejected_count} task(s) ejected to Scheduler ' + f'for troubleshooting. Deferring post-ingestion workflow.') + else: + self._post_ingest_pipe_run(pipe) def _post_ingest_pipe_run(self, pipe: PipeRun) -> None: """ @@ -287,3 +303,44 @@ def _post_ingest_conf_sp(self, label: str) -> None: self.sched.run_opt_job(label, fine=self.sched.fine_only) else: self.sched.run_composite_job(label) + + def _eject_to_scheduler(self, pipe: 'PipeRun', spec: TaskSpec, + state: 'TaskStateRecord') -> None: + """ + Eject a failed pipe task to the Scheduler as an individual job. + + Translates the TaskSpec back into a ``Scheduler.run_job()`` call so that + the Scheduler's existing ``troubleshoot_ess()`` pipeline handles it. + """ + label = spec.owner_key + if label not in self.sched.species_dict: + logger.warning(f'Pipe run {pipe.run_id}, task {spec.task_id}: ' + f'species "{label}" not in species_dict, cannot eject.') + return + job_type = TASK_FAMILY_TO_JOB_TYPE.get(spec.task_family) + if job_type is None: + logger.warning(f'Pipe run {pipe.run_id}, task {spec.task_id}: ' + f'unknown task_family "{spec.task_family}", cannot eject.') + return + payload = spec.input_payload or {} + meta = spec.ingestion_metadata or {} + kwargs = { + 'job_type': job_type, + 'label': label, + 'level_of_theory': spec.level, + 'job_adapter': spec.engine, + 'xyz': payload.get('xyz'), + 'conformer': meta.get('conformer_index'), + } + if spec.task_family == 'irc': + kwargs['irc_direction'] = meta.get('irc_direction') + elif spec.task_family == 'rotor_scan_1d': + kwargs['rotor_index'] = meta.get('rotor_index') + kwargs['torsions'] = payload.get('torsions') + try: + logger.info(f'Pipe run {pipe.run_id}, task {spec.task_id}: ' + f'ejecting to Scheduler as individual {job_type} job for {label}.') + self.sched.run_job(**kwargs) + except Exception: + logger.error(f'Pipe run {pipe.run_id}, task {spec.task_id}: ' + f'failed to eject to Scheduler.', exc_info=True) diff --git a/arc/job/pipe/pipe_run.py b/arc/job/pipe/pipe_run.py index f0d91fe824..a261f8fe2d 100644 --- a/arc/job/pipe/pipe_run.py +++ b/arc/job/pipe/pipe_run.py @@ -332,7 +332,11 @@ def reconcile(self) -> Dict[str, int]: if current not in (TaskState.FAILED_RETRYABLE, TaskState.ORPHANED): continue try: - if state.attempt_index + 1 < state.max_attempts: + # Don't blind-retry deterministic ESS errors (e.g., MaxOptCycles, SCF). + # These need troubleshooting with modified input, not identical retries. + # They'll be ejected to the Scheduler as individual jobs at ingestion time. + is_ess_error = state.failure_class == 'ess_error' + if state.attempt_index + 1 < state.max_attempts and not is_ess_error: update_task_state(self.pipe_root, task_id, new_status=TaskState.PENDING, attempt_index=state.attempt_index + 1, diff --git a/arc/scripts/pipe_worker.py b/arc/scripts/pipe_worker.py index 4afdaa1aa7..7c51c8900a 100644 --- a/arc/scripts/pipe_worker.py +++ b/arc/scripts/pipe_worker.py @@ -94,11 +94,35 @@ def claim_task(pipe_root: str, worker_id: str): return None, None, None +def _parse_ess_error(attempt_dir: str, spec) -> Optional[dict]: + """ + Parse ESS error info from the output file in an attempt directory. + Returns a dict with 'status', 'keywords', 'error', 'line', or None. + """ + from arc.job.trsh import determine_ess_status + from arc.job.pipe.pipe_state import TASK_FAMILY_TO_JOB_TYPE + try: + output_file = _find_canonical_output(attempt_dir, spec.engine) + if output_file is None or not os.path.isfile(output_file): + return None + job_type = TASK_FAMILY_TO_JOB_TYPE.get(spec.task_family, 'opt') + status, keywords, error, line = determine_ess_status( + output_path=output_file, species_label=spec.owner_key, + job_type=job_type, software=spec.engine) + return {'status': status, 'keywords': keywords, 'error': error, 'line': line} + except Exception: + return None + + def run_task(pipe_root: str, task_id: str, state: TaskStateRecord, worker_id: str, claim_token: str) -> None: """ Execute a claimed task: transition to RUNNING, dispatch by task_family, copy outputs, write result.json, and mark COMPLETED or FAILED. + + Detects ESS-level errors (non-convergence) even when the adapter returns + without exception. Saves ESS error diagnostics into result.json for + downstream troubleshooting decisions. """ attempt_dir = get_task_attempt_dir(pipe_root, task_id, state.attempt_index) os.makedirs(attempt_dir, exist_ok=True) @@ -119,8 +143,32 @@ def run_task(pipe_root: str, task_id: str, state: TaskStateRecord, _copy_outputs(scratch_dir, attempt_dir) ended_at = time.time() result['ended_at'] = ended_at - result['status'] = 'COMPLETED' result['canonical_output_path'] = _find_canonical_output(attempt_dir, spec.engine) + + # Check ESS convergence even when no Python exception was raised. + ess_info = _parse_ess_error(attempt_dir, spec) + if ess_info and ess_info['status'] != 'done': + # ESS ran but did not converge — treat as ESS failure. + result['status'] = 'FAILED' + result['failure_class'] = 'ess_error' + result['parser_summary'] = ess_info + write_result_json(attempt_dir, result) + logger.warning(f'Task {task_id}: ESS did not converge ' + f'(keywords={ess_info["keywords"]})') + if not _verify_ownership(pipe_root, task_id, worker_id, claim_token): + return + try: + current_state = read_task_state(pipe_root, task_id) + target = TaskState.FAILED_RETRYABLE \ + if current_state.attempt_index + 1 < current_state.max_attempts \ + else TaskState.FAILED_TERMINAL + update_task_state(pipe_root, task_id, new_status=target, + ended_at=ended_at, failure_class='ess_error') + except (ValueError, TimeoutError) as exc: + logger.warning(f'Task {task_id}: could not mark failed ({exc}).') + return + + result['status'] = 'COMPLETED' write_result_json(attempt_dir, result) if not _verify_ownership(pipe_root, task_id, worker_id, claim_token): return @@ -139,6 +187,13 @@ def run_task(pipe_root: str, task_id: str, state: TaskStateRecord, result['ended_at'] = ended_at result['status'] = 'FAILED' result['failure_class'] = failure_class + # Try to parse ESS error info even on exception path. + ess_info = _parse_ess_error(attempt_dir, spec) + if ess_info: + result['parser_summary'] = ess_info + if ess_info['status'] != 'done': + result['failure_class'] = 'ess_error' + failure_class = 'ess_error' write_result_json(attempt_dir, result) if not _verify_ownership(pipe_root, task_id, worker_id, claim_token): return From 6ae8b8c0a3cfa79cffff3103e4d6e4e4ab63e66b Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Mon, 6 Apr 2026 19:44:57 +0300 Subject: [PATCH 24/60] Further updates --- arc/job/pipe/pipe_coordinator.py | 14 +++++++++++++- arc/scripts/pipe_worker.py | 28 +++++++++++++++++++++++----- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/arc/job/pipe/pipe_coordinator.py b/arc/job/pipe/pipe_coordinator.py index 3efeccba6a..4c3765bb0b 100644 --- a/arc/job/pipe/pipe_coordinator.py +++ b/arc/job/pipe/pipe_coordinator.py @@ -317,7 +317,19 @@ def _eject_to_scheduler(self, pipe: 'PipeRun', spec: TaskSpec, logger.warning(f'Pipe run {pipe.run_id}, task {spec.task_id}: ' f'species "{label}" not in species_dict, cannot eject.') return - job_type = TASK_FAMILY_TO_JOB_TYPE.get(spec.task_family) + # Map task_family to the Scheduler's job_type. Note: ts_opt pipe tasks + # are TS conformer optimizations (at the guess level), not proper-level + # optimizations. The Scheduler uses 'conf_opt' for these, not 'opt'. + family_to_sched_job_type = { + 'ts_opt': 'conf_opt', + 'conf_opt': 'conf_opt', + 'conf_sp': 'conf_sp', + 'species_sp': 'sp', + 'species_freq': 'freq', + 'irc': 'irc', + 'rotor_scan_1d': 'scan', + } + job_type = family_to_sched_job_type.get(spec.task_family) if job_type is None: logger.warning(f'Pipe run {pipe.run_id}, task {spec.task_id}: ' f'unknown task_family "{spec.task_family}", cannot eject.') diff --git a/arc/scripts/pipe_worker.py b/arc/scripts/pipe_worker.py index 7c51c8900a..e7267f1162 100644 --- a/arc/scripts/pipe_worker.py +++ b/arc/scripts/pipe_worker.py @@ -94,6 +94,20 @@ def claim_task(pipe_root: str, worker_id: str): return None, None, None +# ESS error keywords that are transient/infrastructure-related and worth retrying +# with identical input (e.g., on a different node). All other ESS errors are +# deterministic and should be ejected to the Scheduler for troubleshooting. +_TRANSIENT_ESS_KEYWORDS = {'NoOutput', 'ServerTimeLimit', 'DiskSpace'} + + +def _is_deterministic_ess_error(ess_info: dict) -> bool: + """Return True if the ESS error is deterministic (same input will always fail).""" + if not ess_info or ess_info['status'] == 'done': + return False + keywords = set(ess_info.get('keywords', [])) + return not keywords.issubset(_TRANSIENT_ESS_KEYWORDS) + + def _parse_ess_error(attempt_dir: str, spec) -> Optional[dict]: """ Parse ESS error info from the output file in an attempt directory. @@ -148,13 +162,17 @@ def run_task(pipe_root: str, task_id: str, state: TaskStateRecord, # Check ESS convergence even when no Python exception was raised. ess_info = _parse_ess_error(attempt_dir, spec) if ess_info and ess_info['status'] != 'done': - # ESS ran but did not converge — treat as ESS failure. + # Distinguish deterministic ESS errors (need troubleshooting) from + # transient failures (NoOutput, ServerTimeLimit — worth retrying as-is). + is_deterministic = _is_deterministic_ess_error(ess_info) + fc = 'ess_error' if is_deterministic else 'transient_ess' result['status'] = 'FAILED' - result['failure_class'] = 'ess_error' + result['failure_class'] = fc result['parser_summary'] = ess_info write_result_json(attempt_dir, result) logger.warning(f'Task {task_id}: ESS did not converge ' - f'(keywords={ess_info["keywords"]})') + f'(keywords={ess_info["keywords"]}, ' + f'{"deterministic" if is_deterministic else "transient"})') if not _verify_ownership(pipe_root, task_id, worker_id, claim_token): return try: @@ -163,7 +181,7 @@ def run_task(pipe_root: str, task_id: str, state: TaskStateRecord, if current_state.attempt_index + 1 < current_state.max_attempts \ else TaskState.FAILED_TERMINAL update_task_state(pipe_root, task_id, new_status=target, - ended_at=ended_at, failure_class='ess_error') + ended_at=ended_at, failure_class=fc) except (ValueError, TimeoutError) as exc: logger.warning(f'Task {task_id}: could not mark failed ({exc}).') return @@ -191,7 +209,7 @@ def run_task(pipe_root: str, task_id: str, state: TaskStateRecord, ess_info = _parse_ess_error(attempt_dir, spec) if ess_info: result['parser_summary'] = ess_info - if ess_info['status'] != 'done': + if ess_info['status'] != 'done' and _is_deterministic_ess_error(ess_info): result['failure_class'] = 'ess_error' failure_class = 'ess_error' write_result_json(attempt_dir, result) From 00b5e2f228ffcd242962f5ec044868730c1bc128 Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Mon, 6 Apr 2026 22:24:47 +0300 Subject: [PATCH 25/60] Updates --- arc/common.py | 4 ++-- arc/job/adapters/common.py | 2 +- arc/job/pipe/pipe_run.py | 10 +++++++++- arc/settings/settings.py | 1 + 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/arc/common.py b/arc/common.py index 8786357c22..32df575376 100644 --- a/arc/common.py +++ b/arc/common.py @@ -1087,8 +1087,8 @@ def almost_equal_coords(xyz1: dict, raise TypeError(f'xyz1 and xyz2 must be dictionaries, got {type(xyz1)} and {type(xyz2)}:\n{xyz1}\n{xyz2}') for symbol_1, symbol_2 in zip(xyz1['symbols'], xyz2['symbols']): if symbol_1 != symbol_2: - logger.warning(f"Cannot compare coords, xyz1 and xyz2 have different symbols:" - f"\n{xyz1['symbols']}\nand:\n{xyz2['symbols']}") + logger.debug(f"Cannot compare coords, xyz1 and xyz2 have different symbols:" + f"\n{xyz1['symbols']}\nand:\n{xyz2['symbols']}") for xyz_coord1, xyz_coord2 in zip(xyz1['coords'], xyz2['coords']): for xyz1_c, xyz2_c in zip(xyz_coord1, xyz_coord2): if not np.isclose([xyz1_c], [xyz2_c], rtol=rtol, atol=atol): diff --git a/arc/job/adapters/common.py b/arc/job/adapters/common.py index 24486e7ff9..d0e2e9e27d 100644 --- a/arc/job/adapters/common.py +++ b/arc/job/adapters/common.py @@ -525,7 +525,7 @@ def which(command: Union[str, list], command = [command] if isinstance(command, str) else command ans = None for comm in command: - ans = shutil.which(comm, mode=os.F_OK | os.X_OK, path=lenv["PATH"] + lenv["PYTHONPATH"]) + ans = shutil.which(comm, mode=os.F_OK | os.X_OK, path=lenv["PATH"] + os.pathsep + lenv["PYTHONPATH"]) if ans: break diff --git a/arc/job/pipe/pipe_run.py b/arc/job/pipe/pipe_run.py index a261f8fe2d..e6e2cf8afe 100644 --- a/arc/job/pipe/pipe_run.py +++ b/arc/job/pipe/pipe_run.py @@ -215,6 +215,10 @@ def write_submit_script(self) -> str: queue, _ = next(iter(server.get('queues', {}).items()), ('', None)) engine = self.tasks[0].engine if self.tasks else '' env_setup = pipe_settings.get('env_setup', {}).get(engine, '') + scratch_base = pipe_settings.get('scratch_base', '') + if scratch_base: + scratch_export = f'export TMPDIR="{scratch_base}/$PBS_JOBID"\nmkdir -p "$TMPDIR"' + env_setup = f'{env_setup}\n{scratch_export}' if env_setup else scratch_export content = pipe_submit[template_key].format( name=f'pipe_{self.run_id}', max_task_num=array_size, @@ -361,8 +365,12 @@ def reconcile(self) -> Dict[str, int]: # Only flag resubmission for genuinely retried tasks (attempt_index > 0). # Fresh PENDING tasks (attempt_index == 0) are waiting for the initial # submission's workers to start — don't resubmit for those. + # After a resubmission, allow a grace period for workers to start before + # flagging again (prevents duplicate submissions). active_after_retry = counts[TaskState.CLAIMED.value] + counts[TaskState.RUNNING.value] - if retried_pending > 0 and active_after_retry == 0: + resubmit_grace = 120 # seconds + time_since_submit = (now - self.submitted_at) if self.submitted_at else float('inf') + if retried_pending > 0 and active_after_retry == 0 and time_since_submit > resubmit_grace: self._needs_resubmission = True logger.info(f'Pipe run {self.run_id}: {retried_pending} retried tasks ' f'need workers. Resubmission needed.') diff --git a/arc/settings/settings.py b/arc/settings/settings.py index 2d7144047f..d2580cafb7 100644 --- a/arc/settings/settings.py +++ b/arc/settings/settings.py @@ -316,6 +316,7 @@ 'env_setup': {}, # Engine-specific shell setup commands, e.g., # {'gaussian': 'source /usr/local/g09/setup.sh', # 'orca': 'source /usr/local/orca-5.0.4/setup.sh && source /usr/local/openmpi-4.1.1/setup.sh'} + 'scratch_base': '', # Base directory for worker scratch (e.g., '/gtmp'). Leave empty for system default. } # Criteria for identification of imaginary frequencies for transition states. From a9ac64fe38719fa8765e0dc9530beca102907b0f Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Tue, 7 Apr 2026 00:13:35 +0300 Subject: [PATCH 26/60] Scratch fix --- arc/job/pipe/pipe_run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arc/job/pipe/pipe_run.py b/arc/job/pipe/pipe_run.py index e6e2cf8afe..17f8b2fe0f 100644 --- a/arc/job/pipe/pipe_run.py +++ b/arc/job/pipe/pipe_run.py @@ -217,7 +217,7 @@ def write_submit_script(self) -> str: env_setup = pipe_settings.get('env_setup', {}).get(engine, '') scratch_base = pipe_settings.get('scratch_base', '') if scratch_base: - scratch_export = f'export TMPDIR="{scratch_base}/$PBS_JOBID"\nmkdir -p "$TMPDIR"' + scratch_export = f'export TMPDIR="{scratch_base}/${{PBS_JOBID%%[*}}/$PBS_ARRAY_INDEX"\nmkdir -p "$TMPDIR"' env_setup = f'{env_setup}\n{scratch_export}' if env_setup else scratch_export content = pipe_submit[template_key].format( name=f'pipe_{self.run_id}', From c26d5b247e8c72bcd6c125223e61f7b7fa4efc89 Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Tue, 7 Apr 2026 00:39:57 +0300 Subject: [PATCH 27/60] Path sep --- arc/job/adapters/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arc/job/adapters/common.py b/arc/job/adapters/common.py index d0e2e9e27d..82a8db0c40 100644 --- a/arc/job/adapters/common.py +++ b/arc/job/adapters/common.py @@ -515,7 +515,7 @@ def which(command: Union[str, list], The command path or ``None``, returns ``True`` or ``False`` if ``return_bool`` is set to ``True``. """ if env is None: - lenv = {"PATH": os.pathsep + os.environ.get("PATH", "") + os.path.dirname(sys.executable), + lenv = {"PATH": os.pathsep + os.environ.get("PATH", "") + os.pathsep + os.path.dirname(sys.executable), "PYTHONPATH": os.pathsep + os.environ.get("PYTHONPATH", ""), } else: From 7986864d31876c4e98de3a63dda2992a8a6635ac Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Tue, 7 Apr 2026 10:03:58 +0300 Subject: [PATCH 28/60] Handle existing pipe directories on fresh start MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When ARC starts and a pipe directory from a previous run exists with the same run_id, stage() would crash with FileExistsError. Now submit_pipe_run() checks for existing pipe_root before staging and archives it to log_and_restart_archive/ with a timestamp. Always archives — there is no reliable signal in the current ARC architecture to distinguish fresh start from restart (restart_dict is always set by ARC.execute() via as_dict(), regardless of whether the user passed input.yml or restart.yml). Follows existing ARC archive convention (log_and_restart_archive/ with HHMMSS_MonDD_YYYY timestamps). Co-Authored-By: Claude Opus 4.6 (1M context) --- arc/job/pipe/pipe_coordinator.py | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/arc/job/pipe/pipe_coordinator.py b/arc/job/pipe/pipe_coordinator.py index 4c3765bb0b..e745d41289 100644 --- a/arc/job/pipe/pipe_coordinator.py +++ b/arc/job/pipe/pipe_coordinator.py @@ -8,6 +8,7 @@ Family-specific task planning lives in ``pipe_planner.py``. """ +import os import time from typing import TYPE_CHECKING, Dict, List @@ -75,17 +76,43 @@ def should_use_pipe(self, tasks: List[TaskSpec]) -> bool: and t.required_memory_mb == ref.required_memory_mb for t in tasks[1:]) + def _handle_existing_pipe_root(self, pipe_root: str, run_id: str) -> None: + """ + Archive a pre-existing pipe_root directory before staging a new run. + + Always archives — there is no reliable signal in the current ARC + architecture to distinguish a fresh start from a restart + (restart_dict is always set). + """ + if not os.path.isdir(pipe_root): + return + self._archive_pipe_root(pipe_root, run_id) + + def _archive_pipe_root(self, pipe_root: str, run_id: str) -> None: + """Move an old pipe_root directory to log_and_restart_archive/.""" + import datetime + import shutil + archive_dir = os.path.join(self.sched.project_directory, 'log_and_restart_archive') + os.makedirs(archive_dir, exist_ok=True) + timestamp = datetime.datetime.now().strftime('%H%M%S_%b%d_%Y') + dest = os.path.join(archive_dir, f'pipe_{run_id}.old.{timestamp}') + logger.info(f'Pipe run {run_id}: archiving old directory to {dest}') + shutil.move(pipe_root, dest) + def submit_pipe_run(self, run_id: str, tasks: List[TaskSpec], cluster_software: str = 'slurm') -> PipeRun: """ Create, stage, and register a new pipe run. - Attempts to write a submit script and submit the array job. - On submission failure, the run is still registered as STAGED. + If the pipe_root already exists on disk: + - On restart with an active run: resumes it via register_pipe_run_from_dir. + - Otherwise: archives the old directory and creates a fresh run. Returns: PipeRun: The created pipe run. """ + pipe_root = os.path.join(self.sched.project_directory, 'runs', 'pipe_' + run_id) + self._handle_existing_pipe_root(pipe_root, run_id) pipe = PipeRun( project_directory=self.sched.project_directory, run_id=run_id, From b995e05cadcc35869b161f76230a0edde494b40c Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Tue, 7 Apr 2026 10:40:44 +0300 Subject: [PATCH 29/60] Handle existing pipe directories on fresh start Archives ALL old pipe_* directories from runs/ at coordinator startup (in __init__), before any pipe run is created. This prevents FileExistsError when stage() encounters stale directories from previous ARC runs. Moved from per-run check in submit_pipe_run() to a single startup sweep in _archive_old_pipe_dirs(). This ensures cleanup happens immediately regardless of which task family triggers pipe mode first. Follows existing ARC archive convention (log_and_restart_archive/ with HHMMSS_MonDD_YYYY timestamps). Co-Authored-By: Claude Opus 4.6 (1M context) --- arc/job/pipe/pipe_coordinator.py | 37 +++++++++++++++----------------- arc/scheduler.py | 9 +++++++- 2 files changed, 25 insertions(+), 21 deletions(-) diff --git a/arc/job/pipe/pipe_coordinator.py b/arc/job/pipe/pipe_coordinator.py index e745d41289..7acd628fa3 100644 --- a/arc/job/pipe/pipe_coordinator.py +++ b/arc/job/pipe/pipe_coordinator.py @@ -49,6 +49,7 @@ def __init__(self, sched: 'Scheduler'): self.active_pipes: Dict[str, PipeRun] = {} self._pipe_poll_failures: Dict[str, int] = {} self._last_pipe_summary: Dict[str, str] = {} + self._archive_old_pipe_dirs() def should_use_pipe(self, tasks: List[TaskSpec]) -> bool: """ @@ -76,43 +77,39 @@ def should_use_pipe(self, tasks: List[TaskSpec]) -> bool: and t.required_memory_mb == ref.required_memory_mb for t in tasks[1:]) - def _handle_existing_pipe_root(self, pipe_root: str, run_id: str) -> None: + def _archive_old_pipe_dirs(self) -> None: """ - Archive a pre-existing pipe_root directory before staging a new run. + Archive all existing pipe directories from ``runs/`` at startup. - Always archives — there is no reliable signal in the current ARC - architecture to distinguish a fresh start from a restart - (restart_dict is always set). + Called once from ``__init__``. Moves any ``pipe_*`` directories to + ``log_and_restart_archive/`` so that ``stage()`` never hits + ``FileExistsError`` from stale previous runs. """ - if not os.path.isdir(pipe_root): - return - self._archive_pipe_root(pipe_root, run_id) - - def _archive_pipe_root(self, pipe_root: str, run_id: str) -> None: - """Move an old pipe_root directory to log_and_restart_archive/.""" import datetime import shutil + runs_dir = os.path.join(self.sched.project_directory, 'runs') + if not os.path.isdir(runs_dir): + return archive_dir = os.path.join(self.sched.project_directory, 'log_and_restart_archive') - os.makedirs(archive_dir, exist_ok=True) timestamp = datetime.datetime.now().strftime('%H%M%S_%b%d_%Y') - dest = os.path.join(archive_dir, f'pipe_{run_id}.old.{timestamp}') - logger.info(f'Pipe run {run_id}: archiving old directory to {dest}') - shutil.move(pipe_root, dest) + for entry in os.listdir(runs_dir): + if entry.startswith('pipe_') and os.path.isdir(os.path.join(runs_dir, entry)): + os.makedirs(archive_dir, exist_ok=True) + src = os.path.join(runs_dir, entry) + dest = os.path.join(archive_dir, f'{entry}.old.{timestamp}') + logger.info(f'Archiving old pipe directory {entry} to {dest}') + shutil.move(src, dest) def submit_pipe_run(self, run_id: str, tasks: List[TaskSpec], cluster_software: str = 'slurm') -> PipeRun: """ Create, stage, and register a new pipe run. - If the pipe_root already exists on disk: - - On restart with an active run: resumes it via register_pipe_run_from_dir. - - Otherwise: archives the old directory and creates a fresh run. + Old pipe directories are archived at startup by ``_archive_old_pipe_dirs``. Returns: PipeRun: The created pipe run. """ - pipe_root = os.path.join(self.sched.project_directory, 'runs', 'pipe_' + run_id) - self._handle_existing_pipe_root(pipe_root, run_id) pipe = PipeRun( project_directory=self.sched.project_directory, run_id=run_id, diff --git a/arc/scheduler.py b/arc/scheduler.py index 0ecbd9201c..ea25d4667b 100644 --- a/arc/scheduler.py +++ b/arc/scheduler.py @@ -802,7 +802,14 @@ def schedule_jobs(self): break if not len(job_list): - self.check_all_done(label) + has_pending_pipe_work = ( + label in self._pending_pipe_sp + or label in self._pending_pipe_freq + or any(lbl == label for lbl, _ in self._pending_pipe_irc) + or label in self._pending_pipe_conf_sp + ) + if not has_pending_pipe_work: + self.check_all_done(label) if not self.running_jobs[label]: # Delete the label only if it represents an empty entry. del self.running_jobs[label] From 5e67ea49d53d644a7bfd34961694fe917421e33a Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Tue, 7 Apr 2026 11:19:13 +0300 Subject: [PATCH 30/60] Add FAILED_ESS task state and document pipe task lifecycle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added FAILED_ESS as a distinct TaskState for deterministic ESS convergence errors (SCF, MaxOptCycles, InternalCoordinateError). Previously these were lumped into FAILED_RETRYABLE with a side-channel failure_class field. Now the state itself carries the meaning: - FAILED_RETRYABLE: transient (node crash, NoOutput) — pipe retries - FAILED_ESS: deterministic ESS error — ejected to Scheduler - FAILED_TERMINAL: exhausted retries — no further action Reverted log output to original clean format using state names directly (e.g., COMPLETED: 30, FAILED_ESS: 2, RUNNING: 8). Updated docs/source/advanced.rst with full task state documentation and pipe_settings env_setup/scratch_base configuration. Co-Authored-By: Claude Opus 4.6 (1M context) --- arc/job/pipe/pipe_coordinator.py | 13 ++++++------- arc/job/pipe/pipe_run.py | 13 +++++++------ arc/job/pipe/pipe_state.py | 32 ++++++++++++++++++++++++++++++-- arc/scripts/pipe_worker.py | 30 +++++++++++++++++------------- docs/source/advanced.rst | 29 ++++++++++++++++++++++++++--- 5 files changed, 86 insertions(+), 31 deletions(-) diff --git a/arc/job/pipe/pipe_coordinator.py b/arc/job/pipe/pipe_coordinator.py index 7acd628fa3..eda2968e9a 100644 --- a/arc/job/pipe/pipe_coordinator.py +++ b/arc/job/pipe/pipe_coordinator.py @@ -233,14 +233,13 @@ def ingest_pipe_results(self, pipe: PipeRun) -> None: if state.status == TaskState.COMPLETED.value: ingest_completed_task(pipe.run_id, pipe.pipe_root, spec, state, self.sched.species_dict, self.sched.output) + elif state.status == TaskState.FAILED_ESS.value: + self._eject_to_scheduler(pipe, spec, state) + ejected_count += 1 elif state.status == TaskState.FAILED_TERMINAL.value: - if state.failure_class == 'ess_error': - self._eject_to_scheduler(pipe, spec, state) - ejected_count += 1 - else: - logger.error(f'Pipe run {pipe.run_id}, task {spec.task_id}: ' - f'failed terminally (failure_class={state.failure_class}). ' - f'Manual troubleshooting required.') + logger.error(f'Pipe run {pipe.run_id}, task {spec.task_id}: ' + f'failed terminally (failure_class={state.failure_class}). ' + f'Manual troubleshooting required.') elif state.status == TaskState.CANCELLED.value: logger.warning(f'Pipe run {pipe.run_id}, task {spec.task_id}: ' f'was cancelled.') diff --git a/arc/job/pipe/pipe_run.py b/arc/job/pipe/pipe_run.py index 17f8b2fe0f..cffb7e764e 100644 --- a/arc/job/pipe/pipe_run.py +++ b/arc/job/pipe/pipe_run.py @@ -336,11 +336,9 @@ def reconcile(self) -> Dict[str, int]: if current not in (TaskState.FAILED_RETRYABLE, TaskState.ORPHANED): continue try: - # Don't blind-retry deterministic ESS errors (e.g., MaxOptCycles, SCF). - # These need troubleshooting with modified input, not identical retries. - # They'll be ejected to the Scheduler as individual jobs at ingestion time. - is_ess_error = state.failure_class == 'ess_error' - if state.attempt_index + 1 < state.max_attempts and not is_ess_error: + # FAILED_ESS tasks are handled separately (ejected to Scheduler). + # Only FAILED_RETRYABLE and ORPHANED reach here. + if state.attempt_index + 1 < state.max_attempts: update_task_state(self.pipe_root, task_id, new_status=TaskState.PENDING, attempt_index=state.attempt_index + 1, @@ -378,11 +376,14 @@ def reconcile(self) -> Dict[str, int]: self._needs_resubmission = False terminal = (counts[TaskState.COMPLETED.value] + + counts[TaskState.FAILED_ESS.value] + counts[TaskState.FAILED_TERMINAL.value] + counts[TaskState.CANCELLED.value]) if total > 0 and terminal == total: - failed = counts[TaskState.FAILED_TERMINAL.value] + counts[TaskState.CANCELLED.value] + failed = (counts[TaskState.FAILED_ESS.value] + + counts[TaskState.FAILED_TERMINAL.value] + + counts[TaskState.CANCELLED.value]) if failed > 0: self.status = PipeRunState.COMPLETED_PARTIAL else: diff --git a/arc/job/pipe/pipe_state.py b/arc/job/pipe/pipe_state.py index 0021504f29..0de4a78808 100644 --- a/arc/job/pipe/pipe_state.py +++ b/arc/job/pipe/pipe_state.py @@ -32,12 +32,39 @@ class TaskState(str, Enum): - """States for an individual task within a pipe run.""" + """ + States for an individual task within a pipe run. + + Lifecycle:: + + PENDING ──► CLAIMED ──► RUNNING ──► COMPLETED + │ │ + │ ├──► FAILED_RETRYABLE ──► PENDING (retry) + │ │ └──► FAILED_TERMINAL + │ ├──► FAILED_ESS ──► (ejected to Scheduler) + │ └──► ORPHANED ──► PENDING (retry) + └──► ORPHANED + + PENDING: Awaiting a worker. Fresh tasks start here (attempt_index=0). + Retried tasks return here with attempt_index incremented. + CLAIMED: A worker has claimed this task (file-locked). + RUNNING: The worker is executing the ESS adapter. + COMPLETED: ESS ran and converged successfully. Results ready for ingestion. + FAILED_RETRYABLE: Transient failure (node crash, NoOutput, disk issue). + Will be retried with identical input on a different node. + FAILED_ESS: Deterministic ESS error (SCF, MaxOptCycles, InternalCoordinateError). + Blind retry won't help — ejected to Scheduler for troubleshooting + with modified input (different algorithm, keywords, etc.). + FAILED_TERMINAL: Exhausted all retry attempts. No further action by pipe system. + ORPHANED: Worker lease expired (likely killed by PBS). Reset to PENDING. + CANCELLED: Manually cancelled. Terminal state. + """ PENDING = 'PENDING' CLAIMED = 'CLAIMED' RUNNING = 'RUNNING' COMPLETED = 'COMPLETED' FAILED_RETRYABLE = 'FAILED_RETRYABLE' + FAILED_ESS = 'FAILED_ESS' FAILED_TERMINAL = 'FAILED_TERMINAL' ORPHANED = 'ORPHANED' CANCELLED = 'CANCELLED' @@ -85,10 +112,11 @@ class PipeRunState(str, Enum): TASK_TRANSITIONS: Dict[TaskState, Tuple[TaskState, ...]] = { TaskState.PENDING: (TaskState.CLAIMED, TaskState.CANCELLED), TaskState.CLAIMED: (TaskState.RUNNING, TaskState.ORPHANED, TaskState.CANCELLED), - TaskState.RUNNING: (TaskState.COMPLETED, TaskState.FAILED_RETRYABLE, + TaskState.RUNNING: (TaskState.COMPLETED, TaskState.FAILED_RETRYABLE, TaskState.FAILED_ESS, TaskState.FAILED_TERMINAL, TaskState.ORPHANED, TaskState.CANCELLED), TaskState.COMPLETED: (), TaskState.FAILED_RETRYABLE: (TaskState.PENDING, TaskState.FAILED_TERMINAL), + TaskState.FAILED_ESS: (), # Terminal within pipe — ejected to Scheduler for troubleshooting. TaskState.FAILED_TERMINAL: (), TaskState.ORPHANED: (TaskState.PENDING, TaskState.FAILED_TERMINAL), TaskState.CANCELLED: (), diff --git a/arc/scripts/pipe_worker.py b/arc/scripts/pipe_worker.py index e7267f1162..ad37233933 100644 --- a/arc/scripts/pipe_worker.py +++ b/arc/scripts/pipe_worker.py @@ -162,12 +162,9 @@ def run_task(pipe_root: str, task_id: str, state: TaskStateRecord, # Check ESS convergence even when no Python exception was raised. ess_info = _parse_ess_error(attempt_dir, spec) if ess_info and ess_info['status'] != 'done': - # Distinguish deterministic ESS errors (need troubleshooting) from - # transient failures (NoOutput, ServerTimeLimit — worth retrying as-is). is_deterministic = _is_deterministic_ess_error(ess_info) - fc = 'ess_error' if is_deterministic else 'transient_ess' result['status'] = 'FAILED' - result['failure_class'] = fc + result['failure_class'] = 'ess_error' if is_deterministic else 'transient' result['parser_summary'] = ess_info write_result_json(attempt_dir, result) logger.warning(f'Task {task_id}: ESS did not converge ' @@ -176,12 +173,15 @@ def run_task(pipe_root: str, task_id: str, state: TaskStateRecord, if not _verify_ownership(pipe_root, task_id, worker_id, claim_token): return try: - current_state = read_task_state(pipe_root, task_id) - target = TaskState.FAILED_RETRYABLE \ - if current_state.attempt_index + 1 < current_state.max_attempts \ - else TaskState.FAILED_TERMINAL + if is_deterministic: + target = TaskState.FAILED_ESS + else: + current_state = read_task_state(pipe_root, task_id) + target = TaskState.FAILED_RETRYABLE \ + if current_state.attempt_index + 1 < current_state.max_attempts \ + else TaskState.FAILED_TERMINAL update_task_state(pipe_root, task_id, new_status=target, - ended_at=ended_at, failure_class=fc) + ended_at=ended_at, failure_class=result['failure_class']) except (ValueError, TimeoutError) as exc: logger.warning(f'Task {task_id}: could not mark failed ({exc}).') return @@ -206,19 +206,23 @@ def run_task(pipe_root: str, task_id: str, state: TaskStateRecord, result['status'] = 'FAILED' result['failure_class'] = failure_class # Try to parse ESS error info even on exception path. + is_deterministic_ess = False ess_info = _parse_ess_error(attempt_dir, spec) if ess_info: result['parser_summary'] = ess_info if ess_info['status'] != 'done' and _is_deterministic_ess_error(ess_info): result['failure_class'] = 'ess_error' - failure_class = 'ess_error' + is_deterministic_ess = True write_result_json(attempt_dir, result) if not _verify_ownership(pipe_root, task_id, worker_id, claim_token): return try: - current_state = read_task_state(pipe_root, task_id) - target = TaskState.FAILED_RETRYABLE if current_state.attempt_index + 1 < current_state.max_attempts \ - else TaskState.FAILED_TERMINAL + if is_deterministic_ess: + target = TaskState.FAILED_ESS + else: + current_state = read_task_state(pipe_root, task_id) + target = TaskState.FAILED_RETRYABLE if current_state.attempt_index + 1 < current_state.max_attempts \ + else TaskState.FAILED_TERMINAL update_task_state(pipe_root, task_id, new_status=target, ended_at=ended_at, failure_class=failure_class) except (ValueError, TimeoutError) as e: diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst index 8db1458943..aa2db066e0 100644 --- a/docs/source/advanced.rst +++ b/docs/source/advanced.rst @@ -947,12 +947,32 @@ Below that threshold, ARC uses its normal per-job submission path. - Pipe executes only ready "leaf" jobs. All quality checks, troubleshooting, and downstream decision-making remain in ARC's main scheduler. -- Failed tasks are retried automatically (configurable). - If a task exhausts its retry budget, it is marked as terminally failed - and reported to the scheduler for manual review. +- Failed tasks are classified and handled automatically (see task states below). - Each array worker verifies task ownership before writing results, preventing stale workers from overwriting state after lease expiration. +**Task states:** + +Each pipe task has a state that is reported in the ARC log +(e.g., ``Pipe run TS0_ts_opt: COMPLETED: 30, FAILED_ESS: 2, RUNNING: 8``). +The states are: + +- ``PENDING`` — Waiting for a worker to claim it. Fresh tasks start here. + Retried tasks return here with an incremented attempt index. +- ``CLAIMED`` — A worker has claimed this task via file lock. +- ``RUNNING`` — The worker is executing the ESS (e.g., Gaussian, Orca). +- ``COMPLETED`` — ESS converged successfully. Results will be ingested. +- ``FAILED_RETRYABLE`` — Transient failure (node crash, no output, disk issue). + The pipe will retry this task on a different node with the same input. +- ``FAILED_ESS`` — Deterministic ESS convergence error (e.g., SCF failure, + max optimization cycles, internal coordinate error). Retrying with the + same input will produce the same failure. The task is ejected to the + Scheduler as an individual job for troubleshooting with modified input. +- ``FAILED_TERMINAL`` — Exhausted all retry attempts. No further automatic action. +- ``ORPHANED`` — Worker lease expired (e.g., killed by PBS walltime). + Will be reset to ``PENDING`` for retry. +- ``CANCELLED`` — Manually cancelled. Terminal state. + **Configuration:** Pipe mode is configured via ``pipe_settings`` in ``arc/settings/settings.py`` @@ -964,6 +984,9 @@ Pipe mode is configured via ``pipe_settings`` in ``arc/settings/settings.py`` 'max_workers': 100, # Upper bound on array worker slots per PipeRun. 'max_attempts': 3, # Retry budget per task before terminal failure. 'lease_duration_s': 86400, # Worker lease duration in seconds (default 24h). + 'env_setup': {}, # Engine-specific shell setup commands, e.g., + # {'gaussian': 'source /usr/local/g09/setup.sh'} + 'scratch_base': '', # Base directory for worker scratch (e.g., '/gtmp'). } **Submit scripts:** From 3be97bf632fb46254c393ad5a6b0e0a91c370f9f Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Tue, 7 Apr 2026 15:15:38 +0300 Subject: [PATCH 31/60] Fix premature 'all conformer jobs terminated' when others still running Two independent bugs caused the Scheduler to prematurely declare a TS failed and switch to the next guess while other conformers were still being troubleshot: Bug 1 (lines 637, 668): The for/else check for remaining conformer/tsg jobs used `spec_jobs != job_name` to skip the current job. But end_job already removed it from running_jobs before this check runs. When troubleshooting resubmitted a job with the same name, the filter incorrectly skipped it, causing the 'all done' branch to fire early. Fix: removed the unnecessary `!= job_name` filter from both the conformer check (line 637) and tsg check (line 668). Bug 2 (line 3607): troubleshoot_ess called switch_ts (which deletes ALL running jobs for the species) when a single conformer exhausted troubleshooting. But other conformers might still be running. Fix: added `and conformer is None` guard so switch_ts only fires for full TS optimization failures, not individual conformer search failures. Failed conformers are now abandoned gracefully while waiting for the others to finish. Co-Authored-By: Claude Opus 4.6 (1M context) --- arc/scheduler.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/arc/scheduler.py b/arc/scheduler.py index ea25d4667b..f8337d5be1 100644 --- a/arc/scheduler.py +++ b/arc/scheduler.py @@ -633,8 +633,10 @@ def schedule_jobs(self): break # Just terminated a conformer job. # Are there additional conformer jobs currently running for this species? + # Note: end_job already removed the current job from running_jobs, + # so we don't need to exclude job_name. for spec_jobs in job_list: - if ('conf_opt' in spec_jobs or 'conf_sp' in spec_jobs) and spec_jobs != job_name: + if 'conf_opt' in spec_jobs or 'conf_sp' in spec_jobs: break else: # All conformer jobs terminated. @@ -663,7 +665,7 @@ def schedule_jobs(self): # Just terminated a tsg job. # Are there additional tsg jobs currently running for this species? for spec_jobs in job_list: - if 'tsg' in spec_jobs and spec_jobs != job_name: + if 'tsg' in spec_jobs: break else: # All tsg jobs terminated. Spawn confs. @@ -3602,11 +3604,17 @@ def troubleshoot_ess(self, cpu_cores=cpu_cores, shift=shift, ) - elif self.species_dict[label].is_ts and not self.species_dict[label].ts_guesses_exhausted: + elif self.species_dict[label].is_ts and not self.species_dict[label].ts_guesses_exhausted \ + and conformer is None: + # Only switch TS guess when a full optimization fails, not when a single + # conformer search job fails. Other conformers may still be running. logger.info(f'TS {label} did not converge. ' f'Status is:\n{self.species_dict[label].ts_checks}\n' f'Searching for a better TS conformer...') self.switch_ts(label=label) + elif conformer is not None and couldnt_trsh: + logger.warning(f'Could not troubleshoot conformer {conformer} for {label}. ' + f'Abandoning this conformer; waiting for others to finish.') self.save_restart_dict() From c5b659ff42cba925b7f2ddc96a817cff8275ae98 Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Tue, 7 Apr 2026 15:34:36 +0300 Subject: [PATCH 32/60] Move pipe directories from runs/ into calcs/ with auto-indexing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pipe output now follows ARC's calcs/ directory convention: - Per-species TS: calcs/TSs/