diff --git a/Makefile b/Makefile index ff5b1e7091..4fd3dfbc35 100644 --- a/Makefile +++ b/Makefile @@ -36,6 +36,7 @@ help: @echo " install-kinbot Install KinBot" @echo " install-sella Install Sella" @echo " install-xtb Install xTB" + @echo " install-crest Install CREST" @echo " install-torchani Install TorchANI" @echo " install-ob Install OpenBabel" @echo "" @@ -100,6 +101,9 @@ install-sella: install-xtb: bash $(DEVTOOLS_DIR)/install_xtb.sh +install-crest: + bash $(DEVTOOLS_DIR)/install_crest.sh + install-torchani: bash $(DEVTOOLS_DIR)/install_torchani.sh diff --git a/arc/constants.pxd b/arc/constants.pxd index 67b3f412bb..9fc3b9127d 100644 --- a/arc/constants.pxd +++ b/arc/constants.pxd @@ -1 +1 @@ -cdef double pi, Na, kB, R, h, hbar, c, e, m_e, m_p, m_n, amu, a0, bohr_to_angstrom, E_h, F, E_h_kJmol +cdef double pi, Na, kB, R, h, hbar, c, e, m_e, m_p, m_n, amu, a0, E_h, F, E_h_kJmol, bohr_to_angstrom, angstrom_to_bohr diff --git a/arc/constants.py b/arc/constants.py index 83f99051d1..6923a0f16b 100644 --- a/arc/constants.py +++ b/arc/constants.py @@ -79,6 +79,9 @@ #: Vacuum permittivity epsilon_0 = 8.8541878128 +bohr_to_angstrom = 0.529177 +angstrom_to_bohr = 1 / bohr_to_angstrom + # Cython does not automatically place module-level variables into the module # symbol table when in compiled mode, so we must do this manually so that we # can use the constants from both Cython and regular Python code @@ -101,4 +104,5 @@ 'F': F, 'epsilon_0': epsilon_0, 'bohr_to_angstrom': bohr_to_angstrom, + 'angstrom_to_bohr': angstrom_to_bohr, }) diff --git a/arc/job/adapter.py b/arc/job/adapter.py index de8c747718..9e231e18b1 100644 --- a/arc/job/adapter.py +++ b/arc/job/adapter.py @@ -96,6 +96,7 @@ class JobEnum(str, Enum): # TS search methods autotst = 'autotst' # AutoTST, 10.1021/acs.jpca.7b07361, 10.26434/chemrxiv.13277870.v2 heuristics = 'heuristics' # ARC's heuristics + crest = 'crest' # CREST conformer/TS search kinbot = 'kinbot' # KinBot, 10.1016/j.cpc.2019.106947 gcn = 'gcn' # Graph neural network for isomerization, https://doi.org/10.1021/acs.jpclett.0c00500 user = 'user' # user guesses diff --git a/arc/job/adapter_test.py b/arc/job/adapter_test.py index 9657f9a62a..d5b7a10ef6 100644 --- a/arc/job/adapter_test.py +++ b/arc/job/adapter_test.py @@ -180,6 +180,12 @@ def setUpClass(cls): server='server3', testing=True, ) + os.makedirs(cls.job_5.local_path, exist_ok=True) + fixture_path = os.path.join(ARC_TESTING_PATH, 'trsh', 'wall_exceeded.txt') + with open(fixture_path, 'r') as f: + log_content = f.read() + with open(os.path.join(cls.job_5.local_path, 'out.txt'), 'w') as f: + f.write(log_content) cls.job_6 = GaussianAdapter(execution_type='queue', job_name='opt_101', job_type='opt', diff --git a/arc/job/adapters/ts/__init__.py b/arc/job/adapters/ts/__init__.py index 5d571e8e80..4525adeed7 100644 --- a/arc/job/adapters/ts/__init__.py +++ b/arc/job/adapters/ts/__init__.py @@ -1,6 +1,8 @@ import arc.job.adapters.ts.autotst_ts +import arc.job.adapters.ts.crest import arc.job.adapters.ts.gcn_ts import arc.job.adapters.ts.heuristics import arc.job.adapters.ts.kinbot_ts +import arc.job.adapters.ts.seed_hub import arc.job.adapters.ts.xtb_gsm import arc.job.adapters.ts.orca_neb diff --git a/arc/job/adapters/ts/crest.py b/arc/job/adapters/ts/crest.py new file mode 100644 index 0000000000..6396a968da --- /dev/null +++ b/arc/job/adapters/ts/crest.py @@ -0,0 +1,520 @@ +""" +Utilities for running CREST within ARC. + +Separated from heuristics so CREST can be conditionally imported and reused. +""" + +import datetime +import os +import time +from typing import TYPE_CHECKING, List, Optional, Union + +from arc.common import almost_equal_coords, get_logger +from arc.imports import settings, submit_scripts +from arc.job.adapter import JobAdapter +from arc.job.adapters.common import _initialize_adapter, ts_adapters_by_rmg_family +from arc.job.adapters.ts.heuristics import DIHEDRAL_INCREMENT +from arc.job.adapters.ts.seed_hub import get_ts_seeds, get_wrapper_constraints +from arc.job.factory import register_job_adapter +from arc.job.local import check_job_status, submit_job +from arc.plotter import save_geo +from arc.species.converter import reorder_xyz_string, str_to_xyz, xyz_to_str +from arc.species.species import ARCSpecies, TSGuess + +if TYPE_CHECKING: + from arc.level import Level + from arc.reaction import ARCReaction + +logger = get_logger() + +MAX_CHECK_INTERVAL_SECONDS = 100 + +CREST_PATH = settings.get("CREST_PATH", None) +CREST_ENV_PATH = settings.get("CREST_ENV_PATH", None) +SERVERS = settings.get("servers", {}) + + +def crest_available() -> bool: + """ + Return whether CREST is configured for use. + """ + return bool(SERVERS.get("local")) and bool(CREST_PATH or CREST_ENV_PATH) + + +class CrestAdapter(JobAdapter): + """ + A class for executing CREST TS conformer searches based on heuristics-generated guesses. + """ + + def __init__(self, + project: str, + project_directory: str, + job_type: Union[List[str], str], + args: Optional[dict] = None, + bath_gas: Optional[str] = None, + checkfile: Optional[str] = None, + conformer: Optional[int] = None, + constraints: Optional[List] = None, + cpu_cores: Optional[str] = None, + dihedral_increment: Optional[float] = None, + dihedrals: Optional[List[float]] = None, + directed_scan_type: Optional[str] = None, + ess_settings: Optional[dict] = None, + ess_trsh_methods: Optional[List[str]] = None, + execution_type: Optional[str] = None, + fine: bool = False, + initial_time: Optional[Union['datetime.datetime', str]] = None, + irc_direction: Optional[str] = None, + job_id: Optional[int] = None, + job_memory_gb: float = 14.0, + job_name: Optional[str] = None, + job_num: Optional[int] = None, + job_server_name: Optional[str] = None, + job_status: Optional[List[Union[dict, str]]] = None, + level: Optional['Level'] = None, + max_job_time: Optional[float] = None, + run_multi_species: bool = False, + reactions: Optional[List['ARCReaction']] = None, + rotor_index: Optional[int] = None, + server: Optional[str] = None, + server_nodes: Optional[list] = None, + queue: Optional[str] = None, + attempted_queues: Optional[List[str]] = None, + species: Optional[List[ARCSpecies]] = None, + testing: bool = False, + times_rerun: int = 0, + torsions: Optional[List[List[int]]] = None, + tsg: Optional[int] = None, + xyz: Optional[dict] = None, + ): + + self.incore_capacity = 50 + self.job_adapter = 'crest' + self.command = None + self.execution_type = execution_type or 'incore' + + if reactions is None: + raise ValueError('Cannot execute TS CREST without ARCReaction object(s).') + + dihedral_increment = dihedral_increment or DIHEDRAL_INCREMENT + + _initialize_adapter(obj=self, + is_ts=True, + project=project, + project_directory=project_directory, + job_type=job_type, + args=args, + bath_gas=bath_gas, + checkfile=checkfile, + conformer=conformer, + constraints=constraints, + cpu_cores=cpu_cores, + dihedral_increment=dihedral_increment, + dihedrals=dihedrals, + directed_scan_type=directed_scan_type, + ess_settings=ess_settings, + ess_trsh_methods=ess_trsh_methods, + fine=fine, + initial_time=initial_time, + irc_direction=irc_direction, + job_id=job_id, + job_memory_gb=job_memory_gb, + job_name=job_name, + job_num=job_num, + job_server_name=job_server_name, + job_status=job_status, + level=level, + max_job_time=max_job_time, + run_multi_species=run_multi_species, + reactions=reactions, + rotor_index=rotor_index, + server=server, + server_nodes=server_nodes, + queue=queue, + attempted_queues=attempted_queues, + species=species, + testing=testing, + times_rerun=times_rerun, + torsions=torsions, + tsg=tsg, + xyz=xyz, + ) + + def write_input_file(self) -> None: + pass + + def set_files(self) -> None: + pass + + def set_additional_file_paths(self) -> None: + pass + + def set_input_file_memory(self) -> None: + pass + + def execute_incore(self): + self._log_job_execution() + self.initial_time = self.initial_time if self.initial_time else datetime.datetime.now() + + supported_families = [key for key, val in ts_adapters_by_rmg_family.items() if 'crest' in val] + + self.reactions = [self.reactions] if not isinstance(self.reactions, list) else self.reactions + for rxn in self.reactions: + if rxn.family not in supported_families: + logger.warning(f'The CREST TS search adapter does not support the {rxn.family} reaction family.') + continue + if any(spc.get_xyz() is None for spc in rxn.r_species + rxn.p_species): + logger.warning(f'The CREST TS search adapter cannot process a reaction if 3D coordinates of ' + f'some/all of its reactants/products are missing.\nNot processing {rxn}.') + continue + if not crest_available(): + logger.warning('CREST is not available. Skipping CREST TS search.') + break + + if rxn.ts_species is None: + rxn.ts_species = ARCSpecies(label='TS', + is_ts=True, + charge=rxn.charge, + multiplicity=rxn.multiplicity, + ) + + tsg = TSGuess(method='CREST') + tsg.tic() + + crest_job_dirs = [] + xyz_guesses = get_ts_seeds( + reaction=rxn, + base_adapter='heuristics', + dihedral_increment=self.dihedral_increment, + ) + if not xyz_guesses: + logger.warning(f'CREST TS search failed to generate any seed guesses for {rxn.label}.') + tsg.tok() + continue + + for iteration, xyz_entry in enumerate(xyz_guesses): + xyz_guess = xyz_entry.get("xyz") + family = xyz_entry.get("family", rxn.family) + if xyz_guess is None: + continue + + crest_constraint_atoms = get_wrapper_constraints( + wrapper='crest', + reaction=rxn, + seed=xyz_entry, + ) + if not crest_constraint_atoms: + logger.warning( + f"Could not determine CREST constraint atoms for {rxn.label} crest seed {iteration} " + f"(family: {family}). Skipping this CREST seed." + ) + continue + + crest_job_dir = crest_ts_conformer_search( + xyz_guess, + crest_constraint_atoms["A"], + crest_constraint_atoms["H"], + crest_constraint_atoms["B"], + path=self.local_path, + xyz_crest_int=iteration, + ) + crest_job_dirs.append(crest_job_dir) + + if not crest_job_dirs: + logger.warning(f'CREST TS search failed to prepare any jobs for {rxn.label}.') + tsg.tok() + continue + + crest_jobs = submit_crest_jobs(crest_job_dirs) + monitor_crest_jobs(crest_jobs) + xyz_guesses_crest = process_completed_jobs(crest_jobs) + tsg.tok() + + for method_index, xyz in enumerate(xyz_guesses_crest): + if xyz is None: + continue + unique = True + for other_tsg in rxn.ts_species.ts_guesses: + if almost_equal_coords(xyz, other_tsg.initial_xyz): + if hasattr(other_tsg, "method_sources"): + other_tsg.method_sources = other_tsg._normalize_method_sources( + (other_tsg.method_sources or []) + ["crest"] + ) + unique = False + break + if unique: + ts_guess = TSGuess(method='CREST', + index=len(rxn.ts_species.ts_guesses), + method_index=method_index, + t0=tsg.t0, + execution_time=tsg.execution_time, + success=True, + family=rxn.family, + xyz=xyz, + ) + rxn.ts_species.ts_guesses.append(ts_guess) + save_geo(xyz=xyz, + path=self.local_path, + filename=f'CREST_{method_index}', + format_='xyz', + comment=f'CREST {method_index}, family: {rxn.family}', + ) + + if len(self.reactions) < 5: + successes = [tsg for tsg in rxn.ts_species.ts_guesses if tsg.success and 'crest' in tsg.method.lower()] + if successes: + logger.info(f'CREST successfully found {len(successes)} TS guesses for {rxn.label}.') + else: + logger.info(f'CREST did not find any successful TS guesses for {rxn.label}.') + + self.final_time = datetime.datetime.now() + + def execute_queue(self): + self.execute_incore() + + +def crest_ts_conformer_search( + xyz_guess: dict, + a_atom: int, + h_atom: int, + b_atom: int, + path: str = "", + xyz_crest_int: int = 0, +) -> str: + """ + Prepare a CREST TS conformer search job: + - Write coords.ref and constraints.inp + - Write a PBS/HTCondor submit script using submit_scripts["local"]["crest"] + - Return the CREST job directory path + """ + path = os.path.join(path, f"crest_{xyz_crest_int}") + os.makedirs(path, exist_ok=True) + + # --- coords.ref --- + symbols = xyz_guess["symbols"] + converted_coords = reorder_xyz_string( + xyz_str=xyz_to_str(xyz_guess), + reverse_atoms=True, + convert_to="bohr", + ) + coords_ref_content = f"$coord\n{converted_coords}\n$end\n" + coords_ref_path = os.path.join(path, "coords.ref") + with open(coords_ref_path, "w") as f: + f.write(coords_ref_content) + + # --- constraints.inp --- + num_atoms = len(symbols) + # CREST uses 1-based indices + a_atom += 1 + h_atom += 1 + b_atom += 1 + + # All atoms not directly involved in A–H–B go into the metadynamics atom list + list_of_atoms_numbers_not_participating_in_reaction = [ + i for i in range(1, num_atoms + 1) if i not in [a_atom, h_atom, b_atom] + ] + + constraints_path = os.path.join(path, "constraints.inp") + with open(constraints_path, "w") as f: + f.write("$constrain\n") + f.write(f" atoms: {a_atom}, {h_atom}, {b_atom}\n") + f.write(" force constant: 0.5\n") + f.write(" reference=coords.ref\n") + f.write(f" distance: {a_atom}, {h_atom}, auto\n") + f.write(f" distance: {h_atom}, {b_atom}, auto\n") + f.write("$metadyn\n") + if list_of_atoms_numbers_not_participating_in_reaction: + f.write( + f' atoms: {", ".join(map(str, list_of_atoms_numbers_not_participating_in_reaction))}\n' + ) + f.write("$end\n") + + # --- build CREST command string --- + # Example: crest coords.ref --cinp constraints.inp --noreftopo -T 8 + local_server = SERVERS.get("local", {}) + cpus = int(local_server.get("cpus", 8)) + if CREST_ENV_PATH: + crest_exe = "crest" + else: + crest_exe = CREST_PATH if CREST_PATH is not None else "crest" + + commands = [ + crest_exe, + "coords.ref", + "--cinp constraints.inp", + "--noreftopo", + f"-T {cpus}", + ] + command = " ".join(commands) + + # --- activation line (optional) --- + activation_line = CREST_ENV_PATH or "" + + if SERVERS.get("local") is not None: + cluster_soft = SERVERS["local"]["cluster_soft"].lower() + local_templates = submit_scripts.get("local", {}) + crest_template = local_templates.get("crest") + crest_job_template = local_templates.get("crest_job") + + if cluster_soft in ["condor", "htcondor"]: + # HTCondor branch with a built-in fallback template. + if crest_template is None: + crest_template = ( + "universe = vanilla\n" + "executable = job.sh\n" + "output = out.txt\n" + "error = err.txt\n" + "log = log.txt\n" + "request_cpus = {cpus}\n" + "request_memory = {memory}\n" + "JobBatchName = {name}\n" + "queue\n" + ) + if crest_job_template is None: + crest_job_template = ( + "#!/bin/bash -l\n" + "{activation_line}\n" + "cd {path}\n" + "{commands}\n" + ) + sub_job = crest_template + format_params = { + "name": f"crest_{xyz_crest_int}", + "cpus": cpus, + "memory": int(SERVERS["local"].get("memory", 32.0) * 1024), + } + sub_job = sub_job.format(**format_params) + + with open( + os.path.join(path, settings["submit_filenames"]["HTCondor"]), "w" + ) as f: + f.write(sub_job) + + crest_job = crest_job_template.format( + path=path, + activation_line=activation_line, + commands=command, + ) + + with open(os.path.join(path, "job.sh"), "w") as f: + f.write(crest_job) + os.chmod(os.path.join(path, "job.sh"), 0o700) + + # Pre-create out/err for any status checkers that expect them + for fname in ("out.txt", "err.txt"): + fpath = os.path.join(path, fname) + if not os.path.exists(fpath): + with open(fpath, "w") as f: + f.write("") + os.chmod(fpath, 0o600) + + elif cluster_soft == "pbs": + # PBS branch with a built-in fallback template. + if crest_template is None: + crest_template = ( + "#!/bin/bash -l\n" + "#PBS -q {queue}\n" + "#PBS -N {name}\n" + "#PBS -l select=1:ncpus={cpus}:mem={memory}gb\n" + "#PBS -o out.txt\n" + "#PBS -e err.txt\n\n" + "{activation_line}\n" + "cd {path}\n" + "{commands}\n" + ) + sub_job = crest_template + format_params = { + "queue": SERVERS["local"].get("queue", "alon_q"), + "name": f"crest_{xyz_crest_int}", + "cpus": cpus, + # 'memory' is in GB for the template: mem={memory}gb + "memory": int( + SERVERS["local"].get("memory", 32) + if SERVERS["local"].get("memory", 32) < 60 + else 40 + ), + "activation_line": activation_line, + "path": path, + "commands": command, + } + sub_job = sub_job.format(**format_params) + + submit_filename = settings["submit_filenames"]["PBS"] # usually 'submit.sh' + submit_path = os.path.join(path, submit_filename) + with open(submit_path, "w") as f: + f.write(sub_job) + os.chmod(submit_path, 0o700) + + else: + raise ValueError(f"Unsupported cluster_soft for CREST: {cluster_soft!r}") + + return path + + +def submit_crest_jobs(crest_paths: List[str]) -> dict: + """ + Submit CREST jobs to the server. + + Args: + crest_paths (List[str]): List of paths to the CREST directories. + + Returns: + dict: A dictionary containing job IDs as keys and their statuses as values. + """ + crest_jobs = {} + for crest_path in crest_paths: + job_status, job_id = submit_job(path=crest_path) + logger.info(f"CREST job {job_id} submitted for {crest_path}") + crest_jobs[job_id] = {"path": crest_path, "status": job_status} + return crest_jobs + + +def monitor_crest_jobs(crest_jobs: dict, check_interval: int = 300) -> None: + """ + Monitor CREST jobs until they are complete. + + Args: + crest_jobs (dict): Dictionary containing job information (job ID, path, and status). + check_interval (int): Time interval (in seconds) to wait between status checks. + """ + while True: + all_done = True + for job_id, job_info in crest_jobs.items(): + if job_info["status"] not in ["done", "failed"]: + try: + job_info["status"] = check_job_status(job_id) # Update job status + except Exception as e: + logger.error(f"Error checking job status for job {job_id}: {e}") + job_info["status"] = "failed" + if job_info["status"] not in ["done", "failed"]: + all_done = False + if all_done: + break + time.sleep(min(check_interval, MAX_CHECK_INTERVAL_SECONDS)) + + +def process_completed_jobs(crest_jobs: dict) -> list: + """ + Process the completed CREST jobs and update XYZ guesses. + + Args: + crest_jobs (dict): Dictionary containing job information. + """ + xyz_guesses = [] + for job_id, job_info in crest_jobs.items(): + crest_path = job_info["path"] + if job_info["status"] == "done": + crest_best_path = os.path.join(crest_path, "crest_best.xyz") + if os.path.exists(crest_best_path): + with open(crest_best_path, "r") as f: + content = f.read() + xyz_guess = str_to_xyz(content) + xyz_guesses.append(xyz_guess) + else: + logger.error(f"crest_best.xyz not found in {crest_path}") + elif job_info["status"] == "failed": + logger.error(f"CREST job failed for {crest_path}") + + return xyz_guesses + +register_job_adapter('crest', CrestAdapter) diff --git a/arc/job/adapters/ts/crest_test.py b/arc/job/adapters/ts/crest_test.py new file mode 100644 index 0000000000..e243d8d43d --- /dev/null +++ b/arc/job/adapters/ts/crest_test.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +# encoding: utf-8 + +""" +Unit tests for arc.job.adapters.ts.crest +""" + +import os +import tempfile +import unittest + +from arc.species.converter import str_to_xyz + + +class TestCrestAdapter(unittest.TestCase): + """ + Tests for CREST input generation. + """ + + def setUp(self): + self.tmpdir = tempfile.TemporaryDirectory() + + def tearDown(self): + self.tmpdir.cleanup() + + def test_creates_valid_input_files(self): + """ + Ensure CREST inputs are written with expected content/format. + """ + from arc.job.adapters.ts import crest as crest_mod + + xyz = str_to_xyz( + """O 0.0 0.0 0.0 + H 0.0 0.0 0.96 + H 0.9 0.0 0.0""" + ) + + backups = { + "settings": crest_mod.settings, + "submit_scripts": crest_mod.submit_scripts, + "CREST_PATH": crest_mod.CREST_PATH, + "CREST_ENV_PATH": crest_mod.CREST_ENV_PATH, + "SERVERS": crest_mod.SERVERS, + } + + try: + crest_mod.settings = {"submit_filenames": {"PBS": "submit.sh"}} + crest_mod.submit_scripts = { + "local": { + "crest": ( + "#PBS -q {queue}\n" + "#PBS -N {name}\n" + "#PBS -l select=1:ncpus={cpus}:mem={memory}gb\n" + ), + "crest_job": "{activation_line}\ncd {path}\n{commands}\n", + } + } + crest_mod.CREST_PATH = "/usr/bin/crest" + crest_mod.CREST_ENV_PATH = "" + crest_mod.SERVERS = { + "local": {"cluster_soft": "pbs", "cpus": 4, "memory": 8, "queue": "testq"} + } + + crest_dir = crest_mod.crest_ts_conformer_search( + xyz_guess=xyz, a_atom=0, h_atom=1, b_atom=2, path=self.tmpdir.name, xyz_crest_int=0 + ) + + coords_path = os.path.join(crest_dir, "coords.ref") + constraints_path = os.path.join(crest_dir, "constraints.inp") + submit_path = os.path.join(crest_dir, "submit.sh") + + self.assertTrue(os.path.exists(coords_path)) + self.assertTrue(os.path.exists(constraints_path)) + self.assertTrue(os.path.exists(submit_path)) + + with open(coords_path) as f: + coords = f.read().strip().splitlines() + self.assertEqual(coords[0].strip(), "$coord") + self.assertEqual(coords[-1].strip(), "$end") + self.assertEqual(len(coords) - 2, len(xyz["symbols"])) + + with open(constraints_path) as f: + constraints = f.read() + self.assertIn("atoms: 1, 2, 3", constraints) + self.assertIn("force constant: 0.5", constraints) + self.assertIn("reference=coords.ref", constraints) + self.assertIn("distance: 1, 2, auto", constraints) + self.assertIn("distance: 2, 3, auto", constraints) + self.assertIn("$metadyn", constraints) + self.assertTrue(constraints.strip().endswith("$end")) + finally: + crest_mod.settings = backups["settings"] + crest_mod.submit_scripts = backups["submit_scripts"] + crest_mod.CREST_PATH = backups["CREST_PATH"] + crest_mod.CREST_ENV_PATH = backups["CREST_ENV_PATH"] + crest_mod.SERVERS = backups["SERVERS"] + + def test_creates_submit_file_without_crest_templates(self): + """ + Ensure fallback submit template generation works when submit.py has no CREST templates. + """ + from arc.job.adapters.ts import crest as crest_mod + + xyz = str_to_xyz( + """O 0.0 0.0 0.0 + H 0.0 0.0 0.96 + H 0.9 0.0 0.0""" + ) + + backups = { + "settings": crest_mod.settings, + "submit_scripts": crest_mod.submit_scripts, + "CREST_PATH": crest_mod.CREST_PATH, + "CREST_ENV_PATH": crest_mod.CREST_ENV_PATH, + "SERVERS": crest_mod.SERVERS, + } + + try: + crest_mod.settings = {"submit_filenames": {"PBS": "submit.sh"}} + crest_mod.submit_scripts = {"local": {}} + crest_mod.CREST_PATH = "/usr/bin/crest" + crest_mod.CREST_ENV_PATH = "" + crest_mod.SERVERS = { + "local": {"cluster_soft": "pbs", "cpus": 4, "memory": 8, "queue": "testq"} + } + + crest_dir = crest_mod.crest_ts_conformer_search( + xyz_guess=xyz, a_atom=0, h_atom=1, b_atom=2, path=self.tmpdir.name, xyz_crest_int=1 + ) + + submit_path = os.path.join(crest_dir, "submit.sh") + self.assertTrue(os.path.exists(submit_path)) + with open(submit_path) as f: + submit_text = f.read() + self.assertIn("#PBS -q testq", submit_text) + self.assertIn("coords.ref --cinp constraints.inp --noreftopo -T 4", submit_text) + finally: + crest_mod.settings = backups["settings"] + crest_mod.submit_scripts = backups["submit_scripts"] + crest_mod.CREST_PATH = backups["CREST_PATH"] + crest_mod.CREST_ENV_PATH = backups["CREST_ENV_PATH"] + crest_mod.SERVERS = backups["SERVERS"] + + +if __name__ == "__main__": + unittest.main() diff --git a/arc/job/adapters/ts/heuristics.py b/arc/job/adapters/ts/heuristics.py index 9031aa9ec3..8582735cab 100644 --- a/arc/job/adapters/ts/heuristics.py +++ b/arc/job/adapters/ts/heuristics.py @@ -21,28 +21,44 @@ import os from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union -from arc.common import (ARC_PATH, almost_equal_coords, get_angle_in_180_range, get_logger, is_angle_linear, - is_xyz_linear, key_by_val, read_yaml_file) +from arc.common import ( + ARC_PATH, + almost_equal_coords, + get_angle_in_180_range, + get_logger, + is_angle_linear, + is_xyz_linear, + key_by_val, + read_yaml_file, +) from arc.family import get_reaction_family_products from arc.job.adapter import JobAdapter from arc.job.adapters.common import _initialize_adapter, ts_adapters_by_rmg_family from arc.job.factory import register_job_adapter from arc.plotter import save_geo -from arc.species.converter import (compare_zmats, relocate_zmat_dummy_atoms_to_the_end, zmat_from_xyz, zmat_to_xyz, - add_atom_to_xyz_using_internal_coords, sorted_distances_of_atom) +from arc.species.converter import ( + add_atom_to_xyz_using_internal_coords, + compare_zmats, + relocate_zmat_dummy_atoms_to_the_end, + sorted_distances_of_atom, + zmat_from_xyz, + zmat_to_xyz, +) from arc.mapping.engine import map_two_species from arc.molecule.molecule import Molecule from arc.species.species import ARCSpecies, TSGuess, SpeciesError, colliding_atoms from arc.species.zmat import get_parameter_from_atom_indices, remove_zmat_atom_0, up_param, xyz_to_zmat from arc.species.vectors import calculate_angle +from arc.job.adapters.ts.seed_hub import get_ts_seeds if TYPE_CHECKING: from arc.level import Level from arc.reaction import ARCReaction - -FAMILY_SETS = {'hydrolysis_set_1': ['carbonyl_based_hydrolysis', 'ether_hydrolysis'], - 'hydrolysis_set_2': ['nitrile_hydrolysis']} +FAMILY_SETS = { + 'hydrolysis_set_1': ['carbonyl_based_hydrolysis', 'ether_hydrolysis'], + 'hydrolysis_set_2': ['nitrile_hydrolysis'], +} DIHEDRAL_INCREMENT = 30 @@ -258,56 +274,60 @@ def execute_incore(self): multiplicity=rxn.multiplicity, ) - xyzs = list() - tsg, families = None, None - if rxn.family == 'H_Abstraction': - tsg = TSGuess(method='Heuristics') - tsg.tic() - xyzs = h_abstraction(reaction=rxn, dihedral_increment=self.dihedral_increment) - tsg.tok() - + tsg = TSGuess(method='Heuristics') + tsg.tic() + xyzs = get_ts_seeds( + reaction=rxn, + base_adapter='heuristics', + dihedral_increment=self.dihedral_increment, + ) + tsg.tok() if rxn.family in FAMILY_SETS['hydrolysis_set_1'] or rxn.family in FAMILY_SETS['hydrolysis_set_2']: - try: - tsg = TSGuess(method='Heuristics') - tsg.tic() - xyzs, families, indices = hydrolysis(reaction=rxn) - tsg.tok() - if not xyzs: - logger.warning(f'Heuristics TS search failed to generate any valid TS guesses for {rxn.label}.') - continue - except ValueError: + if not xyzs: + logger.warning( + f'Heuristics TS search failed to generate any valid TS guesses for {rxn.label}.' + ) continue - for method_index, xyz in enumerate(xyzs): + for method_index, xyz_entry in enumerate(xyzs): + xyz = xyz_entry.get("xyz") + method_label = xyz_entry.get("method", "Heuristics") + family = xyz_entry.get("family", rxn.family) + if xyz is None: + continue unique = True for other_tsg in rxn.ts_species.ts_guesses: if almost_equal_coords(xyz, other_tsg.initial_xyz): - if 'heuristics' not in other_tsg.method.lower(): - other_tsg.method += ' and Heuristics' + existing_sources = getattr(other_tsg, "method_sources", None) + if existing_sources is not None: + combined_sources = list(existing_sources) + [method_label] + else: + combined_sources = [other_tsg.method, method_label] + other_tsg.method_sources = TSGuess._normalize_method_sources(combined_sources) unique = False break if unique: - ts_guess = TSGuess(method='Heuristics', + ts_guess = TSGuess(method=method_label, index=len(rxn.ts_species.ts_guesses), method_index=method_index, t0=tsg.t0, execution_time=tsg.execution_time, success=True, - family=rxn.family if families is None else families[method_index], + family=family, xyz=xyz, ) rxn.ts_species.ts_guesses.append(ts_guess) save_geo(xyz=xyz, path=self.local_path, - filename=f'Heuristics_{method_index}', + filename=f'{method_label}_{method_index}', format_='xyz', - comment=f'Heuristics {method_index}, family: {rxn.family}', + comment=f'{method_label} {method_index}, family: {rxn.family}', ) if len(self.reactions) < 5: - successes = len([tsg for tsg in rxn.ts_species.ts_guesses if tsg.success and 'heuristics' in tsg.method]) + successes = [tsg for tsg in rxn.ts_species.ts_guesses if tsg.success] if successes: - logger.info(f'Heuristics successfully found {successes} TS guesses for {rxn.label}.') + logger.info(f'Heuristics successfully found {len(successes)} TS guesses for {rxn.label}.') else: logger.info(f'Heuristics did not find any successful TS guesses for {rxn.label}.') @@ -873,7 +893,7 @@ def h_abstraction(reaction: 'ARCReaction', dihedral_increment (int, optional): The dihedral increment to use for B-H-A-C and D-B-H-C dihedral scans. Returns: List[dict] - Entries are Cartesian coordinates of TS guesses for all reactions. + Entries hold Cartesian coordinates of TS guesses and the generating method label. """ xyz_guesses = list() dihedral_increment = dihedral_increment or DIHEDRAL_INCREMENT @@ -952,7 +972,8 @@ def h_abstraction(reaction: 'ARCReaction', else: # This TS is unique, and has no atom collisions. zmats.append(zmat_guess) - xyz_guesses.append(xyz_guess) + xyz_guesses.append({"xyz": xyz_guess, "method": "Heuristics"}) + return xyz_guesses @@ -987,9 +1008,11 @@ def hydrolysis(reaction: 'ARCReaction') -> Tuple[List[dict], List[dict], List[in is_set_1 = reaction_family in hydrolysis_parameters["family_sets"]["set_1"] is_set_2 = reaction_family in hydrolysis_parameters["family_sets"]["set_2"] - main_reactant, water, initial_xyz, xyz_indices = extract_reactant_and_indices(reaction, - product_dict, - is_set_1) + main_reactant, water, initial_xyz, xyz_indices = extract_reactant_and_indices( + reaction, + product_dict, + is_set_1, + ) base_xyz_indices = { "a": xyz_indices["a"], "b": xyz_indices["b"], @@ -999,9 +1022,19 @@ def hydrolysis(reaction: 'ARCReaction') -> Tuple[List[dict], List[dict], List[in } adjustments_to_try = [False, True] if dihedrals_to_change_num == 1 else [True] for adjust_dihedral in adjustments_to_try: - chosen_xyz_indices, xyz_guesses, zmats_total, n_dihedrals_found = process_chosen_d_indices(initial_xyz, base_xyz_indices, xyz_indices, - hydrolysis_parameters,reaction_family, water, zmats_total, is_set_1, is_set_2, - dihedrals_to_change_num, should_adjust_dihedral=adjust_dihedral) + chosen_xyz_indices, xyz_guesses, zmats_total, n_dihedrals_found = process_chosen_d_indices( + initial_xyz, + base_xyz_indices, + xyz_indices, + hydrolysis_parameters, + reaction_family, + water, + zmats_total, + is_set_1, + is_set_2, + dihedrals_to_change_num, + should_adjust_dihedral=adjust_dihedral, + ) max_dihedrals_found = max(max_dihedrals_found, n_dihedrals_found) if xyz_guesses: xyz_guesses_total.extend(xyz_guesses) @@ -1015,8 +1048,8 @@ def hydrolysis(reaction: 'ARCReaction') -> Tuple[List[dict], List[dict], List[in condition_met = len(xyz_guesses_total) > 0 nitrile_in_inputs = any( - (pd.get("family") == "nitrile_hydrolysis") or - (isinstance(pd.get("family"), list) and "nitrile_hydrolysis" in pd.get("family")) + (pd.get("family") == "nitrile_hydrolysis") + or (isinstance(pd.get("family"), list) and "nitrile_hydrolysis" in pd.get("family")) for pd in product_dicts ) nitrile_already_found = any(fam == "nitrile_hydrolysis" for fam in reaction_families) @@ -1032,9 +1065,11 @@ def hydrolysis(reaction: 'ARCReaction') -> Tuple[List[dict], List[dict], List[in is_set_1 = reaction_family in hydrolysis_parameters["family_sets"]["set_1"] is_set_2 = reaction_family in hydrolysis_parameters["family_sets"]["set_2"] - main_reactant, water, initial_xyz, xyz_indices = extract_reactant_and_indices(reaction, - product_dict, - is_set_1) + main_reactant, water, initial_xyz, xyz_indices = extract_reactant_and_indices( + reaction, + product_dict, + is_set_1, + ) base_xyz_indices = { "a": xyz_indices["a"], "b": xyz_indices["b"], @@ -1048,10 +1083,18 @@ def hydrolysis(reaction: 'ARCReaction') -> Tuple[List[dict], List[dict], List[in break dihedrals_to_change_num += 1 chosen_xyz_indices, xyz_guesses, zmats_total, n_dihedrals_found = process_chosen_d_indices( - initial_xyz, base_xyz_indices, xyz_indices, - hydrolysis_parameters, reaction_family, water, zmats_total, is_set_1, is_set_2, - dihedrals_to_change_num, should_adjust_dihedral=True, - allow_nitrile_dihedrals=True + initial_xyz, + base_xyz_indices, + xyz_indices, + hydrolysis_parameters, + reaction_family, + water, + zmats_total, + is_set_1, + is_set_2, + dihedrals_to_change_num, + should_adjust_dihedral=True, + allow_nitrile_dihedrals=True, ) max_dihedrals_found = max(max_dihedrals_found, n_dihedrals_found) @@ -1083,11 +1126,13 @@ def get_products_and_check_families(reaction: 'ARCReaction') -> Tuple[List[dict] consider_arc_families=True, ) carbonyl_based_present = any( - "carbonyl_based_hydrolysis" in (d.get("family", []) if isinstance(d.get("family"), list) else [d.get("family")]) + "carbonyl_based_hydrolysis" + in (d.get("family", []) if isinstance(d.get("family"), list) else [d.get("family")]) for d in product_dicts ) ether_present = any( - "ether_hydrolysis" in (d.get("family", []) if isinstance(d.get("family"), list) else [d.get("family")]) + "ether_hydrolysis" + in (d.get("family", []) if isinstance(d.get("family"), list) else [d.get("family")]) for d in product_dicts ) @@ -1118,9 +1163,11 @@ def has_carbonyl_based_hydrolysis(reaction_families: List[dict]) -> bool: return any(family == "carbonyl_based_hydrolysis" for family in reaction_families) -def extract_reactant_and_indices(reaction: 'ARCReaction', - product_dict: dict, - is_set_1: bool) -> Tuple[ARCSpecies, ARCSpecies, dict, dict]: +def extract_reactant_and_indices( + reaction: 'ARCReaction', + product_dict: dict, + is_set_1: bool, +) -> Tuple[ARCSpecies, ARCSpecies, dict, dict]: """ Extract the reactant molecules and relevant atomic indices (a,b,e,d,o,h1) for the hydrolysis reaction. @@ -1163,11 +1210,13 @@ def extract_reactant_and_indices(reaction: 'ARCReaction', main_reactant, a_xyz_index, b_xyz_index, - two_neighbors + two_neighbors, ) except ValueError as e: - raise ValueError(f"Failed to determine neighbors by electronegativity for atom {a_xyz_index} " - f"in species {main_reactant.label}: {e}") + raise ValueError( + f"Failed to determine neighbors by electronegativity for atom {a_xyz_index} " + f"in species {main_reactant.label}: {e}" + ) o_index = len(main_reactant.mol.atoms) h1_index = o_index + 1 @@ -1178,25 +1227,26 @@ def extract_reactant_and_indices(reaction: 'ARCReaction', "e": e_xyz_index, "d": d_xyz_indices, "o": o_index, - "h1": h1_index + "h1": h1_index, } return main_reactant, water, initial_xyz, xyz_indices -def process_chosen_d_indices(initial_xyz: dict, - base_xyz_indices: dict, - xyz_indices: dict, - hydrolysis_parameters: dict, - reaction_family: str, - water: 'ARCSpecies', - zmats_total: List[dict], - is_set_1: bool, - is_set_2: bool, - dihedrals_to_change_num: int, - should_adjust_dihedral: bool, - allow_nitrile_dihedrals: bool = False - ) -> Tuple[Dict[str, int], List[Dict[str, Any]], List[Dict[str, Any]], int]: +def process_chosen_d_indices( + initial_xyz: dict, + base_xyz_indices: dict, + xyz_indices: dict, + hydrolysis_parameters: dict, + reaction_family: str, + water: 'ARCSpecies', + zmats_total: List[dict], + is_set_1: bool, + is_set_2: bool, + dihedrals_to_change_num: int, + should_adjust_dihedral: bool, + allow_nitrile_dihedrals: bool = False, +) -> Tuple[Dict[str, int], List[Dict[str, Any]], List[Dict[str, Any]], int]: """ Iterates over the 'd' indices to process TS guess generation. @@ -1214,7 +1264,6 @@ def process_chosen_d_indices(initial_xyz: dict, should_adjust_dihedral (bool): Whether to adjust dihedral angles. allow_nitrile_dihedrals (bool, optional): Force-enable dihedral adjustments for nitriles. Defaults to False. - Returns: Tuple[Dict[str, int], List[Dict[str, Any]], List[Dict[str, Any]]]: - Chosen indices for TS generation. @@ -1224,11 +1273,18 @@ def process_chosen_d_indices(initial_xyz: dict, """ max_dihedrals_found = 0 for d_index in xyz_indices.get("d", []) or [None]: - chosen_xyz_indices = {**base_xyz_indices, "d": d_index} if d_index is not None else {**base_xyz_indices, - "d": None} + chosen_xyz_indices = {**base_xyz_indices, "d": d_index} if d_index is not None else { + **base_xyz_indices, + "d": None, + } current_zmat, zmat_indices = setup_zmat_indices(initial_xyz, chosen_xyz_indices) - matches = get_matching_dihedrals(current_zmat, zmat_indices['a'], zmat_indices['b'], - zmat_indices['e'], zmat_indices['d']) + matches = get_matching_dihedrals( + current_zmat, + zmat_indices['a'], + zmat_indices['b'], + zmat_indices['e'], + zmat_indices['d'], + ) max_dihedrals_found = max(max_dihedrals_found, len(matches)) if should_adjust_dihedral and dihedrals_to_change_num > len(matches): continue @@ -1246,22 +1302,28 @@ def process_chosen_d_indices(initial_xyz: dict, zmat_variants = generate_dihedral_variants(current_zmat, indices, adjustment_factors) if zmat_variants: adjusted_zmats.extend(zmat_variants) - if not adjusted_zmats: - pass - else: + if adjusted_zmats: zmats_to_process = adjusted_zmats ts_guesses_list = [] for zmat_to_process in zmats_to_process: ts_guesses, updated_zmats = process_family_specific_adjustments( - is_set_1, is_set_2, reaction_family, hydrolysis_parameters, - zmat_to_process, water, chosen_xyz_indices, zmats_total) + is_set_1, + is_set_2, + reaction_family, + hydrolysis_parameters, + zmat_to_process, + water, + chosen_xyz_indices, + zmats_total, + ) zmats_total = updated_zmats ts_guesses_list.extend(ts_guesses) if attempted_dihedral_adjustments and not ts_guesses_list and ( - reaction_family != 'nitrile_hydrolysis' or allow_nitrile_dihedrals): - flipped_zmats= [] + reaction_family != 'nitrile_hydrolysis' or allow_nitrile_dihedrals + ): + flipped_zmats = [] adjustment_factors = [15, 25, 35, 45, 55] for indices in indices_list: flipped_variants = generate_dihedral_variants(current_zmat, indices, adjustment_factors, flip=True) @@ -1269,8 +1331,14 @@ def process_chosen_d_indices(initial_xyz: dict, for zmat_to_process in flipped_zmats: ts_guesses, updated_zmats = process_family_specific_adjustments( - is_set_1, is_set_2, reaction_family, hydrolysis_parameters, - zmat_to_process, water, chosen_xyz_indices, zmats_total + is_set_1, + is_set_2, + reaction_family, + hydrolysis_parameters, + zmat_to_process, + water, + chosen_xyz_indices, + zmats_total, ) zmats_total = updated_zmats ts_guesses_list.extend(ts_guesses) @@ -1311,10 +1379,12 @@ def get_main_reactant_and_water_from_hydrolysis_reaction(reaction: 'ARCReaction' return arc_reactant, water -def get_neighbors_by_electronegativity(spc: 'ARCSpecies', - atom_index: int, - exclude_index: int, - two_neighbors: bool = True) -> Tuple[int, List[int]]: +def get_neighbors_by_electronegativity( + spc: 'ARCSpecies', + atom_index: int, + exclude_index: int, + two_neighbors: bool = True, +) -> Tuple[int, List[int]]: """ Retrieve the top two neighbors of a given atom in a species, sorted by their effective electronegativity, excluding a specified neighbor. @@ -1340,8 +1410,11 @@ def get_neighbors_by_electronegativity(spc: 'ARCSpecies', Raises: ValueError: If the atom has no valid neighbors. """ - neighbors = [neighbor for neighbor in spc.mol.atoms[atom_index].edges.keys() - if spc.mol.atoms.index(neighbor) != exclude_index] + neighbors = [ + neighbor + for neighbor in spc.mol.atoms[atom_index].edges.keys() + if spc.mol.atoms.index(neighbor) != exclude_index + ] if not neighbors: raise ValueError(f"Atom at index {atom_index} has no valid neighbors.") @@ -1355,12 +1428,17 @@ def get_neighbor_total_electronegativity(neighbor: 'Atom') -> float: float: The total electronegativity of the neighbor """ return sum( - ELECTRONEGATIVITIES[n.symbol] * neighbor.edges[n].order - for n in neighbor.edges.keys() + ELECTRONEGATIVITIES[n.symbol] * neighbor.edges[n].order for n in neighbor.edges.keys() ) - effective_electronegativities = [(ELECTRONEGATIVITIES[n.symbol] * spc.mol.atoms[atom_index].edges[n].order, - get_neighbor_total_electronegativity(n), n ) for n in neighbors] + effective_electronegativities = [ + ( + ELECTRONEGATIVITIES[n.symbol] * spc.mol.atoms[atom_index].edges[n].order, + get_neighbor_total_electronegativity(n), + n, + ) + for n in neighbors + ] effective_electronegativities.sort(reverse=True, key=lambda x: (x[0], x[1])) sorted_neighbors = [spc.mol.atoms.index(n[2]) for n in effective_electronegativities] most_electronegative = sorted_neighbors[0] @@ -1368,8 +1446,7 @@ def get_neighbor_total_electronegativity(neighbor: 'Atom') -> float: return most_electronegative, remaining_neighbors -def setup_zmat_indices(initial_xyz: dict, - xyz_indices: dict) -> Tuple[dict, dict]: +def setup_zmat_indices(initial_xyz: dict, xyz_indices: dict) -> Tuple[dict, dict]: """ Convert XYZ coordinates to Z-matrix format and set up corresponding indices. @@ -1387,26 +1464,28 @@ def setup_zmat_indices(initial_xyz: dict, 'a': key_by_val(initial_zmat.get('map', {}), xyz_indices['a']), 'b': key_by_val(initial_zmat.get('map', {}), xyz_indices['b']), 'e': key_by_val(initial_zmat.get('map', {}), xyz_indices['e']), - 'd': key_by_val(initial_zmat.get('map', {}), xyz_indices['d']) if xyz_indices['d'] is not None else None + 'd': key_by_val(initial_zmat.get('map', {}), xyz_indices['d']) if xyz_indices['d'] is not None else None, } return initial_zmat, zmat_indices -def generate_dihedral_variants(zmat: dict, - indices: List[int], - adjustment_factors: List[float], - flip: bool = False, - tolerance_degrees: float = 10.0) -> List[dict]: +def generate_dihedral_variants( + zmat: dict, + indices: List[int], + adjustment_factors: List[float], + flip: bool = False, + tolerance_degrees: float = 10.0, +) -> List[dict]: """ - Create variants of a Z-matrix by adjusting dihedral angles using multiple adjustment factors. + Create variants of a Z-matrix by adjusting dihedral angles using multiple adjustment factors. This function creates variants of the Z-matrix using different adjustment factors: - 1. Retrieve the current dihedral value and normalize it to the (-180°, 180°] range. - 2. For each adjustment factor, slightly push the angle away from 0° or ±180° to avoid - unstable, boundary configurations. - 3. If `flip=True`, the same procedure is applied starting from a flipped - (180°-shifted) baseline angle. - 4. Each adjusted or flipped variant is deep-copied to ensure independence. + 1. Retrieve the current dihedral value and normalize it to the (-180°, 180°] range. + 2. For each adjustment factor, slightly push the angle away from 0° or ±180° to avoid + unstable, boundary configurations. + 3. If `flip=True`, the same procedure is applied starting from a flipped + (180°-shifted) baseline angle. + 4. Each adjusted or flipped variant is deep-copied to ensure independence. Args: zmat (dict): The initial Z-matrix. @@ -1414,7 +1493,8 @@ def generate_dihedral_variants(zmat: dict, adjustment_factors (List[float], optional): List of factors to try. flip (bool, optional): Whether to start from a flipped (180°) baseline dihedral angle. Defaults to False. - tolerance_degrees (float, optional): Tolerance (in degrees) for detecting angles near 0° or ±180°. Defaults to 10.0. + tolerance_degrees (float, optional): Tolerance (in degrees) for detecting angles near 0° or ±180°. + Defaults to 10.0. Returns: List[dict]: List of Z-matrix variants with adjusted dihedral angles. @@ -1440,8 +1520,9 @@ def push_up_dihedral(val: float, adj_factor: float) -> float: seed_value = normalized_value if flip: seed_value = get_angle_in_180_range(normalized_value + 180.0) - boundary_like = ((abs(seed_value) < tolerance_degrees) - or (180 - tolerance_degrees <= abs(seed_value) <= 180+tolerance_degrees)) + boundary_like = (abs(seed_value) < tolerance_degrees) or ( + 180 - tolerance_degrees <= abs(seed_value) <= 180 + tolerance_degrees + ) if boundary_like: for factor in adjustment_factors: variant = copy.deepcopy(zmat) @@ -1450,11 +1531,13 @@ def push_up_dihedral(val: float, adj_factor: float) -> float: return variants -def get_matching_dihedrals(zmat: dict, - a: int, - b: int, - e: int, - d: Optional[int]) -> List[List[int]]: +def get_matching_dihedrals( + zmat: dict, + a: int, + b: int, + e: int, + d: Optional[int], +) -> List[List[int]]: """ Retrieve all dihedral angles in the Z-matrix that match the given atom indices. This function scans the Z-matrix for dihedral parameters (keys starting with 'D_' or 'DX_') @@ -1484,11 +1567,13 @@ def get_matching_dihedrals(zmat: dict, return matches -def stretch_ab_bond(initial_zmat: 'dict', - xyz_indices: 'dict', - zmat_indices: 'dict', - hydrolysis_parameters: 'dict', - reaction_family: str) -> None: +def stretch_ab_bond( + initial_zmat: dict, + xyz_indices: dict, + zmat_indices: dict, + hydrolysis_parameters: dict, + reaction_family: str, +) -> None: """ Stretch the bond between atoms a and b in the Z-matrix based on the reaction family parameters. @@ -1519,16 +1604,18 @@ def stretch_ab_bond(initial_zmat: 'dict', stretch_zmat_bond(zmat=initial_zmat, indices=indices, stretch=stretch_degree) -def process_family_specific_adjustments(is_set_1: bool, - is_set_2: bool, - reaction_family: str, - hydrolysis_parameters: dict, - initial_zmat: dict, - water: 'ARCSpecies', - xyz_indices: dict, - zmats_total: List[dict]) -> Tuple[List[dict], List[dict]]: +def process_family_specific_adjustments( + is_set_1: bool, + is_set_2: bool, + reaction_family: str, + hydrolysis_parameters: dict, + initial_zmat: dict, + water: 'ARCSpecies', + xyz_indices: dict, + zmats_total: List[dict], +) -> Tuple[List[dict], List[dict]]: """ - Process specific adjustments for different hydrolysis reaction families if needed, then generate TS guesses . + Process specific adjustments for different hydrolysis reaction families if needed, then generate TS guesses. Args: is_set_1 (bool): Whether the reaction belongs to parameter set 1. @@ -1546,38 +1633,52 @@ def process_family_specific_adjustments(is_set_1: bool, Raises: ValueError: If the reaction family is not supported. """ - a_xyz, b_xyz, e_xyz, o_xyz, h1_xyz, d_xyz= xyz_indices.values() + a_xyz, b_xyz, e_xyz, o_xyz, h1_xyz, d_xyz = xyz_indices.values() r_atoms = [a_xyz, o_xyz, o_xyz] a_atoms = [[b_xyz, a_xyz], [a_xyz, o_xyz], [h1_xyz, o_xyz]] - d_atoms = ([[e_xyz, d_xyz, a_xyz], [b_xyz, a_xyz, o_xyz], [a_xyz, h1_xyz, o_xyz]] - if d_xyz is not None else - [[e_xyz, b_xyz, a_xyz], [b_xyz, a_xyz, o_xyz], [a_xyz, h1_xyz, o_xyz]]) + d_atoms = ( + [[e_xyz, d_xyz, a_xyz], [b_xyz, a_xyz, o_xyz], [a_xyz, h1_xyz, o_xyz]] + if d_xyz is not None + else [[e_xyz, b_xyz, a_xyz], [b_xyz, a_xyz, o_xyz], [a_xyz, h1_xyz, o_xyz]] + ) r_value = hydrolysis_parameters['family_parameters'][str(reaction_family)]['r_value'] a_value = hydrolysis_parameters['family_parameters'][str(reaction_family)]['a_value'] d_values = hydrolysis_parameters['family_parameters'][str(reaction_family)]['d_values'] if is_set_1 or is_set_2: initial_xyz = zmat_to_xyz(initial_zmat) - return generate_hydrolysis_ts_guess(initial_xyz, xyz_indices.values(), water, r_atoms, a_atoms, d_atoms, - r_value, a_value, d_values, zmats_total, is_set_1, - threshold=0.6 if reaction_family == 'nitrile_hydrolysis' else 0.8) + return generate_hydrolysis_ts_guess( + initial_xyz, + xyz_indices.values(), + water, + r_atoms, + a_atoms, + d_atoms, + r_value, + a_value, + d_values, + zmats_total, + is_set_1, + threshold=0.6 if reaction_family == 'nitrile_hydrolysis' else 0.8, + ) else: raise ValueError(f"Family {reaction_family} not supported for hydrolysis TS guess generation.") -def generate_hydrolysis_ts_guess(initial_xyz: dict, - xyz_indices: List[int], - water: 'ARCSpecies', - r_atoms: List[int], - a_atoms: List[List[int]], - d_atoms: List[List[int]], - r_value: List[float], - a_value: List[float], - d_values: List[List[float]], - zmats_total: List[dict], - is_set_1: bool, - threshold: float - ) -> Tuple[List[dict], List[dict]]: +def generate_hydrolysis_ts_guess( + initial_xyz: dict, + xyz_indices: List[int], + water: 'ARCSpecies', + r_atoms: List[int], + a_atoms: List[List[int]], + d_atoms: List[List[int]], + r_value: List[float], + a_value: List[float], + d_values: List[List[float]], + zmats_total: List[dict], + is_set_1: bool, + threshold: float, +) -> Tuple[List[dict], List[dict]]: """ Generate Z-matrices and Cartesian coordinates for transition state (TS) guesses. @@ -1600,7 +1701,7 @@ def generate_hydrolysis_ts_guess(initial_xyz: dict, """ xyz_guesses = [] - for index, d_value in enumerate(d_values): + for d_value in d_values: xyz_guess = copy.deepcopy(initial_xyz) for i in range(3): xyz_guess = add_atom_to_xyz_using_internal_coords( @@ -1611,23 +1712,22 @@ def generate_hydrolysis_ts_guess(initial_xyz: dict, d_indices=d_atoms[i], r_value=r_value[i], a_value=a_value[i], - d_value=d_value[i] + d_value=d_value[i], ) - a_xyz, b_xyz, e_xyz, o_xyz, h1_xyz, d_xyz= xyz_indices - are_valid_bonds=check_ts_bonds(xyz_guess, [o_xyz, h1_xyz, h1_xyz+1, a_xyz, b_xyz]) - colliding=colliding_atoms(xyz_guess, threshold=threshold) + a_xyz, b_xyz, e_xyz, o_xyz, h1_xyz, d_xyz = xyz_indices + are_valid_bonds = check_ts_bonds(xyz_guess, [o_xyz, h1_xyz, h1_xyz + 1, a_xyz, b_xyz]) + colliding = colliding_atoms(xyz_guess, threshold=threshold) duplicate = any(compare_zmats(existing, xyz_to_zmat(xyz_guess)) for existing in zmats_total) if is_set_1: - dihedral_edao=[e_xyz, d_xyz, a_xyz, o_xyz] - dao_is_linear=check_dao_angle(dihedral_edao, xyz_guess) + dihedral_edao = [e_xyz, d_xyz, a_xyz, o_xyz] + dao_is_linear = check_dao_angle(dihedral_edao, xyz_guess) else: - dao_is_linear=False + dao_is_linear = False if xyz_guess is not None and not colliding and not duplicate and are_valid_bonds and not dao_is_linear: xyz_guesses.append(xyz_guess) zmats_total.append(xyz_to_zmat(xyz_guess)) - return xyz_guesses, zmats_total @@ -1644,7 +1744,7 @@ def check_dao_angle(d_indices: List[int], xyz_guess: dict) -> bool: """ angle_indices = [d_indices[1], d_indices[2], d_indices[3]] angle_value = calculate_angle(xyz_guess, angle_indices) - norm_value=(angle_value + 180) % 180 + norm_value = (angle_value + 180) % 180 return (norm_value < 10) or (norm_value > 170) @@ -1659,7 +1759,7 @@ def check_ts_bonds(transition_state_xyz: dict, tested_atom_indices: list) -> boo Returns: bool: Whether the transition state guess has the expected water-related bonds. """ - oxygen_index, h1_index, h2_index, a_index, b_index= tested_atom_indices + oxygen_index, h1_index, h2_index, a_index, b_index = tested_atom_indices oxygen_bonds = sorted_distances_of_atom(transition_state_xyz, oxygen_index) h1_bonds = sorted_distances_of_atom(transition_state_xyz, h1_index) h2_bonds = sorted_distances_of_atom(transition_state_xyz, h2_index) @@ -1678,10 +1778,12 @@ def check_oxygen_bonds(bonds): return rel_error <= 0.1 return False - oxygen_has_valid_bonds = (oxygen_bonds[0][0] == h2_index and check_oxygen_bonds(oxygen_bonds)) - h1_has_valid_bonds = (h1_bonds[0][0] in {oxygen_index, b_index}and h1_bonds[1][0] in {oxygen_index, b_index}) + oxygen_has_valid_bonds = oxygen_bonds[0][0] == h2_index and check_oxygen_bonds(oxygen_bonds) + h1_has_valid_bonds = (h1_bonds[0][0] in {oxygen_index, b_index}) and ( + h1_bonds[1][0] in {oxygen_index, b_index} + ) h2_has_valid_bonds = h2_bonds[0][0] == oxygen_index return oxygen_has_valid_bonds and h1_has_valid_bonds and h2_has_valid_bonds -register_job_adapter('heuristics', HeuristicsAdapter) +register_job_adapter("heuristics", HeuristicsAdapter) diff --git a/arc/job/adapters/ts/heuristics_test.py b/arc/job/adapters/ts/heuristics_test.py index 250e10d852..fba89e9462 100644 --- a/arc/job/adapters/ts/heuristics_test.py +++ b/arc/job/adapters/ts/heuristics_test.py @@ -10,6 +10,8 @@ import os import shutil import unittest +from types import SimpleNamespace +from unittest.mock import patch from arc.common import ARC_TESTING_PATH, almost_equal_coords from arc.family import get_reaction_family_products @@ -31,6 +33,7 @@ check_dao_angle, check_ts_bonds, ) +from arc.job.adapters.ts.seed_hub import get_ts_seeds, get_wrapper_constraints from arc.reaction import ARCReaction from arc.species.converter import str_to_xyz, zmat_to_xyz, zmat_from_xyz from arc.species.species import ARCSpecies @@ -2258,5 +2261,61 @@ def tearDownClass(cls): shutil.rmtree(os.path.join(ARC_TESTING_PATH, 'heuristics_1'), ignore_errors=True) +class TestHeuristicsHub(unittest.TestCase): + """Unit tests for shared heuristic seed and CREST-constraint helpers.""" + + def test_get_ts_seeds_h_abstraction(self): + rxn = SimpleNamespace(family='H_Abstraction') + with patch('arc.job.adapters.ts.heuristics.h_abstraction', + return_value=[{'xyz': {'symbols': ('H',), 'coords': ((0.0, 0.0, 0.0),), 'isotopes': (1,)}, + 'method': 'Heuristics'}]): + seeds = get_ts_seeds(reaction=rxn, base_adapter='heuristics', dihedral_increment=60) + self.assertEqual(len(seeds), 1) + self.assertEqual(seeds[0]['family'], 'H_Abstraction') + self.assertEqual(seeds[0]['method'], 'Heuristics') + self.assertEqual(seeds[0]['source_adapter'], 'heuristics') + + def test_get_ts_seeds_hydrolysis(self): + rxn = SimpleNamespace(family='carbonyl_based_hydrolysis') + xyz = {'symbols': ('O',), 'coords': ((0.0, 0.0, 0.0),), 'isotopes': (16,)} + with patch('arc.job.adapters.ts.heuristics.hydrolysis', + return_value=([xyz], ['carbonyl_based_hydrolysis'], [[0, 1, 2]])): + seeds = get_ts_seeds(reaction=rxn, base_adapter='heuristics') + self.assertEqual(len(seeds), 1) + self.assertEqual(seeds[0]['family'], 'carbonyl_based_hydrolysis') + self.assertEqual(seeds[0]['xyz'], xyz) + self.assertEqual(seeds[0]['metadata'], {'indices': [0, 1, 2]}) + + def test_get_wrapper_constraints_crest(self): + rxn = SimpleNamespace(family='H_Abstraction') + xyz = str_to_xyz("""O 0.0000 0.0000 0.0000 + H 0.0000 0.0000 0.9600 + H 0.9000 0.0000 0.0000""") + seed = {'xyz': xyz, 'family': rxn.family} + atoms = get_wrapper_constraints(wrapper='crest', reaction=rxn, seed=seed) + self.assertIsInstance(atoms, dict) + self.assertSetEqual(set(atoms.keys()), {'A', 'H', 'B'}) + self.assertTrue(all(isinstance(v, int) for v in atoms.values())) + + def test_get_wrapper_constraints_crest_unsupported_family(self): + rxn = SimpleNamespace(family='carbonyl_based_hydrolysis') + xyz = str_to_xyz("""O 0.0000 0.0000 0.0000 + H 0.0000 0.0000 0.9600 + H 0.9000 0.0000 0.0000""") + seed = {'xyz': xyz, 'family': rxn.family} + atoms = get_wrapper_constraints(wrapper='crest', reaction=rxn, seed=seed) + self.assertIsNone(atoms) + + def test_get_ts_seeds_unsupported_adapter(self): + rxn = SimpleNamespace(family='H_Abstraction') + with self.assertRaises(ValueError): + get_ts_seeds(reaction=rxn, base_adapter='gcn') + + def test_get_wrapper_constraints_unsupported_wrapper(self): + rxn = SimpleNamespace(family='H_Abstraction') + with self.assertRaises(ValueError): + get_wrapper_constraints(wrapper='foo_wrapper', reaction=rxn, seed={}) + + if __name__ == '__main__': unittest.main(testRunner=unittest.TextTestRunner(verbosity=2)) diff --git a/arc/job/adapters/ts/seed_hub.py b/arc/job/adapters/ts/seed_hub.py new file mode 100644 index 0000000000..4a38254cdb --- /dev/null +++ b/arc/job/adapters/ts/seed_hub.py @@ -0,0 +1,168 @@ +""" +Shared TS-seed and wrapper-constraint hub. + +This module centralizes: +1. How TS seeds are requested from a base TS-search adapter. +2. How wrapper adapters (e.g., CREST) request family-specific constraints for a seed. +""" + +from typing import Dict, List, Optional + +from arc.common import get_logger +from arc.species.converter import xyz_to_dmat + +logger = get_logger() + + +def get_ts_seeds(reaction: 'ARCReaction', + base_adapter: str = 'heuristics', + dihedral_increment: Optional[int] = None, + ) -> List[dict]: + """ + Return TS seed entries from a base TS-search adapter. + + Seed schema: + - ``xyz`` (dict): Cartesian coordinates. + - ``family`` (str): The family associated with this seed. + - ``method`` (str): Human-readable generator label. + - ``source_adapter`` (str): Adapter id that generated the seed. + - ``metadata`` (dict, optional): Adapter-specific auxiliary fields. + + Args: + reaction: The ARC reaction object. + base_adapter: The underlying TS-search adapter providing seeds. + dihedral_increment: Optional scan increment used by adapters that support it. + """ + adapter = (base_adapter or '').lower() + if adapter != 'heuristics': + raise ValueError(f'Unsupported TS seed base adapter: {base_adapter}') + + # Lazily import to avoid circular imports with heuristics.py. + from arc.job.adapters.ts.heuristics import FAMILY_SETS, h_abstraction, hydrolysis + + xyz_entries = list() + if reaction.family == 'H_Abstraction': + xyzs = h_abstraction(reaction=reaction, dihedral_increment=dihedral_increment) + for entry in xyzs: + xyz = entry.get('xyz') if isinstance(entry, dict) else entry + method = entry.get('method', 'Heuristics') if isinstance(entry, dict) else 'Heuristics' + if xyz is not None: + xyz_entries.append({ + 'xyz': xyz, + 'method': method, + 'family': reaction.family, + 'source_adapter': 'heuristics', + 'metadata': {}, + }) + elif reaction.family in FAMILY_SETS['hydrolysis_set_1'] or reaction.family in FAMILY_SETS['hydrolysis_set_2']: + try: + xyzs_raw, families, indices = hydrolysis(reaction=reaction) + xyz_entries = [{ + 'xyz': xyz, + 'method': 'Heuristics', + 'family': family, + 'source_adapter': 'heuristics', + 'metadata': {'indices': idx}, + } for xyz, family, idx in zip(xyzs_raw, families, indices)] + except ValueError: + xyz_entries = list() + return xyz_entries + + +def get_wrapper_constraints(wrapper: str, + reaction: 'ARCReaction', + seed: dict, + ) -> Optional[dict]: + """ + Return wrapper-specific constraints for a TS seed. + + Args: + wrapper: Wrapper adapter id (e.g., ``crest``). + reaction: The ARC reaction object. + seed: A seed entry returned by :func:`get_ts_seeds`. + """ + wrapper_name = (wrapper or '').lower() + if wrapper_name != 'crest': + raise ValueError(f'Unsupported wrapper adapter: {wrapper}') + return _get_crest_constraints(reaction=reaction, seed=seed) + + +def _get_crest_constraints(reaction: 'ARCReaction', seed: dict) -> Optional[Dict[str, int]]: + """ + Return CREST constraints for a seed. + + Currently, only H_Abstraction is supported. + """ + family = seed.get('family') or reaction.family + xyz = seed.get('xyz') + if family != 'H_Abstraction' or xyz is None: + return None + return _get_h_abs_atoms_from_xyz(xyz) + + +def _get_h_abs_atoms_from_xyz(xyz: dict) -> Optional[Dict[str, int]]: + """ + Determine H-abstraction atoms from a TS guess. + + Returns: + Optional[Dict[str, int]]: ``{'H': int, 'A': int, 'B': int}``, or ``None``. + """ + symbols = xyz.get('symbols') if isinstance(xyz, dict) else None + if not symbols: + return None + dmat = xyz_to_dmat(xyz) + if dmat is None: + return None + + closest_atoms = dict() + for i in range(len(symbols)): + nearest = sorted( + ((dmat[i][j], j) for j in range(len(symbols)) if j != i), + key=lambda x: x[0], + )[:2] + closest_atoms[i] = [idx for _, idx in nearest] + + hydrogen_indices = [i for i, symbol in enumerate(symbols) if symbol.startswith('H')] + condition_occurrences = list() + + for hydrogen_index in hydrogen_indices: + atom_neighbors = closest_atoms[hydrogen_index] + is_heavy_present = any(not symbols[atom].startswith('H') for atom in atom_neighbors) + if_hydrogen_present = any(symbols[atom].startswith('H') and atom != hydrogen_index for atom in atom_neighbors) + + if is_heavy_present and if_hydrogen_present: + condition_occurrences.append({'H': hydrogen_index, 'A': atom_neighbors[0], 'B': atom_neighbors[1]}) + + if condition_occurrences: + if len(condition_occurrences) > 1: + occurrence_distances = list() + for occurrence in condition_occurrences: + h_atom = occurrence['H'] + a_atom = occurrence['A'] + b_atom = occurrence['B'] + occurrence_distances.append((occurrence, dmat[h_atom][a_atom] + dmat[h_atom][b_atom])) + best_occurrence = min(occurrence_distances, key=lambda x: x[1])[0] + return {'H': best_occurrence['H'], 'A': best_occurrence['A'], 'B': best_occurrence['B']} + single_occurrence = condition_occurrences[0] + return {'H': single_occurrence['H'], 'A': single_occurrence['A'], 'B': single_occurrence['B']} + + min_distance = float('inf') + selected_hydrogen = None + selected_heavy_atoms = None + for hydrogen_index in hydrogen_indices: + atom_neighbors = closest_atoms[hydrogen_index] + heavy_atoms = [atom for atom in atom_neighbors if not symbols[atom].startswith('H')] + if len(heavy_atoms) < 2: + continue + distances = dmat[hydrogen_index][heavy_atoms[0]] + dmat[hydrogen_index][heavy_atoms[1]] + if distances < min_distance: + min_distance = distances + selected_hydrogen = hydrogen_index + selected_heavy_atoms = heavy_atoms + + if selected_hydrogen is not None and selected_heavy_atoms is not None: + return {'H': selected_hydrogen, 'A': selected_heavy_atoms[0], 'B': selected_heavy_atoms[1]} + + logger.warning('No valid hydrogen atom found for CREST H-abstraction atoms.') + return None + diff --git a/arc/job/pipe/pipe_coordinator.py b/arc/job/pipe/pipe_coordinator.py index b5a0d874e8..47f774fab5 100644 --- a/arc/job/pipe/pipe_coordinator.py +++ b/arc/job/pipe/pipe_coordinator.py @@ -10,7 +10,7 @@ import os import time -from typing import TYPE_CHECKING, Dict, List +from typing import TYPE_CHECKING, Dict, List, Optional from arc.common import get_logger from arc.imports import settings @@ -188,7 +188,30 @@ def register_pipe_run_from_dir(self, pipe_root: str) -> PipeRun: self.active_pipes[pipe.run_id] = pipe return pipe - def poll_pipes(self) -> None: + @staticmethod + def _is_scheduler_job_alive(pipe: PipeRun, + server_job_ids: Optional[List[str]], + ) -> bool: + """ + Check whether a pipe run's scheduler job is still in the cluster queue. + + For PBS/Slurm array jobs the stored ``scheduler_job_id`` is the base + ID (e.g. ``'4018898[]'`` for PBS, ``'12345'`` for Slurm), while the + queue lists individual elements (``'4018898[0]'`` for PBS, + ``'12345_7'`` for Slurm). We match on the numeric prefix with both + ``[`` and ``_`` array separators so both formats are recognised. + + Returns True (optimistic) when *server_job_ids* is unavailable. + """ + if server_job_ids is None or pipe.scheduler_job_id is None: + return True # Cannot determine — assume alive. + base = pipe.scheduler_job_id.rstrip('[]') + return any(jid == base + or jid.startswith(base + '[') + or jid.startswith(base + '_') + for jid in server_job_ids) + + def poll_pipes(self, server_job_ids: Optional[List[str]] = None) -> None: """ Reconcile all active pipe runs. @@ -197,12 +220,19 @@ def poll_pipes(self) -> None: Tolerates up to 3 consecutive reconciliation failures per run before marking it as FAILED and removing it. + + Args: + server_job_ids: Job IDs currently present in the cluster queue + (from ``check_running_jobs_ids``). Used to detect when a + pipe's scheduler job has left the queue so that orphaned + tasks can be cleaned up immediately. """ max_consecutive_failures = 3 for run_id in list(self.active_pipes.keys()): pipe = self.active_pipes[run_id] + job_alive = self._is_scheduler_job_alive(pipe, server_job_ids) try: - counts = pipe.reconcile() + counts = pipe.reconcile(scheduler_job_alive=job_alive) except Exception: n_failures = self._pipe_poll_failures.get(run_id, 0) + 1 self._pipe_poll_failures[run_id] = n_failures diff --git a/arc/job/pipe/pipe_coordinator_test.py b/arc/job/pipe/pipe_coordinator_test.py index 2a963b37d5..56e456aebc 100644 --- a/arc/job/pipe/pipe_coordinator_test.py +++ b/arc/job/pipe/pipe_coordinator_test.py @@ -223,6 +223,80 @@ def test_poll_resets_failure_count_on_success(self): self.assertNotIn('run_flaky', self.coord._pipe_poll_failures) +class TestIsSchedulerJobAlive(unittest.TestCase): + """Tests for PipeCoordinator._is_scheduler_job_alive().""" + + def test_none_server_ids_returns_true(self): + pipe = MagicMock() + pipe.scheduler_job_id = '12345[]' + self.assertTrue(PipeCoordinator._is_scheduler_job_alive(pipe, None)) + + def test_none_scheduler_job_id_returns_true(self): + pipe = MagicMock() + pipe.scheduler_job_id = None + self.assertTrue(PipeCoordinator._is_scheduler_job_alive(pipe, ['12345[0]'])) + + def test_pbs_array_element_in_queue(self): + pipe = MagicMock() + pipe.scheduler_job_id = '4018898[]' + self.assertTrue(PipeCoordinator._is_scheduler_job_alive(pipe, ['4018898[2]', '9999'])) + + def test_pbs_array_not_in_queue(self): + pipe = MagicMock() + pipe.scheduler_job_id = '4018898[]' + self.assertFalse(PipeCoordinator._is_scheduler_job_alive(pipe, ['9999', '5555'])) + + def test_non_array_job_in_queue(self): + pipe = MagicMock() + pipe.scheduler_job_id = '12345' + self.assertTrue(PipeCoordinator._is_scheduler_job_alive(pipe, ['12345', '9999'])) + + def test_non_array_job_not_in_queue(self): + pipe = MagicMock() + pipe.scheduler_job_id = '12345' + self.assertFalse(PipeCoordinator._is_scheduler_job_alive(pipe, ['9999'])) + + def test_empty_queue(self): + pipe = MagicMock() + pipe.scheduler_job_id = '12345[]' + self.assertFalse(PipeCoordinator._is_scheduler_job_alive(pipe, [])) + + def test_slurm_array_element_in_queue(self): + pipe = MagicMock() + pipe.scheduler_job_id = '12345' + self.assertTrue(PipeCoordinator._is_scheduler_job_alive(pipe, ['12345_7', '9999'])) + + def test_slurm_array_not_in_queue(self): + pipe = MagicMock() + pipe.scheduler_job_id = '12345' + self.assertFalse(PipeCoordinator._is_scheduler_job_alive(pipe, ['99999_7'])) + + +class TestPollPipesJobGone(unittest.TestCase): + """Test that poll_pipes passes server_job_ids to reconcile for orphan detection.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_coord_gone_') + self.coord = PipeCoordinator(_make_mock_sched(self.tmpdir)) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_poll_with_server_job_ids_cleans_stuck_pipe(self): + """A pipe whose scheduler job left the queue gets cleaned up.""" + pipe = self.coord.submit_pipe_run('run_stuck', [_make_spec('t0')]) + pipe.scheduler_job_id = '99999[]' + now = time.time() + update_task_state(pipe.pipe_root, 't0', new_status=TaskState.CLAIMED, + claimed_by='w', claim_token='tok', + claimed_at=now, lease_expires_at=now + 86400) + update_task_state(pipe.pipe_root, 't0', new_status=TaskState.RUNNING, started_at=now) + # Job 99999 is NOT in the queue — pass empty list. + self.coord.poll_pipes(server_job_ids=[]) + # The pipe should have been cleaned up (orphaned → failed_terminal → ingested). + self.assertNotIn('run_stuck', self.coord.active_pipes) + + class TestIngestPipeResults(unittest.TestCase): """Tests for PipeCoordinator.ingest_pipe_results().""" diff --git a/arc/job/pipe/pipe_run.py b/arc/job/pipe/pipe_run.py index 7093a9df80..be38fc6497 100644 --- a/arc/job/pipe/pipe_run.py +++ b/arc/job/pipe/pipe_run.py @@ -277,11 +277,17 @@ def submit_to_scheduler(self): ) return job_status, job_id - def reconcile(self) -> Dict[str, int]: + def reconcile(self, scheduler_job_alive: bool = True) -> Dict[str, int]: """ Poll all tasks, detect orphans, schedule retries, and check for completion. Does not regress an already-terminal run status. + Args: + scheduler_job_alive: Whether the scheduler (PBS/Slurm) job for this + pipe run is still present in the queue. When False, any + CLAIMED/RUNNING tasks are immediately orphaned (the workers + are gone) and unreachable PENDING tasks are failed terminally. + Returns: Dict[str, int]: Counts of tasks in each state. """ @@ -307,9 +313,11 @@ def reconcile(self) -> Dict[str, int]: except (FileNotFoundError, ValueError, KeyError): continue current = TaskState(state.status) + # Orphan detection: lease expired OR the scheduler job is gone. if current in (TaskState.CLAIMED, TaskState.RUNNING) \ - and state.lease_expires_at is not None \ - and now > state.lease_expires_at: + and (not scheduler_job_alive + or (state.lease_expires_at is not None + and now > state.lease_expires_at)): try: update_task_state(self.pipe_root, task_id, new_status=TaskState.ORPHANED, @@ -341,7 +349,9 @@ def reconcile(self) -> Dict[str, int]: try: # FAILED_ESS tasks are handled separately (ejected to Scheduler). # Only FAILED_RETRYABLE and ORPHANED reach here. - if state.attempt_index + 1 < state.max_attempts: + # Only retry if the scheduler job is alive (workers exist + # to claim the retried task); otherwise fail terminally. + if state.attempt_index + 1 < state.max_attempts and scheduler_job_alive: update_task_state(self.pipe_root, task_id, new_status=TaskState.PENDING, attempt_index=state.attempt_index + 1, @@ -369,6 +379,30 @@ def reconcile(self) -> Dict[str, int]: # was killed, that is a manual intervention issue. self._needs_resubmission = False + # If the scheduler job is gone, any PENDING tasks will never be + # claimed. Mark them as terminally failed so the pipe can finish. + if not scheduler_job_alive and counts[TaskState.PENDING.value] > 0: + for task_id in task_ids: + if not os.path.isdir(os.path.join(tasks_dir, task_id)): + continue + try: + state = read_task_state(self.pipe_root, task_id) + except (FileNotFoundError, ValueError, KeyError): + continue + if TaskState(state.status) != TaskState.PENDING: + continue + try: + update_task_state(self.pipe_root, task_id, + new_status=TaskState.FAILED_TERMINAL, + ended_at=now) + counts[TaskState.PENDING.value] -= 1 + counts[TaskState.FAILED_TERMINAL.value] += 1 + logger.info(f'Task {task_id}: no workers remain ' + f'(scheduler job gone). Marked FAILED_TERMINAL.') + except (ValueError, TimeoutError) as e: + logger.warning(f'Task {task_id}: failed to mark stranded pending task ' + f'during reconciliation: {e}') + terminal = (counts[TaskState.COMPLETED.value] + counts[TaskState.FAILED_ESS.value] + counts[TaskState.FAILED_TERMINAL.value] diff --git a/arc/job/pipe/pipe_run_test.py b/arc/job/pipe/pipe_run_test.py index dec018c449..8dd6b3a1fc 100644 --- a/arc/job/pipe/pipe_run_test.py +++ b/arc/job/pipe/pipe_run_test.py @@ -266,6 +266,50 @@ def test_terminal_run_not_regressed(self): run.reconcile() self.assertEqual(run.status, PipeRunState.COMPLETED) + def test_scheduler_job_gone_orphans_running_task(self): + """When the scheduler job is gone, RUNNING tasks are orphaned immediately.""" + tasks = [_make_spec('t0'), _make_spec('t1')] + run = PipeRun(project_directory=self.tmpdir, run_id='gone', + tasks=tasks, cluster_software='pbs', max_attempts=1) + run.stage() + now = time.time() + # t0 completed normally + self._complete_task(run.pipe_root, 't0') + # t1 is RUNNING with lease still valid (24h in the future) + update_task_state(run.pipe_root, 't1', new_status=TaskState.CLAIMED, + claimed_by='w', claim_token='tok', claimed_at=now, + lease_expires_at=now + 86400) + update_task_state(run.pipe_root, 't1', new_status=TaskState.RUNNING, started_at=now) + + # With scheduler_job_alive=True (default), the task stays RUNNING. + counts = run.reconcile(scheduler_job_alive=True) + self.assertEqual(counts[TaskState.RUNNING.value], 1) + + # Reset run status so reconcile runs the main logic again. + run.status = PipeRunState.SUBMITTED + + # With scheduler_job_alive=False, RUNNING is immediately orphaned → FAILED_TERMINAL. + counts = run.reconcile(scheduler_job_alive=False) + self.assertEqual(counts[TaskState.RUNNING.value], 0) + state = read_task_state(run.pipe_root, 't1') + self.assertEqual(state.status, 'FAILED_TERMINAL') + self.assertEqual(run.status, PipeRunState.COMPLETED_PARTIAL) + + def test_scheduler_job_gone_stranded_pending_failed(self): + """When the scheduler job is gone, unreachable PENDING tasks are FAILED_TERMINAL.""" + tasks = [_make_spec('t0'), _make_spec('t1')] + run = PipeRun(project_directory=self.tmpdir, run_id='stranded', + tasks=tasks, cluster_software='slurm', max_attempts=3) + run.stage() + # t0 completed + self._complete_task(run.pipe_root, 't0') + # t1 was never claimed — still PENDING (e.g., worker crashed before claiming) + + run.reconcile(scheduler_job_alive=False) + state = read_task_state(run.pipe_root, 't1') + self.assertEqual(state.status, 'FAILED_TERMINAL') + self.assertEqual(run.status, PipeRunState.COMPLETED_PARTIAL) + class TestPipeRunNoResubmission(unittest.TestCase): """Pipe runs must never flag resubmission — Q-state workers handle retried tasks.""" diff --git a/arc/job/pipe/pipe_state.py b/arc/job/pipe/pipe_state.py index 0de4a78808..ee6f4d372c 100644 --- a/arc/job/pipe/pipe_state.py +++ b/arc/job/pipe/pipe_state.py @@ -110,7 +110,7 @@ class PipeRunState(str, Enum): # Allowed transitions: maps each state to the set of states it may transition to. TASK_TRANSITIONS: Dict[TaskState, Tuple[TaskState, ...]] = { - TaskState.PENDING: (TaskState.CLAIMED, TaskState.CANCELLED), + TaskState.PENDING: (TaskState.CLAIMED, TaskState.CANCELLED, TaskState.FAILED_TERMINAL), TaskState.CLAIMED: (TaskState.RUNNING, TaskState.ORPHANED, TaskState.CANCELLED), TaskState.RUNNING: (TaskState.COMPLETED, TaskState.FAILED_RETRYABLE, TaskState.FAILED_ESS, TaskState.FAILED_TERMINAL, TaskState.ORPHANED, TaskState.CANCELLED), diff --git a/arc/job/trsh.py b/arc/job/trsh.py index 89131f9ae4..6c7e03b085 100644 --- a/arc/job/trsh.py +++ b/arc/job/trsh.py @@ -838,6 +838,7 @@ def trsh_ess_job(label: str, cpu_cores: int, ess_trsh_methods: list, is_h: bool = False, + is_monoatomic: bool = False, ) -> tuple: """ Troubleshoot issues related to the electronic structure software, such as convergence. @@ -856,6 +857,7 @@ def trsh_ess_job(label: str, cpu_cores (int): The total number of cpu cores requested for a job. ess_trsh_methods (list): The troubleshooting methods tried for this job. is_h (bool): Whether the species is a hydrogen atom (or its isotope). e.g., H, D, T. + is_monoatomic (bool): Whether the species is monoatomic (single atom). Todo: - Change server to one that has the same ESS if running out of disk space. @@ -1016,7 +1018,11 @@ def trsh_ess_job(label: str, couldnt_trsh = True elif 'orca' in software: - if 'Memory' in job_status['keywords']: + if 'dlpno' in level_of_theory.method and (is_monoatomic or is_h): + logger.error(f'DLPNO methods are incompatible with monoatomic species {label} in Orca ' + f'(no electron pairs to correlate). Cannot troubleshoot.') + couldnt_trsh = True + elif 'Memory' in job_status['keywords']: # Increase memory allocation. # job_status will be for example # `Error (ORCA_SCF): Not enough memory available! Please increase MaxCore to more than: 289 MB`. @@ -1067,9 +1073,6 @@ def trsh_ess_job(label: str, logger.info(f'Troubleshooting {job_type} job in {software} for {label} using {cpu_cores} cpu cores.') if 'cpu' not in ess_trsh_methods: ess_trsh_methods.append('cpu') - elif 'dlpno' in level_of_theory.method and is_h: - logger.error('DLPNO method is not supported for H atom (or its isotope D or T) in Orca.') - couldnt_trsh = True else: couldnt_trsh = True diff --git a/arc/scheduler.py b/arc/scheduler.py index 6f4d81f39b..61cab782cb 100644 --- a/arc/scheduler.py +++ b/arc/scheduler.py @@ -828,7 +828,7 @@ def schedule_jobs(self): # Poll active pipe runs (per-run failures are handled inside poll_pipes). if self.active_pipes: - self.pipe_coordinator.poll_pipes() + self.pipe_coordinator.poll_pipes(server_job_ids=self.server_job_ids) # Flush deferred pipe batches (SP, freq, IRC, conf_sp) after all # newly-ready work has been discovered and before the loop sleeps. @@ -1444,10 +1444,16 @@ def run_sp_job(self, level_of_theory='ccsd/cc-pvdz', job_type='sp') return - mol = self.species_dict[label].mol - if mol is not None and len(mol.atoms) == 1 and mol.atoms[0].element.symbol == 'H' and 'DLPNO' in level.method: - # Run only CCSD for an H atom instead of DLPNO-CCSD(T) / etc. - level = Level(repr='ccsd/vtz', software=level.software, args=level.args) + if self.species_dict[label].is_monoatomic() and 'dlpno' in level.method: + species = self.species_dict[label] + if species.number_of_atoms == 1 and species.mol.atoms[0].element.symbol in ('H', 'D', 'T'): + logger.warning(f'Using HF/{level.basis} for {label} (single electron, no correlation).') + level = Level(method='hf', basis=level.basis, software=level.software) + else: + canonical_method = level.method.replace('dlpno-', '') + logger.warning(f'DLPNO methods are incompatible with monoatomic species {label}. ' + f'Using {canonical_method}/{level.basis} instead.') + level = Level(method=canonical_method, basis=level.basis, software=level.software) if self.job_types['sp']: if self.species_dict[label].multi_species: if self.output_multi_spc[self.species_dict[label].multi_species].get('sp', False): @@ -2763,7 +2769,35 @@ def switch_ts(self, label: str): logger.info(f'Switching a TS guess for {label}...') self.determine_most_likely_ts_conformer(label=label) # Look for a different TS guess. self.delete_all_species_jobs(label=label) # Delete other currently running jobs for this TS. - self.output[label]['geo'] = self.output[label]['freq'] = self.output[label]['sp'] = self.output[label]['composite'] = '' + # Clean up IRC species spawned from the invalidated TS guess. + irc_labels_str = self.species_dict[label].irc_label + if irc_labels_str: + for irc_label in irc_labels_str.split(): + if irc_label in self.running_jobs: + self.delete_all_species_jobs(irc_label) + del self.running_jobs[irc_label] + if irc_label in self.job_dict: + del self.job_dict[irc_label] + if irc_label in self.output: + del self.output[irc_label] + if irc_label in self.species_dict: + self.species_list = [spc for spc in self.species_list if spc.label != irc_label] + del self.species_dict[irc_label] + if irc_label in self.unique_species_labels: + self.unique_species_labels.remove(irc_label) + logger.info(f'Deleted IRC species {irc_label} from invalidated TS guess.') + self.species_dict[label].irc_label = None + # Reset job_types so the new guess's pipeline runs from scratch. + for job_type in self.output[label]['job_types']: + if job_type in ['rotors', 'bde']: + continue + self.output[label]['job_types'][job_type] = False + self.output[label]['convergence'] = None + # Discard any pending pipe jobs queued for the OLD guess geometry. + self._pending_pipe_sp.discard(label) + self._pending_pipe_freq.discard(label) + self._pending_pipe_irc.discard((label, 'forward')) + self._pending_pipe_irc.discard((label, 'reverse')) freq_path = os.path.join(self.project_directory, 'output', 'rxns', label, 'geometry', 'freq.out') if os.path.isfile(freq_path): os.remove(freq_path) @@ -3580,6 +3614,7 @@ def troubleshoot_ess(self, server=job.server, job_status=job.job_status[1], is_h=is_h, + is_monoatomic=bool(self.species_dict[label].is_monoatomic()), job_type=job.job_type, num_heavy_atoms=self.species_dict[label].number_of_heavy_atoms, software=job.job_adapter, diff --git a/arc/scheduler_test.py b/arc/scheduler_test.py index 3216a9f254..933a589c9e 100644 --- a/arc/scheduler_test.py +++ b/arc/scheduler_test.py @@ -757,6 +757,79 @@ def test_add_label_to_unique_species_labels(self): self.assertEqual(unique_label, 'new_species_15_1') self.assertEqual(self.sched2.unique_species_labels, ['methylamine', 'C2H6', 'CtripCO', 'new_species_15', 'new_species_15_0', 'new_species_15_1']) + def test_switch_ts_cleanup(self): + """Test that switch_ts resets job_types, convergence, and cleans up IRC species.""" + sched = self.sched1 + ts_label = 'methylamine' + sched.output = dict() + sched.initialize_output_dict() + + # Simulate state after a TS guess completes: freq/sp/opt marked done. + sched.output[ts_label]['job_types']['opt'] = True + sched.output[ts_label]['job_types']['freq'] = True + sched.output[ts_label]['job_types']['sp'] = True + sched.output[ts_label]['convergence'] = True + + # Simulate IRC species spawned from the TS guess. + irc_label_1 = 'IRC_methylamine_1' + irc_label_2 = 'IRC_methylamine_2' + irc_spc_1 = ARCSpecies(label=irc_label_1, smiles='CN', compute_thermo=False) + irc_spc_2 = ARCSpecies(label=irc_label_2, smiles='CN', compute_thermo=False) + sched.species_dict[ts_label].irc_label = f'{irc_label_1} {irc_label_2}' + sched.species_dict[irc_label_1] = irc_spc_1 + sched.species_dict[irc_label_2] = irc_spc_2 + sched.species_list.extend([irc_spc_1, irc_spc_2]) + sched.unique_species_labels.extend([irc_label_1, irc_label_2]) + sched.running_jobs[irc_label_1] = ['opt_a100'] + sched.running_jobs[irc_label_2] = ['opt_a101'] + sched.job_dict[irc_label_1] = {'opt': {}} + sched.job_dict[irc_label_2] = {'opt': {}} + sched.initialize_output_dict(label=irc_label_1) + sched.initialize_output_dict(label=irc_label_2) + + # Run the cleanup logic from switch_ts (without calling determine_most_likely_ts_conformer). + irc_labels_str = sched.species_dict[ts_label].irc_label + if irc_labels_str: + for irc_label in irc_labels_str.split(): + if irc_label in sched.running_jobs: + sched.running_jobs[irc_label] = list() + del sched.running_jobs[irc_label] + if irc_label in sched.job_dict: + del sched.job_dict[irc_label] + if irc_label in sched.output: + del sched.output[irc_label] + if irc_label in sched.species_dict: + sched.species_list = [s for s in sched.species_list if s.label != irc_label] + del sched.species_dict[irc_label] + if irc_label in sched.unique_species_labels: + sched.unique_species_labels.remove(irc_label) + sched.species_dict[ts_label].irc_label = None + + for job_type in sched.output[ts_label]['job_types']: + if job_type in ['rotors', 'bde']: + continue + sched.output[ts_label]['job_types'][job_type] = False + sched.output[ts_label]['convergence'] = None + + # Verify IRC species fully removed. + self.assertNotIn(irc_label_1, sched.species_dict) + self.assertNotIn(irc_label_2, sched.species_dict) + self.assertNotIn(irc_label_1, sched.running_jobs) + self.assertNotIn(irc_label_2, sched.running_jobs) + self.assertNotIn(irc_label_1, sched.job_dict) + self.assertNotIn(irc_label_2, sched.job_dict) + self.assertNotIn(irc_label_1, sched.output) + self.assertNotIn(irc_label_2, sched.output) + self.assertNotIn(irc_label_1, sched.unique_species_labels) + self.assertNotIn(irc_label_2, sched.unique_species_labels) + self.assertIsNone(sched.species_dict[ts_label].irc_label) + + # Verify job_types reset and convergence cleared. + self.assertFalse(sched.output[ts_label]['job_types']['opt']) + self.assertFalse(sched.output[ts_label]['job_types']['freq']) + self.assertFalse(sched.output[ts_label]['job_types']['sp']) + self.assertIsNone(sched.output[ts_label]['convergence']) + @classmethod def tearDownClass(cls): """ diff --git a/arc/scripts/pipe_worker.py b/arc/scripts/pipe_worker.py index ad37233933..12417083a1 100644 --- a/arc/scripts/pipe_worker.py +++ b/arc/scripts/pipe_worker.py @@ -149,10 +149,11 @@ def run_task(pipe_root: str, task_id: str, state: TaskStateRecord, logger.warning(f'Task {task_id}: could not transition to RUNNING ({e}), skipping.') return - spec = read_task_spec(pipe_root, task_id) - scratch_dir = tempfile.mkdtemp(prefix=f'pipe_{task_id}_') - result = _make_result_template(task_id, state.attempt_index, started_at) + scratch_dir = None try: + spec = read_task_spec(pipe_root, task_id) + scratch_dir = tempfile.mkdtemp(prefix=f'pipe_{task_id}_') + result = _make_result_template(task_id, state.attempt_index, started_at) _dispatch_execution(spec, scratch_dir) _copy_outputs(scratch_dir, attempt_dir) ended_at = time.time() @@ -201,18 +202,21 @@ def run_task(pipe_root: str, task_id: str, state: TaskStateRecord, failure_class = type(e).__name__ ended_at = time.time() logger.error(f'Task {task_id} failed: {failure_class}: {e}') - _copy_outputs(scratch_dir, attempt_dir) + if scratch_dir: + _copy_outputs(scratch_dir, attempt_dir) + result = locals().get('result') or _make_result_template(task_id, state.attempt_index, started_at) result['ended_at'] = ended_at result['status'] = 'FAILED' result['failure_class'] = failure_class # Try to parse ESS error info even on exception path. is_deterministic_ess = False - ess_info = _parse_ess_error(attempt_dir, spec) - if ess_info: - result['parser_summary'] = ess_info - if ess_info['status'] != 'done' and _is_deterministic_ess_error(ess_info): - result['failure_class'] = 'ess_error' - is_deterministic_ess = True + if 'spec' in locals(): + ess_info = _parse_ess_error(attempt_dir, spec) + if ess_info: + result['parser_summary'] = ess_info + if ess_info['status'] != 'done' and _is_deterministic_ess_error(ess_info): + result['failure_class'] = 'ess_error' + is_deterministic_ess = True write_result_json(attempt_dir, result) if not _verify_ownership(pipe_root, task_id, worker_id, claim_token): return @@ -229,7 +233,8 @@ def run_task(pipe_root: str, task_id: str, state: TaskStateRecord, logger.warning(f'Task {task_id}: could not mark failed ({e}). ' f'Task may have been orphaned concurrently.') finally: - shutil.rmtree(scratch_dir, ignore_errors=True) + if scratch_dir: + shutil.rmtree(scratch_dir, ignore_errors=True) def _make_result_template(task_id: str, attempt_index: int, started_at: float) -> dict: diff --git a/arc/scripts/pipe_worker_test.py b/arc/scripts/pipe_worker_test.py index bb6a01cb8d..4ead0144d4 100644 --- a/arc/scripts/pipe_worker_test.py +++ b/arc/scripts/pipe_worker_test.py @@ -309,6 +309,18 @@ def test_unsupported_family_fails(self): final = read_task_state(self.tmpdir, 'bad_family') self.assertIn(final.status, ('FAILED_RETRYABLE', 'FAILED_TERMINAL')) + def test_scratch_creation_failure_marks_failed(self): + """If tempfile.mkdtemp fails (e.g., I/O error), the task is properly + marked FAILED_RETRYABLE instead of being left stuck in RUNNING.""" + spec = _make_h2o_spec('io_fail') + initialize_task(self.tmpdir, spec) + state, token = self._claim('io_fail') + with patch('arc.scripts.pipe_worker.tempfile.mkdtemp', + side_effect=OSError(5, 'Input/output error')): + run_task(self.tmpdir, 'io_fail', state, 'test-worker', token) + final = read_task_state(self.tmpdir, 'io_fail') + self.assertIn(final.status, ('FAILED_RETRYABLE', 'FAILED_TERMINAL')) + class TestWorkerLoop(unittest.TestCase): diff --git a/arc/settings/crest.py b/arc/settings/crest.py new file mode 100644 index 0000000000..ebd227fa53 --- /dev/null +++ b/arc/settings/crest.py @@ -0,0 +1,113 @@ +""" +Utilities for locating CREST executables and activation commands. +""" + +import os +import re +import shutil +import sys +from typing import Optional, Tuple + + +def parse_version(folder_name: str) -> Tuple[int, int, int]: + """ + Parse a version from a folder name. + + Supports patterns such as ``3.0.2``, ``v212``, ``2.1``, ``2``. + """ + version_regex = re.compile(r"(?:v?(\d+)(?:\.(\d+))?(?:\.(\d+))?)", re.IGNORECASE) + match = version_regex.search(folder_name) + if not match: + return 0, 0, 0 + + major = int(match.group(1)) if match.group(1) else 0 + minor = int(match.group(2)) if match.group(2) else 0 + patch = int(match.group(3)) if match.group(3) else 0 + + # Example: v212 -> (2, 1, 2) + if major >= 100 and match.group(2) is None and match.group(3) is None: + s = str(major).rjust(3, "0") + major, minor, patch = int(s[0]), int(s[1]), int(s[2]) + + return major, minor, patch + + +def find_highest_version_in_directory(directory: str, name_contains: str) -> Optional[str]: + """ + Find the ``crest`` executable under the highest-version matching subdirectory. + """ + if not os.path.exists(directory): + return None + + highest_version_path = None + highest_version = () + for folder in os.listdir(directory): + file_path = os.path.join(directory, folder) + if name_contains.lower() in folder.lower() and os.path.isdir(file_path): + crest_path = os.path.join(file_path, "crest") + if os.path.isfile(crest_path) and os.access(crest_path, os.X_OK): + version = parse_version(folder) + if highest_version == () or version > highest_version: + highest_version = version + highest_version_path = crest_path + return highest_version_path + + +def find_crest_executable() -> Tuple[Optional[str], Optional[str]]: + """ + Return ``(crest_path, env_cmd)``. + + ``env_cmd`` is a shell snippet to activate the environment if needed, otherwise ``""``. + """ + # Priority 1: standalone builds in a configurable directory (default: /Local/ce_dana) + standalone_dir = os.getenv("ARC_CREST_STANDALONE_DIR", "/Local/ce_dana") + crest_path = find_highest_version_in_directory(standalone_dir, "crest") + if crest_path and os.path.isfile(crest_path) and os.access(crest_path, os.X_OK): + return crest_path, "" + + # Priority 2: Conda/Mamba/Micromamba envs + home = os.path.expanduser("~") + potential_env_paths = [ + os.path.join(home, "anaconda3", "envs", "crest_env", "bin", "crest"), + os.path.join(home, "miniconda3", "envs", "crest_env", "bin", "crest"), + os.path.join(home, "miniforge3", "envs", "crest_env", "bin", "crest"), + os.path.join(home, ".conda", "envs", "crest_env", "bin", "crest"), + os.path.join(home, "mambaforge", "envs", "crest_env", "bin", "crest"), + os.path.join(home, "micromamba", "envs", "crest_env", "bin", "crest"), + ] + + current_env_bin = os.path.dirname(sys.executable) + potential_env_paths.insert(0, os.path.join(current_env_bin, "crest")) + + for crest_path in potential_env_paths: + if os.path.isfile(crest_path) and os.access(crest_path, os.X_OK): + env_marker = os.path.join("envs", "crest_env") + os.path.sep + env_root = crest_path.split(env_marker)[0] + if "micromamba" in crest_path: + env_cmd = ( + f"source {env_root}/etc/profile.d/micromamba.sh && " + f"micromamba activate crest_env" + ) + elif any(name in env_root for name in ("anaconda3", "miniconda3", "miniforge3", "mambaforge", ".conda")): + env_cmd = ( + f"source {env_root}/etc/profile.d/conda.sh && " + f"conda activate crest_env" + ) + else: + env_cmd = "" + return crest_path, env_cmd + + # Priority 3: PATH + crest_in_path = shutil.which("crest") + if crest_in_path: + return crest_in_path, "" + + return None, None + + +__all__ = [ + "parse_version", + "find_highest_version_in_directory", + "find_crest_executable", +] + diff --git a/arc/settings/crest_test.py b/arc/settings/crest_test.py new file mode 100644 index 0000000000..d7793604ed --- /dev/null +++ b/arc/settings/crest_test.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +# encoding: utf-8 + +""" +Unit tests for arc.settings.crest +""" + +import os +import stat +import tempfile +import unittest +from unittest.mock import patch + +from arc.settings.crest import ( + find_crest_executable, + find_highest_version_in_directory, + parse_version, +) + + +class TestCrestSettingsUtils(unittest.TestCase): + + def _make_executable(self, path: str): + with open(path, "w") as f: + f.write("#!/bin/bash\n") + st = os.stat(path) + os.chmod(path, st.st_mode | stat.S_IXUSR) + + def test_parse_version(self): + self.assertEqual(parse_version("crest-3.0.2"), (3, 0, 2)) + self.assertEqual(parse_version("v212"), (2, 1, 2)) + self.assertEqual(parse_version("version-2.1"), (2, 1, 0)) + self.assertEqual(parse_version("foo"), (0, 0, 0)) + + def test_find_highest_version_in_directory(self): + with tempfile.TemporaryDirectory() as td: + low = os.path.join(td, "crest-2.1") + high = os.path.join(td, "crest-3.0.2") + os.makedirs(low) + os.makedirs(high) + self._make_executable(os.path.join(low, "crest")) + self._make_executable(os.path.join(high, "crest")) + + found = find_highest_version_in_directory(td, "crest") + self.assertEqual(found, os.path.join(high, "crest")) + + def test_find_crest_executable_prefers_standalone(self): + with tempfile.TemporaryDirectory() as td: + standalone = os.path.join(td, "crest-3.0.2") + os.makedirs(standalone) + standalone_crest = os.path.join(standalone, "crest") + self._make_executable(standalone_crest) + + with patch.dict(os.environ, {"ARC_CREST_STANDALONE_DIR": td}, clear=False): + path, env_cmd = find_crest_executable() + self.assertEqual(path, standalone_crest) + self.assertEqual(env_cmd, "") + + def test_find_crest_executable_env_detection(self): + with tempfile.TemporaryDirectory() as td: + fake_home = os.path.join(td, "home") + os.makedirs(fake_home) + crest_path = os.path.join(fake_home, "miniforge3", "envs", "crest_env", "bin", "crest") + os.makedirs(os.path.dirname(crest_path), exist_ok=True) + self._make_executable(crest_path) + + with patch("arc.settings.crest.os.path.expanduser", return_value=fake_home): + with patch("arc.settings.crest.sys.executable", os.path.join(td, "python")): + with patch("arc.settings.crest.shutil.which", return_value=None): + path, env_cmd = find_crest_executable() + self.assertEqual(path, crest_path) + self.assertIn("conda activate crest_env", env_cmd) + + +if __name__ == "__main__": + unittest.main() + diff --git a/arc/settings/settings.py b/arc/settings/settings.py index 7203ef8a8f..bcaa978d80 100644 --- a/arc/settings/settings.py +++ b/arc/settings/settings.py @@ -9,6 +9,12 @@ import os import string import sys +import shutil +from arc.settings.crest import ( + find_crest_executable, + find_highest_version_in_directory, + parse_version, +) # Users should update the following server dictionary. # Instructions for RSA key generation can be found here: @@ -89,7 +95,7 @@ supported_ess = ['cfour', 'gaussian', 'mockter', 'molpro', 'orca', 'qchem', 'terachem', 'onedmin', 'xtb', 'torchani', 'openbabel'] # TS methods to try when appropriate for a reaction (other than user guesses which are always allowed): -ts_adapters = ['heuristics', 'AutoTST', 'GCN', 'xtb_gsm', 'orca_neb'] +ts_adapters = ['heuristics', 'AutoTST', 'GCN', 'xtb_gsm', 'orca_neb', 'crest'] # List here job types to execute by default default_job_types = {'conf_opt': True, # defaults to True if not specified @@ -452,3 +458,57 @@ def add_rmg_db_candidates(prefix: str) -> None: if path and os.path.isdir(path): RMG_DB_PATH = path break + +CREST_PATH, CREST_ENV_PATH = find_crest_executable() + +__all__ = [ + "servers", + "global_ess_settings", + "supported_ess", + "ts_adapters", + "default_job_types", + "levels_ess", + "check_status_command", + "submit_command", + "delete_command", + "list_available_nodes_command", + "submit_filenames", + "t_max_format", + "input_filenames", + "output_filenames", + "default_levels_of_theory", + "orca_default_options_dict", + "tani_default_options_dict", + "ob_default_settings", + "xtb_gsm_settings", + "valid_chars", + "rotor_scan_resolution", + "maximum_barrier", + "minimum_barrier", + "inconsistency_az", + "inconsistency_ab", + "max_rotor_trsh", + "preserve_params_in_scan", + "workers_coeff", + "default_job_settings", + "ARC_FAMILIES_PATH", + "home", + "TANI_PYTHON", + "OB_PYTHON", + "TS_GCN_PYTHON", + "AUTOTST_PYTHON", + "ARC_PYTHON", + "RMG_ENV_NAME", + "RMG_PYTHON", + "XTB", + "exported_rmg_path", + "exported_rmg_db_path", + "gw", + "find_executable", + "add_rmg_db_candidates", + "parse_version", + "find_highest_version_in_directory", + "find_crest_executable", + "CREST_PATH", + "CREST_ENV_PATH", +] diff --git a/arc/species/converter.py b/arc/species/converter.py index ce0d484541..9d19a4b13d 100644 --- a/arc/species/converter.py +++ b/arc/species/converter.py @@ -5,6 +5,7 @@ import math import numpy as np import os +import warnings from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union from ase import Atoms @@ -48,6 +49,103 @@ DIST_PRECISION = 0.01 # Angstrom ANGL_PRECISION = 0.1 # rad (for both bond angle and dihedral) +def reorder_xyz_string(xyz_str: str, + reverse_atoms: bool = False, + units: str = 'angstrom', + convert_to: str = 'angstrom', + project_directory: Optional[str] = None + ) -> str: + """ + Reorder an XYZ string between ``ATOM X Y Z`` and ``X Y Z ATOM`` with optional unit conversion. + + Args: + xyz_str (str): The string xyz format to be converted. + reverse_atoms (bool, optional): Whether to reverse the atoms and coordinates. + units (str, optional): Units of the input coordinates ('angstrom' or 'bohr'). + convert_to (str, optional): The units to convert to (either 'angstrom' or 'bohr'). + project_directory (str, optional): The path to the project directory. + + Raises: + ConverterError: If xyz_str is not a string or does not have four space-separated entries per non-empty line. + + Returns: str + The converted string xyz format. + """ + if isinstance(xyz_str, tuple): + xyz_str = '\n'.join(xyz_str) + if isinstance(xyz_str, list): + xyz_str = '\n'.join(xyz_str) + if not isinstance(xyz_str, str): + raise ConverterError(f'Expected a string input, got {type(xyz_str)}') + if project_directory is not None: + file_path = os.path.join(project_directory, xyz_str) + if os.path.isfile(file_path): + with open(file_path, 'r') as f: + xyz_str = f.read() + + + if units.lower() == 'angstrom' and convert_to.lower() == 'angstrom': + conversion_factor = 1 + elif units.lower() == 'bohr' and convert_to.lower() == 'bohr': + conversion_factor = 1 + elif units.lower() == 'angstrom' and convert_to.lower() == 'bohr': + conversion_factor = constants.angstrom_to_bohr + elif units.lower() == 'bohr' and convert_to.lower() == 'angstrom': + conversion_factor = constants.bohr_to_angstrom + else: + raise ConverterError("Invalid target unit. Choose 'angstrom' or 'bohr'.") + + processed_lines = list() + # Split the string into lines + lxyz = xyz_str.strip().splitlines() + # Determine whether the atom label appears first or last in each line + first_line_tokens = lxyz[0].strip().split() + atom_first = not is_str_float(first_line_tokens[0]) + + for item in lxyz: + parts = item.strip().split() + + if len(parts) != 4: + raise ConverterError(f'xyz_str has an incorrect format, expected 4 elements in each line, ' + f'got "{item}" in:\n{xyz_str}') + if atom_first: + atom, x_str, y_str, z_str = parts + else: + x_str, y_str, z_str, atom = parts + + try: + x = float(x_str) * conversion_factor + y = float(y_str) * conversion_factor + z = float(z_str) * conversion_factor + + except ValueError as e: + raise ConverterError(f'Could not convert {x_str}, {y_str}, or {z_str} to floats.') from e + + if reverse_atoms and atom_first: + formatted_line = f'{x} {y} {z} {atom}' + elif reverse_atoms and not atom_first: + formatted_line = f'{atom} {x} {y} {z}' + elif not reverse_atoms and atom_first: + formatted_line = f'{atom} {x} {y} {z}' + elif not reverse_atoms and not atom_first: + formatted_line = f'{x} {y} {z} {atom}' + + processed_lines.append(formatted_line) + + return '\n'.join(processed_lines) + + +def str_to_str(*args, **kwargs) -> str: + """ + Backwards compatible wrapper for reorder_xyz_string. + """ + warnings.warn( + "str_to_str was renamed to reorder_xyz_string and will be removed in a future ARC release", + DeprecationWarning, + stacklevel=2, + ) + return reorder_xyz_string(*args, **kwargs) + def str_to_xyz(xyz_str: str, project_directory: Optional[str] = None, diff --git a/arc/species/converter_test.py b/arc/species/converter_test.py index aa881bdcac..f423c4d500 100644 --- a/arc/species/converter_test.py +++ b/arc/species/converter_test.py @@ -18,6 +18,7 @@ import arc.species.converter as converter from arc.common import ARC_PATH, ARC_TESTING_PATH, almost_equal_coords, almost_equal_coords_lists, almost_equal_lists +from arc.constants import angstrom_to_bohr from arc.exceptions import ConverterError from arc.molecule.molecule import Molecule from arc.species.perceive import perceive_molecule_from_xyz @@ -700,6 +701,37 @@ def test_str_to_xyz(self): xyz = converter.str_to_xyz(xyz_format) self.assertEqual(xyz, expected_xyz) + def test_reorder_xyz_string_atom_first(self): + """Test reordering atom-first XYZ strings with unit conversion""" + xyz_format = "C 0.0 1.0 2.0\nH -1.0 0.5 0.0" + converted = converter.reorder_xyz_string(xyz_str=xyz_format, reverse_atoms=True, convert_to="bohr") + converted_lines = converted.splitlines() + self.assertEqual(len(converted_lines), 2) + + x1, y1, z1, s1 = converted_lines[0].split() + self.assertEqual(s1, "C") + self.assertAlmostEqual(float(x1), 0.0) + self.assertAlmostEqual(float(y1), 1.0 * angstrom_to_bohr) + self.assertAlmostEqual(float(z1), 2.0 * angstrom_to_bohr) + + x2, y2, z2, s2 = converted_lines[1].split() + self.assertEqual(s2, "H") + self.assertAlmostEqual(float(x2), -1.0 * angstrom_to_bohr) + self.assertAlmostEqual(float(y2), 0.5 * angstrom_to_bohr) + self.assertAlmostEqual(float(z2), 0.0) + + def test_reorder_xyz_string_coordinate_first(self): + """Test reordering coordinate-first XYZ strings back to atom-last order with conversion""" + xyz_format = "0.0 0.0 0.0 N\n1.0 0.0 0.0 H" + converted = converter.reorder_xyz_string( + xyz_str=xyz_format, + reverse_atoms=False, + units="bohr", + convert_to="angstrom", + ) + expected = "0.0 0.0 0.0 N\n0.529177 0.0 0.0 H" + self.assertEqual(converted, expected) + def test_xyz_to_str(self): """Test converting an ARC xyz format to a string xyz format""" xyz_str1 = converter.xyz_to_str(xyz_dict=self.xyz1['dict']) diff --git a/arc/testing/server/pbs/timelimit/err.txt b/arc/testing/server/pbs/timelimit/err.txt deleted file mode 100644 index 17a55b3536..0000000000 --- a/arc/testing/server/pbs/timelimit/err.txt +++ /dev/null @@ -1,17 +0,0 @@ -=>> PBS: job killed: walltime 86415 exceeded limit 86400 -Error: software termination - rax fffffffffffffffc, rbx 00007ffc0d4f90d0, rcx ffffffffffffffff - rdx 0000000000000000, rsp 00007ffc0d4f9098, rbp 0000000000000001 - rsi 00007ffc0d4f90d0, rdi 0000000000038f1b, r8 00002b7af22a5700 - r9 0000000000000000, r10 0000000000000000, r11 0000000000000246 - r12 00007ffc0d4f90f0, r13 000000000000008f, r14 0000000000000000 - r15 00007ffc0d4fff40 -Error: software termination - rax 0000000000024fa8, rbx 00002ae812e9f2c0, rcx 0000000000035498 - rdx 00002ae8c4888bd0, rsp 00007ffde70fb680, rbp 00007ffde70fbf70 - rsi 00002ae8c48be068, rdi 00002ae8c48f3508, r8 00002ae8c49289b0 - r9 0000000000006a93, r10 0000000000006a95, r11 00002ae812ed4768 - r12 00002ae812f66508, r13 00002ae812f9b9b0, r14 0000000000006a92 - r15 00002ae81311f478 - --- traceback not available - --- traceback not available diff --git a/devtools/crest_environment.yml b/devtools/crest_environment.yml new file mode 100644 index 0000000000..2291e72d37 --- /dev/null +++ b/devtools/crest_environment.yml @@ -0,0 +1,6 @@ +name: crest_env +channels: + - conda-forge +dependencies: + - python>=3.7 + - crest=2.12 diff --git a/devtools/install_all.sh b/devtools/install_all.sh index c958fdd548..c9de207ef7 100644 --- a/devtools/install_all.sh +++ b/devtools/install_all.sh @@ -26,6 +26,8 @@ run_devtool () { bash "$DEVTOOLS_DIR/$1" "${@:2}"; } SKIP_CLEAN=false SKIP_EXT=false SKIP_ARC=false +SKIP_RMG=false +ARC_INSTALLED=false RMG_ARGS=() ARC_ARGS=() EXT_ARGS=() @@ -36,6 +38,7 @@ while [[ $# -gt 0 ]]; do --no-clean) SKIP_CLEAN=true ;; --no-ext) SKIP_EXT=true ;; --no-arc) SKIP_ARC=true ;; + --no-rmg) SKIP_RMG=true ;; --rmg-*) RMG_ARGS+=("--${1#--rmg-}") ;; --arc-*) ARC_ARGS+=("--${1#--arc-}") ;; --ext-*) EXT_ARGS+=("--${1#--ext-}") ;; @@ -44,6 +47,7 @@ while [[ $# -gt 0 ]]; do Usage: $0 [global-flags] [--rmg-xxx] [--arc-yyy] [--ext-zzz] --no-clean Skip micromamba/conda cache cleanup --no-ext Skip external tools (AutoTST, KinBot, …) + --no-rmg Skip RMG-Py entirely --rmg-path Forward '--path' to RMG installer --rmg-pip Forward '--pip' to RMG installer ... @@ -67,16 +71,15 @@ echo " EXT sub-flags : ${EXT_ARGS[*]:-(none)}" echo ">>> Beginning full ARC external repo installation…" pushd . >/dev/null -# 1) RMG -echo "=== Installing RMG ===" -run_devtool install_rmg.sh "${RMG_ARGS[@]}" - - - # 2) PyRDL - echo "=== Installing PyRDL ===" - bash devtools/install_pyrdl.sh +# 1) RMG (optional) +if [[ $SKIP_RMG == false ]]; then + echo "=== Installing RMG ===" + run_devtool install_rmg.sh "${RMG_ARGS[@]}" +else + echo "ℹ️ --no-rmg flag set. Skipping RMG installation." +fi -# 3) ARC itself (skip env creation in CI or if user requests it) +# 2) ARC itself (skip env creation in CI or if user requests it) if [[ "${CI:-false}" != "true" && "${SKIP_ARC:-false}" != "true" ]]; then if [[ $SKIP_CLEAN == false ]]; then echo "=== Cleaning up old ARC build artifacts ===" @@ -88,10 +91,23 @@ if [[ "${CI:-false}" != "true" && "${SKIP_ARC:-false}" != "true" ]]; then echo "=== Installing ARC ===" run_devtool install_arc.sh "${ARC_ARGS[@]}" + ARC_INSTALLED=true else + ARC_INSTALLED=false echo ":information_source: CI detected or --no-arc flag set. Skip cleaning ARC installation." fi +# 3) PyRDL (needs arc_env, but not ARC install) +if [[ "${CI:-false}" == "true" ]]; then + echo "=== Installing PyRDL (CI) ===" + bash devtools/install_pyrdl.sh +elif [[ $ARC_INSTALLED == true ]]; then + echo "=== Installing PyRDL ===" + bash devtools/install_pyrdl.sh +else + echo "ℹ️ Skipping PyRDL install because ARC installation was skipped." +fi + if [[ $SKIP_EXT == false ]]; then # map of friendly names → installer scripts declare -A EXT_INSTALLERS=( @@ -100,6 +116,7 @@ if [[ $SKIP_EXT == false ]]; then [KinBot]=install_kinbot.sh [OpenBabel]=install_ob.sh [xtb]=install_xtb.sh + [CREST]=install_crest.sh [Sella]=install_sella.sh [TorchANI]=install_torchani.sh ) diff --git a/devtools/install_autotst.sh b/devtools/install_autotst.sh index 5e3bc35288..e71e42d035 100644 --- a/devtools/install_autotst.sh +++ b/devtools/install_autotst.sh @@ -31,6 +31,8 @@ done # where "$(pwd)" is the path to the AutoTST repository. write_hook () { local env="$1" repo_path="$2" # repo_path="$(pwd)" in AutoTST + local repo_path_escaped + repo_path_escaped=$(printf '%q' "$repo_path") $COMMAND_PKG env list | awk '{print $1}' | grep -qx "$env" || return 0 # env prefix @@ -50,16 +52,37 @@ write_hook () { # --- activation -------------------------------------------------------- cat >"$act" <>"$act" <<'EOF' +# Remove RMG-Py from PATH/PYTHONPATH to avoid clashes while AutoTST is active. +if [[ -n "${RMG_PY_PATH:-}" ]]; then + export PATH="$(_strip_path "$RMG_PY_PATH" "$PATH")" + export PYTHONPATH="$(_strip_path "$RMG_PY_PATH" "${PYTHONPATH:-}")" +fi +EOF + fi + + cat >>"$act" <<'EOF' case ":\$PYTHONPATH:" in *":\$AUTOTST_ROOT:"*) ;; \ *) export PYTHONPATH="\$AUTOTST_ROOT:\${PYTHONPATH:-}" ;; esac EOF # --- de-activation ----------------------------------------------------- cat >"$deact" <<'EOF' -_strip () { local n=":$1:"; local s=":$2:"; echo "${s//$n/:}" | sed 's/^://;s/:$//'; } -export PYTHONPATH=$(_strip "$AUTOTST_ROOT" ":${PYTHONPATH:-}:") -unset AUTOTST_ROOT +export PATH="${AUTOTST_OLD_PATH:-$PATH}" +if [[ -n "${AUTOTST_OLD_PYTHONPATH+x}" ]]; then + export PYTHONPATH="$AUTOTST_OLD_PYTHONPATH" +else + unset PYTHONPATH +fi +unset AUTOTST_ROOT AUTOTST_OLD_PATH AUTOTST_OLD_PYTHONPATH EOF echo "🔗 AutoTST hook refreshed in $env" } @@ -115,12 +138,53 @@ fi if [[ $MODE == "path" ]]; then - AUTO_PATH_LINE="export PYTHONPATH=\"\$PYTHONPATH:$(pwd)\"" - if ! grep -Fqx "$AUTO_PATH_LINE" ~/.bashrc; then - echo "$AUTO_PATH_LINE" >> ~/.bashrc - echo "✔️ Added AutoTST path to ~/.bashrc" + HOOK_SENTINEL="# AutoTST path-mode hook" + if ! grep -Fqx "$HOOK_SENTINEL" ~/.bashrc; then + cat <<'EOF' >> ~/.bashrc +# AutoTST path-mode hook +_strip_path () { + local needle=":$1:" + local haystack=":$2:" + echo "${haystack//$needle/:}" | sed 's/^://;s/:$//' +} + +autotst_on () { + export AUTOTST_ROOT="__AUTOTST_PATH__" + export AUTOTST_OLD_PATH="$PATH" + export AUTOTST_OLD_PYTHONPATH="${PYTHONPATH:-}" + if [[ -n "${RMG_PY_PATH:-}" ]]; then + PATH="$(_strip_path "$RMG_PY_PATH" "$PATH")" + PYTHONPATH="$(_strip_path "$RMG_PY_PATH" "${PYTHONPATH:-}")" + fi + + case ":$PYTHONPATH:" in *":$AUTOTST_ROOT:"*) ;; \ + *) PYTHONPATH="$AUTOTST_ROOT:${PYTHONPATH:-}" ;; esac + export PATH PYTHONPATH +} + +autotst_off () { + export PATH="${AUTOTST_OLD_PATH:-$PATH}" + if [[ -n "${AUTOTST_OLD_PYTHONPATH+x}" ]]; then + export PYTHONPATH="$AUTOTST_OLD_PYTHONPATH" + else + unset PYTHONPATH + fi + unset AUTOTST_ROOT AUTOTST_OLD_PATH AUTOTST_OLD_PYTHONPATH +} + +# Enable AutoTST by default in new shells and keep RMG-Py out of the way. +autotst_on +EOF + # replace placeholder with actual path (portable across GNU/BSD sed) + AUTOTST_ESCAPED_PATH="$(printf '%q' "$(pwd)" | sed 's#/#\\\\/#g')" + if sed --version >/dev/null 2>&1; then + sed -i "s#__AUTOTST_PATH__#${AUTOTST_ESCAPED_PATH}#" ~/.bashrc + else + sed -i '' "s#__AUTOTST_PATH__#${AUTOTST_ESCAPED_PATH}#" ~/.bashrc + fi + echo "✔️ Added AutoTST path-mode hook to ~/.bashrc" else - echo "ℹ️ AutoTST path already exists in ~/.bashrc" + echo "ℹ️ AutoTST path-mode hook already exists in ~/.bashrc" fi elif [[ $MODE == "conda" ]]; then write_hook tst_env "$(pwd)" diff --git a/devtools/install_crest.sh b/devtools/install_crest.sh new file mode 100644 index 0000000000..1086ec9db2 --- /dev/null +++ b/devtools/install_crest.sh @@ -0,0 +1,64 @@ +#!/bin/bash -l +set -eo pipefail + +if command -v micromamba &> /dev/null; then + echo "✔️ Micromamba is installed." + COMMAND_PKG=micromamba +elif command -v mamba &> /dev/null; then + echo "✔️ Mamba is installed." + COMMAND_PKG=mamba +elif command -v conda &> /dev/null; then + echo "✔️ Conda is installed." + COMMAND_PKG=conda +else + echo "❌ Micromamba, Mamba, or Conda is required. Please install one." + exit 1 +fi + +if [ "$COMMAND_PKG" = "micromamba" ]; then + eval "$(micromamba shell hook --shell=bash)" +else + BASE=$(conda info --base) + . "$BASE/etc/profile.d/conda.sh" +fi + +ENV_FILE="devtools/crest_environment.yml" + +if [ ! -f "$ENV_FILE" ]; then + echo "❌ File not found: $ENV_FILE" + exit 1 +fi + +if $COMMAND_PKG env list | grep -q '^crest_env\s'; then + echo ">>> Updating existing crest_env..." + $COMMAND_PKG env update -n crest_env -f "$ENV_FILE" --prune +else + echo ">>> Creating new crest_env..." + $COMMAND_PKG env create -n crest_env -f "$ENV_FILE" -y +fi + +echo ">>> Checking CREST installation..." + +if [ "$COMMAND_PKG" = "micromamba" ]; then + CREST_RUNNER="micromamba run -n crest_env" + CREST_LISTER="micromamba list -n crest_env" +else + CREST_RUNNER="conda run -n crest_env" + CREST_LISTER="conda list -n crest_env" +fi + +if $CREST_RUNNER crest --version &> /dev/null; then + version_output=$($CREST_RUNNER crest --version 2>&1) + echo "$version_output" + installed_version=$(printf '%s' "$version_output" | tr '\n' ' ' | sed -n 's/.*Version[[:space:]]\+\([0-9.][0-9.]*\).*/\1/p') + if [ "$installed_version" != "2.12" ]; then + echo "❌ CREST version mismatch (expected 2.12)." + exit 1 + fi + echo "✔️ CREST 2.12 is successfully installed." +else + echo "❌ CREST is not found in PATH. Please check the environment." + exit 1 +fi + +echo "✅ Done installing CREST (crest_env)." diff --git a/devtools/install_gcn.sh b/devtools/install_gcn.sh index 8f83a2cda1..5273353d77 100644 --- a/devtools/install_gcn.sh +++ b/devtools/install_gcn.sh @@ -93,12 +93,12 @@ write_hook() { # env_name repo_path rm -f "$act" "$deact" # --- activation hook ----------------------------------------------------- - cat <<'ACTHOOK' >"$act" + cat <"$act" # TS-GCN hook – $(date +%F) export TSGCN_ROOT="$repo" -case ":$PYTHONPATH:" in - *":$TSGCN_ROOT:") ;; \ - *) export PYTHONPATH="$TSGCN_ROOT:\${PYTHONPATH:-}" ;; +case ":\$PYTHONPATH:" in + *":\$TSGCN_ROOT:") ;; \ + *) export PYTHONPATH="\$TSGCN_ROOT:\${PYTHONPATH:-}" ;; esac ACTHOOK @@ -182,46 +182,43 @@ CORE_PKGS=( # ── inline env creation & unified PyTorch install -------------------------- if $COMMAND_PKG env list | awk '{print $1}' | grep -qx ts_gcn; then - $COMMAND_PKG env update -n ts_gcn \ + $COMMAND_PKG install -n ts_gcn \ -c schrodinger -c conda-forge \ --channel-priority flexible \ "${CORE_PKGS[@]}" \ - --prune -y + --yes else - $COMMAND_PKG env create -n ts_gcn \ + $COMMAND_PKG create -n ts_gcn \ -c schrodinger -c conda-forge \ --channel-priority flexible \ "${CORE_PKGS[@]}" \ - -y + --yes fi - # 2) activate it - we set +u to avoid printing variable names - # that are not set yet - set +u; $COMMAND_PKG activate ts_gcn; set -u - - # 3) pip‐install exactly the CPU or CUDA wheels (no ROCm on that index) - WHEEL=https://download.pytorch.org/whl/torch_stable.html - if [[ $CUDA_VERSION == cpu ]]; then -pip install torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 -f $WHEEL - else - pip install torch==1.7.1+${CUDA_VERSION} \ - torchvision==0.8.2+${CUDA_VERSION} \ - torchaudio==0.7.2+${CUDA_VERSION} \ - -f $WHEEL - fi - # for PyG wheels use the official PyG index—with a real '+' in the URL - TORCH_VER=1.7.1 - WHEEL_URL="https://pytorch-geometric.com/whl/torch-${TORCH_VER}+${CUDA_VERSION}.html" - - # install ONLY the prebuilt binaries, never fall back to source - pip install torch-scatter -f "$WHEEL_URL" --only-binary torch-scatter - pip install torch-sparse -f "$WHEEL_URL" --only-binary torch-sparse - pip install torch-cluster -f "$WHEEL_URL" --only-binary torch-cluster - pip install torch-spline-conv -f "$WHEEL_URL" --only-binary torch-spline-conv - - # finally the meta‐package (this one can install from PyPI) - pip install torch-geometric - echo "✅ ts_gcn environment ready" +# 2) pip‐install exactly the CPU or CUDA wheels (no ROCm on that index) +PIP_RUN=("$COMMAND_PKG" run -n ts_gcn) +WHEEL=https://download.pytorch.org/whl/torch_stable.html +if [[ $CUDA_VERSION == cpu ]]; then + "${PIP_RUN[@]}" pip install torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 -f $WHEEL +else + "${PIP_RUN[@]}" pip install torch==1.7.1+${CUDA_VERSION} \ + torchvision==0.8.2+${CUDA_VERSION} \ + torchaudio==0.7.2+${CUDA_VERSION} \ + -f $WHEEL +fi +# for PyG wheels use the official PyG index—with a real '+' in the URL +TORCH_VER=1.7.1 +WHEEL_URL="https://pytorch-geometric.com/whl/torch-${TORCH_VER}+${CUDA_VERSION}.html" + +# install ONLY the prebuilt binaries, never fall back to source +"${PIP_RUN[@]}" pip install torch-scatter -f "$WHEEL_URL" --only-binary torch-scatter +"${PIP_RUN[@]}" pip install torch-sparse -f "$WHEEL_URL" --only-binary torch-sparse +"${PIP_RUN[@]}" pip install torch-cluster -f "$WHEEL_URL" --only-binary torch-cluster +"${PIP_RUN[@]}" pip install torch-spline-conv -f "$WHEEL_URL" --only-binary torch-spline-conv + +# finally the meta‐package (this one can install from PyPI) +"${PIP_RUN[@]}" pip install torch-geometric +echo "✅ ts_gcn environment ready" # ── write hooks into conda envs if required ------------------------------- if [[ $MODE == conda ]]; then diff --git a/devtools/install_pyrdl.sh b/devtools/install_pyrdl.sh index 529d9d5dc3..edcb5ed9da 100644 --- a/devtools/install_pyrdl.sh +++ b/devtools/install_pyrdl.sh @@ -51,8 +51,8 @@ fi # Ensure CMake is installed in the environment if ! command -v cmake &> /dev/null; then - echo "Installing CMake..." - "$COMMAND_PKG" install -y cmake + echo "Installing CMake into arc_env..." + "$COMMAND_PKG" install -n arc_env -c conda-forge -y cmake fi # Clone and build RingDecomposerLib diff --git a/devtools/install_torchani.sh b/devtools/install_torchani.sh index 5410e88658..992031d014 100644 --- a/devtools/install_torchani.sh +++ b/devtools/install_torchani.sh @@ -2,9 +2,10 @@ set -eo pipefail # Enable tracing of each command, but tee it to a logfile +LOGFILE="tani_env_setup.log" exec 3>&1 4>&2 trap 'exec 2>&4 1>&3' EXIT -exec 1> >(tee .log) 2>&1 +exec 1> >(tee "$LOGFILE") 2>&1 set -x echo ">>> Starting TANI environment setup at $(date)" @@ -53,7 +54,7 @@ fi echo ">>> Creating conda env from $ENV_YAML (name=$ENV_NAME)" if ! $COMMAND_PKG env create -n "$ENV_NAME" -f "$ENV_YAML" -v; then echo "❌ Environment creation failed. Dumping last 200 lines of log:" - tail -n 200 tani_env_setup.log + tail -n 200 "$LOGFILE" echo "---- Disk usage at failure ----" df -h . exit 1 diff --git a/docs/source/TS_search.rst b/docs/source/TS_search.rst index 73513c9afb..21c6315ddb 100644 --- a/docs/source/TS_search.rst +++ b/docs/source/TS_search.rst @@ -54,4 +54,90 @@ A detailed description of the methodology, design choices, and validation benchm L. Fahoum, A. Grinberg Dana, *“Automated Reaction Transition State Search for Neutral Hydrolysis Reactions”*, Digital Discovery, 2026. +CREST +^^^^^ + +CREST is an external conformational sampling tool used by ARC as a TS-search wrapper stage. +In ARC's current flow, CREST is applied to TS seeds generated by base TS search methods and uses +family-specific constraints from ARC. + +Current ARC family support for CREST: + +- ``H_Abstraction`` only (RMG family reference: + `H_Abstraction `_). + +External references: + +- `CREST documentation `_ +- `CREST constrained sampling example `_ + +Wrapper Extension Guide +""""""""""""""""""""""" + +Use this guide when extending CREST-based TS workflows in ARC (for example, adding hydrolysis support to CREST, +or allowing CREST to wrap a new TS seed source adapter). + +ARC uses a neutral wrapper hub API for TS seed generation and wrapper-specific constraints: + +- ``arc.job.adapters.ts.seed_hub.get_ts_seeds(...)`` +- ``arc.job.adapters.ts.seed_hub.get_wrapper_constraints(...)`` + +Current status +"""""""""""""" + +- ``CrestAdapter`` requests seeds using ``base_adapter='heuristics'``. +- ``CrestAdapter`` requests constraints using ``wrapper='crest'``. +- CREST constraints are currently implemented for ``H_Abstraction`` only. +- Hydrolysis seeds can be generated by heuristics, but CREST constraints for hydrolysis are not implemented yet. + +Seed schema contract +"""""""""""""""""""" + +``get_ts_seeds(...)`` returns a list of seed dictionaries with the following fields: + +- ``xyz``: Cartesian coordinates dictionary. +- ``family``: Reaction family associated with the seed. +- ``method``: Method label for provenance. +- ``source_adapter``: TS-search adapter id that generated the seed. +- ``metadata``: Optional adapter-specific metadata dictionary. + +Extension instructions: Add a new family to CREST +""""""""""""""""""""""""""""""""""""""""""""""""" + +1. Update ``get_ts_seeds(...)`` logic in ``arc/job/adapters/ts/seed_hub.py`` only if the seed generation path changes. +2. Add family-specific CREST constraints in ``_get_crest_constraints(...)`` (or family helper it calls) in + ``arc/job/adapters/ts/seed_hub.py``. +3. Add/update tests in ``arc/job/adapters/ts/heuristics_test.py`` (``TestHeuristicsHub``). +4. Update ``ts_adapters_by_rmg_family`` mapping if CREST should be enabled for that family. + +Extension instructions: Let CREST wrap a new TS seed adapter +"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + +1. Add a ``base_adapter`` branch in ``get_ts_seeds(...)``. +2. Ensure the returned seed objects satisfy the seed schema contract. +3. Reuse ``get_wrapper_constraints(wrapper='crest', ...)`` with those seeds. +4. Add tests for the new adapter branch and constraints compatibility. + +Minimal usage pattern +""""""""""""""""""""" + +.. code-block:: python + + from arc.job.adapters.ts.seed_hub import get_ts_seeds, get_wrapper_constraints + + seeds = get_ts_seeds( + reaction=rxn, + base_adapter='heuristics', + dihedral_increment=30, + ) + for seed in seeds: + crest_constraints = get_wrapper_constraints( + wrapper='crest', + reaction=rxn, + seed=seed, + ) + if crest_constraints is None: + continue + # run CREST with crest_constraints["A"], crest_constraints["H"], crest_constraints["B"] + .. include:: links.txt diff --git a/functional/restart_test.py b/functional/restart_test.py index 35594910b8..5be2f476c2 100644 --- a/functional/restart_test.py +++ b/functional/restart_test.py @@ -12,7 +12,7 @@ from arc.molecule.molecule import Molecule -from arc.common import ARC_PATH, read_yaml_file +from arc.common import ARC_PATH, ARC_TESTING_PATH, read_yaml_file from arc.main import ARC @@ -42,7 +42,7 @@ def test_restart_thermo(self): Test restarting ARC through the ARC class in main.py via the input_dict argument of the API Rather than through ARC.py. Check that all files are in place and the log file content. """ - restart_dir = os.path.join(ARC_PATH, 'arc', 'testing', 'restart', '1_restart_thermo') + restart_dir = os.path.join(ARC_TESTING_PATH, 'restart', '1_restart_thermo') restart_path = os.path.join(restart_dir, 'restart.yml') project = _project_name('arc_project_for_testing_delete_after_usage_restart_thermo') project_directory = os.path.join(ARC_PATH, 'Projects', project) @@ -139,7 +139,7 @@ def test_restart_thermo(self): def test_restart_rate_1(self): """Test restarting ARC and attaining a reaction rate coefficient""" - restart_dir = os.path.join(ARC_PATH, 'arc', 'testing', 'restart', '2_restart_rate') + restart_dir = os.path.join(ARC_TESTING_PATH, 'restart', '2_restart_rate') restart_path = os.path.join(restart_dir, 'restart.yml') project = _project_name('arc_project_for_testing_delete_after_usage_restart_rate_1') project_directory = os.path.join(ARC_PATH, 'Projects', project) @@ -164,7 +164,7 @@ def test_restart_rate_2(self): """Test restarting ARC and attaining a reaction rate coefficient""" project = _project_name('arc_project_for_testing_delete_after_usage_restart_rate_2') project_directory = os.path.join(ARC_PATH, 'Projects', project) - base_path = os.path.join(ARC_PATH, 'arc', 'testing', 'restart', '5_TS1') + base_path = os.path.join(ARC_TESTING_PATH, 'restart', '5_TS1') restart_path = os.path.join(base_path, 'restart.yml') input_dict = read_yaml_file(path=restart_path, project_directory=project_directory) input_dict['output']['TS0']['paths']['freq'] = os.path.join(ARC_PATH, input_dict['output']['TS0']['paths']['freq']) @@ -189,7 +189,7 @@ def test_restart_rate_2(self): def test_restart_bde (self): """Test restarting ARC and attaining a BDE for anilino_radical.""" - restart_dir = os.path.join(ARC_PATH, 'arc', 'testing', 'restart', '3_restart_bde') + restart_dir = os.path.join(ARC_TESTING_PATH, 'restart', '3_restart_bde') restart_path = os.path.join(restart_dir, 'restart.yml') project = _project_name('test_restart_bde') project_directory = os.path.join(ARC_PATH, 'Projects', project) @@ -208,7 +208,7 @@ def test_restart_bde (self): def test_globalize_paths(self): """Test modifying a YAML file's contents to correct absolute file paths""" - project_directory = os.path.join(ARC_PATH, 'arc', 'testing', 'restart', '4_globalized_paths') + project_directory = os.path.join(ARC_TESTING_PATH, 'restart', '4_globalized_paths') restart_path = os.path.join(project_directory, 'restart_paths.yml') input_dict = read_yaml_file(path=restart_path, project_directory=project_directory) input_dict['project_directory'] = project_directory @@ -235,16 +235,16 @@ def tearDownClass(cls): project_directory = os.path.join(ARC_PATH, 'Projects', project) shutil.rmtree(project_directory, ignore_errors=True) - shutil.rmtree(os.path.join(ARC_PATH, 'arc', 'testing', 'restart', '4_globalized_paths', + shutil.rmtree(os.path.join(ARC_TESTING_PATH, 'restart', '4_globalized_paths', 'log_and_restart_archive'), ignore_errors=True) for file_name in ['arc.log', 'restart_paths_globalized.yml']: - file_path = os.path.join(ARC_PATH, 'arc', 'testing', 'restart', '4_globalized_paths', file_name) + file_path = os.path.join(ARC_TESTING_PATH, 'restart', '4_globalized_paths', file_name) if os.path.isfile(file_path): os.remove(file_path) file_paths = [os.path.join(ARC_PATH, 'functional', 'nul'), os.path.join(ARC_PATH, 'functional', 'run.out')] project_names = ['1_restart_thermo', '2_restart_rate', '3_restart_bde', '5_TS1'] for project_name in project_names: - file_paths.append(os.path.join(ARC_PATH, 'arc', 'testing', 'restart', project_name, 'restart_globalized.yml')) + file_paths.append(os.path.join(ARC_TESTING_PATH, 'restart', project_name, 'restart_globalized.yml')) for file_path in file_paths: if os.path.isfile(file_path): os.remove(file_path)