From 971620440617c8e478c08520327007e45361344d Mon Sep 17 00:00:00 2001
From: Anyang Peng <137014849+anyangml@users.noreply.github.com>
Date: Mon, 8 Jun 2026 16:41:17 +0800
Subject: [PATCH 1/2] feat: support rxn39

---
 lambench/metrics/downstream_tasks_metrics.yml |   4 +
 lambench/metrics/post_process.py              |   1 +
 lambench/models/ase_models.py                 |  12 ++
 .../tasks/calculator/calculator_tasks.yml     |   3 +
 .../tasks/calculator/rxn_path39/rxn_path39.py | 105 ++++++++++++++++++
 5 files changed, 125 insertions(+)
 create mode 100644 lambench/tasks/calculator/rxn_path39/rxn_path39.py

diff --git a/lambench/metrics/downstream_tasks_metrics.yml b/lambench/metrics/downstream_tasks_metrics.yml
index caf8cab..eb9b361 100644
--- a/lambench/metrics/downstream_tasks_metrics.yml
+++ b/lambench/metrics/downstream_tasks_metrics.yml
@@ -32,6 +32,10 @@ rxn_barrier:
   domain: Molecules
   metrics: [MAE]
   dummy: {"MAE": 20.975}
+rxn_path39:
+  domain: Molecules
+  metrics: [MAE] # RMSE is not used for calculating metrics
+  dummy: {"MAE": 34.109} # "RMSE": 43.150
 pressure:
   domain: Inorganic Materials
   metrics: [MAE]
diff --git a/lambench/metrics/post_process.py b/lambench/metrics/post_process.py
index 7a8e769..f72092e 100644
--- a/lambench/metrics/post_process.py
+++ b/lambench/metrics/post_process.py
@@ -120,6 +120,7 @@ def process_domain_specific_for_one_model(model: BaseLargeAtomModel):
             "vacancy",
             "binding_energy",
             "rxn_barrier",
+            "rxn_path39",
             "pressure",
             "stacking_fault",
             "interface",
diff --git a/lambench/models/ase_models.py b/lambench/models/ase_models.py
index e1f20b4..bcfdd39 100644
--- a/lambench/models/ase_models.py
+++ b/lambench/models/ase_models.py
@@ -280,6 +280,18 @@ def evaluate(
             elif task.task_name == "wiggle150":
                 from lambench.tasks.calculator.wiggle150.wiggle150 import run_inference
 
+                assert task.test_data is not None
+                return {
+                    "metrics": run_inference(
+                        self,
+                        task.test_data,
+                    )
+                }
+            elif task.task_name == "rxn_path39":
+                from lambench.tasks.calculator.rxn_path39.rxn_path39 import (
+                    run_inference,
+                )
+
                 assert task.test_data is not None
                 return {
                     "metrics": run_inference(
diff --git a/lambench/tasks/calculator/calculator_tasks.yml b/lambench/tasks/calculator/calculator_tasks.yml
index 24bcdb5..39272d9 100644
--- a/lambench/tasks/calculator/calculator_tasks.yml
+++ b/lambench/tasks/calculator/calculator_tasks.yml
@@ -23,6 +23,9 @@ neb:
 wiggle150:
   test_data: /bohr/lambench-wiggle150-yazy/v1/Wiggle150.traj
   calculator_params: null
+rxn_path39:
+  test_data: /bohr/lambench-rxn39-755z/v2/trajs
+  calculator_params: null
 elastic:
   test_data: /bohr/lambench-elastic-9qdt/v1/elastic.json
   calculator_params:
diff --git a/lambench/tasks/calculator/rxn_path39/rxn_path39.py b/lambench/tasks/calculator/rxn_path39/rxn_path39.py
new file mode 100644
index 0000000..6f94174
--- /dev/null
+++ b/lambench/tasks/calculator/rxn_path39/rxn_path39.py
@@ -0,0 +1,105 @@
+"""
+RXN-Path-39: 13 organic reactions (wB97M-V/def2-TZVPD), 3 path-sampling
+trajectories each, 11 arc-length-equidistant frames per trajectory.
+
+For each trajectory the first frame (index 0) is chosen as the reference.
+The task measures how accurately a LAM reproduces the relative energies of all
+other frames with respect to that reference, i.e.
+
+    ΔE_DFT(i)  = E_DFT(i)  − E_DFT(frame 0)   [kcal/mol]
+    ΔE_LAM(i)  = E_LAM(i)  − E_LAM(frame 0)   [kcal/mol]
+
+and reports MAE and RMSE over all 39 × 10 = 390 (reaction, frame) pairs.
+"""
+
+from pathlib import Path
+import logging
+
+import numpy as np
+from ase.io import Trajectory
+from sklearn.metrics import mean_absolute_error, root_mean_squared_error
+
+from lambench.models.ase_models import ASEModel
+
+EV_TO_KCAL = 23.0609  # 1 eV = 23.0609 kcal/mol
+
+
+def run_inference(model: ASEModel, test_data: Path) -> dict[str, float]:
+    """
+    Parameters
+    ----------
+    model : ASEModel
+    test_data : Path
+        Root of the trajectory tree.  Expected layout::
+
+            test_data/
+              <reaction_id>/
+                traj_0.traj
+                traj_1.traj
+                traj_2.traj
+              ...
+
+    Returns
+    -------
+    dict with keys "MAE" and "RMSE" in kcal/mol.
+    """
+    calc = model.calc
+    label_diffs: list[float] = []
+    pred_diffs: list[float] = []
+
+    traj_files = sorted(test_data.rglob("traj_*.traj"))
+    if not traj_files:
+        raise FileNotFoundError(f"No traj_*.traj files found under {test_data}")
+
+    for traj_path in traj_files:
+        frames = list(Trajectory(traj_path))
+
+        # DFT reference energies (eV, stored by SinglePointCalculator)
+        dft_energies = np.array([a.get_potential_energy() for a in frames])
+        ref_dft_kcal = dft_energies[0] * EV_TO_KCAL
+
+        # LAM energy for the first frame (reference)
+        frames[0].calc = calc
+        try:
+            ref_pred_kcal = frames[0].get_potential_energy() * EV_TO_KCAL
+        except Exception as e:
+            logging.error(
+                f"Failed predicting reference frame (idx=0) in {traj_path}: {e}"
+            )
+            continue  # skip this trajectory entirely
+
+        # Relative energies for every non-reference frame
+        for i, atoms in enumerate(frames):
+            if i == 0:
+                continue
+
+            label_diffs.append(dft_energies[i] * EV_TO_KCAL - ref_dft_kcal)
+
+            atoms.calc = calc
+            try:
+                pred_kcal = atoms.get_potential_energy() * EV_TO_KCAL
+            except Exception as e:
+                logging.error(
+                    f"Failed predicting frame {i} of {traj_path}: {e}"
+                )
+                pred_kcal = np.nan
+            pred_diffs.append(pred_kcal - ref_pred_kcal)
+
+    label_arr = np.array(label_diffs)
+    pred_arr = np.array(pred_diffs)
+    valid = np.isfinite(pred_arr)
+
+    if not valid.any():
+        logging.error("All predictions failed; returning NaN metrics.")
+        return {"MAE": np.nan, "RMSE": np.nan}
+
+    if not valid.all():
+        n_failed = int((~valid).sum())
+        logging.warning(
+            f"{n_failed} frame(s) failed inference and were excluded from metrics."
+        )
+
+    return {
+        "MAE": float(mean_absolute_error(label_arr[valid], pred_arr[valid])),
+        "RMSE": float(root_mean_squared_error(label_arr[valid], pred_arr[valid])),
+    }

From 6cbf4b5d53b385b79b15e9920f5c4fa229ef4cf5 Mon Sep 17 00:00:00 2001
From: Anyang Peng <137014849+anyangml@users.noreply.github.com>
Date: Mon, 8 Jun 2026 16:42:58 +0800
Subject: [PATCH 2/2] chore: lint

---
 lambench/tasks/calculator/rxn_path39/rxn_path39.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/lambench/tasks/calculator/rxn_path39/rxn_path39.py b/lambench/tasks/calculator/rxn_path39/rxn_path39.py
index 6f94174..bba8834 100644
--- a/lambench/tasks/calculator/rxn_path39/rxn_path39.py
+++ b/lambench/tasks/calculator/rxn_path39/rxn_path39.py
@@ -79,9 +79,7 @@ def run_inference(model: ASEModel, test_data: Path) -> dict[str, float]:
             try:
                 pred_kcal = atoms.get_potential_energy() * EV_TO_KCAL
             except Exception as e:
-                logging.error(
-                    f"Failed predicting frame {i} of {traj_path}: {e}"
-                )
+                logging.error(f"Failed predicting frame {i} of {traj_path}: {e}")
                 pred_kcal = np.nan
             pred_diffs.append(pred_kcal - ref_pred_kcal)