From 27ea1d717f51491c95e6db6317d64a7d66f63cc0 Mon Sep 17 00:00:00 2001
From: Christopher Albert <albert@tugraz.at>
Date: Sun, 14 Jun 2026 12:46:42 +0200
Subject: [PATCH 01/10] ideal_mhd_model: share the constraint-force kernels

Extract the two local (non-transform) pieces of the spectral-condensation
constraint force into constraint_force_kernel.h, shared between the solver
and the Enzyme autodiff path:

- ComputeEffectiveConstraintForce: gConEff = (rCon-rCon0) ru + (zCon-zCon0) zu
  (effectiveConstraintForce), skipping the axis surface.
- AddConstraintForces: add the bandpass-filtered gCon back into the MHD R/Z
  forces and write frcon/fzcon (the constraint part of assembleTotalForces).

The Fourier-space bandpass between them stays the shared free function
deAliasConstraintForce; the free-boundary rBSq contribution stays in
assembleTotalForces. Allocation-free over flat buffers.

This completes the local force-density terms of the augmented functional
(MHD + lambda + constraint), the nonlinear core of the exact Hessian.
---
 .../ideal_mhd_model/constraint_force_kernel.h | 70 +++++++++++++++++++
 .../vmec/ideal_mhd_model/ideal_mhd_model.cc   | 49 +++++--------
 2 files changed, 86 insertions(+), 33 deletions(-)
 create mode 100644 src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/constraint_force_kernel.h
diff --git a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/constraint_force_kernel.h b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/constraint_force_kernel.h
new file mode 100644
index 00000000..a8a469a7
--- /dev/null
+++ b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/constraint_force_kernel.h
@@ -0,0 +1,70 @@
+// SPDX-FileCopyrightText: 2024-present Proxima Fusion GmbH
+// <info@proximafusion.com>
+//
+// SPDX-License-Identifier: MIT
+#ifndef VMECPP_VMEC_IDEAL_MHD_MODEL_CONSTRAINT_FORCE_KERNEL_H_
+#define VMECPP_VMEC_IDEAL_MHD_MODEL_CONSTRAINT_FORCE_KERNEL_H_
+
+#include <algorithm>
+
+namespace vmecpp {
+
+// The two local (non-transform) pieces of VMEC's spectral-condensation
+// constraint force. The Fourier-space bandpass between them is the separate
+// shared free function deAliasConstraintForce. All full-grid buffers are
+// indexed (jF-nsMinF)*nZnT except sqrtSF, which is indexed (jF-nsMinF1).
+// Allocation-free, shared between the solver and the Enzyme autodiff path.
+
+// Effective constraint force gConEff = (rCon - rCon0) ru + (zCon - zCon0) zu.
+// No constraint on the axis (it has no poloidal angle), so the axis surface is
+// skipped, matching deAliasConstraintForce.
+inline void ComputeEffectiveConstraintForce(
+    const double* rCon, const double* rCon0, const double* zCon,
+    const double* zCon0, const double* ruFull, const double* zuFull, int nZnT,
+    int nsMinF, int nsMaxFIncludingLcfs, double* gConEff) {
+  int jMin = 0;
+  if (nsMinF == 0) {
+    jMin = 1;
+  }
+  for (int jF = std::max(jMin, nsMinF); jF < nsMaxFIncludingLcfs; ++jF) {
+    for (int kl = 0; kl < nZnT; ++kl) {
+      int idx_kl = (jF - nsMinF) * nZnT + kl;
+      gConEff[idx_kl] = (rCon[idx_kl] - rCon0[idx_kl]) * ruFull[idx_kl] +
+                        (zCon[idx_kl] - zCon0[idx_kl]) * zuFull[idx_kl];
+    }  // kl
+  }  // jF
+}
+
+// Add the bandpass-filtered constraint force gCon back into the MHD R/Z forces
+// (brmn, bzmn) and write the constraint-force outputs (frcon, fzcon).
+inline void AddConstraintForces(const double* rCon, const double* rCon0,
+                                const double* zCon, const double* zCon0,
+                                const double* ruFull, const double* zuFull,
+                                const double* gCon, const double* sqrtSF,
+                                int nZnT, int nsMinF, int nsMinF1, int nsMaxF,
+                                double* brmn_e, double* brmn_o, double* bzmn_e,
+                                double* bzmn_o, double* frcon_e, double* frcon_o,
+                                double* fzcon_e, double* fzcon_o) {
+  for (int jF = nsMinF; jF < nsMaxF; ++jF) {
+    for (int kl = 0; kl < nZnT; ++kl) {
+      int idx_kl = (jF - nsMinF) * nZnT + kl;
+
+      double brcon = (rCon[idx_kl] - rCon0[idx_kl]) * gCon[idx_kl];
+      double bzcon = (zCon[idx_kl] - zCon0[idx_kl]) * gCon[idx_kl];
+
+      brmn_e[idx_kl] += brcon;
+      bzmn_e[idx_kl] += bzcon;
+      brmn_o[idx_kl] += brcon * sqrtSF[jF - nsMinF1];
+      bzmn_o[idx_kl] += bzcon * sqrtSF[jF - nsMinF1];
+
+      frcon_e[idx_kl] = ruFull[idx_kl] * gCon[idx_kl];
+      fzcon_e[idx_kl] = zuFull[idx_kl] * gCon[idx_kl];
+      frcon_o[idx_kl] = frcon_e[idx_kl] * sqrtSF[jF - nsMinF1];
+      fzcon_o[idx_kl] = fzcon_e[idx_kl] * sqrtSF[jF - nsMinF1];
+    }  // kl
+  }  // jF
+}
+
+}  // namespace vmecpp
+
+#endif  // VMECPP_VMEC_IDEAL_MHD_MODEL_CONSTRAINT_FORCE_KERNEL_H_
diff --git a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/ideal_mhd_model.cc b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/ideal_mhd_model.cc
index 8fc57ad3..396b4668 100644
--- a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/ideal_mhd_model.cc
+++ b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/ideal_mhd_model.cc
@@ -22,6 +22,7 @@
 #include "vmecpp/vmec/handover_storage/handover_storage.h"
 #include "vmecpp/vmec/ideal_mhd_model/bco_kernel.h"
 #include "vmecpp/vmec/ideal_mhd_model/bcontra_kernel.h"
+#include "vmecpp/vmec/ideal_mhd_model/constraint_force_kernel.h"
 #include "vmecpp/vmec/ideal_mhd_model/jacobian_kernel.h"
 #include "vmecpp/vmec/ideal_mhd_model/lambda_force_kernel.h"
 #include "vmecpp/vmec/ideal_mhd_model/metric_kernel.h"
@@ -2043,21 +2044,12 @@ absl::Status IdealMhdModel::constraintForceMultiplier() {
 }
 
 void IdealMhdModel::effectiveConstraintForce() {
-  // gConEff
-
-  // no constraint on axis --> has no poloidal angle
-  int jMin = 0;
-  if (r_.nsMinF == 0) {
-    jMin = 1;
-  }
-
-  for (int jF = std::max(jMin, r_.nsMinF); jF < r_.nsMaxFIncludingLcfs; ++jF) {
-    for (int kl = 0; kl < s_.nZnT; ++kl) {
-      int idx_kl = (jF - r_.nsMinF) * s_.nZnT + kl;
-      gConEff[idx_kl] = (rCon[idx_kl] - rCon0[idx_kl]) * ruFull[idx_kl] +
-                        (zCon[idx_kl] - zCon0[idx_kl]) * zuFull[idx_kl];
-    }  // kl
-  }  // jF
+  // gConEff via the shared kernel (constraint_force_kernel.h), also used by the
+  // Enzyme autodiff path.
+  ComputeEffectiveConstraintForce(rCon.data(), rCon0.data(), zCon.data(),
+                                  zCon0.data(), ruFull.data(), zuFull.data(),
+                                  s_.nZnT, r_.nsMinF, r_.nsMaxFIncludingLcfs,
+                                  gConEff.data());
 }
 
 // perform Fourier-space bandpass filtering of constraint force
@@ -2088,24 +2080,15 @@ void IdealMhdModel::assembleTotalForces() {
     }
   }
 
-  for (int jF = r_.nsMinF; jF < r_.nsMaxF; ++jF) {
-    for (int kl = 0; kl < s_.nZnT; ++kl) {
-      int idx_kl = (jF - r_.nsMinF) * s_.nZnT + kl;
-
-      double brcon = (rCon[idx_kl] - rCon0[idx_kl]) * gCon[idx_kl];
-      double bzcon = (zCon[idx_kl] - zCon0[idx_kl]) * gCon[idx_kl];
-
-      brmn_e[idx_kl] += brcon;
-      bzmn_e[idx_kl] += bzcon;
-      brmn_o[idx_kl] += brcon * m_p_.sqrtSF[jF - r_.nsMinF1];
-      bzmn_o[idx_kl] += bzcon * m_p_.sqrtSF[jF - r_.nsMinF1];
-
-      frcon_e[idx_kl] = ruFull[idx_kl] * gCon[idx_kl];
-      fzcon_e[idx_kl] = zuFull[idx_kl] * gCon[idx_kl];
-      frcon_o[idx_kl] = frcon_e[idx_kl] * m_p_.sqrtSF[jF - r_.nsMinF1];
-      fzcon_o[idx_kl] = fzcon_e[idx_kl] * m_p_.sqrtSF[jF - r_.nsMinF1];
-    }
-  }
+  // add the bandpass-filtered constraint force to the MHD R/Z forces and write
+  // the constraint outputs via the shared kernel (constraint_force_kernel.h),
+  // also used by the Enzyme autodiff path.
+  AddConstraintForces(rCon.data(), rCon0.data(), zCon.data(), zCon0.data(),
+                      ruFull.data(), zuFull.data(), gCon.data(),
+                      m_p_.sqrtSF.data(), s_.nZnT, r_.nsMinF, r_.nsMinF1,
+                      r_.nsMaxF, brmn_e.data(), brmn_o.data(), bzmn_e.data(),
+                      bzmn_o.data(), frcon_e.data(), frcon_o.data(),
+                      fzcon_e.data(), fzcon_o.data());
 }
 
 void IdealMhdModel::forcesToFourier(FourierForces& m_physical_f) {

From f2b548b30a457366a88f54b238ac0efe110c0c3e Mon Sep 17 00:00:00 2001
From: Christopher Albert <albert@tugraz.at>
Date: Sun, 14 Jun 2026 15:55:46 +0200
Subject: [PATCH 02/10] apply pre-commit formatting (ruff, docformatter,
 clang-format)

---
 .../common/enzyme/local_force_hessian_test.cc | 162 ++++++++++++------
 .../ideal_mhd_model/constraint_force_kernel.h |  14 +-
 .../vmec/ideal_mhd_model/dft_toroidal.cc      |   7 +-
 .../vmec/ideal_mhd_model/ideal_mhd_model.cc   |   4 +-
 .../ideal_mhd_model/lambda_force_kernel.h     |  14 +-
 5 files changed, 129 insertions(+), 72 deletions(-)

diff --git a/src/vmecpp/cpp/vmecpp/common/enzyme/local_force_hessian_test.cc b/src/vmecpp/cpp/vmecpp/common/enzyme/local_force_hessian_test.cc
index 082d0403..84cf939f 100644
--- a/src/vmecpp/cpp/vmecpp/common/enzyme/local_force_hessian_test.cc
+++ b/src/vmecpp/cpp/vmecpp/common/enzyme/local_force_hessian_test.cc
@@ -36,8 +36,8 @@
 
 // Problem dimensions and the constant (non-differentiated) context.
 struct Ctx {
-  int nZnT, nsH;       // half-grid surfaces; full-grid has nsH + 1
-  int nFull, nHalf;    // (nsH+1)*nZnT, nsH*nZnT
+  int nZnT, nsH;         // half-grid surfaces; full-grid has nsH + 1
+  int nFull, nHalf;      // (nsH+1)*nZnT, nsH*nZnT
   const double* sqrtSF;  // [nsH+1]
   const double* sqrtSH;  // [nsH]
   const double* chipH;   // [nsH]
@@ -46,13 +46,13 @@ struct Ctx {
   bool lthreed;
 };
 
-// 16 geometry blocks (each nFull) packed into geom; 12 force blocks (each nHalf)
-// in force; everything else is intermediate work sliced from work.
+// 16 geometry blocks (each nFull) packed into geom; 12 force blocks (each
+// nHalf) in force; everything else is intermediate work sliced from work.
 enum { kGeomBlocks = 16, kForceBlocks = 12 };
 
 // g: geometry -> force density, composing the six shared kernels.
 __attribute__((noinline)) void LocalForce(const double* geom, double* work,
-                                           double* force, const Ctx* c) {
+                                          double* force, const Ctx* c) {
   const int nF = c->nFull;
   const int nH = c->nHalf;
   const int nZnT = c->nZnT, nsH = c->nsH;
@@ -75,30 +75,45 @@ __attribute__((noinline)) void LocalForce(const double* geom, double* work,
   const double* lvo = geom + 15 * nF;
   // half-grid intermediates from work
   double* p = work;
-  double* r12 = p; p += nH;
-  double* ru12 = p; p += nH;
-  double* zu12 = p; p += nH;
-  double* rs = p; p += nH;
-  double* zs = p; p += nH;
-  double* tau = p; p += nH;
-  double* gsqrt = p; p += nH;
-  double* guu = p; p += nH;
-  double* guv = p; p += nH;
-  double* gvv = p; p += nH;
-  double* bsupu = p; p += nH;
-  double* bsupv = p; p += nH;
-  double* bsubu = p; p += nH;
-  double* bsubv = p; p += nH;
-  double* tp = p; p += nH;
+  double* r12 = p;
+  p += nH;
+  double* ru12 = p;
+  p += nH;
+  double* zu12 = p;
+  p += nH;
+  double* rs = p;
+  p += nH;
+  double* zs = p;
+  p += nH;
+  double* tau = p;
+  p += nH;
+  double* gsqrt = p;
+  p += nH;
+  double* guu = p;
+  p += nH;
+  double* guv = p;
+  p += nH;
+  double* gvv = p;
+  p += nH;
+  double* bsupu = p;
+  p += nH;
+  double* bsupv = p;
+  p += nH;
+  double* bsubu = p;
+  p += nH;
+  double* bsubv = p;
+  p += nH;
+  double* tp = p;
+  p += nH;
   // per-nZnT scratch for the force kernel (26 blocks)
   double* sc = p;  // 26 * nZnT
 
-  vmecpp::ComputeHalfGridJacobian(r1e, r1o, z1e, z1o, rue, ruo, zue, zuo, c->sqrtSH,
-                          c->deltaS, /*dSHalfDsInterp=*/0.25, nZnT, 0, 0, nsH,
-                          r12, ru12, zu12, rs, zs, tau);
-  vmecpp::ComputeMetricElements(r1e, r1o, rue, ruo, zue, zuo, rve, rvo, zve, zvo,
-                                tau, r12, c->sqrtSF, c->sqrtSH, c->lthreed, nZnT,
-                                0, 0, nsH, gsqrt, guu, guv, gvv);
+  vmecpp::ComputeHalfGridJacobian(
+      r1e, r1o, z1e, z1o, rue, ruo, zue, zuo, c->sqrtSH, c->deltaS,
+      /*dSHalfDsInterp=*/0.25, nZnT, 0, 0, nsH, r12, ru12, zu12, rs, zs, tau);
+  vmecpp::ComputeMetricElements(r1e, r1o, rue, ruo, zue, zuo, rve, rvo, zve,
+                                zvo, tau, r12, c->sqrtSF, c->sqrtSH, c->lthreed,
+                                nZnT, 0, 0, nsH, gsqrt, guu, guv, gvv);
   vmecpp::ComputeBsupContra(lue, luo, lve, lvo, gsqrt, c->sqrtSH, c->lthreed,
                             nZnT, 0, 0, nsH, bsupu, bsupv);
   for (int jH = 0; jH < nsH; ++jH) {
@@ -113,19 +128,58 @@ __attribute__((noinline)) void LocalForce(const double* geom, double* work,
     for (int kl = 0; kl < nZnT; ++kl) tp[jH * nZnT + kl] += c->presH[jH];
   }
   double* s = sc;
-  double* P_i = s; s += nZnT; double* rup_i = s; s += nZnT;
-  double* zup_i = s; s += nZnT; double* rsp_i = s; s += nZnT;
-  double* zsp_i = s; s += nZnT; double* taup_i = s; s += nZnT;
-  double* gbubu_i = s; s += nZnT; double* gbubv_i = s; s += nZnT;
-  double* gbvbv_i = s; s += nZnT; double* P_o = s; s += nZnT;
-  double* rup_o = s; s += nZnT; double* zup_o = s; s += nZnT;
-  double* rsp_o = s; s += nZnT; double* zsp_o = s; s += nZnT;
-  double* taup_o = s; s += nZnT; double* gbubu_o = s; s += nZnT;
-  double* gbubv_o = s; s += nZnT; double* gbvbv_o = s; s += nZnT;
-  double* P_avg = s; s += nZnT; double* P_wavg = s; s += nZnT;
-  double* gbubu_avg = s; s += nZnT; double* gbubu_wavg = s; s += nZnT;
-  double* gbvbv_avg = s; s += nZnT; double* gbvbv_wavg = s; s += nZnT;
-  double* gbubv_avg = s; s += nZnT; double* gbubv_wavg = s; s += nZnT;
+  double* P_i = s;
+  s += nZnT;
+  double* rup_i = s;
+  s += nZnT;
+  double* zup_i = s;
+  s += nZnT;
+  double* rsp_i = s;
+  s += nZnT;
+  double* zsp_i = s;
+  s += nZnT;
+  double* taup_i = s;
+  s += nZnT;
+  double* gbubu_i = s;
+  s += nZnT;
+  double* gbubv_i = s;
+  s += nZnT;
+  double* gbvbv_i = s;
+  s += nZnT;
+  double* P_o = s;
+  s += nZnT;
+  double* rup_o = s;
+  s += nZnT;
+  double* zup_o = s;
+  s += nZnT;
+  double* rsp_o = s;
+  s += nZnT;
+  double* zsp_o = s;
+  s += nZnT;
+  double* taup_o = s;
+  s += nZnT;
+  double* gbubu_o = s;
+  s += nZnT;
+  double* gbubv_o = s;
+  s += nZnT;
+  double* gbvbv_o = s;
+  s += nZnT;
+  double* P_avg = s;
+  s += nZnT;
+  double* P_wavg = s;
+  s += nZnT;
+  double* gbubu_avg = s;
+  s += nZnT;
+  double* gbubu_wavg = s;
+  s += nZnT;
+  double* gbvbv_avg = s;
+  s += nZnT;
+  double* gbvbv_wavg = s;
+  s += nZnT;
+  double* gbubv_avg = s;
+  s += nZnT;
+  double* gbubv_wavg = s;
+  s += nZnT;
   double* armn_e = force + 0 * nH;
   double* armn_o = force + 1 * nH;
   double* azmn_e = force + 2 * nH;
@@ -139,17 +193,18 @@ __attribute__((noinline)) void LocalForce(const double* geom, double* work,
   double* czmn_e = force + 10 * nH;
   double* czmn_o = force + 11 * nH;
   vmecpp::ComputeMHDForceDensity(
-      r1e, r1o, rue, ruo, zue, zuo, z1o, rve, rvo, zve, zvo, r12, ru12, zu12, rs,
-      zs, tau, tp, gsqrt, bsupu, bsupv, c->sqrtSF, c->sqrtSH, P_i, rup_i, zup_i,
-      rsp_i, zsp_i, taup_i, gbubu_i, gbubv_i, gbvbv_i, P_o, rup_o, zup_o, rsp_o,
-      zsp_o, taup_o, gbubu_o, gbubv_o, gbvbv_o, P_avg, P_wavg, gbubu_avg,
+      r1e, r1o, rue, ruo, zue, zuo, z1o, rve, rvo, zve, zvo, r12, ru12, zu12,
+      rs, zs, tau, tp, gsqrt, bsupu, bsupv, c->sqrtSF, c->sqrtSH, P_i, rup_i,
+      zup_i, rsp_i, zsp_i, taup_i, gbubu_i, gbubv_i, gbvbv_i, P_o, rup_o, zup_o,
+      rsp_o, zsp_o, taup_o, gbubu_o, gbubv_o, gbvbv_o, P_avg, P_wavg, gbubu_avg,
       gbubu_wavg, gbvbv_avg, gbvbv_wavg, gbubv_avg, gbubv_wavg, c->deltaS, nZnT,
       /*nsMinF=*/0, 0, 0, nsH, /*jMaxRZ=*/nsH, c->lthreed, armn_e, armn_o,
       azmn_e, azmn_o, brmn_e, brmn_o, bzmn_e, bzmn_o, crmn_e, crmn_o, czmn_e,
       czmn_o);
 }
 
-// Scalar objective L = 0.5 ||force||^2; work and force are caller-owned scratch.
+// Scalar objective L = 0.5 ||force||^2; work and force are caller-owned
+// scratch.
 __attribute__((noinline)) double Loss(const double* geom, double* work,
                                       double* force, const Ctx* c) {
   LocalForce(geom, work, force, c);
@@ -206,17 +261,19 @@ int main() {
     gp[i] = geom[i] + h * v[i];
     gm[i] = geom[i] - h * v[i];
   }
-  const double dfd =
-      (Loss(gp.data(), w2.data(), f2.data(), &c) -
-       Loss(gm.data(), w2.data(), f2.data(), &c)) /
-      (2 * h);
-  const double drev = std::inner_product(grev.begin(), grev.end(), v.begin(), 0.0);
+  const double dfd = (Loss(gp.data(), w2.data(), f2.data(), &c) -
+                      Loss(gm.data(), w2.data(), f2.data(), &c)) /
+                     (2 * h);
+  const double drev =
+      std::inner_product(grev.begin(), grev.end(), v.begin(), 0.0);
   const double scale = std::fabs(dfd) + 1e-300;
 
   printf("exact Hessian of VMEC local force map (composed 6 kernels)\n");
   printf("  geom dofs=%d  force outputs=%d\n", nG, nFc);
-  printf("  reverse dL.v vs finite-diff : %.2e\n", std::fabs(drev - dfd) / scale);
-  printf("  forward dL.v vs finite-diff : %.2e\n", std::fabs(dfwd - dfd) / scale);
+  printf("  reverse dL.v vs finite-diff : %.2e\n",
+         std::fabs(drev - dfd) / scale);
+  printf("  forward dL.v vs finite-diff : %.2e\n",
+         std::fabs(dfwd - dfd) / scale);
   printf("  forward / reverse agreement : %.2e\n",
          std::fabs(dfwd - drev) / (std::fabs(drev) + 1e-300));
 
@@ -225,7 +282,8 @@ int main() {
   for (int r = 0; r < reps; ++r) {
     volatile double q = __enzyme_fwddiff<double>(
         (void*)Loss, enzyme_dup, geom.data(), v.data(), enzyme_dup, work.data(),
-        dwork.data(), enzyme_dup, force.data(), dforce.data(), enzyme_const, &c);
+        dwork.data(), enzyme_dup, force.data(), dforce.data(), enzyme_const,
+        &c);
     (void)q;
   }
   auto t1 = std::chrono::steady_clock::now();
diff --git a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/constraint_force_kernel.h b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/constraint_force_kernel.h
index a8a469a7..b5078a6f 100644
--- a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/constraint_force_kernel.h
+++ b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/constraint_force_kernel.h
@@ -37,14 +37,12 @@ inline void ComputeEffectiveConstraintForce(
 
 // Add the bandpass-filtered constraint force gCon back into the MHD R/Z forces
 // (brmn, bzmn) and write the constraint-force outputs (frcon, fzcon).
-inline void AddConstraintForces(const double* rCon, const double* rCon0,
-                                const double* zCon, const double* zCon0,
-                                const double* ruFull, const double* zuFull,
-                                const double* gCon, const double* sqrtSF,
-                                int nZnT, int nsMinF, int nsMinF1, int nsMaxF,
-                                double* brmn_e, double* brmn_o, double* bzmn_e,
-                                double* bzmn_o, double* frcon_e, double* frcon_o,
-                                double* fzcon_e, double* fzcon_o) {
+inline void AddConstraintForces(
+    const double* rCon, const double* rCon0, const double* zCon,
+    const double* zCon0, const double* ruFull, const double* zuFull,
+    const double* gCon, const double* sqrtSF, int nZnT, int nsMinF, int nsMinF1,
+    int nsMaxF, double* brmn_e, double* brmn_o, double* bzmn_e, double* bzmn_o,
+    double* frcon_e, double* frcon_o, double* fzcon_e, double* fzcon_o) {
   for (int jF = nsMinF; jF < nsMaxF; ++jF) {
     for (int kl = 0; kl < nZnT; ++kl) {
       int idx_kl = (jF - nsMinF) * nZnT + kl;
diff --git a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/dft_toroidal.cc b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/dft_toroidal.cc
index fb3a6e2d..5f384cd6 100644
--- a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/dft_toroidal.cc
+++ b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/dft_toroidal.cc
@@ -93,9 +93,10 @@ void ForcesToFourier3DSymmFastPoloidal(
         double zmksc_n = -czmn_seg.dot(sinmui_seg);
 
         // Assemble effective R and Z forces from MHD and spectral condensation
-        // contributions. Materialize to avoid re-evaluation in each dot product,
-        // reusing per-thread scratch so the inner loop allocates nothing (and
-        // stays Enzyme-differentiable). Same arithmetic as a fresh .eval().
+        // contributions. Materialize to avoid re-evaluation in each dot
+        // product, reusing per-thread scratch so the inner loop allocates
+        // nothing (and stays Enzyme-differentiable). Same arithmetic as a fresh
+        // .eval().
         thread_local Eigen::VectorXd tempR_seg, tempZ_seg;
         tempR_seg.noalias() = armn_seg + xmpq[m] * frcon_seg;
         tempZ_seg.noalias() = azmn_seg + xmpq[m] * fzcon_seg;
diff --git a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/ideal_mhd_model.cc b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/ideal_mhd_model.cc
index 396b4668..1232d6e4 100644
--- a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/ideal_mhd_model.cc
+++ b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/ideal_mhd_model.cc
@@ -1608,8 +1608,8 @@ void IdealMhdModel::hybridLambdaForce() {
 #pragma omp barrier
 #endif  // _OPENMP
 
-  // Lambda force on the full grid via the shared kernel (lambda_force_kernel.h),
-  // also used by the Enzyme autodiff path.
+  // Lambda force on the full grid via the shared kernel
+  // (lambda_force_kernel.h), also used by the Enzyme autodiff path.
   ComputeHybridLambdaForce(
       bsubu.data(), bsubv.data(), gvv.data(), gsqrt.data(), guv.data(),
       bsupu.data(), lu_e.data(), lu_o.data(), m_p_.sqrtSH.data(),
diff --git a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/lambda_force_kernel.h b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/lambda_force_kernel.h
index be7d5459..105df373 100644
--- a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/lambda_force_kernel.h
+++ b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/lambda_force_kernel.h
@@ -9,11 +9,11 @@ namespace vmecpp {
 
 // Hybrid lambda force on the full grid: blmn (and clmn in 3D). The covariant
 // field bsubv is interpolated from the half grid two ways (a plain average and
-// an alternative from gvv/gsqrt times lambda plus, in 3D, guv*bsupu) and blended
-// with the radialBlending profile. The radial sweep carries the inside half-grid
-// point in scratch (bsubu_i, bsubv_i, gvv_gsqrt_i, guv_bsupu_i, each nZnT) and
-// shifts it outward each surface. Allocation-free over flat buffers, shared
-// between IdealMhdModel::hybridLambdaForce and the Enzyme autodiff path.
+// an alternative from gvv/gsqrt times lambda plus, in 3D, guv*bsupu) and
+// blended with the radialBlending profile. The radial sweep carries the inside
+// half-grid point in scratch (bsubu_i, bsubv_i, gvv_gsqrt_i, guv_bsupu_i, each
+// nZnT) and shifts it outward each surface. Allocation-free over flat buffers,
+// shared between IdealMhdModel::hybridLambdaForce and the Enzyme autodiff path.
 //
 // Half-grid inputs (index (jH-nsMinH)*nZnT): bsubu, bsubv, gvv, gsqrt, guv,
 // bsupu. Full-grid lambda (index (jF-nsMinF1)*nZnT): lu_e, lu_o. Profiles:
@@ -72,8 +72,8 @@ inline void ComputeHybridLambdaForce(
         }
       }
 
-      double gvv_gsqrt_lu_e =
-          0.5 * (gvv_gsqrt_i[kl] + gvv_gsqrt_o) * lu_e[(jF - nsMinF1) * nZnT + kl];
+      double gvv_gsqrt_lu_e = 0.5 * (gvv_gsqrt_i[kl] + gvv_gsqrt_o) *
+                              lu_e[(jF - nsMinF1) * nZnT + kl];
       double gvv_gsqrt_lu_o =
           0.5 * (gvv_gsqrt_i[kl] * sqrtSHi + gvv_gsqrt_o * sqrtSHo) *
           lu_o[(jF - nsMinF1) * nZnT + kl];

From 486df79750b33aef8449c3b6d2e8d8f606ec65d3 Mon Sep 17 00:00:00 2001
From: Christopher Albert <albert@tugraz.at>
Date: Sun, 14 Jun 2026 17:04:30 +0200
Subject: [PATCH 03/10] bazel: declare force-chain kernel headers in
 ideal_mhd_model (sandbox fix)

---
 .../cpp/vmecpp/vmec/ideal_mhd_model/BUILD.bazel       | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/BUILD.bazel b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/BUILD.bazel
index 5fca7247..e8df6160 100644
--- a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/BUILD.bazel
+++ b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/BUILD.bazel
@@ -82,7 +82,16 @@ cc_test(
 cc_library(
     name = "ideal_mhd_model",
     srcs = ["ideal_mhd_model.cc"],
-    hdrs = ["ideal_mhd_model.h", "dft_data.h"],
+# The shared force-chain kernels and the autodiff composition/JVP header are
+    # header-only and included by ideal_mhd_model.cc; declare whatever exists on
+    # this branch so the bazel sandbox can see them. The Enzyme JVP entry point
+    # (exact_force_jvp.cc) is built only by the CMake VMECPP_ENABLE_ENZYME path,
+    # not by bazel; its use in ideal_mhd_model.cc is guarded by that define.
+    hdrs = ["ideal_mhd_model.h", "dft_data.h"] + glob([
+        "*_kernel.h",
+        "local_force_composition.h",
+        "exact_force_jvp.h",
+    ]),
     visibility = ["//visibility:public"],
     deps = [
         ":dft_toroidal",

From 29fbf1dbe71b7db7571778d985249f2677c5cf5d Mon Sep 17 00:00:00 2001
From: Christopher Albert <albert@tugraz.at>
Date: Sun, 14 Jun 2026 17:31:45 +0200
Subject: [PATCH 04/10] ci: skip benchmark result upload on fork PRs (token is
 read-only)

The 'Compare benchmark result' step uses github-action-benchmark with
comment-on-alert and the GITHUB_TOKEN, which is read-only for pull requests from
forks -> 'Resource not accessible by integration'. Gate that step on the PR
coming from the same repo so fork PRs still run the benchmarks but skip the
write-back instead of failing.
---
 .github/workflows/benchmarks.yaml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/benchmarks.yaml b/.github/workflows/benchmarks.yaml
index c2f3fdf4..54326ec6 100644
--- a/.github/workflows/benchmarks.yaml
+++ b/.github/workflows/benchmarks.yaml
@@ -36,7 +36,11 @@ jobs:
           pytest benchmarks/test_benchmarks.py --benchmark-json=benchmark_results.json
 
       - name: Compare benchmark result
-        if: github.event_name == 'pull_request'
+        # Skip the result upload/compare for fork PRs: their GITHUB_TOKEN is
+        # read-only, so comment-on-alert/auto-push hit 'Resource not accessible
+        # by integration'. The benchmarks still run above; only the write-back
+        # is skipped for forks.
+        if: github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository
         uses: benchmark-action/github-action-benchmark@v1.21.0
         with:
           tool: "pytest"

From 8121fe303a52293865c7c3520bf905a8af0d58e9 Mon Sep 17 00:00:00 2001
From: Christopher Albert <albert@tugraz.at>
Date: Sun, 14 Jun 2026 18:12:42 +0200
Subject: [PATCH 05/10] ci: build VMEC2000 from source so the compat test runs
 on numpy 2

The pinned vmec-0.0.6 cp310 wheel was f90wrapped against numpy 1.x. Under
the numpy 2.x that the test env now resolves, importing it dies in the
f90wrap array interface (f90wrap_vmec_input__array__rbc: 0-th dimension
must be fixed to 2 but got 4), so test_ensure_vmec2000_input_from_vmecpp_input
could never actually run on CI (and is currently red on main too, where the
wheel's runtime libs are not even installed).

Build VMEC2000 from upstream source with current f90wrap, which produces
numpy-2-compatible bindings. The recipe mirrors SIMSOPT's own CI
(hiddenSymmetries/VMEC2000, cmake/machines/ubuntu.json). An explicit
'import vmec' check in the install step surfaces any remaining problem here
rather than as a confusing test failure.
---
 .github/workflows/tests.yaml | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index a22e0cbd..4a796405 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -53,12 +53,27 @@ jobs:
       - name: Also install VMEC2000 (only on Ubuntu 22.04)
         if: ${{ matrix.os == 'ubuntu-22.04' && matrix.python-version == '3.10' }}
         run: |
-          # mpi4py is needed for VMEC2000
-          sudo apt-get install -y libopenmpi-dev
-          python -m pip install mpi4py
-          # custom wheel for VMEC2000, needed for some VMEC++/VMEC2000 compatibility tests
-          # NOTE: this wheel is only guaranteed to work on Ubuntu 22.04
-          python -m pip install vmec@https://anaconda.org/eguiraud-pf/vmec/0.0.6/download/vmec-0.0.6-cp310-cp310-linux_x86_64.whl
+          # Build VMEC2000 from source instead of the old prebuilt wheel. The
+          # pinned vmec-0.0.6 cp310 wheel was f90wrapped against numpy 1.x and
+          # fails to import under numpy 2 (f90wrap array-interface break), so the
+          # compatibility test could never run. Building from source with current
+          # f90wrap produces numpy-2-compatible bindings. Recipe mirrors SIMSOPT's
+          # own CI (hiddenSymmetries/VMEC2000 cmake/machines/ubuntu.json).
+          sudo apt-get install -y libopenmpi-dev libnetcdff-dev libscalapack-mpi-dev \
+            libhdf5-dev libhdf5-serial-dev libfftw3-dev libopenblas-dev ninja-build
+          # Build VMEC2000 with the SYSTEM cmake + ninja. Do not pip-install the
+          # cmake/ninja wheels: they would shadow the system cmake that
+          # scikit-build-core's editable vmecpp rebuild records, breaking its
+          # verify_globs step in the editable matrix job.
+          python -m pip install mpi4py numpy scikit-build f90wrap setuptools wheel
+          git clone --depth 1 https://github.com/hiddenSymmetries/VMEC2000.git /tmp/VMEC2000
+          cd /tmp/VMEC2000
+          cp cmake/machines/ubuntu.json cmake_config_file.json
+          LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu \
+            python -m pip install -v --no-build-isolation .
+          cd -
+          # fail loudly here if the binding is still broken, not in the test step
+          python -c "import vmec; print('VMEC2000 import OK')"
       - name: Install package
         run: |
           # on Ubuntu we would not need this, but on MacOS we need to point CMake to gfortran-14 and gcc-14

From 6e97c3c9ed8292a2528b3d002ed50ef785592776 Mon Sep 17 00:00:00 2001
From: Christopher Albert <albert@tugraz.at>
Date: Sun, 14 Jun 2026 18:20:09 +0200
Subject: [PATCH 06/10] test: skip vmecpp-only indata fields in the VMEC2000
 compat subset

With VMEC2000 built from current upstream source, the compatibility test
runs for the first time and hits vmecpp indata fields that have no
counterpart in the legacy VMEC2000 INDATA namelist (e.g.
free_boundary_method), which raised AttributeError. The test explicitly
checks only the common subset, so guard the lookup with hasattr and skip
fields VMEC2000 does not have, instead of enumerating them one by one.
---
 tests/test_simsopt_compat.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/test_simsopt_compat.py b/tests/test_simsopt_compat.py
index 609dffea..1443d8cb 100644
--- a/tests/test_simsopt_compat.py
+++ b/tests/test_simsopt_compat.py
@@ -303,6 +303,11 @@ def test_ensure_vmec2000_input_from_vmecpp_input():
         if varname[1:-1] == "axis_":
             # these are called differently in VMEC2000, e.g. raxis_c -> raxis_cc
             varname_vmec2000 = f"{varname[:-1]}c{varname[-1]}"
+        if not hasattr(vmec2000.indata, varname_vmec2000):
+            # vmecpp-only field (e.g. free_boundary_method) with no counterpart
+            # in the legacy VMEC2000 INDATA namelist; not part of the common
+            # subset under test.
+            continue
         vmec2000_var = getattr(vmec2000.indata, varname_vmec2000)
 
         if vmecpp_var is None:

From 33f0bd76016b409af363ad41e47a0de2674449c0 Mon Sep 17 00:00:00 2001
From: Christopher Albert <albert@tugraz.at>
Date: Mon, 15 Jun 2026 07:22:54 +0200
Subject: [PATCH 07/10] ci: sync VMEC2000-from-source build, benchmark fork
 guard, abseil commit pin

Bring this stack branch up to the corrected CI baseline (from #583/#564):
- tests.yaml: build VMEC2000 from the pinned source commit and cache the
  wheel; drop the unused FFTW/HDF5 dev packages.
- benchmarks.yaml: skip the result upload on fork PRs (read-only token).
- test_simsopt_compat.py: skip vmecpp-only INDATA fields.
- CMakeLists: pin abseil to the 20260107.1 commit hash, not the tag.
---
 .github/workflows/tests.yaml | 39 +++++++++++++++++++-----------------
 CMakeLists.txt               |  7 +++----
 2 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 4a796405..0f68ab6e 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -50,28 +50,31 @@ jobs:
         run: |
           # install VMEC++ deps as well as VMEC2000 deps (we need to import VMEC2000 in a test)
           sudo apt-get update && sudo apt-get install -y build-essential cmake libnetcdf-dev liblapack-dev libomp-dev
+      - name: Cache the VMEC2000 wheel
+        if: ${{ matrix.os == 'ubuntu-22.04' && matrix.python-version == '3.10' }}
+        uses: actions/cache@v4
+        with:
+          path: /tmp/vmec2000-wheel
+          # Keyed on the pinned VMEC2000 source commit: rebuild only when it changes.
+          key: vmec2000-728af8bd6c79-cp310-ubuntu22.04
       - name: Also install VMEC2000 (only on Ubuntu 22.04)
         if: ${{ matrix.os == 'ubuntu-22.04' && matrix.python-version == '3.10' }}
         run: |
-          # Build VMEC2000 from source instead of the old prebuilt wheel. The
-          # pinned vmec-0.0.6 cp310 wheel was f90wrapped against numpy 1.x and
-          # fails to import under numpy 2 (f90wrap array-interface break), so the
-          # compatibility test could never run. Building from source with current
-          # f90wrap produces numpy-2-compatible bindings. Recipe mirrors SIMSOPT's
-          # own CI (hiddenSymmetries/VMEC2000 cmake/machines/ubuntu.json).
           sudo apt-get install -y libopenmpi-dev libnetcdff-dev libscalapack-mpi-dev \
-            libhdf5-dev libhdf5-serial-dev libfftw3-dev libopenblas-dev ninja-build
-          # Build VMEC2000 with the SYSTEM cmake + ninja. Do not pip-install the
-          # cmake/ninja wheels: they would shadow the system cmake that
-          # scikit-build-core's editable vmecpp rebuild records, breaking its
-          # verify_globs step in the editable matrix job.
-          python -m pip install mpi4py numpy scikit-build f90wrap setuptools wheel
-          git clone --depth 1 https://github.com/hiddenSymmetries/VMEC2000.git /tmp/VMEC2000
-          cd /tmp/VMEC2000
-          cp cmake/machines/ubuntu.json cmake_config_file.json
-          LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu \
-            python -m pip install -v --no-build-isolation .
-          cd -
+            libopenblas-dev ninja-build
+          python -m pip install mpi4py
+          if ! ls /tmp/vmec2000-wheel/vmec-*.whl >/dev/null 2>&1; then
+            # Build with the SYSTEM cmake + ninja; do NOT pip-install the
+            # cmake/ninja wheels (they shadow the system cmake that
+            # scikit-build-core's editable vmecpp rebuild records).
+            python -m pip install numpy scikit-build f90wrap setuptools wheel
+            git clone https://github.com/hiddenSymmetries/VMEC2000.git /tmp/VMEC2000
+            git -C /tmp/VMEC2000 checkout 728af8bd6c796b36a0aa85fe298e507791e57c6e
+            cp /tmp/VMEC2000/cmake/machines/ubuntu.json /tmp/VMEC2000/cmake_config_file.json
+            LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu \
+              python -m pip wheel /tmp/VMEC2000 --no-build-isolation -w /tmp/vmec2000-wheel
+          fi
+          python -m pip install /tmp/vmec2000-wheel/vmec-*.whl
           # fail loudly here if the binding is still broken, not in the test step
           python -c "import vmec; print('VMEC2000 import OK')"
       - name: Install package
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 813c0fe9..b4b64d23 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -66,10 +66,9 @@ find_package(LAPACK REQUIRED)
 FetchContent_Declare(
   abseil-cpp
   GIT_REPOSITORY https://github.com/abseil/abseil-cpp.git
-  # 20260107.1 LTS: required for Clang >= 21, where the 2024 pin fails to
-  # compile (absl::Nonnull SFINAE in absl/strings/ascii.cc). Clang is the
-  # compiler used for the Enzyme autodiff build.
-  GIT_TAG 20260107.1
+  # 20260107.1 LTS: older abseil fails to compile under Clang >= 21 (the
+  # Enzyme build) on absl::Nonnull SFINAE in absl/strings/ascii.cc.
+  GIT_TAG 255c84dadd029fd8ad25c5efb5933e47beaa00c7
   GIT_SHALLOW TRUE
 )
 FetchContent_Declare(

From a248cd9869c53281ca99b2485c75de3dd5851153 Mon Sep 17 00:00:00 2001
From: Christopher Albert <albert@tugraz.at>
Date: Mon, 15 Jun 2026 08:57:27 +0200
Subject: [PATCH 08/10] build+ci: abseil commit pin for Clang>=21,
 VMEC2000-from-source, benchmark fork guard (#564)

* build: bump CMake abseil pin to 20260107.1 for Clang >= 21

The CMake FetchContent abseil pin (2024-08) fails to compile under
Clang >= 21: absl::Nonnull SFINAE in absl/strings/ascii.cc and the
numbers.cc nullability annotations are rejected by the newer frontend.
Bump to the 20260107.1 LTS, which compiles cleanly under Clang 21.1.8
and GCC. Clang is the compiler required for the Enzyme autodiff build.

The Bazel build keeps its own (BCR) abseil pin and is unaffected.

* ci: skip benchmark result upload on fork PRs (token is read-only)

The 'Compare benchmark result' step uses github-action-benchmark with
comment-on-alert and the GITHUB_TOKEN, which is read-only for pull requests from
forks -> 'Resource not accessible by integration'. Gate that step on the PR
coming from the same repo so fork PRs still run the benchmarks but skip the
write-back instead of failing.

* ci: build VMEC2000 from source so the compat test runs on numpy 2

The pinned vmec-0.0.6 cp310 wheel was f90wrapped against numpy 1.x. Under
the numpy 2.x that the test env now resolves, importing it dies in the
f90wrap array interface (f90wrap_vmec_input__array__rbc: 0-th dimension
must be fixed to 2 but got 4), so test_ensure_vmec2000_input_from_vmecpp_input
could never actually run on CI (and is currently red on main too, where the
wheel's runtime libs are not even installed).

Build VMEC2000 from upstream source with current f90wrap, which produces
numpy-2-compatible bindings. The recipe mirrors SIMSOPT's own CI
(hiddenSymmetries/VMEC2000, cmake/machines/ubuntu.json). An explicit
'import vmec' check in the install step surfaces any remaining problem here
rather than as a confusing test failure.

* test: skip vmecpp-only indata fields in the VMEC2000 compat subset

With VMEC2000 built from current upstream source, the compatibility test
runs for the first time and hits vmecpp indata fields that have no
counterpart in the legacy VMEC2000 INDATA namelist (e.g.
free_boundary_method), which raised AttributeError. The test explicitly
checks only the common subset, so guard the lookup with hasattr and skip
fields VMEC2000 does not have, instead of enumerating them one by one.

* build: pin abseil to the 20260107.1 commit hash

Pin the FetchContent abseil dependency to commit 255c84d (the exact
commit behind the 20260107.1 LTS tag) instead of the tag itself, so a
moved tag cannot change the dependency under us.

* ci: cache and pin the VMEC2000-from-source build

Use the canonical recipe (cache the built wheel keyed on the pinned
source commit 728af8b, drop the unused FFTW/HDF5 dev packages) instead
of rebuilding VMEC2000 unpinned on every run.
---
 .github/workflows/benchmarks.yaml |  6 +++++-
 .github/workflows/tests.yaml      | 28 +++++++++++++++++++++++-----
 CMakeLists.txt                    |  4 +++-
 tests/test_simsopt_compat.py      |  5 +++++
 4 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/benchmarks.yaml b/.github/workflows/benchmarks.yaml
index c2f3fdf4..54326ec6 100644
--- a/.github/workflows/benchmarks.yaml
+++ b/.github/workflows/benchmarks.yaml
@@ -36,7 +36,11 @@ jobs:
           pytest benchmarks/test_benchmarks.py --benchmark-json=benchmark_results.json
 
       - name: Compare benchmark result
-        if: github.event_name == 'pull_request'
+        # Skip the result upload/compare for fork PRs: their GITHUB_TOKEN is
+        # read-only, so comment-on-alert/auto-push hit 'Resource not accessible
+        # by integration'. The benchmarks still run above; only the write-back
+        # is skipped for forks.
+        if: github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository
         uses: benchmark-action/github-action-benchmark@v1.21.0
         with:
           tool: "pytest"
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index a22e0cbd..0f68ab6e 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -50,15 +50,33 @@ jobs:
         run: |
           # install VMEC++ deps as well as VMEC2000 deps (we need to import VMEC2000 in a test)
           sudo apt-get update && sudo apt-get install -y build-essential cmake libnetcdf-dev liblapack-dev libomp-dev
+      - name: Cache the VMEC2000 wheel
+        if: ${{ matrix.os == 'ubuntu-22.04' && matrix.python-version == '3.10' }}
+        uses: actions/cache@v4
+        with:
+          path: /tmp/vmec2000-wheel
+          # Keyed on the pinned VMEC2000 source commit: rebuild only when it changes.
+          key: vmec2000-728af8bd6c79-cp310-ubuntu22.04
       - name: Also install VMEC2000 (only on Ubuntu 22.04)
         if: ${{ matrix.os == 'ubuntu-22.04' && matrix.python-version == '3.10' }}
         run: |
-          # mpi4py is needed for VMEC2000
-          sudo apt-get install -y libopenmpi-dev
+          sudo apt-get install -y libopenmpi-dev libnetcdff-dev libscalapack-mpi-dev \
+            libopenblas-dev ninja-build
           python -m pip install mpi4py
-          # custom wheel for VMEC2000, needed for some VMEC++/VMEC2000 compatibility tests
-          # NOTE: this wheel is only guaranteed to work on Ubuntu 22.04
-          python -m pip install vmec@https://anaconda.org/eguiraud-pf/vmec/0.0.6/download/vmec-0.0.6-cp310-cp310-linux_x86_64.whl
+          if ! ls /tmp/vmec2000-wheel/vmec-*.whl >/dev/null 2>&1; then
+            # Build with the SYSTEM cmake + ninja; do NOT pip-install the
+            # cmake/ninja wheels (they shadow the system cmake that
+            # scikit-build-core's editable vmecpp rebuild records).
+            python -m pip install numpy scikit-build f90wrap setuptools wheel
+            git clone https://github.com/hiddenSymmetries/VMEC2000.git /tmp/VMEC2000
+            git -C /tmp/VMEC2000 checkout 728af8bd6c796b36a0aa85fe298e507791e57c6e
+            cp /tmp/VMEC2000/cmake/machines/ubuntu.json /tmp/VMEC2000/cmake_config_file.json
+            LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu \
+              python -m pip wheel /tmp/VMEC2000 --no-build-isolation -w /tmp/vmec2000-wheel
+          fi
+          python -m pip install /tmp/vmec2000-wheel/vmec-*.whl
+          # fail loudly here if the binding is still broken, not in the test step
+          python -c "import vmec; print('VMEC2000 import OK')"
       - name: Install package
         run: |
           # on Ubuntu we would not need this, but on MacOS we need to point CMake to gfortran-14 and gcc-14
diff --git a/CMakeLists.txt b/CMakeLists.txt
index be3c3255..8e07b175 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -66,7 +66,9 @@ find_package(LAPACK REQUIRED)
 FetchContent_Declare(
   abseil-cpp
   GIT_REPOSITORY https://github.com/abseil/abseil-cpp.git
-  GIT_TAG 4447c7562e3bc702ade25105912dce503f0c4010
+  # 20260107.1 LTS: older abseil fails to compile under Clang >= 21 (the
+  # Enzyme build) on absl::Nonnull SFINAE in absl/strings/ascii.cc.
+  GIT_TAG 255c84dadd029fd8ad25c5efb5933e47beaa00c7
   GIT_SHALLOW TRUE
 )
 FetchContent_Declare(
diff --git a/tests/test_simsopt_compat.py b/tests/test_simsopt_compat.py
index 609dffea..1443d8cb 100644
--- a/tests/test_simsopt_compat.py
+++ b/tests/test_simsopt_compat.py
@@ -303,6 +303,11 @@ def test_ensure_vmec2000_input_from_vmecpp_input():
         if varname[1:-1] == "axis_":
             # these are called differently in VMEC2000, e.g. raxis_c -> raxis_cc
             varname_vmec2000 = f"{varname[:-1]}c{varname[-1]}"
+        if not hasattr(vmec2000.indata, varname_vmec2000):
+            # vmecpp-only field (e.g. free_boundary_method) with no counterpart
+            # in the legacy VMEC2000 INDATA namelist; not part of the common
+            # subset under test.
+            continue
         vmec2000_var = getattr(vmec2000.indata, varname_vmec2000)
 
         if vmecpp_var is None:

From 39e492fbf75e949cc416ae8ef0e701cf9934bdf4 Mon Sep 17 00:00:00 2001
From: Christopher Albert <albert@tugraz.at>
Date: Tue, 16 Jun 2026 08:31:12 +0200
Subject: [PATCH 09/10] ideal_mhd_model: hoist ForcesToFourier scratch out of
 the inner loop

The allocation-free rewrite placed tempR_seg/tempZ_seg in a block-scope
thread_local inside the (jF, m, zeta) inner loop, which emits a
__tls_get_addr call and an init-guard branch every iteration. Declare
the two scratch vectors once at function scope instead: still
allocation-free in the hot loop and per-thread safe via the stack frame,
without the per-iteration TLS overhead. Same arithmetic; cma and w7x
wout are bit-for-bit unchanged.
---
 .../cpp/vmecpp/vmec/ideal_mhd_model/dft_toroidal.cc   | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/dft_toroidal.cc b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/dft_toroidal.cc
index 5f384cd6..279386d9 100644
--- a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/dft_toroidal.cc
+++ b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/dft_toroidal.cc
@@ -34,6 +34,11 @@ void ForcesToFourier3DSymmFastPoloidal(
   // axis lambda stays zero (no contribution from any m)
   const int jMinL = 1;
 
+  // Reused scratch for the assembled R/Z forces: sized on first use, so the
+  // inner loop allocates nothing. Per-thread safe via the stack frame, since
+  // each OpenMP thread runs this on its own radial slice.
+  Eigen::VectorXd tempR_seg, tempZ_seg;
+
   for (int jF = rp.nsMinF; jF < jMaxRZ; ++jF) {
     const int mmax = jF == 0 ? 1 : s.mpol;
     for (int m = 0; m < mmax; ++m) {
@@ -94,10 +99,8 @@ void ForcesToFourier3DSymmFastPoloidal(
 
         // Assemble effective R and Z forces from MHD and spectral condensation
         // contributions. Materialize to avoid re-evaluation in each dot
-        // product, reusing per-thread scratch so the inner loop allocates
-        // nothing (and stays Enzyme-differentiable). Same arithmetic as a fresh
-        // .eval().
-        thread_local Eigen::VectorXd tempR_seg, tempZ_seg;
+        // product, reusing the function-scope scratch so the inner loop
+        // allocates nothing. Same arithmetic as a fresh .eval().
         tempR_seg.noalias() = armn_seg + xmpq[m] * frcon_seg;
         tempZ_seg.noalias() = azmn_seg + xmpq[m] * fzcon_seg;
 

From cac846a324b5f3920386f5ca57457f0cd67d8662 Mon Sep 17 00:00:00 2001
From: Christopher Albert <albert@tugraz.at>
Date: Tue, 16 Jun 2026 08:31:12 +0200
Subject: [PATCH 10/10] ideal_mhd_model: mark Jacobian  metric kernel buffers
 __restrict

Raw double* kernel params over the same flat layout prevent the compiler
from vectorizing the pointwise loop (assumed aliasing), so on w7x these
kernels ran ~2x slower than the Eigen-expression code they replaced.
The buffers never overlap; mark them __restrict to restore SIMD. Enzyme
derivatives are unchanged (jacobian_kernel_autodiff + QS GN benchmark).
---
 .../vmec/ideal_mhd_model/jacobian_kernel.h       | 13 ++++++++-----
 .../vmecpp/vmec/ideal_mhd_model/metric_kernel.h  | 16 ++++++++++------
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/jacobian_kernel.h b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/jacobian_kernel.h
index 11cc24ec..79ce3a39 100644
--- a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/jacobian_kernel.h
+++ b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/jacobian_kernel.h
@@ -23,11 +23,14 @@ namespace vmecpp {
 // half-grid; sqrtSH is indexed jH - nsMinH. The half-grid point jH sits between
 // full-grid surfaces jH (inside) and jH + 1 (outside).
 inline void ComputeHalfGridJacobian(
-    const double* r1e, const double* r1o, const double* z1e, const double* z1o,
-    const double* rue, const double* ruo, const double* zue, const double* zuo,
-    const double* sqrtSH, double deltaS, double dSHalfDsInterp, int nZnT,
-    int nsMinF1, int nsMinH, int nsMaxH, double* r12, double* ru12,
-    double* zu12, double* rs, double* zs, double* tau) {
+    const double* __restrict r1e, const double* __restrict r1o,
+    const double* __restrict z1e, const double* __restrict z1o,
+    const double* __restrict rue, const double* __restrict ruo,
+    const double* __restrict zue, const double* __restrict zuo,
+    const double* __restrict sqrtSH, double deltaS, double dSHalfDsInterp,
+    int nZnT, int nsMinF1, int nsMinH, int nsMaxH, double* __restrict r12,
+    double* __restrict ru12, double* __restrict zu12, double* __restrict rs,
+    double* __restrict zs, double* __restrict tau) {
   for (int jH = nsMinH; jH < nsMaxH; ++jH) {
     const double sH = sqrtSH[jH - nsMinH];
     for (int kl = 0; kl < nZnT; ++kl) {
diff --git a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/metric_kernel.h b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/metric_kernel.h
index ede5caac..d014ec60 100644
--- a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/metric_kernel.h
+++ b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/metric_kernel.h
@@ -14,12 +14,16 @@ namespace vmecpp {
 // IdealMhdModel::computeMetricElements and the Enzyme autodiff path. Same
 // indexing conventions as jacobian_kernel.h. sqrtSF is indexed jF - nsMinF1.
 inline void ComputeMetricElements(
-    const double* r1e, const double* r1o, const double* rue, const double* ruo,
-    const double* zue, const double* zuo, const double* rve, const double* rvo,
-    const double* zve, const double* zvo, const double* tau, const double* r12,
-    const double* sqrtSF, const double* sqrtSH, bool lthreed, int nZnT,
-    int nsMinF1, int nsMinH, int nsMaxH, double* gsqrt, double* guu,
-    double* guv, double* gvv) {
+    const double* __restrict r1e, const double* __restrict r1o,
+    const double* __restrict rue, const double* __restrict ruo,
+    const double* __restrict zue, const double* __restrict zuo,
+    const double* __restrict rve, const double* __restrict rvo,
+    const double* __restrict zve, const double* __restrict zvo,
+    const double* __restrict tau, const double* __restrict r12,
+    const double* __restrict sqrtSF, const double* __restrict sqrtSH,
+    bool lthreed, int nZnT, int nsMinF1, int nsMinH, int nsMaxH,
+    double* __restrict gsqrt, double* __restrict guu, double* __restrict guv,
+    double* __restrict gvv) {
   for (int jH = nsMinH; jH < nsMaxH; ++jH) {
     const double sF_i = sqrtSF[jH - nsMinF1] * sqrtSF[jH - nsMinF1];
     const double sF_o = sqrtSF[jH + 1 - nsMinF1] * sqrtSF[jH + 1 - nsMinF1];