itpplasma · krystophny · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026
diff --git a/.github/workflows/benchmarks.yaml b/.github/workflows/benchmarks.yaml
@@ -36,7 +36,11 @@ jobs:
           pytest benchmarks/test_benchmarks.py --benchmark-json=benchmark_results.json
 
       - name: Compare benchmark result
-        if: github.event_name == 'pull_request'
+        # Skip the result upload/compare for fork PRs: their GITHUB_TOKEN is
+        # read-only, so comment-on-alert/auto-push hit 'Resource not accessible
+        # by integration'. The benchmarks still run above; only the write-back
+        # is skipped for forks.
+        if: github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository
         uses: benchmark-action/github-action-benchmark@v1.21.0
         with:
           tool: "pytest"

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -50,15 +50,33 @@ jobs:
         run: |
           # install VMEC++ deps as well as VMEC2000 deps (we need to import VMEC2000 in a test)
           sudo apt-get update && sudo apt-get install -y build-essential cmake libnetcdf-dev liblapack-dev libomp-dev
+      - name: Cache the VMEC2000 wheel
+        if: ${{ matrix.os == 'ubuntu-22.04' && matrix.python-version == '3.10' }}
+        uses: actions/cache@v4
+        with:
+          path: /tmp/vmec2000-wheel
+          # Keyed on the pinned VMEC2000 source commit: rebuild only when it changes.
+          key: vmec2000-728af8bd6c79-cp310-ubuntu22.04
       - name: Also install VMEC2000 (only on Ubuntu 22.04)
         if: ${{ matrix.os == 'ubuntu-22.04' && matrix.python-version == '3.10' }}
         run: |
-          # mpi4py is needed for VMEC2000
-          sudo apt-get install -y libopenmpi-dev
+          sudo apt-get install -y libopenmpi-dev libnetcdff-dev libscalapack-mpi-dev \
+            libopenblas-dev ninja-build
           python -m pip install mpi4py
-          # custom wheel for VMEC2000, needed for some VMEC++/VMEC2000 compatibility tests
-          # NOTE: this wheel is only guaranteed to work on Ubuntu 22.04
-          python -m pip install vmec@https://anaconda.org/eguiraud-pf/vmec/0.0.6/download/vmec-0.0.6-cp310-cp310-linux_x86_64.whl
+          if ! ls /tmp/vmec2000-wheel/vmec-*.whl >/dev/null 2>&1; then
+            # Build with the SYSTEM cmake + ninja; do NOT pip-install the
+            # cmake/ninja wheels (they shadow the system cmake that
+            # scikit-build-core's editable vmecpp rebuild records).
+            python -m pip install numpy scikit-build f90wrap setuptools wheel
+            git clone https://github.com/hiddenSymmetries/VMEC2000.git /tmp/VMEC2000
+            git -C /tmp/VMEC2000 checkout 728af8bd6c796b36a0aa85fe298e507791e57c6e
+            cp /tmp/VMEC2000/cmake/machines/ubuntu.json /tmp/VMEC2000/cmake_config_file.json
+            LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu \
+              python -m pip wheel /tmp/VMEC2000 --no-build-isolation -w /tmp/vmec2000-wheel
+          fi
+          python -m pip install /tmp/vmec2000-wheel/vmec-*.whl
+          # fail loudly here if the binding is still broken, not in the test step
+          python -c "import vmec; print('VMEC2000 import OK')"
       - name: Install package
         run: |
           # on Ubuntu we would not need this, but on MacOS we need to point CMake to gfortran-14 and gcc-14

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -66,10 +66,9 @@ find_package(LAPACK REQUIRED)
 FetchContent_Declare(
   abseil-cpp
   GIT_REPOSITORY https://github.com/abseil/abseil-cpp.git
-  # 20260107.1 LTS: required for Clang >= 21, where the 2024 pin fails to
-  # compile (absl::Nonnull SFINAE in absl/strings/ascii.cc). Clang is the
-  # compiler used for the Enzyme autodiff build.
-  GIT_TAG 20260107.1
+  # 20260107.1 LTS: older abseil fails to compile under Clang >= 21 (the
+  # Enzyme build) on absl::Nonnull SFINAE in absl/strings/ascii.cc.
+  GIT_TAG 255c84dadd029fd8ad25c5efb5933e47beaa00c7
   GIT_SHALLOW TRUE
 )
 FetchContent_Declare(

diff --git a/src/vmecpp/cpp/vmecpp/common/enzyme/local_force_hessian_test.cc b/src/vmecpp/cpp/vmecpp/common/enzyme/local_force_hessian_test.cc
@@ -36,8 +36,8 @@
 
 // Problem dimensions and the constant (non-differentiated) context.
 struct Ctx {
-  int nZnT, nsH;       // half-grid surfaces; full-grid has nsH + 1
-  int nFull, nHalf;    // (nsH+1)*nZnT, nsH*nZnT
+  int nZnT, nsH;         // half-grid surfaces; full-grid has nsH + 1
+  int nFull, nHalf;      // (nsH+1)*nZnT, nsH*nZnT
   const double* sqrtSF;  // [nsH+1]
   const double* sqrtSH;  // [nsH]
   const double* chipH;   // [nsH]
@@ -46,13 +46,13 @@ struct Ctx {
   bool lthreed;
 };
 
-// 16 geometry blocks (each nFull) packed into geom; 12 force blocks (each nHalf)
-// in force; everything else is intermediate work sliced from work.
+// 16 geometry blocks (each nFull) packed into geom; 12 force blocks (each
+// nHalf) in force; everything else is intermediate work sliced from work.
 enum { kGeomBlocks = 16, kForceBlocks = 12 };
 
 // g: geometry -> force density, composing the six shared kernels.
 __attribute__((noinline)) void LocalForce(const double* geom, double* work,
-                                           double* force, const Ctx* c) {
+                                          double* force, const Ctx* c) {
   const int nF = c->nFull;
   const int nH = c->nHalf;
   const int nZnT = c->nZnT, nsH = c->nsH;
@@ -75,30 +75,45 @@ __attribute__((noinline)) void LocalForce(const double* geom, double* work,
   const double* lvo = geom + 15 * nF;
   // half-grid intermediates from work
   double* p = work;
-  double* r12 = p; p += nH;
-  double* ru12 = p; p += nH;
-  double* zu12 = p; p += nH;
-  double* rs = p; p += nH;
-  double* zs = p; p += nH;
-  double* tau = p; p += nH;
-  double* gsqrt = p; p += nH;
-  double* guu = p; p += nH;
-  double* guv = p; p += nH;
-  double* gvv = p; p += nH;
-  double* bsupu = p; p += nH;
-  double* bsupv = p; p += nH;
-  double* bsubu = p; p += nH;
-  double* bsubv = p; p += nH;
-  double* tp = p; p += nH;
+  double* r12 = p;
+  p += nH;
+  double* ru12 = p;
+  p += nH;
+  double* zu12 = p;
+  p += nH;
+  double* rs = p;
+  p += nH;
+  double* zs = p;
+  p += nH;
+  double* tau = p;
+  p += nH;
+  double* gsqrt = p;
+  p += nH;
+  double* guu = p;
+  p += nH;
+  double* guv = p;
+  p += nH;
+  double* gvv = p;
+  p += nH;
+  double* bsupu = p;
+  p += nH;
+  double* bsupv = p;
+  p += nH;
+  double* bsubu = p;
+  p += nH;
+  double* bsubv = p;
+  p += nH;
+  double* tp = p;
+  p += nH;
   // per-nZnT scratch for the force kernel (26 blocks)
   double* sc = p;  // 26 * nZnT
 
-  vmecpp::ComputeHalfGridJacobian(r1e, r1o, z1e, z1o, rue, ruo, zue, zuo, c->sqrtSH,
-                          c->deltaS, /*dSHalfDsInterp=*/0.25, nZnT, 0, 0, nsH,
-                          r12, ru12, zu12, rs, zs, tau);
-  vmecpp::ComputeMetricElements(r1e, r1o, rue, ruo, zue, zuo, rve, rvo, zve, zvo,
-                                tau, r12, c->sqrtSF, c->sqrtSH, c->lthreed, nZnT,
-                                0, 0, nsH, gsqrt, guu, guv, gvv);
+  vmecpp::ComputeHalfGridJacobian(
+      r1e, r1o, z1e, z1o, rue, ruo, zue, zuo, c->sqrtSH, c->deltaS,
+      /*dSHalfDsInterp=*/0.25, nZnT, 0, 0, nsH, r12, ru12, zu12, rs, zs, tau);
+  vmecpp::ComputeMetricElements(r1e, r1o, rue, ruo, zue, zuo, rve, rvo, zve,
+                                zvo, tau, r12, c->sqrtSF, c->sqrtSH, c->lthreed,
+                                nZnT, 0, 0, nsH, gsqrt, guu, guv, gvv);
   vmecpp::ComputeBsupContra(lue, luo, lve, lvo, gsqrt, c->sqrtSH, c->lthreed,
                             nZnT, 0, 0, nsH, bsupu, bsupv);
   for (int jH = 0; jH < nsH; ++jH) {
@@ -113,19 +128,58 @@ __attribute__((noinline)) void LocalForce(const double* geom, double* work,
     for (int kl = 0; kl < nZnT; ++kl) tp[jH * nZnT + kl] += c->presH[jH];
   }
   double* s = sc;
-  double* P_i = s; s += nZnT; double* rup_i = s; s += nZnT;
-  double* zup_i = s; s += nZnT; double* rsp_i = s; s += nZnT;
-  double* zsp_i = s; s += nZnT; double* taup_i = s; s += nZnT;
-  double* gbubu_i = s; s += nZnT; double* gbubv_i = s; s += nZnT;
-  double* gbvbv_i = s; s += nZnT; double* P_o = s; s += nZnT;
-  double* rup_o = s; s += nZnT; double* zup_o = s; s += nZnT;
-  double* rsp_o = s; s += nZnT; double* zsp_o = s; s += nZnT;
-  double* taup_o = s; s += nZnT; double* gbubu_o = s; s += nZnT;
-  double* gbubv_o = s; s += nZnT; double* gbvbv_o = s; s += nZnT;
-  double* P_avg = s; s += nZnT; double* P_wavg = s; s += nZnT;
-  double* gbubu_avg = s; s += nZnT; double* gbubu_wavg = s; s += nZnT;
-  double* gbvbv_avg = s; s += nZnT; double* gbvbv_wavg = s; s += nZnT;
-  double* gbubv_avg = s; s += nZnT; double* gbubv_wavg = s; s += nZnT;
+  double* P_i = s;
+  s += nZnT;
+  double* rup_i = s;
+  s += nZnT;
+  double* zup_i = s;
+  s += nZnT;
+  double* rsp_i = s;
+  s += nZnT;
+  double* zsp_i = s;
+  s += nZnT;
+  double* taup_i = s;
+  s += nZnT;
+  double* gbubu_i = s;
+  s += nZnT;
+  double* gbubv_i = s;
+  s += nZnT;
+  double* gbvbv_i = s;
+  s += nZnT;
+  double* P_o = s;
+  s += nZnT;
+  double* rup_o = s;
+  s += nZnT;
+  double* zup_o = s;
+  s += nZnT;
+  double* rsp_o = s;
+  s += nZnT;
+  double* zsp_o = s;
+  s += nZnT;
+  double* taup_o = s;
+  s += nZnT;
+  double* gbubu_o = s;
+  s += nZnT;
+  double* gbubv_o = s;
+  s += nZnT;
+  double* gbvbv_o = s;
+  s += nZnT;
+  double* P_avg = s;
+  s += nZnT;
+  double* P_wavg = s;
+  s += nZnT;
+  double* gbubu_avg = s;
+  s += nZnT;
+  double* gbubu_wavg = s;
+  s += nZnT;
+  double* gbvbv_avg = s;
+  s += nZnT;
+  double* gbvbv_wavg = s;
+  s += nZnT;
+  double* gbubv_avg = s;
+  s += nZnT;
+  double* gbubv_wavg = s;
+  s += nZnT;
   double* armn_e = force + 0 * nH;
   double* armn_o = force + 1 * nH;
   double* azmn_e = force + 2 * nH;
@@ -139,17 +193,18 @@ __attribute__((noinline)) void LocalForce(const double* geom, double* work,
   double* czmn_e = force + 10 * nH;
   double* czmn_o = force + 11 * nH;
   vmecpp::ComputeMHDForceDensity(
-      r1e, r1o, rue, ruo, zue, zuo, z1o, rve, rvo, zve, zvo, r12, ru12, zu12, rs,
-      zs, tau, tp, gsqrt, bsupu, bsupv, c->sqrtSF, c->sqrtSH, P_i, rup_i, zup_i,
-      rsp_i, zsp_i, taup_i, gbubu_i, gbubv_i, gbvbv_i, P_o, rup_o, zup_o, rsp_o,
-      zsp_o, taup_o, gbubu_o, gbubv_o, gbvbv_o, P_avg, P_wavg, gbubu_avg,
+      r1e, r1o, rue, ruo, zue, zuo, z1o, rve, rvo, zve, zvo, r12, ru12, zu12,
+      rs, zs, tau, tp, gsqrt, bsupu, bsupv, c->sqrtSF, c->sqrtSH, P_i, rup_i,
+      zup_i, rsp_i, zsp_i, taup_i, gbubu_i, gbubv_i, gbvbv_i, P_o, rup_o, zup_o,
+      rsp_o, zsp_o, taup_o, gbubu_o, gbubv_o, gbvbv_o, P_avg, P_wavg, gbubu_avg,
       gbubu_wavg, gbvbv_avg, gbvbv_wavg, gbubv_avg, gbubv_wavg, c->deltaS, nZnT,
       /*nsMinF=*/0, 0, 0, nsH, /*jMaxRZ=*/nsH, c->lthreed, armn_e, armn_o,
       azmn_e, azmn_o, brmn_e, brmn_o, bzmn_e, bzmn_o, crmn_e, crmn_o, czmn_e,
       czmn_o);
 }
 
-// Scalar objective L = 0.5 ||force||^2; work and force are caller-owned scratch.
+// Scalar objective L = 0.5 ||force||^2; work and force are caller-owned
+// scratch.
 __attribute__((noinline)) double Loss(const double* geom, double* work,
                                       double* force, const Ctx* c) {
   LocalForce(geom, work, force, c);
@@ -206,17 +261,19 @@ int main() {
     gp[i] = geom[i] + h * v[i];
     gm[i] = geom[i] - h * v[i];
   }
-  const double dfd =
-      (Loss(gp.data(), w2.data(), f2.data(), &c) -
-       Loss(gm.data(), w2.data(), f2.data(), &c)) /
-      (2 * h);
-  const double drev = std::inner_product(grev.begin(), grev.end(), v.begin(), 0.0);
+  const double dfd = (Loss(gp.data(), w2.data(), f2.data(), &c) -
+                      Loss(gm.data(), w2.data(), f2.data(), &c)) /
+                     (2 * h);
+  const double drev =
+      std::inner_product(grev.begin(), grev.end(), v.begin(), 0.0);
   const double scale = std::fabs(dfd) + 1e-300;
 
   printf("exact Hessian of VMEC local force map (composed 6 kernels)\n");
   printf("  geom dofs=%d  force outputs=%d\n", nG, nFc);
-  printf("  reverse dL.v vs finite-diff : %.2e\n", std::fabs(drev - dfd) / scale);
-  printf("  forward dL.v vs finite-diff : %.2e\n", std::fabs(dfwd - dfd) / scale);
+  printf("  reverse dL.v vs finite-diff : %.2e\n",
+         std::fabs(drev - dfd) / scale);
+  printf("  forward dL.v vs finite-diff : %.2e\n",
+         std::fabs(dfwd - dfd) / scale);
   printf("  forward / reverse agreement : %.2e\n",
          std::fabs(dfwd - drev) / (std::fabs(drev) + 1e-300));
 
@@ -225,7 +282,8 @@ int main() {
   for (int r = 0; r < reps; ++r) {
     volatile double q = __enzyme_fwddiff<double>(
         (void*)Loss, enzyme_dup, geom.data(), v.data(), enzyme_dup, work.data(),
-        dwork.data(), enzyme_dup, force.data(), dforce.data(), enzyme_const, &c);
+        dwork.data(), enzyme_dup, force.data(), dforce.data(), enzyme_const,
+        &c);
     (void)q;
   }
   auto t1 = std::chrono::steady_clock::now();

diff --git a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/BUILD.bazel b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/BUILD.bazel
@@ -82,7 +82,16 @@ cc_test(
 cc_library(
     name = "ideal_mhd_model",
     srcs = ["ideal_mhd_model.cc"],
-    hdrs = ["ideal_mhd_model.h", "dft_data.h"],
+# The shared force-chain kernels and the autodiff composition/JVP header are
+    # header-only and included by ideal_mhd_model.cc; declare whatever exists on
+    # this branch so the bazel sandbox can see them. The Enzyme JVP entry point
+    # (exact_force_jvp.cc) is built only by the CMake VMECPP_ENABLE_ENZYME path,
+    # not by bazel; its use in ideal_mhd_model.cc is guarded by that define.
+    hdrs = ["ideal_mhd_model.h", "dft_data.h"] + glob([
+        "*_kernel.h",
+        "local_force_composition.h",
+        "exact_force_jvp.h",
+    ]),
     visibility = ["//visibility:public"],
     deps = [
         ":dft_toroidal",

diff --git a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/constraint_force_kernel.h b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/constraint_force_kernel.h
@@ -0,0 +1,68 @@
+// SPDX-FileCopyrightText: 2024-present Proxima Fusion GmbH
+// <info@proximafusion.com>
+//
+// SPDX-License-Identifier: MIT
+#ifndef VMECPP_VMEC_IDEAL_MHD_MODEL_CONSTRAINT_FORCE_KERNEL_H_
+#define VMECPP_VMEC_IDEAL_MHD_MODEL_CONSTRAINT_FORCE_KERNEL_H_
+
+#include <algorithm>
+
+namespace vmecpp {
+
+// The two local (non-transform) pieces of VMEC's spectral-condensation
+// constraint force. The Fourier-space bandpass between them is the separate
+// shared free function deAliasConstraintForce. All full-grid buffers are
+// indexed (jF-nsMinF)*nZnT except sqrtSF, which is indexed (jF-nsMinF1).
+// Allocation-free, shared between the solver and the Enzyme autodiff path.
+
+// Effective constraint force gConEff = (rCon - rCon0) ru + (zCon - zCon0) zu.
+// No constraint on the axis (it has no poloidal angle), so the axis surface is
+// skipped, matching deAliasConstraintForce.
+inline void ComputeEffectiveConstraintForce(
+    const double* rCon, const double* rCon0, const double* zCon,
+    const double* zCon0, const double* ruFull, const double* zuFull, int nZnT,
+    int nsMinF, int nsMaxFIncludingLcfs, double* gConEff) {
+  int jMin = 0;
+  if (nsMinF == 0) {
+    jMin = 1;
+  }
+  for (int jF = std::max(jMin, nsMinF); jF < nsMaxFIncludingLcfs; ++jF) {
+    for (int kl = 0; kl < nZnT; ++kl) {
+      int idx_kl = (jF - nsMinF) * nZnT + kl;
+      gConEff[idx_kl] = (rCon[idx_kl] - rCon0[idx_kl]) * ruFull[idx_kl] +
+                        (zCon[idx_kl] - zCon0[idx_kl]) * zuFull[idx_kl];
+    }  // kl
+  }  // jF
+}
+
+// Add the bandpass-filtered constraint force gCon back into the MHD R/Z forces
+// (brmn, bzmn) and write the constraint-force outputs (frcon, fzcon).
+inline void AddConstraintForces(
+    const double* rCon, const double* rCon0, const double* zCon,
+    const double* zCon0, const double* ruFull, const double* zuFull,
+    const double* gCon, const double* sqrtSF, int nZnT, int nsMinF, int nsMinF1,
+    int nsMaxF, double* brmn_e, double* brmn_o, double* bzmn_e, double* bzmn_o,
+    double* frcon_e, double* frcon_o, double* fzcon_e, double* fzcon_o) {
+  for (int jF = nsMinF; jF < nsMaxF; ++jF) {
+    for (int kl = 0; kl < nZnT; ++kl) {
+      int idx_kl = (jF - nsMinF) * nZnT + kl;
+
+      double brcon = (rCon[idx_kl] - rCon0[idx_kl]) * gCon[idx_kl];
+      double bzcon = (zCon[idx_kl] - zCon0[idx_kl]) * gCon[idx_kl];
+
+      brmn_e[idx_kl] += brcon;
+      bzmn_e[idx_kl] += bzcon;
+      brmn_o[idx_kl] += brcon * sqrtSF[jF - nsMinF1];
+      bzmn_o[idx_kl] += bzcon * sqrtSF[jF - nsMinF1];
+
+      frcon_e[idx_kl] = ruFull[idx_kl] * gCon[idx_kl];
+      fzcon_e[idx_kl] = zuFull[idx_kl] * gCon[idx_kl];
+      frcon_o[idx_kl] = frcon_e[idx_kl] * sqrtSF[jF - nsMinF1];
+      fzcon_o[idx_kl] = fzcon_e[idx_kl] * sqrtSF[jF - nsMinF1];
+    }  // kl
+  }  // jF
+}
+
+}  // namespace vmecpp
+
+#endif  // VMECPP_VMEC_IDEAL_MHD_MODEL_CONSTRAINT_FORCE_KERNEL_H_
diff --git a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/dft_toroidal.cc b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/dft_toroidal.cc
@@ -93,9 +93,10 @@ void ForcesToFourier3DSymmFastPoloidal(
         double zmksc_n = -czmn_seg.dot(sinmui_seg);
 
         // Assemble effective R and Z forces from MHD and spectral condensation
-        // contributions. Materialize to avoid re-evaluation in each dot product,
-        // reusing per-thread scratch so the inner loop allocates nothing (and
-        // stays Enzyme-differentiable). Same arithmetic as a fresh .eval().
+        // contributions. Materialize to avoid re-evaluation in each dot
+        // product, reusing per-thread scratch so the inner loop allocates
+        // nothing (and stays Enzyme-differentiable). Same arithmetic as a fresh
+        // .eval().
         thread_local Eigen::VectorXd tempR_seg, tempZ_seg;
         tempR_seg.noalias() = armn_seg + xmpq[m] * frcon_seg;
         tempZ_seg.noalias() = azmn_seg + xmpq[m] * fzcon_seg;