itpplasma · krystophny · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026
diff --git a/.github/workflows/benchmarks.yaml b/.github/workflows/benchmarks.yaml
@@ -36,7 +36,11 @@ jobs:
           pytest benchmarks/test_benchmarks.py --benchmark-json=benchmark_results.json
 
       - name: Compare benchmark result
-        if: github.event_name == 'pull_request'
+        # Skip the result upload/compare for fork PRs: their GITHUB_TOKEN is
+        # read-only, so comment-on-alert/auto-push hit 'Resource not accessible
+        # by integration'. The benchmarks still run above; only the write-back
+        # is skipped for forks.
+        if: github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository
         uses: benchmark-action/github-action-benchmark@v1.21.0
         with:
           tool: "pytest"

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -50,15 +50,33 @@ jobs:
         run: |
           # install VMEC++ deps as well as VMEC2000 deps (we need to import VMEC2000 in a test)
           sudo apt-get update && sudo apt-get install -y build-essential cmake libnetcdf-dev liblapack-dev libomp-dev
+      - name: Cache the VMEC2000 wheel
+        if: ${{ matrix.os == 'ubuntu-22.04' && matrix.python-version == '3.10' }}
+        uses: actions/cache@v4
+        with:
+          path: /tmp/vmec2000-wheel
+          # Keyed on the pinned VMEC2000 source commit: rebuild only when it changes.
+          key: vmec2000-728af8bd6c79-cp310-ubuntu22.04
       - name: Also install VMEC2000 (only on Ubuntu 22.04)
         if: ${{ matrix.os == 'ubuntu-22.04' && matrix.python-version == '3.10' }}
         run: |
-          # mpi4py is needed for VMEC2000
-          sudo apt-get install -y libopenmpi-dev
+          sudo apt-get install -y libopenmpi-dev libnetcdff-dev libscalapack-mpi-dev \
+            libopenblas-dev ninja-build
           python -m pip install mpi4py
-          # custom wheel for VMEC2000, needed for some VMEC++/VMEC2000 compatibility tests
-          # NOTE: this wheel is only guaranteed to work on Ubuntu 22.04
-          python -m pip install vmec@https://anaconda.org/eguiraud-pf/vmec/0.0.6/download/vmec-0.0.6-cp310-cp310-linux_x86_64.whl
+          if ! ls /tmp/vmec2000-wheel/vmec-*.whl >/dev/null 2>&1; then
+            # Build with the SYSTEM cmake + ninja; do NOT pip-install the
+            # cmake/ninja wheels (they shadow the system cmake that
+            # scikit-build-core's editable vmecpp rebuild records).
+            python -m pip install numpy scikit-build f90wrap setuptools wheel
+            git clone https://github.com/hiddenSymmetries/VMEC2000.git /tmp/VMEC2000
+            git -C /tmp/VMEC2000 checkout 728af8bd6c796b36a0aa85fe298e507791e57c6e
+            cp /tmp/VMEC2000/cmake/machines/ubuntu.json /tmp/VMEC2000/cmake_config_file.json
+            LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu \
+              python -m pip wheel /tmp/VMEC2000 --no-build-isolation -w /tmp/vmec2000-wheel
+          fi
+          python -m pip install /tmp/vmec2000-wheel/vmec-*.whl
+          # fail loudly here if the binding is still broken, not in the test step
+          python -c "import vmec; print('VMEC2000 import OK')"
       - name: Install package
         run: |
           # on Ubuntu we would not need this, but on MacOS we need to point CMake to gfortran-14 and gcc-14

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -66,10 +66,9 @@ find_package(LAPACK REQUIRED)
 FetchContent_Declare(
   abseil-cpp
   GIT_REPOSITORY https://github.com/abseil/abseil-cpp.git
-  # 20260107.1 LTS: required for Clang >= 21, where the 2024 pin fails to
-  # compile (absl::Nonnull SFINAE in absl/strings/ascii.cc). Clang is the
-  # compiler used for the Enzyme autodiff build.
-  GIT_TAG 20260107.1
+  # 20260107.1 LTS: older abseil fails to compile under Clang >= 21 (the
+  # Enzyme build) on absl::Nonnull SFINAE in absl/strings/ascii.cc.
+  GIT_TAG 255c84dadd029fd8ad25c5efb5933e47beaa00c7
   GIT_SHALLOW TRUE
 )
 FetchContent_Declare(
@@ -205,4 +204,13 @@ if(VMECPP_ENABLE_ENZYME)
   target_compile_options(enzyme_smoke_test PRIVATE
     -O2 -fplugin=${VMECPP_ENZYME_PLUGIN})
   add_test(NAME enzyme_smoke COMMAND enzyme_smoke_test)
+
+  # Exact autodiff (forward and reverse) of a real VMEC nonlinear kernel: the
+  # half-grid Jacobian. Self-contained over flat buffers; checks Jv and J^T u
+  # against finite differences and against each other (adjoint identity).
+  add_executable(jacobian_kernel_autodiff_test
+    ${PROJECT_SOURCE_DIR}/src/vmecpp/cpp/vmecpp/common/enzyme/jacobian_kernel_autodiff_test.cc)
+  target_compile_options(jacobian_kernel_autodiff_test PRIVATE
+    -O2 -fplugin=${VMECPP_ENZYME_PLUGIN})
+  add_test(NAME jacobian_kernel_autodiff COMMAND jacobian_kernel_autodiff_test)
 endif()
diff --git a/src/vmecpp/cpp/vmecpp/common/enzyme/jacobian_kernel_autodiff_test.cc b/src/vmecpp/cpp/vmecpp/common/enzyme/jacobian_kernel_autodiff_test.cc
@@ -0,0 +1,172 @@
+// SPDX-FileCopyrightText: 2024-present Proxima Fusion GmbH
+// <info@proximafusion.com>
+//
+// SPDX-License-Identifier: MIT
+
+// Exact autodiff of a real VMEC nonlinear kernel, forward vs reverse mode.
+//
+// JacobianKernel reproduces IdealMhdModel::computeJacobian: it maps the
+// full-grid geometry (R, Z and their poloidal derivatives, even/odd parity) to
+// the half-grid metric quantities r12, ru12, zu12, rs, zs and the Jacobian tau.
+// tau is nonlinear in the geometry (products ru12*zs - rs*zu12, ...), so its
+// Jacobian is a genuine building block of the MHD force Hessian (the force is
+// the gradient of VMEC's functional; chain rule composes this kernel's Jacobian
+// with the linear spectral transforms to give the Hessian-vector product).
+//
+// The kernel is written allocation-free over flat buffers (scalar locals,
+// Eigen-free), which is the form Enzyme differentiates. We take:
+//   * a Jacobian-vector product  J v   by forward mode  (__enzyme_fwddiff), and
+//   * a vector-Jacobian product  J^T u  by reverse mode (__enzyme_autodiff),
+// check both against central finite differences, and time them so the
+// forward/reverse trade-off is explicit.
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstdio>
+#include <numeric>
+#include <random>
+#include <vector>
+
+#include "vmecpp/common/enzyme/enzyme.h"
+#include "vmecpp/vmec/ideal_mhd_model/jacobian_kernel.h"
+
+// Types used as Enzyme intrinsic arguments need external linkage (an anonymous
+// namespace type cannot instantiate the __enzyme_* templates).
+
+// JacobianKernel is the shared production kernel ComputeHalfGridJacobian
+// (jacobian_kernel.h); differentiated here exactly as the solver uses it.
+
+constexpr int kNgeom = 8;  // r1e r1o z1e z1o rue ruo zue zuo
+constexpr int kNout = 6;   // r12 ru12 zu12 rs zs tau
+
+struct Problem {
+  int nZnT, nsH, n_in, n_out;
+  std::vector<double> geom;  // kNgeom blocks of (nsH+1)*nZnT
+  std::vector<double> sqrtSH;
+  double deltaS, dSHalfDsInterp;
+};
+
+Problem MakeProblem(int nZnT, int nsH) {
+  Problem p{nZnT, nsH, kNgeom * (nsH + 1) * nZnT, kNout * nsH * nZnT, {}, {},
+            0.0,  0.0};
+  std::mt19937 rng(7);
+  std::uniform_real_distribution<double> d(0.5, 1.5);
+  p.geom.resize(p.n_in);
+  for (double& x : p.geom) x = d(rng);
+  p.sqrtSH.resize(nsH);
+  for (int j = 0; j < nsH; ++j) p.sqrtSH[j] = std::sqrt(0.05 + 0.9 * j / nsH);
+  p.deltaS = 0.1;
+  p.dSHalfDsInterp = 0.25;
+  return p;
+}
+
+// Evaluate the kernel from a flat geometry buffer into a flat output buffer.
+__attribute__((noinline)) void Eval(const double* geom, const Problem& p,
+                                    double* out) {
+  const int s = (p.nsH + 1) * p.nZnT;
+  const int o = p.nsH * p.nZnT;
+  vmecpp::ComputeHalfGridJacobian(
+      geom, geom + s, geom + 2 * s, geom + 3 * s, geom + 4 * s, geom + 5 * s,
+      geom + 6 * s, geom + 7 * s, p.sqrtSH.data(), p.deltaS, p.dSHalfDsInterp,
+      p.nZnT, /*nsMinF1=*/0, /*nsMinH=*/0,
+      /*nsMaxH=*/p.nsH, out, out + o, out + 2 * o, out + 3 * o, out + 4 * o,
+      out + 5 * o);
+}
+
+double MaxAbs(const std::vector<double>& a, const std::vector<double>& b) {
+  double m = 0.0;
+  for (size_t i = 0; i < a.size(); ++i)
+    m = std::fmax(m, std::fabs(a[i] - b[i]));
+  return m;
+}
+
+// Scalar objective L(geom) = 0.5 * sum of squares of all kernel outputs. out is
+// caller-owned scratch (an intermediate of the differentiated call). This is
+// the scalar-over-buffers form Enzyme differentiates in both modes.
+__attribute__((noinline)) double Loss(const double* geom, double* out,
+                                      const Problem* p) {
+  Eval(geom, *p, out);
+  double s = 0.0;
+  for (int i = 0; i < p->n_out; ++i) s += 0.5 * out[i] * out[i];
+  return s;
+}
+
+int main() {
+  const Problem p = MakeProblem(/*nZnT=*/32, /*nsH=*/12);
+  std::mt19937 rng(11);
+  std::uniform_real_distribution<double> dist(-1.0, 1.0);
+  std::vector<double> v(p.n_in);
+  for (double& x : v) x = dist(rng);
+
+  // Reverse mode: full gradient dL/dgeom in one pass.
+  std::vector<double> g_rev(p.n_in, 0.0), out_s(p.n_out, 0.0),
+      out_sh(p.n_out, 0.0);
+  __enzyme_autodiff((void*)Loss, enzyme_dup, p.geom.data(), g_rev.data(),
+                    enzyme_dup, out_s.data(), out_sh.data(), enzyme_const, &p);
+
+  // Forward mode: directional derivative dL . v in one pass.
+  std::vector<double> out_t(p.n_out, 0.0);
+  const double d_fwd = __enzyme_fwddiff<double>(
+      (void*)Loss, enzyme_dup, p.geom.data(), v.data(), enzyme_dup,
+      out_s.data(), out_t.data(), enzyme_const, &p);
+
+  // Central finite differences of the directional derivative.
+  const double h = 1e-6;
+  std::vector<double> gp(p.geom), gm(p.geom), os(p.n_out);
+  for (int i = 0; i < p.n_in; ++i) {
+    gp[i] = p.geom[i] + h * v[i];
+    gm[i] = p.geom[i] - h * v[i];
+  }
+  const double d_fd =
+      (Loss(gp.data(), os.data(), &p) - Loss(gm.data(), os.data(), &p)) /
+      (2 * h);
+  const double d_rev =
+      std::inner_product(g_rev.begin(), g_rev.end(), v.begin(), 0.0);
+
+  const double scale = std::fabs(d_fd) + 1e-300;
+  printf("exact autodiff of VMEC Jacobian kernel (n_in=%d n_out=%d)\n", p.n_in,
+         p.n_out);
+  printf("  reverse  dL.v = %.10e   (rel err vs FD %.2e)\n", d_rev,
+         std::fabs(d_rev - d_fd) / scale);
+  printf("  forward  dL.v = %.10e   (rel err vs FD %.2e)\n", d_fwd,
+         std::fabs(d_fwd - d_fd) / scale);
+  printf("  forward/reverse agreement: %.2e\n",
+         std::fabs(d_fwd - d_rev) / (std::fabs(d_rev) + 1e-300));
+
+  // Performance: reverse returns the whole gradient (n_in) in one pass; forward
+  // returns one directional derivative per pass. For a scalar objective reverse
+  // wins; for a single Jacobian/Hessian-vector product forward is the cheaper
+  // primitive. Time both.
+  const int reps = 2000;
+  auto t0 = std::chrono::steady_clock::now();
+  for (int r = 0; r < reps; ++r) {
+    std::fill(g_rev.begin(), g_rev.end(), 0.0);
+    __enzyme_autodiff((void*)Loss, enzyme_dup, p.geom.data(), g_rev.data(),
+                      enzyme_dup, out_s.data(), out_sh.data(), enzyme_const,
+                      &p);
+  }
+  auto t1 = std::chrono::steady_clock::now();
+  for (int r = 0; r < reps; ++r) {
+    out_t.assign(p.n_out, 0.0);
+    volatile double s = __enzyme_fwddiff<double>(
+        (void*)Loss, enzyme_dup, p.geom.data(), v.data(), enzyme_dup,
+        out_s.data(), out_t.data(), enzyme_const, &p);
+    (void)s;
+  }
+  auto t2 = std::chrono::steady_clock::now();
+  const double us_rev =
+      std::chrono::duration<double, std::micro>(t1 - t0).count() / reps;
+  const double us_fwd =
+      std::chrono::duration<double, std::micro>(t2 - t1).count() / reps;
+  printf(
+      "  performance: reverse %.2f us/pass (full gradient), forward %.2f "
+      "us/pass (one direction)\n",
+      us_rev, us_fwd);
+
+  const bool ok = std::fabs(d_rev - d_fd) < 1e-5 * scale &&
+                  std::fabs(d_fwd - d_fd) < 1e-5 * scale &&
+                  std::fabs(d_fwd - d_rev) < 1e-9 * (std::fabs(d_rev) + 1e-300);
+  printf("%s\n", ok ? "PASS" : "FAIL");
+  return ok ? 0 : 1;
+}
diff --git a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/BUILD.bazel b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/BUILD.bazel
@@ -82,7 +82,16 @@ cc_test(
 cc_library(
     name = "ideal_mhd_model",
     srcs = ["ideal_mhd_model.cc"],
-    hdrs = ["ideal_mhd_model.h", "dft_data.h"],
+# The shared force-chain kernels and the autodiff composition/JVP header are
+    # header-only and included by ideal_mhd_model.cc; declare whatever exists on
+    # this branch so the bazel sandbox can see them. The Enzyme JVP entry point
+    # (exact_force_jvp.cc) is built only by the CMake VMECPP_ENABLE_ENZYME path,
+    # not by bazel; its use in ideal_mhd_model.cc is guarded by that define.
+    hdrs = ["ideal_mhd_model.h", "dft_data.h"] + glob([
+        "*_kernel.h",
+        "local_force_composition.h",
+        "exact_force_jvp.h",
+    ]),
     visibility = ["//visibility:public"],
     deps = [
         ":dft_toroidal",

diff --git a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/ideal_mhd_model.cc b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/ideal_mhd_model.cc
@@ -20,6 +20,7 @@
 #include "vmecpp/common/util/util.h"
 #include "vmecpp/vmec/fourier_geometry/fourier_geometry.h"
 #include "vmecpp/vmec/handover_storage/handover_storage.h"
+#include "vmecpp/vmec/ideal_mhd_model/jacobian_kernel.h"
 #include "vmecpp/vmec/radial_partitioning/radial_partitioning.h"
 #include "vmecpp/vmec/radial_profiles/radial_profiles.h"
 #include "vmecpp/vmec/vmec_constants/vmec_algorithm_constants.h"
@@ -1194,95 +1195,30 @@ void IdealMhdModel::rzConIntoVolume() {
 }
 
 void IdealMhdModel::computeJacobian() {
-  // r12, ru12, zu12, rs, zs, tau
-
+  // Half-grid r12, ru12, zu12, rs, zs and the Jacobian tau. The arithmetic
+  // lives in the shared, allocation-free kernel (jacobian_kernel.h) so the
+  // solver and the Enzyme autodiff test use one implementation.
+  ComputeHalfGridJacobian(
+      r1_e.data(), r1_o.data(), z1_e.data(), z1_o.data(), ru_e.data(),
+      ru_o.data(), zu_e.data(), zu_o.data(), m_p_.sqrtSH.data(), m_fc_.deltaS,
+      dSHalfDsInterp, s_.nZnT, r_.nsMinF1, r_.nsMinH, r_.nsMaxH, r12.data(),
+      ru12.data(), zu12.data(), rs.data(), zs.data(), tau.data());
+
+  // Jacobian sign check: same running min/max over tau as before, now scanned
+  // after the kernel (identical values, hence identical verdict).
   double minTau = 0.0;
   double maxTau = 0.0;
-
-  // contributions from full-grid surface _i_nside j-th half-grid surface
-  int j0 = r_.nsMinF1;
-  for (int kl = 0; kl < s_.nZnT; ++kl) {
-    m_ls_.r1e_i[kl] = r1_e[(j0 - r_.nsMinF1) * s_.nZnT + kl];
-    m_ls_.r1o_i[kl] = r1_o[(j0 - r_.nsMinF1) * s_.nZnT + kl];
-    m_ls_.z1e_i[kl] = z1_e[(j0 - r_.nsMinF1) * s_.nZnT + kl];
-    m_ls_.z1o_i[kl] = z1_o[(j0 - r_.nsMinF1) * s_.nZnT + kl];
-    m_ls_.rue_i[kl] = ru_e[(j0 - r_.nsMinF1) * s_.nZnT + kl];
-    m_ls_.ruo_i[kl] = ru_o[(j0 - r_.nsMinF1) * s_.nZnT + kl];
-    m_ls_.zue_i[kl] = zu_e[(j0 - r_.nsMinF1) * s_.nZnT + kl];
-    m_ls_.zuo_i[kl] = zu_o[(j0 - r_.nsMinF1) * s_.nZnT + kl];
+  const int nTau = (r_.nsMaxH - r_.nsMinH) * s_.nZnT;
+  for (int i = 0; i < nTau; ++i) {
+    const double t = tau[i];
+    if (t < minTau || minTau == 0.0) {
+      minTau = t;
+    }
+    if (t > maxTau || maxTau == 0.0) {
+      maxTau = t;
+    }
   }
 
-  for (int jH = r_.nsMinH; jH < r_.nsMaxH; ++jH) {
-    // sqrt(s) on j-th half-grid pos
-    double sqrtSH = m_p_.sqrtSH[jH - r_.nsMinH];
-
-    for (int kl = 0; kl < s_.nZnT; ++kl) {
-      // contributions from full-grid surface _o_utside j-th half-grid surface
-      double r1e_o = r1_e[(jH + 1 - r_.nsMinF1) * s_.nZnT + kl];
-      double r1o_o = r1_o[(jH + 1 - r_.nsMinF1) * s_.nZnT + kl];
-      double z1e_o = z1_e[(jH + 1 - r_.nsMinF1) * s_.nZnT + kl];
-      double z1o_o = z1_o[(jH + 1 - r_.nsMinF1) * s_.nZnT + kl];
-      double rue_o = ru_e[(jH + 1 - r_.nsMinF1) * s_.nZnT + kl];
-      double ruo_o = ru_o[(jH + 1 - r_.nsMinF1) * s_.nZnT + kl];
-      double zue_o = zu_e[(jH + 1 - r_.nsMinF1) * s_.nZnT + kl];
-      double zuo_o = zu_o[(jH + 1 - r_.nsMinF1) * s_.nZnT + kl];
-
-      int iHalf = (jH - r_.nsMinH) * s_.nZnT + kl;
-
-      // R on half-grid
-      r12[iHalf] = 0.5 * ((m_ls_.r1e_i[kl] + r1e_o) +
-                          sqrtSH * (m_ls_.r1o_i[kl] + r1o_o));
-
-      // dRdTheta on half-grid
-      ru12[iHalf] = 0.5 * ((m_ls_.rue_i[kl] + rue_o) +
-                           sqrtSH * (m_ls_.ruo_i[kl] + ruo_o));
-
-      // dZdTheta on half-grid
-      zu12[iHalf] = 0.5 * ((m_ls_.zue_i[kl] + zue_o) +
-                           sqrtSH * (m_ls_.zuo_i[kl] + zuo_o));
-
-      // \tilde{dRds} on half-grid
-      rs[iHalf] =
-          ((r1e_o - m_ls_.r1e_i[kl]) + sqrtSH * (r1o_o - m_ls_.r1o_i[kl])) /
-          m_fc_.deltaS;
-
-      // \tilde{dZds} on half-grid
-      zs[iHalf] =
-          ((z1e_o - m_ls_.z1e_i[kl]) + sqrtSH * (z1o_o - m_ls_.z1o_i[kl])) /
-          m_fc_.deltaS;
-
-      // sqrt(g)/R on half-grid: assemble as governed by product rule
-      double tau1 = ru12[iHalf] * zs[iHalf] - rs[iHalf] * zu12[iHalf];
-      double tau2 = ruo_o * z1o_o + m_ls_.ruo_i[kl] * m_ls_.z1o_i[kl] -
-                    zuo_o * r1o_o - m_ls_.zuo_i[kl] * m_ls_.r1o_i[kl] +
-                    (rue_o * z1o_o + m_ls_.rue_i[kl] * m_ls_.z1o_i[kl] -
-                     zue_o * r1o_o - m_ls_.zue_i[kl] * m_ls_.r1o_i[kl]) /
-                        sqrtSH;
-      double tau_val = tau1 + dSHalfDsInterp * tau2;
-
-      if (tau_val < minTau || minTau == 0.0) {
-        minTau = tau_val;
-      }
-      if (tau_val > maxTau || maxTau == 0.0) {
-        maxTau = tau_val;
-      }
-
-      tau[iHalf] = tau_val;
-
-      // hand over to next iteration of radial loop
-      // --> what was outside in this loop iteration will be inside for next
-      // half-grid location
-      m_ls_.r1e_i[kl] = r1e_o;
-      m_ls_.r1o_i[kl] = r1o_o;
-      m_ls_.z1e_i[kl] = z1e_o;
-      m_ls_.z1o_i[kl] = z1o_o;
-      m_ls_.rue_i[kl] = rue_o;
-      m_ls_.ruo_i[kl] = ruo_o;
-      m_ls_.zue_i[kl] = zue_o;
-      m_ls_.zuo_i[kl] = zuo_o;
-    }  // kl
-  }  // j
-
   bool localBadJacobian =
       (minTau * maxTau < 0.0) || !std::isfinite(minTau * maxTau);