diff --git a/.github/workflows/benchmarks.yaml b/.github/workflows/benchmarks.yaml
index c2f3fdf4e..54326ec6a 100644
--- a/.github/workflows/benchmarks.yaml
+++ b/.github/workflows/benchmarks.yaml
@@ -36,7 +36,11 @@ jobs:
           pytest benchmarks/test_benchmarks.py --benchmark-json=benchmark_results.json
 
       - name: Compare benchmark result
-        if: github.event_name == 'pull_request'
+        # Skip the result upload/compare for fork PRs: their GITHUB_TOKEN is
+        # read-only, so comment-on-alert/auto-push hit 'Resource not accessible
+        # by integration'. The benchmarks still run above; only the write-back
+        # is skipped for forks.
+        if: github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository
         uses: benchmark-action/github-action-benchmark@v1.21.0
         with:
           tool: "pytest"
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index a22e0cbdc..0f68ab6e9 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -50,15 +50,33 @@ jobs:
         run: |
           # install VMEC++ deps as well as VMEC2000 deps (we need to import VMEC2000 in a test)
           sudo apt-get update && sudo apt-get install -y build-essential cmake libnetcdf-dev liblapack-dev libomp-dev
+      - name: Cache the VMEC2000 wheel
+        if: ${{ matrix.os == 'ubuntu-22.04' && matrix.python-version == '3.10' }}
+        uses: actions/cache@v4
+        with:
+          path: /tmp/vmec2000-wheel
+          # Keyed on the pinned VMEC2000 source commit: rebuild only when it changes.
+          key: vmec2000-728af8bd6c79-cp310-ubuntu22.04
       - name: Also install VMEC2000 (only on Ubuntu 22.04)
         if: ${{ matrix.os == 'ubuntu-22.04' && matrix.python-version == '3.10' }}
         run: |
-          # mpi4py is needed for VMEC2000
-          sudo apt-get install -y libopenmpi-dev
+          sudo apt-get install -y libopenmpi-dev libnetcdff-dev libscalapack-mpi-dev \
+            libopenblas-dev ninja-build
           python -m pip install mpi4py
-          # custom wheel for VMEC2000, needed for some VMEC++/VMEC2000 compatibility tests
-          # NOTE: this wheel is only guaranteed to work on Ubuntu 22.04
-          python -m pip install vmec@https://anaconda.org/eguiraud-pf/vmec/0.0.6/download/vmec-0.0.6-cp310-cp310-linux_x86_64.whl
+          if ! ls /tmp/vmec2000-wheel/vmec-*.whl >/dev/null 2>&1; then
+            # Build with the SYSTEM cmake + ninja; do NOT pip-install the
+            # cmake/ninja wheels (they shadow the system cmake that
+            # scikit-build-core's editable vmecpp rebuild records).
+            python -m pip install numpy scikit-build f90wrap setuptools wheel
+            git clone https://github.com/hiddenSymmetries/VMEC2000.git /tmp/VMEC2000
+            git -C /tmp/VMEC2000 checkout 728af8bd6c796b36a0aa85fe298e507791e57c6e
+            cp /tmp/VMEC2000/cmake/machines/ubuntu.json /tmp/VMEC2000/cmake_config_file.json
+            LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu \
+              python -m pip wheel /tmp/VMEC2000 --no-build-isolation -w /tmp/vmec2000-wheel
+          fi
+          python -m pip install /tmp/vmec2000-wheel/vmec-*.whl
+          # fail loudly here if the binding is still broken, not in the test step
+          python -c "import vmec; print('VMEC2000 import OK')"
       - name: Install package
         run: |
           # on Ubuntu we would not need this, but on MacOS we need to point CMake to gfortran-14 and gcc-14
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 813c0fe9c..b4b64d23f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -66,10 +66,9 @@ find_package(LAPACK REQUIRED)
 FetchContent_Declare(
   abseil-cpp
   GIT_REPOSITORY https://github.com/abseil/abseil-cpp.git
-  # 20260107.1 LTS: required for Clang >= 21, where the 2024 pin fails to
-  # compile (absl::Nonnull SFINAE in absl/strings/ascii.cc). Clang is the
-  # compiler used for the Enzyme autodiff build.
-  GIT_TAG 20260107.1
+  # 20260107.1 LTS: older abseil fails to compile under Clang >= 21 (the
+  # Enzyme build) on absl::Nonnull SFINAE in absl/strings/ascii.cc.
+  GIT_TAG 255c84dadd029fd8ad25c5efb5933e47beaa00c7
   GIT_SHALLOW TRUE
 )
 FetchContent_Declare(
diff --git a/src/vmecpp/cpp/vmecpp/common/enzyme/local_force_hessian_test.cc b/src/vmecpp/cpp/vmecpp/common/enzyme/local_force_hessian_test.cc
index 082d04030..84cf939fb 100644
--- a/src/vmecpp/cpp/vmecpp/common/enzyme/local_force_hessian_test.cc
+++ b/src/vmecpp/cpp/vmecpp/common/enzyme/local_force_hessian_test.cc
@@ -36,8 +36,8 @@
 
 // Problem dimensions and the constant (non-differentiated) context.
 struct Ctx {
-  int nZnT, nsH;       // half-grid surfaces; full-grid has nsH + 1
-  int nFull, nHalf;    // (nsH+1)*nZnT, nsH*nZnT
+  int nZnT, nsH;         // half-grid surfaces; full-grid has nsH + 1
+  int nFull, nHalf;      // (nsH+1)*nZnT, nsH*nZnT
   const double* sqrtSF;  // [nsH+1]
   const double* sqrtSH;  // [nsH]
   const double* chipH;   // [nsH]
@@ -46,13 +46,13 @@ struct Ctx {
   bool lthreed;
 };
 
-// 16 geometry blocks (each nFull) packed into geom; 12 force blocks (each nHalf)
-// in force; everything else is intermediate work sliced from work.
+// 16 geometry blocks (each nFull) packed into geom; 12 force blocks (each
+// nHalf) in force; everything else is intermediate work sliced from work.
 enum { kGeomBlocks = 16, kForceBlocks = 12 };
 
 // g: geometry -> force density, composing the six shared kernels.
 __attribute__((noinline)) void LocalForce(const double* geom, double* work,
-                                           double* force, const Ctx* c) {
+                                          double* force, const Ctx* c) {
   const int nF = c->nFull;
   const int nH = c->nHalf;
   const int nZnT = c->nZnT, nsH = c->nsH;
@@ -75,30 +75,45 @@ __attribute__((noinline)) void LocalForce(const double* geom, double* work,
   const double* lvo = geom + 15 * nF;
   // half-grid intermediates from work
   double* p = work;
-  double* r12 = p; p += nH;
-  double* ru12 = p; p += nH;
-  double* zu12 = p; p += nH;
-  double* rs = p; p += nH;
-  double* zs = p; p += nH;
-  double* tau = p; p += nH;
-  double* gsqrt = p; p += nH;
-  double* guu = p; p += nH;
-  double* guv = p; p += nH;
-  double* gvv = p; p += nH;
-  double* bsupu = p; p += nH;
-  double* bsupv = p; p += nH;
-  double* bsubu = p; p += nH;
-  double* bsubv = p; p += nH;
-  double* tp = p; p += nH;
+  double* r12 = p;
+  p += nH;
+  double* ru12 = p;
+  p += nH;
+  double* zu12 = p;
+  p += nH;
+  double* rs = p;
+  p += nH;
+  double* zs = p;
+  p += nH;
+  double* tau = p;
+  p += nH;
+  double* gsqrt = p;
+  p += nH;
+  double* guu = p;
+  p += nH;
+  double* guv = p;
+  p += nH;
+  double* gvv = p;
+  p += nH;
+  double* bsupu = p;
+  p += nH;
+  double* bsupv = p;
+  p += nH;
+  double* bsubu = p;
+  p += nH;
+  double* bsubv = p;
+  p += nH;
+  double* tp = p;
+  p += nH;
   // per-nZnT scratch for the force kernel (26 blocks)
   double* sc = p;  // 26 * nZnT
 
-  vmecpp::ComputeHalfGridJacobian(r1e, r1o, z1e, z1o, rue, ruo, zue, zuo, c->sqrtSH,
-                          c->deltaS, /*dSHalfDsInterp=*/0.25, nZnT, 0, 0, nsH,
-                          r12, ru12, zu12, rs, zs, tau);
-  vmecpp::ComputeMetricElements(r1e, r1o, rue, ruo, zue, zuo, rve, rvo, zve, zvo,
-                                tau, r12, c->sqrtSF, c->sqrtSH, c->lthreed, nZnT,
-                                0, 0, nsH, gsqrt, guu, guv, gvv);
+  vmecpp::ComputeHalfGridJacobian(
+      r1e, r1o, z1e, z1o, rue, ruo, zue, zuo, c->sqrtSH, c->deltaS,
+      /*dSHalfDsInterp=*/0.25, nZnT, 0, 0, nsH, r12, ru12, zu12, rs, zs, tau);
+  vmecpp::ComputeMetricElements(r1e, r1o, rue, ruo, zue, zuo, rve, rvo, zve,
+                                zvo, tau, r12, c->sqrtSF, c->sqrtSH, c->lthreed,
+                                nZnT, 0, 0, nsH, gsqrt, guu, guv, gvv);
   vmecpp::ComputeBsupContra(lue, luo, lve, lvo, gsqrt, c->sqrtSH, c->lthreed,
                             nZnT, 0, 0, nsH, bsupu, bsupv);
   for (int jH = 0; jH < nsH; ++jH) {
@@ -113,19 +128,58 @@ __attribute__((noinline)) void LocalForce(const double* geom, double* work,
     for (int kl = 0; kl < nZnT; ++kl) tp[jH * nZnT + kl] += c->presH[jH];
   }
   double* s = sc;
-  double* P_i = s; s += nZnT; double* rup_i = s; s += nZnT;
-  double* zup_i = s; s += nZnT; double* rsp_i = s; s += nZnT;
-  double* zsp_i = s; s += nZnT; double* taup_i = s; s += nZnT;
-  double* gbubu_i = s; s += nZnT; double* gbubv_i = s; s += nZnT;
-  double* gbvbv_i = s; s += nZnT; double* P_o = s; s += nZnT;
-  double* rup_o = s; s += nZnT; double* zup_o = s; s += nZnT;
-  double* rsp_o = s; s += nZnT; double* zsp_o = s; s += nZnT;
-  double* taup_o = s; s += nZnT; double* gbubu_o = s; s += nZnT;
-  double* gbubv_o = s; s += nZnT; double* gbvbv_o = s; s += nZnT;
-  double* P_avg = s; s += nZnT; double* P_wavg = s; s += nZnT;
-  double* gbubu_avg = s; s += nZnT; double* gbubu_wavg = s; s += nZnT;
-  double* gbvbv_avg = s; s += nZnT; double* gbvbv_wavg = s; s += nZnT;
-  double* gbubv_avg = s; s += nZnT; double* gbubv_wavg = s; s += nZnT;
+  double* P_i = s;
+  s += nZnT;
+  double* rup_i = s;
+  s += nZnT;
+  double* zup_i = s;
+  s += nZnT;
+  double* rsp_i = s;
+  s += nZnT;
+  double* zsp_i = s;
+  s += nZnT;
+  double* taup_i = s;
+  s += nZnT;
+  double* gbubu_i = s;
+  s += nZnT;
+  double* gbubv_i = s;
+  s += nZnT;
+  double* gbvbv_i = s;
+  s += nZnT;
+  double* P_o = s;
+  s += nZnT;
+  double* rup_o = s;
+  s += nZnT;
+  double* zup_o = s;
+  s += nZnT;
+  double* rsp_o = s;
+  s += nZnT;
+  double* zsp_o = s;
+  s += nZnT;
+  double* taup_o = s;
+  s += nZnT;
+  double* gbubu_o = s;
+  s += nZnT;
+  double* gbubv_o = s;
+  s += nZnT;
+  double* gbvbv_o = s;
+  s += nZnT;
+  double* P_avg = s;
+  s += nZnT;
+  double* P_wavg = s;
+  s += nZnT;
+  double* gbubu_avg = s;
+  s += nZnT;
+  double* gbubu_wavg = s;
+  s += nZnT;
+  double* gbvbv_avg = s;
+  s += nZnT;
+  double* gbvbv_wavg = s;
+  s += nZnT;
+  double* gbubv_avg = s;
+  s += nZnT;
+  double* gbubv_wavg = s;
+  s += nZnT;
   double* armn_e = force + 0 * nH;
   double* armn_o = force + 1 * nH;
   double* azmn_e = force + 2 * nH;
@@ -139,17 +193,18 @@ __attribute__((noinline)) void LocalForce(const double* geom, double* work,
   double* czmn_e = force + 10 * nH;
   double* czmn_o = force + 11 * nH;
   vmecpp::ComputeMHDForceDensity(
-      r1e, r1o, rue, ruo, zue, zuo, z1o, rve, rvo, zve, zvo, r12, ru12, zu12, rs,
-      zs, tau, tp, gsqrt, bsupu, bsupv, c->sqrtSF, c->sqrtSH, P_i, rup_i, zup_i,
-      rsp_i, zsp_i, taup_i, gbubu_i, gbubv_i, gbvbv_i, P_o, rup_o, zup_o, rsp_o,
-      zsp_o, taup_o, gbubu_o, gbubv_o, gbvbv_o, P_avg, P_wavg, gbubu_avg,
+      r1e, r1o, rue, ruo, zue, zuo, z1o, rve, rvo, zve, zvo, r12, ru12, zu12,
+      rs, zs, tau, tp, gsqrt, bsupu, bsupv, c->sqrtSF, c->sqrtSH, P_i, rup_i,
+      zup_i, rsp_i, zsp_i, taup_i, gbubu_i, gbubv_i, gbvbv_i, P_o, rup_o, zup_o,
+      rsp_o, zsp_o, taup_o, gbubu_o, gbubv_o, gbvbv_o, P_avg, P_wavg, gbubu_avg,
       gbubu_wavg, gbvbv_avg, gbvbv_wavg, gbubv_avg, gbubv_wavg, c->deltaS, nZnT,
       /*nsMinF=*/0, 0, 0, nsH, /*jMaxRZ=*/nsH, c->lthreed, armn_e, armn_o,
       azmn_e, azmn_o, brmn_e, brmn_o, bzmn_e, bzmn_o, crmn_e, crmn_o, czmn_e,
       czmn_o);
 }
 
-// Scalar objective L = 0.5 ||force||^2; work and force are caller-owned scratch.
+// Scalar objective L = 0.5 ||force||^2; work and force are caller-owned
+// scratch.
 __attribute__((noinline)) double Loss(const double* geom, double* work,
                                       double* force, const Ctx* c) {
   LocalForce(geom, work, force, c);
@@ -206,17 +261,19 @@ int main() {
     gp[i] = geom[i] + h * v[i];
     gm[i] = geom[i] - h * v[i];
   }
-  const double dfd =
-      (Loss(gp.data(), w2.data(), f2.data(), &c) -
-       Loss(gm.data(), w2.data(), f2.data(), &c)) /
-      (2 * h);
-  const double drev = std::inner_product(grev.begin(), grev.end(), v.begin(), 0.0);
+  const double dfd = (Loss(gp.data(), w2.data(), f2.data(), &c) -
+                      Loss(gm.data(), w2.data(), f2.data(), &c)) /
+                     (2 * h);
+  const double drev =
+      std::inner_product(grev.begin(), grev.end(), v.begin(), 0.0);
   const double scale = std::fabs(dfd) + 1e-300;
 
   printf("exact Hessian of VMEC local force map (composed 6 kernels)\n");
   printf("  geom dofs=%d  force outputs=%d\n", nG, nFc);
-  printf("  reverse dL.v vs finite-diff : %.2e\n", std::fabs(drev - dfd) / scale);
-  printf("  forward dL.v vs finite-diff : %.2e\n", std::fabs(dfwd - dfd) / scale);
+  printf("  reverse dL.v vs finite-diff : %.2e\n",
+         std::fabs(drev - dfd) / scale);
+  printf("  forward dL.v vs finite-diff : %.2e\n",
+         std::fabs(dfwd - dfd) / scale);
   printf("  forward / reverse agreement : %.2e\n",
          std::fabs(dfwd - drev) / (std::fabs(drev) + 1e-300));
 
@@ -225,7 +282,8 @@ int main() {
   for (int r = 0; r < reps; ++r) {
     volatile double q = __enzyme_fwddiff<double>(
         (void*)Loss, enzyme_dup, geom.data(), v.data(), enzyme_dup, work.data(),
-        dwork.data(), enzyme_dup, force.data(), dforce.data(), enzyme_const, &c);
+        dwork.data(), enzyme_dup, force.data(), dforce.data(), enzyme_const,
+        &c);
     (void)q;
   }
   auto t1 = std::chrono::steady_clock::now();
diff --git a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/BUILD.bazel b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/BUILD.bazel
index 5fca72477..e8df6160b 100644
--- a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/BUILD.bazel
+++ b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/BUILD.bazel
@@ -82,7 +82,16 @@ cc_test(
 cc_library(
     name = "ideal_mhd_model",
     srcs = ["ideal_mhd_model.cc"],
-    hdrs = ["ideal_mhd_model.h", "dft_data.h"],
+# The shared force-chain kernels and the autodiff composition/JVP header are
+    # header-only and included by ideal_mhd_model.cc; declare whatever exists on
+    # this branch so the bazel sandbox can see them. The Enzyme JVP entry point
+    # (exact_force_jvp.cc) is built only by the CMake VMECPP_ENABLE_ENZYME path,
+    # not by bazel; its use in ideal_mhd_model.cc is guarded by that define.
+    hdrs = ["ideal_mhd_model.h", "dft_data.h"] + glob([
+        "*_kernel.h",
+        "local_force_composition.h",
+        "exact_force_jvp.h",
+    ]),
     visibility = ["//visibility:public"],
     deps = [
         ":dft_toroidal",
diff --git a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/constraint_force_kernel.h b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/constraint_force_kernel.h
new file mode 100644
index 000000000..b5078a6f1
--- /dev/null
+++ b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/constraint_force_kernel.h
@@ -0,0 +1,68 @@
+// SPDX-FileCopyrightText: 2024-present Proxima Fusion GmbH
+// <info@proximafusion.com>
+//
+// SPDX-License-Identifier: MIT
+#ifndef VMECPP_VMEC_IDEAL_MHD_MODEL_CONSTRAINT_FORCE_KERNEL_H_
+#define VMECPP_VMEC_IDEAL_MHD_MODEL_CONSTRAINT_FORCE_KERNEL_H_
+
+#include <algorithm>
+
+namespace vmecpp {
+
+// The two local (non-transform) pieces of VMEC's spectral-condensation
+// constraint force. The Fourier-space bandpass between them is the separate
+// shared free function deAliasConstraintForce. All full-grid buffers are
+// indexed (jF-nsMinF)*nZnT except sqrtSF, which is indexed (jF-nsMinF1).
+// Allocation-free, shared between the solver and the Enzyme autodiff path.
+
+// Effective constraint force gConEff = (rCon - rCon0) ru + (zCon - zCon0) zu.
+// No constraint on the axis (it has no poloidal angle), so the axis surface is
+// skipped, matching deAliasConstraintForce.
+inline void ComputeEffectiveConstraintForce(
+    const double* rCon, const double* rCon0, const double* zCon,
+    const double* zCon0, const double* ruFull, const double* zuFull, int nZnT,
+    int nsMinF, int nsMaxFIncludingLcfs, double* gConEff) {
+  int jMin = 0;
+  if (nsMinF == 0) {
+    jMin = 1;
+  }
+  for (int jF = std::max(jMin, nsMinF); jF < nsMaxFIncludingLcfs; ++jF) {
+    for (int kl = 0; kl < nZnT; ++kl) {
+      int idx_kl = (jF - nsMinF) * nZnT + kl;
+      gConEff[idx_kl] = (rCon[idx_kl] - rCon0[idx_kl]) * ruFull[idx_kl] +
+                        (zCon[idx_kl] - zCon0[idx_kl]) * zuFull[idx_kl];
+    }  // kl
+  }  // jF
+}
+
+// Add the bandpass-filtered constraint force gCon back into the MHD R/Z forces
+// (brmn, bzmn) and write the constraint-force outputs (frcon, fzcon).
+inline void AddConstraintForces(
+    const double* rCon, const double* rCon0, const double* zCon,
+    const double* zCon0, const double* ruFull, const double* zuFull,
+    const double* gCon, const double* sqrtSF, int nZnT, int nsMinF, int nsMinF1,
+    int nsMaxF, double* brmn_e, double* brmn_o, double* bzmn_e, double* bzmn_o,
+    double* frcon_e, double* frcon_o, double* fzcon_e, double* fzcon_o) {
+  for (int jF = nsMinF; jF < nsMaxF; ++jF) {
+    for (int kl = 0; kl < nZnT; ++kl) {
+      int idx_kl = (jF - nsMinF) * nZnT + kl;
+
+      double brcon = (rCon[idx_kl] - rCon0[idx_kl]) * gCon[idx_kl];
+      double bzcon = (zCon[idx_kl] - zCon0[idx_kl]) * gCon[idx_kl];
+
+      brmn_e[idx_kl] += brcon;
+      bzmn_e[idx_kl] += bzcon;
+      brmn_o[idx_kl] += brcon * sqrtSF[jF - nsMinF1];
+      bzmn_o[idx_kl] += bzcon * sqrtSF[jF - nsMinF1];
+
+      frcon_e[idx_kl] = ruFull[idx_kl] * gCon[idx_kl];
+      fzcon_e[idx_kl] = zuFull[idx_kl] * gCon[idx_kl];
+      frcon_o[idx_kl] = frcon_e[idx_kl] * sqrtSF[jF - nsMinF1];
+      fzcon_o[idx_kl] = fzcon_e[idx_kl] * sqrtSF[jF - nsMinF1];
+    }  // kl
+  }  // jF
+}
+
+}  // namespace vmecpp
+
+#endif  // VMECPP_VMEC_IDEAL_MHD_MODEL_CONSTRAINT_FORCE_KERNEL_H_
diff --git a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/dft_toroidal.cc b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/dft_toroidal.cc
index fb3a6e2db..279386d9e 100644
--- a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/dft_toroidal.cc
+++ b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/dft_toroidal.cc
@@ -34,6 +34,11 @@ void ForcesToFourier3DSymmFastPoloidal(
   // axis lambda stays zero (no contribution from any m)
   const int jMinL = 1;
 
+  // Reused scratch for the assembled R/Z forces: sized on first use, so the
+  // inner loop allocates nothing. Per-thread safe via the stack frame, since
+  // each OpenMP thread runs this on its own radial slice.
+  Eigen::VectorXd tempR_seg, tempZ_seg;
+
   for (int jF = rp.nsMinF; jF < jMaxRZ; ++jF) {
     const int mmax = jF == 0 ? 1 : s.mpol;
     for (int m = 0; m < mmax; ++m) {
@@ -93,10 +98,9 @@ void ForcesToFourier3DSymmFastPoloidal(
         double zmksc_n = -czmn_seg.dot(sinmui_seg);
 
         // Assemble effective R and Z forces from MHD and spectral condensation
-        // contributions. Materialize to avoid re-evaluation in each dot product,
-        // reusing per-thread scratch so the inner loop allocates nothing (and
-        // stays Enzyme-differentiable). Same arithmetic as a fresh .eval().
-        thread_local Eigen::VectorXd tempR_seg, tempZ_seg;
+        // contributions. Materialize to avoid re-evaluation in each dot
+        // product, reusing the function-scope scratch so the inner loop
+        // allocates nothing. Same arithmetic as a fresh .eval().
         tempR_seg.noalias() = armn_seg + xmpq[m] * frcon_seg;
         tempZ_seg.noalias() = azmn_seg + xmpq[m] * fzcon_seg;
 
diff --git a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/ideal_mhd_model.cc b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/ideal_mhd_model.cc
index 8fc57ad30..1232d6e41 100644
--- a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/ideal_mhd_model.cc
+++ b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/ideal_mhd_model.cc
@@ -22,6 +22,7 @@
 #include "vmecpp/vmec/handover_storage/handover_storage.h"
 #include "vmecpp/vmec/ideal_mhd_model/bco_kernel.h"
 #include "vmecpp/vmec/ideal_mhd_model/bcontra_kernel.h"
+#include "vmecpp/vmec/ideal_mhd_model/constraint_force_kernel.h"
 #include "vmecpp/vmec/ideal_mhd_model/jacobian_kernel.h"
 #include "vmecpp/vmec/ideal_mhd_model/lambda_force_kernel.h"
 #include "vmecpp/vmec/ideal_mhd_model/metric_kernel.h"
@@ -1607,8 +1608,8 @@ void IdealMhdModel::hybridLambdaForce() {
 #pragma omp barrier
 #endif  // _OPENMP
 
-  // Lambda force on the full grid via the shared kernel (lambda_force_kernel.h),
-  // also used by the Enzyme autodiff path.
+  // Lambda force on the full grid via the shared kernel
+  // (lambda_force_kernel.h), also used by the Enzyme autodiff path.
   ComputeHybridLambdaForce(
       bsubu.data(), bsubv.data(), gvv.data(), gsqrt.data(), guv.data(),
       bsupu.data(), lu_e.data(), lu_o.data(), m_p_.sqrtSH.data(),
@@ -2043,21 +2044,12 @@ absl::Status IdealMhdModel::constraintForceMultiplier() {
 }
 
 void IdealMhdModel::effectiveConstraintForce() {
-  // gConEff
-
-  // no constraint on axis --> has no poloidal angle
-  int jMin = 0;
-  if (r_.nsMinF == 0) {
-    jMin = 1;
-  }
-
-  for (int jF = std::max(jMin, r_.nsMinF); jF < r_.nsMaxFIncludingLcfs; ++jF) {
-    for (int kl = 0; kl < s_.nZnT; ++kl) {
-      int idx_kl = (jF - r_.nsMinF) * s_.nZnT + kl;
-      gConEff[idx_kl] = (rCon[idx_kl] - rCon0[idx_kl]) * ruFull[idx_kl] +
-                        (zCon[idx_kl] - zCon0[idx_kl]) * zuFull[idx_kl];
-    }  // kl
-  }  // jF
+  // gConEff via the shared kernel (constraint_force_kernel.h), also used by the
+  // Enzyme autodiff path.
+  ComputeEffectiveConstraintForce(rCon.data(), rCon0.data(), zCon.data(),
+                                  zCon0.data(), ruFull.data(), zuFull.data(),
+                                  s_.nZnT, r_.nsMinF, r_.nsMaxFIncludingLcfs,
+                                  gConEff.data());
 }
 
 // perform Fourier-space bandpass filtering of constraint force
@@ -2088,24 +2080,15 @@ void IdealMhdModel::assembleTotalForces() {
     }
   }
 
-  for (int jF = r_.nsMinF; jF < r_.nsMaxF; ++jF) {
-    for (int kl = 0; kl < s_.nZnT; ++kl) {
-      int idx_kl = (jF - r_.nsMinF) * s_.nZnT + kl;
-
-      double brcon = (rCon[idx_kl] - rCon0[idx_kl]) * gCon[idx_kl];
-      double bzcon = (zCon[idx_kl] - zCon0[idx_kl]) * gCon[idx_kl];
-
-      brmn_e[idx_kl] += brcon;
-      bzmn_e[idx_kl] += bzcon;
-      brmn_o[idx_kl] += brcon * m_p_.sqrtSF[jF - r_.nsMinF1];
-      bzmn_o[idx_kl] += bzcon * m_p_.sqrtSF[jF - r_.nsMinF1];
-
-      frcon_e[idx_kl] = ruFull[idx_kl] * gCon[idx_kl];
-      fzcon_e[idx_kl] = zuFull[idx_kl] * gCon[idx_kl];
-      frcon_o[idx_kl] = frcon_e[idx_kl] * m_p_.sqrtSF[jF - r_.nsMinF1];
-      fzcon_o[idx_kl] = fzcon_e[idx_kl] * m_p_.sqrtSF[jF - r_.nsMinF1];
-    }
-  }
+  // add the bandpass-filtered constraint force to the MHD R/Z forces and write
+  // the constraint outputs via the shared kernel (constraint_force_kernel.h),
+  // also used by the Enzyme autodiff path.
+  AddConstraintForces(rCon.data(), rCon0.data(), zCon.data(), zCon0.data(),
+                      ruFull.data(), zuFull.data(), gCon.data(),
+                      m_p_.sqrtSF.data(), s_.nZnT, r_.nsMinF, r_.nsMinF1,
+                      r_.nsMaxF, brmn_e.data(), brmn_o.data(), bzmn_e.data(),
+                      bzmn_o.data(), frcon_e.data(), frcon_o.data(),
+                      fzcon_e.data(), fzcon_o.data());
 }
 
 void IdealMhdModel::forcesToFourier(FourierForces& m_physical_f) {
diff --git a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/jacobian_kernel.h b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/jacobian_kernel.h
index 11cc24ecb..79ce3a39c 100644
--- a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/jacobian_kernel.h
+++ b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/jacobian_kernel.h
@@ -23,11 +23,14 @@ namespace vmecpp {
 // half-grid; sqrtSH is indexed jH - nsMinH. The half-grid point jH sits between
 // full-grid surfaces jH (inside) and jH + 1 (outside).
 inline void ComputeHalfGridJacobian(
-    const double* r1e, const double* r1o, const double* z1e, const double* z1o,
-    const double* rue, const double* ruo, const double* zue, const double* zuo,
-    const double* sqrtSH, double deltaS, double dSHalfDsInterp, int nZnT,
-    int nsMinF1, int nsMinH, int nsMaxH, double* r12, double* ru12,
-    double* zu12, double* rs, double* zs, double* tau) {
+    const double* __restrict r1e, const double* __restrict r1o,
+    const double* __restrict z1e, const double* __restrict z1o,
+    const double* __restrict rue, const double* __restrict ruo,
+    const double* __restrict zue, const double* __restrict zuo,
+    const double* __restrict sqrtSH, double deltaS, double dSHalfDsInterp,
+    int nZnT, int nsMinF1, int nsMinH, int nsMaxH, double* __restrict r12,
+    double* __restrict ru12, double* __restrict zu12, double* __restrict rs,
+    double* __restrict zs, double* __restrict tau) {
   for (int jH = nsMinH; jH < nsMaxH; ++jH) {
     const double sH = sqrtSH[jH - nsMinH];
     for (int kl = 0; kl < nZnT; ++kl) {
diff --git a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/lambda_force_kernel.h b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/lambda_force_kernel.h
index be7d54599..105df3730 100644
--- a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/lambda_force_kernel.h
+++ b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/lambda_force_kernel.h
@@ -9,11 +9,11 @@ namespace vmecpp {
 
 // Hybrid lambda force on the full grid: blmn (and clmn in 3D). The covariant
 // field bsubv is interpolated from the half grid two ways (a plain average and
-// an alternative from gvv/gsqrt times lambda plus, in 3D, guv*bsupu) and blended
-// with the radialBlending profile. The radial sweep carries the inside half-grid
-// point in scratch (bsubu_i, bsubv_i, gvv_gsqrt_i, guv_bsupu_i, each nZnT) and
-// shifts it outward each surface. Allocation-free over flat buffers, shared
-// between IdealMhdModel::hybridLambdaForce and the Enzyme autodiff path.
+// an alternative from gvv/gsqrt times lambda plus, in 3D, guv*bsupu) and
+// blended with the radialBlending profile. The radial sweep carries the inside
+// half-grid point in scratch (bsubu_i, bsubv_i, gvv_gsqrt_i, guv_bsupu_i, each
+// nZnT) and shifts it outward each surface. Allocation-free over flat buffers,
+// shared between IdealMhdModel::hybridLambdaForce and the Enzyme autodiff path.
 //
 // Half-grid inputs (index (jH-nsMinH)*nZnT): bsubu, bsubv, gvv, gsqrt, guv,
 // bsupu. Full-grid lambda (index (jF-nsMinF1)*nZnT): lu_e, lu_o. Profiles:
@@ -72,8 +72,8 @@ inline void ComputeHybridLambdaForce(
         }
       }
 
-      double gvv_gsqrt_lu_e =
-          0.5 * (gvv_gsqrt_i[kl] + gvv_gsqrt_o) * lu_e[(jF - nsMinF1) * nZnT + kl];
+      double gvv_gsqrt_lu_e = 0.5 * (gvv_gsqrt_i[kl] + gvv_gsqrt_o) *
+                              lu_e[(jF - nsMinF1) * nZnT + kl];
       double gvv_gsqrt_lu_o =
           0.5 * (gvv_gsqrt_i[kl] * sqrtSHi + gvv_gsqrt_o * sqrtSHo) *
           lu_o[(jF - nsMinF1) * nZnT + kl];
diff --git a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/metric_kernel.h b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/metric_kernel.h
index ede5caacd..d014ec606 100644
--- a/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/metric_kernel.h
+++ b/src/vmecpp/cpp/vmecpp/vmec/ideal_mhd_model/metric_kernel.h
@@ -14,12 +14,16 @@ namespace vmecpp {
 // IdealMhdModel::computeMetricElements and the Enzyme autodiff path. Same
 // indexing conventions as jacobian_kernel.h. sqrtSF is indexed jF - nsMinF1.
 inline void ComputeMetricElements(
-    const double* r1e, const double* r1o, const double* rue, const double* ruo,
-    const double* zue, const double* zuo, const double* rve, const double* rvo,
-    const double* zve, const double* zvo, const double* tau, const double* r12,
-    const double* sqrtSF, const double* sqrtSH, bool lthreed, int nZnT,
-    int nsMinF1, int nsMinH, int nsMaxH, double* gsqrt, double* guu,
-    double* guv, double* gvv) {
+    const double* __restrict r1e, const double* __restrict r1o,
+    const double* __restrict rue, const double* __restrict ruo,
+    const double* __restrict zue, const double* __restrict zuo,
+    const double* __restrict rve, const double* __restrict rvo,
+    const double* __restrict zve, const double* __restrict zvo,
+    const double* __restrict tau, const double* __restrict r12,
+    const double* __restrict sqrtSF, const double* __restrict sqrtSH,
+    bool lthreed, int nZnT, int nsMinF1, int nsMinH, int nsMaxH,
+    double* __restrict gsqrt, double* __restrict guu, double* __restrict guv,
+    double* __restrict gvv) {
   for (int jH = nsMinH; jH < nsMaxH; ++jH) {
     const double sF_i = sqrtSF[jH - nsMinF1] * sqrtSF[jH - nsMinF1];
     const double sF_o = sqrtSF[jH + 1 - nsMinF1] * sqrtSF[jH + 1 - nsMinF1];
diff --git a/tests/test_simsopt_compat.py b/tests/test_simsopt_compat.py
index 609dffea2..1443d8cbc 100644
--- a/tests/test_simsopt_compat.py
+++ b/tests/test_simsopt_compat.py
@@ -303,6 +303,11 @@ def test_ensure_vmec2000_input_from_vmecpp_input():
         if varname[1:-1] == "axis_":
             # these are called differently in VMEC2000, e.g. raxis_c -> raxis_cc
             varname_vmec2000 = f"{varname[:-1]}c{varname[-1]}"
+        if not hasattr(vmec2000.indata, varname_vmec2000):
+            # vmecpp-only field (e.g. free_boundary_method) with no counterpart
+            # in the legacy VMEC2000 INDATA namelist; not part of the common
+            # subset under test.
+            continue
         vmec2000_var = getattr(vmec2000.indata, varname_vmec2000)
 
         if vmecpp_var is None: